Source code for wikidata2df.wikidata2df

"""Main module."""

from collections import defaultdict
from functools import lru_cache
from itertools import product, chain
import pandas as pd
import requests


[docs]def perform_query(query): """Perform a SPARQL query to the wikidata endpoint. A simple request with the header 'Accept' pointing to a json result. Args: query(str): A string containing a functional sparql query Returns: A json (dict) with the response content. Raises: requests.exceptions.HTTPError """ endpoint_url = "https://query.wikidata.org/sparql" try: response = requests.get( endpoint_url, params={"query": query}, headers={"Accept": "application/sparql-results+json"}, ) response.raise_for_status() except requests.exceptions.HTTPError as err: raise requests.exceptions.HTTPError(err) else: raw_results = response.json() return raw_results
[docs]def parse_query_results(query_result): """Parse wikidata query json into a nice dataframe Args: query_result(dict): A json dict with the results from the query Returns: A Pandas DataFrame with the query results. """ parsed_results = defaultdict(list) data = query_result["results"]["bindings"] keys = frozenset(chain.from_iterable(data)) for json_key, item in product(data, keys): try: parsed_results[item].append(json_key[item]["value"]) except: # If there is no data for a key, append None parsed_results[item].append(None) results_df = pd.DataFrame.from_dict(parsed_results).replace( {"http://www.wikidata.org/entity/": ""}, regex=True ) return results_df
[docs]@lru_cache(maxsize=10) def wikidata2df(query): """Transform a wikidata SPARQL query into a Pandas DataFrame Wrapper function that performs a request to the wikidata endpoint and returns a dataframe. If there is no result found, it will raise an exception. If there were optional fields in your query, the result will have rows with value "None", corresponding to values that were not found. Args: query(str): A string containing a valid SPARQL query. Returns: A Pandas DataFrame with the results of the query. """ query_res = perform_query(query) parsed_res = parse_query_results(query_res) return parsed_res