Utilities used for patter and IDs replacements
def get_entity_dict_from_api_v2(id_):
"""This function takes a wikidata id and return the corresponding entity dict"""
url = f"https://www.wikidata.org/w/api.php?action=wbgetentities&ids={id_}&format=json"
r = requests.get(url)
return r.json()["entities"][id_]
def get_entity_id_from_api_v2(label):
"""This function takes a wikidata label and return the corresponding entity id"""
url = f"https://www.wikidata.org/w/api.php?action=wbsearchentities&search={label}&language=en&format=json"
r = requests.get(url)
return r.json()["search"][0]["id"]
def get_property_id_from_api_v2(label):
"""This function takes a wikidata label and return the corresponding property id"""
url = f"https://www.wikidata.org/w/api.php?action=wbsearchentities&search={label}&language=en&format=json&type=property"
r = requests.get(url)
return r.json()["search"][0]["id"]
def get_patterns(wiki_prefixes):
"""This function takes a dictionary of prefixes and return a list of patterns to be used to replace them in the queries
e.g: "wdt:P31" -> regex experession using to detect all occurences of "wdt:*" in a query. The regex should only match IDs in the form of a Letter follwed by numbers.
"""
patterns = dict()
for prefix in wiki_prefixes:
patterns[prefix] = re.compile(rf"{prefix}:[A-Z]\d+", re.IGNORECASE)
return patterns
def get_schema_patterns(schema_dict):
"""This function takes a dictionary of prefixes and return a list of patterns to be used to replace them in the queries
e.g: "wikibase:directClaim" -> regex experession using to detect all occurences of "wikibase:*" in a query. The regex should only match prefix:entity_name patterns.
"""
patterns = dict()
for prefix in schema_dict:
patterns[prefix] = re.compile(rf"{prefix}:\w+", re.IGNORECASE)
return patterns
def clean_label(label):
return re.sub(r"[^a-zA-Z0-9 ]", "", label)
def replace_ids(query, pattern, wiki_prefix):
"""This function takes a query and a pattern and replace all occurences of the pattern in the query with the corresponding wikidata id"""
matches = re.findall(pattern, query)
base_uri = wiki_prefixes[wiki_prefix]
for match in matches:
id_ = match.split(":")[1]
id_ = re.sub(r"[^a-zA-Z0-9]", "", id_)
entity = get_entity_dict_from_api(id_)
label = clean_label(entity["labels"]["en"]["value"])
# replace all special characters in the label with an underscore
label = re.sub(r"[^a-zA-Z0-9]", "_", label)
label = clean_label(entity["labels"]["en"]["value"]).replace(' ', '_')
query = query.replace(match, f"{wiki_prefix}:{label}")
query = f"# {wiki_prefix}:{label} is the text label corresponding to ID:{id_} (<{base_uri}{id_}>)\n" + query
if matches:
# prepend the query with the prefix
query = "PREFIX " + wiki_prefix + ": <" + base_uri + "> \n" + query
return query
def add_schema_prefix(query, pattern, schema_prefix):
"""This function simply prepends the query with the schema prefix if it's used in the query"""
matches = re.findall(pattern, query)
base_uri = schema_dict[schema_prefix]
if matches:
# prepend the query with the prefix
query = "PREFIX " + schema_prefix + ": <" + base_uri + "> \n" + query
return query