Skip to content

Utilities used for patter and IDs replacements

def get_entity_dict_from_api_v2(id_):
    """This function takes a wikidata id and return the corresponding entity dict"""
    url = f"https://www.wikidata.org/w/api.php?action=wbgetentities&ids={id_}&format=json"
    r = requests.get(url)
    return r.json()["entities"][id_]


def get_entity_id_from_api_v2(label):
    """This function takes a wikidata label and return the corresponding entity id"""
    url = f"https://www.wikidata.org/w/api.php?action=wbsearchentities&search={label}&language=en&format=json"
    r = requests.get(url)
    return r.json()["search"][0]["id"]


def get_property_id_from_api_v2(label):
    """This function takes a wikidata label and return the corresponding property id"""
    url = f"https://www.wikidata.org/w/api.php?action=wbsearchentities&search={label}&language=en&format=json&type=property"
    r = requests.get(url)
    return r.json()["search"][0]["id"]


def get_patterns(wiki_prefixes):
    """This function takes a dictionary of prefixes and return a list of patterns to be used to replace them in the queries
    e.g: "wdt:P31" -> regex experession using to detect all occurences of "wdt:*" in a query. The regex should only match IDs in the form of a Letter follwed by numbers.
    """
    patterns = dict()
    for prefix in wiki_prefixes:
        patterns[prefix] = re.compile(rf"{prefix}:[A-Z]\d+", re.IGNORECASE)
    return patterns


def get_schema_patterns(schema_dict):
    """This function takes a dictionary of prefixes and return a list of patterns to be used to replace them in the queries
    e.g: "wikibase:directClaim" -> regex experession using to detect all occurences of "wikibase:*" in a query. The regex should only match prefix:entity_name patterns.
    """
    patterns = dict()
    for prefix in schema_dict:
        patterns[prefix] = re.compile(rf"{prefix}:\w+", re.IGNORECASE)
    return patterns


def clean_label(label):
    return re.sub(r"[^a-zA-Z0-9 ]", "", label)


def replace_ids(query, pattern, wiki_prefix):
    """This function takes a query and a pattern and replace all occurences of the pattern in the query with the corresponding wikidata id"""
    matches = re.findall(pattern, query)
    base_uri = wiki_prefixes[wiki_prefix]
    for match in matches:
        id_ = match.split(":")[1]
        id_ = re.sub(r"[^a-zA-Z0-9]", "", id_)
        entity = get_entity_dict_from_api(id_)
        label = clean_label(entity["labels"]["en"]["value"])
        # replace all special characters in the label with an underscore
        label = re.sub(r"[^a-zA-Z0-9]", "_", label)
        label = clean_label(entity["labels"]["en"]["value"]).replace(' ', '_')
        query = query.replace(match, f"{wiki_prefix}:{label}")
        query = f"# {wiki_prefix}:{label} is the text label corresponding to ID:{id_} (<{base_uri}{id_}>)\n" + query
    if matches:
        # prepend the query with the prefix
        query = "PREFIX " + wiki_prefix + ": <" + base_uri + "> \n" + query
    return query


def add_schema_prefix(query, pattern, schema_prefix):
    """This function simply prepends the query with the schema prefix if it's used in the query"""
    matches = re.findall(pattern, query)
    base_uri = schema_dict[schema_prefix]
    if matches:
        # prepend the query with the prefix
        query = "PREFIX " + schema_prefix + ": <" + base_uri + "> \n" + query
    return query