Toponyms and their Countries

By Eduardo Graells-Garrido.

In this notebook we generate a dictionary of {place_uri => country_uri} so in other notebooks, when we need to determine the country of origin of a biography, we do not have to parse the entire database again looking for this information.

The result is one dictionary per each language. Equivalences between languages have not been resolved yet.


In [ ]:
from __future__ import print_function, unicode_literals
import gzip
import dbpedia_config
import ujson as json
import os
import networkx as nx
import rdflib.graph as rdf

from dbpedia_utils import iter_entities_from

In [ ]:
data_folder = dbpedia_config.DATA_FOLDER
target_folder = dbpedia_config.TARGET_FOLDER
languages = dbpedia_config.LANGUAGES

In [ ]:
with open('{0}/dbpedia.owl'.format(data_folder), 'r') as f:
    ontology = rdf.Graph().parse(f)

In [ ]:
ontology_graph = nx.DiGraph()

for s, p, o in ontology:
    src = str(s)
    attr = str(p)
    dst = str(o)
    #print(s, p, o)

    if attr == 'http://www.w3.org/2000/01/rdf-schema#subClassOf':
        ontology_graph.add_edge(dst, src)

In [ ]:
place_classes = nx.descendants(ontology_graph, 'http://dbpedia.org/ontology/PopulatedPlace')
place_classes.add('http://dbpedia.org/ontology/PopulatedPlace')
place_classes

In [ ]:
def create_country_dictionary(language):
    instance_types = '{1}/instance_types_{0}.ttl.bz2'.format(language, data_folder)
    properties = '{1}/mappingbased_objects_{0}.ttl.bz2'.format(language, data_folder)

    print(instance_types)
    country_names = set()
    places = set()
    
    for i, ent in enumerate(iter_entities_from(instance_types)):
        if 'http://dbpedia.org/ontology/Country' in ent['22-rdf-syntax-ns#type']:
            country_names.add(ent['resource'])

        if place_classes.intersection(ent['22-rdf-syntax-ns#type']):
            places.add(ent['resource'])
        
    print(len(country_names), len(places))

    countries = {}

    for i, ent in enumerate(iter_entities_from(properties)):
        resource = ent['resource']

        if resource not in places:
            continue
            
        if 'country' not in ent:
            continue
        
        try:
            c = next((ent['country'] & country_names).__iter__())
        except StopIteration:
            # doesn't have a valid country URI attribute
            continue

        countries[resource] = c

    print(len(countries))

    with gzip.open('{0}/countries_{1}.json.gz'.format(target_folder, language), 'wt') as f:
        json.dump(countries, f)

In [ ]:
for lang in languages:
    if not os.path.exists('{0}/countries_{1}.json.gz'.format(target_folder, lang)):
        create_country_dictionary(lang)
    else:
        print(lang, 'already exists')

In [ ]: