By Eduardo Graells-Garrido.
In this notebook we generate a dictionary of {place_uri => country_uri} so in other notebooks, when we need to determine the country of origin of a biography, we do not have to parse the entire database again looking for this information.
The result is one dictionary per each language. Equivalences between languages have not been resolved yet.
In [ ]:
from __future__ import print_function, unicode_literals
import gzip
import dbpedia_config
import ujson as json
import os
import networkx as nx
import rdflib.graph as rdf
from dbpedia_utils import iter_entities_from
In [ ]:
data_folder = dbpedia_config.DATA_FOLDER
target_folder = dbpedia_config.TARGET_FOLDER
languages = dbpedia_config.LANGUAGES
In [ ]:
with open('{0}/dbpedia.owl'.format(data_folder), 'r') as f:
ontology = rdf.Graph().parse(f)
In [ ]:
ontology_graph = nx.DiGraph()
for s, p, o in ontology:
src = str(s)
attr = str(p)
dst = str(o)
#print(s, p, o)
if attr == 'http://www.w3.org/2000/01/rdf-schema#subClassOf':
ontology_graph.add_edge(dst, src)
In [ ]:
place_classes = nx.descendants(ontology_graph, 'http://dbpedia.org/ontology/PopulatedPlace')
place_classes.add('http://dbpedia.org/ontology/PopulatedPlace')
place_classes
In [ ]:
def create_country_dictionary(language):
instance_types = '{1}/instance_types_{0}.ttl.bz2'.format(language, data_folder)
properties = '{1}/mappingbased_objects_{0}.ttl.bz2'.format(language, data_folder)
print(instance_types)
country_names = set()
places = set()
for i, ent in enumerate(iter_entities_from(instance_types)):
if 'http://dbpedia.org/ontology/Country' in ent['22-rdf-syntax-ns#type']:
country_names.add(ent['resource'])
if place_classes.intersection(ent['22-rdf-syntax-ns#type']):
places.add(ent['resource'])
print(len(country_names), len(places))
countries = {}
for i, ent in enumerate(iter_entities_from(properties)):
resource = ent['resource']
if resource not in places:
continue
if 'country' not in ent:
continue
try:
c = next((ent['country'] & country_names).__iter__())
except StopIteration:
# doesn't have a valid country URI attribute
continue
countries[resource] = c
print(len(countries))
with gzip.open('{0}/countries_{1}.json.gz'.format(target_folder, language), 'wt') as f:
json.dump(countries, f)
In [ ]:
for lang in languages:
if not os.path.exists('{0}/countries_{1}.json.gz'.format(target_folder, lang)):
create_country_dictionary(lang)
else:
print(lang, 'already exists')
In [ ]: