In [ ]:
from __future__ import print_function, unicode_literals
import pandas as pd
import gzip
import csv
import regex as re
import json
import time
import datetime
import requests
import os
import json
import dbpedia_config
from collections import Counter, defaultdict
from cytoolz import partition_all
from dbpedia_utils import iter_entities_from, get_date
In [ ]:
data_folder = dbpedia_config.DATA_FOLDER
target_folder = dbpedia_config.TARGET_FOLDER
your_email = dbpedia_config.YOUR_EMAIL
query_wikidata = dbpedia_config.QUERY_WIKIDATA_GENDER
languages = dbpedia_config.LANGUAGES
We need to know the children classes of Person in the DBpedia ontology. We use rdflib and networkx to find them.
In [ ]:
import bz2file
import networkx as nx
import rdflib.graph as rdf
In [ ]:
with open('{0}/dbpedia.owl'.format(data_folder), 'r') as f:
ontology = rdf.Graph().parse(f)
In [ ]:
ontology_graph = nx.DiGraph()
In [ ]:
for s, p, o in ontology:
src = str(s)
attr = str(p)
dst = str(o)
#print(s, p, o)
if attr == 'http://www.w3.org/2000/01/rdf-schema#subClassOf':
ontology_graph.add_edge(dst, src)
In [ ]:
ontology_graph.number_of_nodes()
In [ ]:
person_classes = set(nx.neighbors(ontology_graph, 'http://dbpedia.org/ontology/Person'))
len(person_classes)
In [ ]:
person_classes
There are a variety of classes in the ontology, but we do not need as much detail. We build a dictionary to obtain the first child of the Person class that matches each entity.
In [ ]:
class_parents = {}
for level_1 in person_classes:
for descendant in nx.descendants(ontology_graph, level_1):
class_parents[descendant] = level_1
class_parents[level_1] = level_1
# to avoid querying another dictionary/set
class_parents['http://dbpedia.org/ontology/Person'] = 'http://dbpedia.org/ontology/Person'
len(class_parents)
In Wikidata, these are the identifiers for the different gender values. This is also the API URL and the headers we send when querying for entity genders.
In [ ]:
value_dict = {6581097: 'male', 6581072: 'female', 1052281: 'transgender female', 2449503: 'transgender male'}
wikidata_api_url = 'http://www.wikidata.org/w/api.php?action=wbgetentities&ids={0}&format=json&props=claims'
headers = {'user-agent': 'gender-research-crawler/0.0.1 (contact: {0})'.format(your_email)}
Recall that we have two extra sources of gender information. Here we use them to query for gender of specific biographies.
In [ ]:
gender_by_dbpedia_uri = defaultdict(lambda: None)
for ent in iter_entities_from('{0}/wiki.genders.txt'.format(data_folder)):
gender_by_dbpedia_uri[ent['resource']] = ent['gender'].pop()
len(gender_by_dbpedia_uri)
In [ ]:
for ent in iter_entities_from('{0}/genders_en.nt.bz2'.format(data_folder)):
if ent['resource'] not in gender_by_dbpedia_uri:
gender_by_dbpedia_uri[ent['resource']] = ent['gender'].pop()
len(gender_by_dbpedia_uri)
If this notebook has been run multiple times, we cache the Wikidata genders in order to avoid querying the system too much.
In [ ]:
wikidata_gender = defaultdict(lambda: None)
if os.path.exists('{0}/wikidata_entity_gender.json'.format(target_folder)):
with open('{0}/wikidata_entity_gender.json'.format(target_folder), 'r') as f:
wikidata_gender.update(json.load(f))
len(wikidata_gender)
In [ ]:
# to avoid multiple queries of the same entity
no_gender_available = set()
In [ ]:
def get_entity_gender(req_json, entity_id):
"""
Given a JSON structure from Wikidata, get the gender of the specified entity.
"""
try:
ent_value = req_json['entities'][entity_id]['claims']['P21'][0]['mainsnak']['datavalue']['value']['numeric-id']
return value_dict[ent_value]
except KeyError:
return None
The DBpedia data includes the relationships between editions of each article. This function decides whether such links are from a different language edition, or Wikidata.
In [ ]:
dbpedia_url = re.compile(r'http://(.+)\.dbpedia.org/*+')
wikidata_url = re.compile(r'http://www.wikidata.org/entity/(.+)|http://wikidata.org/entity/(.+)')
discarded_editions = {'simple', 'commons', 'wikidata'}
def get_edition(url):
edition = None
wikidata_id = None
if url.startswith('http://dbpedia.org/'):
edition = 'en'
else:
dbp_prefix = dbpedia_url.match(url)
if dbp_prefix:
prefix = dbp_prefix.groups()[0]
if prefix not in discarded_editions:
edition = prefix
else:
wikidata = wikidata_url.match(url)
if wikidata:
if wikidata.groups()[0]:
wikidata_id = wikidata.groups()[0]
else:
wikidata_id = wikidata.groups()[1]
return edition, wikidata_id
get_edition('http://mg.dbpedia.org/resource/Paul_Otlet')
This is the function that creates a CSV file containing biographical meta-data.
In [ ]:
def generate_person_data(language, query_wikidata=True, skip_labels=False, skip_countries=False, include_without_gender=False):
"""
Creates a csv file with person data from the specified language edition.
If query_wikidata is true, entities not found in our gender dictionaries will be queried in Wikidata.
"""
# indexed by URI
person_uris = {}
person_ontologies = {}
person_birth = defaultdict(lambda: None)
person_birth_place = defaultdict(lambda: None)
person_death = defaultdict(lambda: None)
person_death_place = defaultdict(lambda: None)
person_gender = defaultdict(lambda: None)
person_editions = defaultdict(lambda: list([language]))
person_labels = defaultdict(lambda: None)
person_alternate_uri = defaultdict(lambda: None)
country_dict = defaultdict(lambda: None)
countries = set()
if not skip_countries:
try:
print('{0}/countries_{1}.json.gz'.format(target_folder, language))
with gzip.open('{0}/countries_{1}.json.gz'.format(target_folder, language), 'rt') as f:
country_dict.update(json.load(f))
countries.update(country_dict.values())
print('# toponyms', len(country_dict))
except Exception as err:
print('error loading countries', err)
skip_countries = True
def get_country(toponyms):
for t in toponyms:
if t in countries:
return t
if t in country_dict:
return country_dict[t]
return None
instance_types = '{1}/instance_types_{0}.ttl.bz2'.format(language, data_folder)
interlanguage_links = '{1}/interlanguage_links_{0}.ttl.bz2'.format(language, data_folder)
labels = '{1}/labels_{0}.ttl.bz2'.format(language, data_folder)
object_properties = '{1}/mappingbased_objects_{0}.ttl.bz2'.format(language, data_folder)
literal_properties = '{1}/mappingbased_literals_{0}.ttl.bz2'.format(language, data_folder)
for i, ent in enumerate(iter_entities_from(instance_types)):
ent_class = ent['22-rdf-syntax-ns#type'].pop()
if ent_class in class_parents:
person_uris[ent['resource']] = class_parents[ent_class]
print('# persons', len(person_uris))
entity_wikidata = defaultdict(lambda: None)
entity_uri = defaultdict(lambda: None)
without_gender = []
for i, ent in enumerate(iter_entities_from(literal_properties)):
resource = ent['resource']
if resource not in person_uris:
continue
if 'birthDate' in ent:
birth_year = get_date(ent, 'birthDate')
if birth_year is not None:
person_birth[resource] = birth_year.year
if 'deathDate' in ent:
death_year = get_date(ent, 'deathDate')
if death_year is not None:
person_death[resource] = death_year.year
if not skip_countries:
for i, ent in enumerate(iter_entities_from(object_properties)):
resource = ent['resource']
if resource not in person_uris:
continue
if 'birthPlace' in ent:
place = get_country(ent['birthPlace'])
if place is not None:
person_birth_place[resource] = place
if 'deathPlace' in ent:
place = get_country(ent['deathPlace'])
if place is not None:
person_death_place[resource] = place
if not skip_labels:
for i, ent in enumerate(iter_entities_from(labels)):
resource = ent['resource']
if not resource in person_uris:
continue
if ent['rdf-schema#label']:
person_labels[resource] = ent['rdf-schema#label'].pop()
for i, ent in enumerate(iter_entities_from(interlanguage_links)):
resource = ent['resource']
if resource not in person_uris:
continue
this_entity_editions = set()
this_entity_wikidata = None
alt_url = None
for url in ent['owl#sameAs']:
edition, wikidata_id = get_edition(url)
if edition is not None:
this_entity_editions.add(edition)
if edition == 'en':
alt_url = url
elif wikidata_id != None:
this_entity_wikidata = wikidata_id
if alt_url is None:
alt_url = ent['owl#sameAs'].pop()
person_alternate_uri[resource] = alt_url
person_editions[resource].extend(this_entity_editions)
if this_entity_wikidata:
entity_wikidata[resource] = this_entity_wikidata
entity_uri[this_entity_wikidata] = resource
for ent_uri, ent_id in entity_wikidata.items():
if ent_uri in person_gender:
continue
# do we know the URI?
if ent_uri in gender_by_dbpedia_uri:
person_gender[ent_uri] = gender_by_dbpedia_uri[ent_uri]
# perhaps using same as...
if person_alternate_uri[ent_uri] is not None:
alt_uri = person_alternate_uri[ent_uri]
if alt_uri in gender_by_dbpedia_uri:
person_gender[ent_uri] = gender_by_dbpedia_uri[alt_uri]
continue
# have we seen it on wikidata?
if ent_id in wikidata_gender:
person_gender[ent_uri] = wikidata_gender[ent_id]
elif ent_id not in no_gender_available:
without_gender.append(ent_id)
print('without gender', len(without_gender))
if query_wikidata:
for ids in partition_all(50, without_gender):
try:
req = requests.get(wikidata_api_url.format(u'|'.join(ids)), headers=headers)
req_json = req.json()
except Exception as ex:
print(ex)
time.sleep(1)
continue
for i, ent_id in enumerate(ids):
ent_gender = get_entity_gender(req_json, ent_id)
if ent_gender is None:
no_gender_available.add(ent_id)
else:
person_gender[entity_uri[ent_id]] = ent_gender
wikidata_gender[ent_id] = ent_gender
stats = dict(Counter(person_gender.values()))
stats['total_biographies'] = len(person_uris)
stats['language'] = language
stats['wikidata_entities'] = len(entity_wikidata)
stats['with_gender'] = len(person_gender)
with open('{1}/person_stats_{0}.json'.format(language, target_folder), 'w') as f:
json.dump(stats, f)
print(stats)
with gzip.open('{1}/person_data_{0}.csv.gz'.format(language, target_folder), 'wt') as f:
fields = ['uri', 'wikidata_entity', 'class', 'gender', 'edition_count', 'available_english', 'available_editions',
'birth_year', 'death_year', 'birth_place', 'death_place', 'same_as', 'label']
writer = csv.DictWriter(f, fieldnames=fields)
writer.writeheader()
for resource in person_uris.keys():
ent_gender = person_gender[resource]
if ent_gender is None and not include_without_gender:
continue
writer.writerow({
'wikidata_entity': entity_wikidata[resource],
'uri': resource,
'label': person_labels[resource] if person_labels[resource] else None,
'gender': ent_gender,
'available_english': 'en' in person_editions[resource],
'edition_count': len(person_editions[resource]),
'available_editions': u'|'.join(person_editions[resource]),
'birth_year': person_birth[resource],
'death_year': person_death[resource],
'birth_place': person_birth_place[resource],
'death_place': person_death_place[resource],
'class': person_uris[resource],
'same_as': person_alternate_uri[resource] if person_alternate_uri[resource] else None
})
return stats
In [ ]:
len(wikidata_gender)
In [ ]:
records = []
for lang in languages:
print(lang)
records.append(generate_person_data(lang, query_wikidata=query_wikidata))
In [ ]:
records
We save this to be able to reuse this data in the future.
In [ ]:
with open('{0}/wikidata_entity_gender.json'.format(target_folder), 'w') as f:
json.dump(dict(wikidata_gender), f)
In [ ]: