Consolidate DBpedia Person Data

In this notebook we consolidate person data from all editions into one file, by removing duplicates.

By Eduardo Graells-Garrido.


In [ ]:
from __future__ import print_function, unicode_literals

import gzip
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
import dbpedia_config

sns.set_context('poster', font_scale=0.8)
%matplotlib inline

In [ ]:
target_folder = dbpedia_config.TARGET_FOLDER
languages = dbpedia_config.LANGUAGES

In [ ]:
filenames = ['{1}/person_data_{0}.csv.gz'.format(lang, target_folder) for lang in languages]

Person DataFrame

We will create a single dataframe with all the editions.


In [ ]:
all_bios = None
for lang, filename in zip(languages, filenames):
    this_edition = pd.read_csv(filename, encoding='utf-8')
    this_edition['language'] = lang
    
    if all_bios is None:
        all_bios = this_edition
    else:
        all_bios = pd.concat([all_bios, this_edition])
        
all_bios.language.value_counts()

In [ ]:
def w_fraction(arr):
    #print arr
    return np.sum(arr == 'female') / float(len(arr))

col_labels = all_bios.groupby('language').aggregate(
    {'edition_count': lambda x: len(x), 'gender': w_fraction, 'available_english': np.mean}
    ).sort('edition_count', ascending=False)

col_labels['female_median_count'] = [all_bios[(all_bios.language == idx) 
                                                 & (all_bios.gender == 'female')].edition_count.median()
                                    for idx in col_labels.index]
col_labels['male_median_count'] = [all_bios[(all_bios.language == idx) 
                                                 & (all_bios.gender == 'male')].edition_count.median()
                                    for idx in col_labels.index]

col_labels['female_mean_count'] = [all_bios[(all_bios.language == idx) 
                                                 & (all_bios.gender == 'female')].edition_count.mean()
                                    for idx in col_labels.index]
col_labels['male_mean_count'] = [all_bios[(all_bios.language == idx) 
                                                 & (all_bios.gender == 'male')].edition_count.mean()
                                    for idx in col_labels.index]


col_labels

Consolidations

Let's remove duplicates and build a single dataset for all languages.


In [ ]:
all_bios.drop_duplicates(subset=['same_as'], inplace=True)
all_bios.drop_duplicates(subset=['wikidata_entity'], inplace=True)
all_bios.drop_duplicates(subset=['label'], inplace=True)

In [ ]:
all_bios.gender.value_counts()

In [ ]:
all_bios = all_bios[all_bios.gender.isin(['male', 'female'])].copy()

In [ ]:
print(all_bios.shape)

In [ ]:
all_bios.gender.value_counts()

In [ ]:
all_bios.sample(n=5)

In [ ]:
with gzip.open('{0}/consolidated_person_data.csv.gz'.format(target_folder), 'wb') as f:
    all_bios.to_csv(f, encoding='utf-8')

In [ ]: