In [1]:
import pandas as pd
from os import path
%pylab inline
#plt.style.use('fivethirtyeight')
plt.style.use('ggplot')
import matplotlib as mpl
mpl.rc("savefig", dpi=300)


Populating the interactive namespace from numpy and matplotlib

In [2]:
snapshot_dir = '/media/notconfusing/9d9b45fc-55f7-428c-a228-1c4c4a1b728c/home/maximilianklein/snapshot_data/'

In [3]:
lssnaps = !ls $snapshot_dir

In [4]:
snapdirs = [snap for snap in lssnaps if snap[0].isdigit()]

In [5]:
all_genders = set()
#find the genders
for snapdir in snapdirs:
    sitelinkspath = path.join(snapshot_dir,snapdir,'property_indexes','site_linkss-index.csv')
    sldf = pd.read_csv(sitelinkspath).fillna(0)
    snap_genders = sldf.columns
    for g in snap_genders: 
        if g.endswith('1'):
            print(sitelinkspath)
        all_genders.add(g)

In [6]:
#populate a dict with numbers:
longitudinal_totals = {}
for snapdir in snapdirs:
    sitelinkspath = path.join(snapshot_dir,snapdir,'property_indexes','site_linkss-index.csv')
    sldf = pd.read_csv(sitelinkspath,index_col=0).fillna(0)
    snap_genders = sldf.columns
    gendersum = sldf.sum()
    longitudinal_totals[snapdir] = gendersum

ldf = pd.DataFrame.from_dict(longitudinal_totals,orient='index').fillna(0)

In [7]:
print(len(ldf.columns))


37

In [8]:
ldf.index = pd.to_datetime(ldf.index)

In [9]:
ldf['total'] = ldf.sum(axis=1)

In [10]:
ldf['total']


Out[10]:
2014-09-17    5869606
2014-10-13    6589799
2015-07-28    6589799
2015-08-03    6589799
2015-08-09    6609617
2015-08-12    6617934
2015-08-16    6617934
2015-08-21    6630052
2015-08-23    6630052
2015-09-06    6656076
2015-09-13    6680340
2015-09-20    6698141
2015-09-27    6714732
2015-10-13    6763390
2015-10-19    6763390
2015-10-26    6807585
2015-11-02    6824561
2015-11-09    6851235
2015-11-21    6876328
2015-11-22    6876328
2015-11-23    6901147
2015-12-04    6929623
2015-12-21    6987464
2016-01-03    6999542
Name: total, dtype: float64

In [11]:
ldf['ungendered'] = ldf['nan'] / ldf['total']

In [12]:
ldf['gender'] = 1 - ldf['ungendered']

In [13]:
ldf['total'].plot()


Out[13]:
<matplotlib.axes._subplots.AxesSubplot at 0x7f83ba6d8860>

In [14]:
ldf['total']


Out[14]:
2014-09-17    5869606
2014-10-13    6589799
2015-07-28    6589799
2015-08-03    6589799
2015-08-09    6609617
2015-08-12    6617934
2015-08-16    6617934
2015-08-21    6630052
2015-08-23    6630052
2015-09-06    6656076
2015-09-13    6680340
2015-09-20    6698141
2015-09-27    6714732
2015-10-13    6763390
2015-10-19    6763390
2015-10-26    6807585
2015-11-02    6824561
2015-11-09    6851235
2015-11-21    6876328
2015-11-22    6876328
2015-11-23    6901147
2015-12-04    6929623
2015-12-21    6987464
2016-01-03    6999542
Name: total, dtype: float64

In [15]:
r_total = pd.rolling_mean(ldf['total'], 2, 1)

In [16]:
ax = r_total.plot(title="Total Humans in Wikidata Over Time")
ax.get_yaxis().set_major_formatter(
    matplotlib.ticker.FuncFormatter(lambda x, p: format(int(x), ',')))
plt.ylabel('Number of humans')
plt.xlabel('Wikidata snapshot')


Out[16]:
<matplotlib.text.Text at 0x7f83b96af4e0>

In [17]:
r_gendered = pd.rolling_mean(ldf['gender'], 2, 1)
r_gendered.plot(secondary_y=True, title="% of Humans with Gender in Wikidata")


Out[17]:
<matplotlib.axes._subplots.AxesSubplot at 0x7f83b86591d0>

In [ ]:


In [69]:
#populate a dict with numbers:
longitudinal_countrys = {}
longitudinal_citizenships = {}
longitudinal_egs = {}
longitudinal_sites = {}

dict_csv_map = { 'place_of_births-index.csv': longitudinal_countrys,
'citizenships-index.csv' : longitudinal_citizenships,
'ethnic_groups-index.csv': longitudinal_egs,
'site_linkss-index.csv': longitudinal_sites}

for snapdir in snapdirs:
    for csvtype, longitudinal_dict in dict_csv_map.items():
        sitelinkspath = path.join(snapshot_dir,snapdir,'property_indexes',csvtype)
        cdf = pd.read_csv(sitelinkspath,index_col=0).fillna(0)
        csum = cdf.sum()
        total = csum.sum()
        nothaving = cdf.ix[0].sum()
        having = total - nothaving
        if snapdir == '2016-01-03':
            print(csvtype, having)
        having_per = having / float(total)
        longitudinal_dict[snapdir] = having_per


2016-01-03 ethnic_groups-index.csv 16881.0
2016-01-03 site_linkss-index.csv 6869792.0
2016-01-03 citizenships-index.csv 1802041.0
2016-01-03 place_of_births-index.csv 927166.0

In [70]:
longitudinal_dobs = {}
longitudinal_dods= {}
longitudinal_occ = {}
longitudinal_fow = {}
dict_csv_map = { 'dod-index.csv': longitudinal_dods,
'dob-index.csv' : longitudinal_dobs,
               'field_of_work-index.csv': longitudinal_fow,
               'occupation-index.csv': longitudinal_occ}

for snapdir in snapdirs:
    bigfname = 'gender-index-data-{}.csv'.format(snapdir)
    bigdf = pd.read_csv(path.join(snapshot_dir,snapdir,bigfname))
    humans =  len(bigdf)

    for csvtype, longitudinal_dict in dict_csv_map.items():
        sitelinkspath = path.join(snapshot_dir,snapdir,'property_indexes',csvtype)
        try:
            cdf = pd.read_csv(sitelinkspath,index_col=0).fillna(0)
            csum = cdf.sum()
            total = csum.sum()
        except IOError:
            total = 0
        if snapdir == '2016-01-03':
            print(csvtype, total)
        having_per = total/ float(humans)
        longitudinal_dict[snapdir] = having_per


field_of_work-index.csv 7812.0
occupation-index.csv 1782850.0
dob-index.csv 2177738.0
dod-index.csv 1097562.0
/usr/lib/python3/dist-packages/pandas/io/parsers.py:1150: DtypeWarning: Columns (4,7) have mixed types. Specify dtype option on import or set low_memory=False.
  data = self._reader.read(nrows)

In [73]:
java_min_int = -2147483648
bigdf = pd.read_csv(path.join(snapshot_dir,'2016-01-03',bigfname),na_values=[java_min_int])

In [84]:
bigdf['dob'].apply(lambda x: not math.isnan(x)).sum()


Out[84]:
2177602

In [20]:
dobdf = pd.DataFrame.from_dict(longitudinal_dobs,orient='index').fillna(0)
dobdf.columns = ['date of birth']
doddf = pd.DataFrame.from_dict(longitudinal_dods,orient='index').fillna(0)
doddf.columns = ['date of death']
citdf = pd.DataFrame.from_dict(longitudinal_citizenships,orient='index').fillna(0)
citdf.columns = ['citizenship']
countrydf = pd.DataFrame.from_dict(longitudinal_countrys,orient='index').fillna(0)
countrydf.columns = ['place of birth']
egdf = pd.DataFrame.from_dict(longitudinal_egs,orient='index').fillna(0)
egdf.columns = ['ethnic group']
fowdf = pd.DataFrame.from_dict(longitudinal_fow,orient='index').fillna(0)
fowdf.columns = ['field of work']
occdf = pd.DataFrame.from_dict(longitudinal_occ,orient='index').fillna(0)
occdf.columns = ['occupation']
sitedf = pd.DataFrame.from_dict(longitudinal_sites,orient='index').fillna(0)
sitedf.columns = ['at least 1 site link']

In [21]:
aggdf = dobdf.join(doddf).join(citdf).join(countrydf).join(egdf).join(sitedf).join(ldf['gender']).join(fowdf).join(occdf)

In [22]:
aggdf.index = pd.to_datetime(aggdf.index)

In [23]:
aggdf.head(24)


Out[23]:
date of birth date of death citizenship place of birth ethnic group at least 1 site link gender field of work occupation
2015-11-21 0.709272 0.360168 0.575037 0.292995 0.004189 0.986067 0.969182 0.002574 0.567623
2015-08-03 0.667751 0.337255 0.560040 0.266263 0.003391 0.989771 0.971760 0.000000 0.000000
2015-08-09 0.669641 0.338769 0.560887 0.266321 0.003396 0.989634 0.971606 0.002548 0.557724
2015-08-16 0.669993 0.339033 0.560946 0.266382 0.003559 0.989620 0.971775 0.002551 0.558631
2015-12-04 0.713121 0.360298 0.575444 0.291580 0.005109 0.982407 0.966120 0.002559 0.568582
2015-11-22 0.709272 0.360168 0.575037 0.292995 0.004189 0.986067 0.969182 0.002574 0.567623
2015-09-13 0.674378 0.343581 0.571644 0.284418 0.003554 0.989358 0.970622 0.002555 0.563043
2015-11-23 0.712787 0.361195 0.574745 0.292245 0.004536 0.984862 0.967960 0.002569 0.571086
2015-12-21 0.716232 0.361078 0.581477 0.303596 0.005413 0.981570 0.965295 0.002571 0.582688
2016-01-03 0.717249 0.361488 0.582185 0.305102 0.005559 0.981463 0.965416 0.002573 0.587191
2014-10-13 0.740799 0.374148 0.560040 0.266263 0.003391 0.989771 0.971760 0.000000 0.000000
2015-11-02 0.700751 0.357170 0.579068 0.294258 0.003605 0.988327 0.972842 0.002568 0.566348
2015-09-27 0.673458 0.343068 0.577288 0.283896 0.003558 0.989244 0.972117 0.002553 0.562528
2015-10-13 0.675539 0.347995 0.578089 0.291084 0.003584 0.989097 0.973522 0.002561 0.559922
2015-07-28 0.667751 0.337255 0.560040 0.266263 0.003391 0.989771 0.971760 0.000000 0.000000
2014-09-17 0.575736 0.285827 0.428217 0.240124 0.003109 0.996164 0.952879 0.000000 0.000000
2015-08-23 0.673135 0.340938 0.561927 0.276930 0.003561 0.989528 0.971867 0.002556 0.563302
2015-09-06 0.675439 0.343853 0.572870 0.283842 0.003567 0.989411 0.971435 0.002561 0.565040
2015-09-20 0.673881 0.343459 0.575933 0.284116 0.003569 0.989312 0.971570 0.002552 0.562254
2015-10-26 0.695236 0.354858 0.579960 0.294588 0.003597 0.988819 0.973161 0.002568 0.564161
2015-08-12 0.669993 0.339033 0.560946 0.266382 0.003559 0.989620 0.971775 0.002551 0.558631
2015-10-19 0.675539 0.347995 0.578089 0.291084 0.003584 0.989097 0.973522 0.002561 0.559922
2015-11-09 0.702570 0.358822 0.576679 0.293571 0.004148 0.988171 0.971133 0.002571 0.564788
2015-08-21 0.673135 0.340938 0.561927 0.276930 0.003561 0.989528 0.971867 0.002556 0.563302

In [24]:
aggdf.ix[[6,5],['gender','date of birth', 'date of death','citizenship','place of birth','ethnic group', 'at least 1 site link']].T


Out[24]:
2015-09-13 00:00:00 2015-11-22 00:00:00
gender 0.970622 0.969182
date of birth 0.674378 0.709272
date of death 0.343581 0.360168
citizenship 0.571644 0.575037
place of birth 0.284418 0.292995
ethnic group 0.003554 0.004189
at least 1 site link 0.989358 0.986067

In [25]:
print(aggdf.ix[[6,5],['gender','date of birth', 'date of death','citizenship','place of birth','ethnic group','field of work','occupation', 'at least 1 site link']].T.to_latex(float_format=lambda x: "{0:.1f}%".format(x*100)))


\begin{tabular}{lrr}
\toprule
{} &  2015-09-13 &  2015-11-22 \\
\midrule
gender               &       97.1\% &       96.9\% \\
date of birth        &       67.4\% &       70.9\% \\
date of death        &       34.4\% &       36.0\% \\
citizenship          &       57.2\% &       57.5\% \\
place of birth       &       28.4\% &       29.3\% \\
ethnic group         &        0.4\% &        0.4\% \\
field of work        &        0.3\% &        0.3\% \\
occupation           &       56.3\% &       56.8\% \\
at least 1 site link &       98.9\% &       98.6\% \\
\bottomrule
\end{tabular}


In [26]:
r_aggdf = pd.rolling_mean(aggdf, 2,1)
ax = aggdf.plot( figsize = (5,5), colormap=plt.cm.jet, linewidth=3)
title("Coverage of Accompanying Properties Over Time", y = 1.08,size=18)
legend(bbox_to_anchor=(1.6,0.75))
vals = ax.get_yticks()
ax.set_yticklabels(['{:3.0f}%'.format(x*100) for x in vals])
plt.ylabel('Ratio of biographies with property')
plt.xlabel('Wikidata snapshot')


Out[26]:
<matplotlib.text.Text at 0x7f83b11ede10>

FRB over time


In [27]:
ldf['female%'] = ldf['female'] / ldf['total']

In [28]:
ldf['nonbinary'] = ldf['total'] - ldf['female'] - ldf['male']
ldf['nonbinary%'] = ldf['nonbinary'] / ldf['total']

In [29]:
r_femper = pd.rolling_mean(ldf['female%'], 2, 1)
ax =r_femper.plot()
title('Female Ratio of Humans in Wikidata Over Time')
vals = ax.get_yticks()
ax.set_yticklabels(['{:3.1f}%'.format(x*100) for x in vals])
plt.ylabel('Female ratio')
plt.xlabel('Wikidata snapshot')


Out[29]:
<matplotlib.text.Text at 0x7f83966e84a8>