In [1]:
import pandas as pd
from os import path
%pylab inline
#plt.style.use('fivethirtyeight')
plt.style.use('ggplot')
import matplotlib as mpl
mpl.rc("savefig", dpi=300)
In [2]:
snapshot_dir = '/media/notconfusing/9d9b45fc-55f7-428c-a228-1c4c4a1b728c/home/maximilianklein/snapshot_data/'
In [3]:
lssnaps = !ls $snapshot_dir
In [4]:
snapdirs = [snap for snap in lssnaps if snap[0].isdigit()]
In [5]:
all_genders = set()
#find the genders
for snapdir in snapdirs:
sitelinkspath = path.join(snapshot_dir,snapdir,'property_indexes','site_linkss-index.csv')
sldf = pd.read_csv(sitelinkspath).fillna(0)
snap_genders = sldf.columns
for g in snap_genders:
if g.endswith('1'):
print(sitelinkspath)
all_genders.add(g)
In [6]:
#populate a dict with numbers:
longitudinal_totals = {}
for snapdir in snapdirs:
sitelinkspath = path.join(snapshot_dir,snapdir,'property_indexes','site_linkss-index.csv')
sldf = pd.read_csv(sitelinkspath,index_col=0).fillna(0)
snap_genders = sldf.columns
gendersum = sldf.sum()
longitudinal_totals[snapdir] = gendersum
ldf = pd.DataFrame.from_dict(longitudinal_totals,orient='index').fillna(0)
In [7]:
print(len(ldf.columns))
In [8]:
ldf.index = pd.to_datetime(ldf.index)
In [9]:
ldf['total'] = ldf.sum(axis=1)
In [10]:
ldf['total']
Out[10]:
In [11]:
ldf['ungendered'] = ldf['nan'] / ldf['total']
In [12]:
ldf['gender'] = 1 - ldf['ungendered']
In [13]:
ldf['total'].plot()
Out[13]:
In [14]:
ldf['total']
Out[14]:
In [15]:
r_total = pd.rolling_mean(ldf['total'], 2, 1)
In [16]:
ax = r_total.plot(title="Total Humans in Wikidata Over Time")
ax.get_yaxis().set_major_formatter(
matplotlib.ticker.FuncFormatter(lambda x, p: format(int(x), ',')))
plt.ylabel('Number of humans')
plt.xlabel('Wikidata snapshot')
Out[16]:
In [17]:
r_gendered = pd.rolling_mean(ldf['gender'], 2, 1)
r_gendered.plot(secondary_y=True, title="% of Humans with Gender in Wikidata")
Out[17]:
In [ ]:
In [69]:
#populate a dict with numbers:
longitudinal_countrys = {}
longitudinal_citizenships = {}
longitudinal_egs = {}
longitudinal_sites = {}
dict_csv_map = { 'place_of_births-index.csv': longitudinal_countrys,
'citizenships-index.csv' : longitudinal_citizenships,
'ethnic_groups-index.csv': longitudinal_egs,
'site_linkss-index.csv': longitudinal_sites}
for snapdir in snapdirs:
for csvtype, longitudinal_dict in dict_csv_map.items():
sitelinkspath = path.join(snapshot_dir,snapdir,'property_indexes',csvtype)
cdf = pd.read_csv(sitelinkspath,index_col=0).fillna(0)
csum = cdf.sum()
total = csum.sum()
nothaving = cdf.ix[0].sum()
having = total - nothaving
if snapdir == '2016-01-03':
print(csvtype, having)
having_per = having / float(total)
longitudinal_dict[snapdir] = having_per
In [70]:
longitudinal_dobs = {}
longitudinal_dods= {}
longitudinal_occ = {}
longitudinal_fow = {}
dict_csv_map = { 'dod-index.csv': longitudinal_dods,
'dob-index.csv' : longitudinal_dobs,
'field_of_work-index.csv': longitudinal_fow,
'occupation-index.csv': longitudinal_occ}
for snapdir in snapdirs:
bigfname = 'gender-index-data-{}.csv'.format(snapdir)
bigdf = pd.read_csv(path.join(snapshot_dir,snapdir,bigfname))
humans = len(bigdf)
for csvtype, longitudinal_dict in dict_csv_map.items():
sitelinkspath = path.join(snapshot_dir,snapdir,'property_indexes',csvtype)
try:
cdf = pd.read_csv(sitelinkspath,index_col=0).fillna(0)
csum = cdf.sum()
total = csum.sum()
except IOError:
total = 0
if snapdir == '2016-01-03':
print(csvtype, total)
having_per = total/ float(humans)
longitudinal_dict[snapdir] = having_per
In [73]:
java_min_int = -2147483648
bigdf = pd.read_csv(path.join(snapshot_dir,'2016-01-03',bigfname),na_values=[java_min_int])
In [84]:
bigdf['dob'].apply(lambda x: not math.isnan(x)).sum()
Out[84]:
In [20]:
dobdf = pd.DataFrame.from_dict(longitudinal_dobs,orient='index').fillna(0)
dobdf.columns = ['date of birth']
doddf = pd.DataFrame.from_dict(longitudinal_dods,orient='index').fillna(0)
doddf.columns = ['date of death']
citdf = pd.DataFrame.from_dict(longitudinal_citizenships,orient='index').fillna(0)
citdf.columns = ['citizenship']
countrydf = pd.DataFrame.from_dict(longitudinal_countrys,orient='index').fillna(0)
countrydf.columns = ['place of birth']
egdf = pd.DataFrame.from_dict(longitudinal_egs,orient='index').fillna(0)
egdf.columns = ['ethnic group']
fowdf = pd.DataFrame.from_dict(longitudinal_fow,orient='index').fillna(0)
fowdf.columns = ['field of work']
occdf = pd.DataFrame.from_dict(longitudinal_occ,orient='index').fillna(0)
occdf.columns = ['occupation']
sitedf = pd.DataFrame.from_dict(longitudinal_sites,orient='index').fillna(0)
sitedf.columns = ['at least 1 site link']
In [21]:
aggdf = dobdf.join(doddf).join(citdf).join(countrydf).join(egdf).join(sitedf).join(ldf['gender']).join(fowdf).join(occdf)
In [22]:
aggdf.index = pd.to_datetime(aggdf.index)
In [23]:
aggdf.head(24)
Out[23]:
In [24]:
aggdf.ix[[6,5],['gender','date of birth', 'date of death','citizenship','place of birth','ethnic group', 'at least 1 site link']].T
Out[24]:
In [25]:
print(aggdf.ix[[6,5],['gender','date of birth', 'date of death','citizenship','place of birth','ethnic group','field of work','occupation', 'at least 1 site link']].T.to_latex(float_format=lambda x: "{0:.1f}%".format(x*100)))
In [26]:
r_aggdf = pd.rolling_mean(aggdf, 2,1)
ax = aggdf.plot( figsize = (5,5), colormap=plt.cm.jet, linewidth=3)
title("Coverage of Accompanying Properties Over Time", y = 1.08,size=18)
legend(bbox_to_anchor=(1.6,0.75))
vals = ax.get_yticks()
ax.set_yticklabels(['{:3.0f}%'.format(x*100) for x in vals])
plt.ylabel('Ratio of biographies with property')
plt.xlabel('Wikidata snapshot')
Out[26]:
In [27]:
ldf['female%'] = ldf['female'] / ldf['total']
In [28]:
ldf['nonbinary'] = ldf['total'] - ldf['female'] - ldf['male']
ldf['nonbinary%'] = ldf['nonbinary'] / ldf['total']
In [29]:
r_femper = pd.rolling_mean(ldf['female%'], 2, 1)
ax =r_femper.plot()
title('Female Ratio of Humans in Wikidata Over Time')
vals = ax.get_yticks()
ax.set_yticklabels(['{:3.1f}%'.format(x*100) for x in vals])
plt.ylabel('Female ratio')
plt.xlabel('Wikidata snapshot')
Out[29]: