In [1]:
import pandas as pd
from os import path
%pylab inline
#plt.style.use('fivethirtyeight')
plt.style.use('ggplot')
import matplotlib as mpl
mpl.rc("savefig", dpi=300)
In [2]:
snapshot_dir = '/media/notconfusing/9d9b45fc-55f7-428c-a228-1c4c4a1b728c/home/maximilianklein/snapshot_data/'
In [3]:
lssnaps = !ls $snapshot_dir
In [4]:
snapdirs = [snap for snap in lssnaps if snap[0].isdigit()]
In [5]:
all_genders = set()
#find the genders
for snapdir in snapdirs:
sitelinkspath = path.join(snapshot_dir,snapdir,'property_indexes','site_linkss-index.csv')
sldf = pd.read_csv(sitelinkspath).fillna(0)
snap_genders = sldf.columns
for g in snap_genders:
if g.endswith('1'):
print(sitelinkspath)
all_genders.add(g)
In [6]:
#populate a dict with numbers:
longitudinal_totals = {}
for snapdir in snapdirs:
sitelinkspath = path.join(snapshot_dir,snapdir,'property_indexes','site_linkss-index.csv')
sldf = pd.read_csv(sitelinkspath,index_col=0).fillna(0)
snap_genders = sldf.columns
gendersum = sldf.sum()
longitudinal_totals[snapdir] = gendersum
ldf = pd.DataFrame.from_dict(longitudinal_totals,orient='index').fillna(0)
In [7]:
print(len(ldf.columns))
In [8]:
ldf.index = pd.to_datetime(ldf.index)
In [9]:
ldf['total'] = ldf.sum(axis=1)
In [10]:
ldf['total']
Out[10]:
In [11]:
ldf['ungendered'] = ldf['nan'] / ldf['total']
In [12]:
ldf['gender'] = 1 - ldf['ungendered']
In [13]:
ldf['total'].plot()
Out[13]:
In [14]:
ldf['total']
Out[14]:
In [15]:
r_total = pd.rolling_mean(ldf['total'], 2, 1)
In [16]:
ax = r_total.plot(title="Total Humans in Wikidata Over Time")
ax.get_yaxis().set_major_formatter(
matplotlib.ticker.FuncFormatter(lambda x, p: format(int(x), ',')))
plt.ylabel('Number of humans')
plt.xlabel('Wikidata snapshot')
Out[16]:
In [17]:
r_gendered = pd.rolling_mean(ldf['gender'], 2, 1)
r_gendered.plot(secondary_y=True, title="% of Humans with Gender in Wikidata")
Out[17]:
In [ ]:
In [69]:
#populate a dict with numbers:
longitudinal_countrys = {}
longitudinal_citizenships = {}
longitudinal_egs = {}
longitudinal_sites = {}
dict_csv_map = { 'place_of_births-index.csv': longitudinal_countrys,
'citizenships-index.csv' : longitudinal_citizenships,
'ethnic_groups-index.csv': longitudinal_egs,
'site_linkss-index.csv': longitudinal_sites}
for snapdir in snapdirs:
for csvtype, longitudinal_dict in dict_csv_map.items():
sitelinkspath = path.join(snapshot_dir,snapdir,'property_indexes',csvtype)
cdf = pd.read_csv(sitelinkspath,index_col=0).fillna(0)
csum = cdf.sum()
total = csum.sum()
nothaving = cdf.ix[0].sum()
having = total - nothaving
if snapdir == '2016-01-03':
print(csvtype, having)
having_per = having / float(total)
longitudinal_dict[snapdir] = having_per
In [70]:
longitudinal_dobs = {}
longitudinal_dods= {}
longitudinal_occ = {}
longitudinal_fow = {}
dict_csv_map = { 'dod-index.csv': longitudinal_dods,
'dob-index.csv' : longitudinal_dobs,
'field_of_work-index.csv': longitudinal_fow,
'occupation-index.csv': longitudinal_occ}
for snapdir in snapdirs:
bigfname = 'gender-index-data-{}.csv'.format(snapdir)
bigdf = pd.read_csv(path.join(snapshot_dir,snapdir,bigfname))
humans = len(bigdf)
for csvtype, longitudinal_dict in dict_csv_map.items():
sitelinkspath = path.join(snapshot_dir,snapdir,'property_indexes',csvtype)
try:
cdf = pd.read_csv(sitelinkspath,index_col=0).fillna(0)
csum = cdf.sum()
total = csum.sum()
except IOError:
total = 0
if snapdir == '2016-01-03':
print(csvtype, total)
having_per = total/ float(humans)
longitudinal_dict[snapdir] = having_per
In [73]:
java_min_int = -2147483648
bigdf = pd.read_csv(path.join(snapshot_dir,'2016-01-03',bigfname),na_values=[java_min_int])
In [84]:
bigdf['dob'].apply(lambda x: not math.isnan(x)).sum()
Out[84]:
In [20]:
dobdf = pd.DataFrame.from_dict(longitudinal_dobs,orient='index').fillna(0)
dobdf.columns = ['date of birth']
doddf = pd.DataFrame.from_dict(longitudinal_dods,orient='index').fillna(0)
doddf.columns = ['date of death']
citdf = pd.DataFrame.from_dict(longitudinal_citizenships,orient='index').fillna(0)
citdf.columns = ['citizenship']
countrydf = pd.DataFrame.from_dict(longitudinal_countrys,orient='index').fillna(0)
countrydf.columns = ['place of birth']
egdf = pd.DataFrame.from_dict(longitudinal_egs,orient='index').fillna(0)
egdf.columns = ['ethnic group']
fowdf = pd.DataFrame.from_dict(longitudinal_fow,orient='index').fillna(0)
fowdf.columns = ['field of work']
occdf = pd.DataFrame.from_dict(longitudinal_occ,orient='index').fillna(0)
occdf.columns = ['occupation']
sitedf = pd.DataFrame.from_dict(longitudinal_sites,orient='index').fillna(0)
sitedf.columns = ['at least 1 site link']
In [21]:
aggdf = dobdf.join(doddf).join(citdf).join(countrydf).join(egdf).join(sitedf).join(ldf['gender']).join(fowdf).join(occdf)
In [22]:
aggdf.index = pd.to_datetime(aggdf.index)
In [23]:
aggdf.head(24)
Out[23]:
In [24]:
aggdf.ix[[6,5],['gender','date of birth', 'date of death','citizenship','place of birth','ethnic group', 'at least 1 site link']].T
Out[24]:
In [25]:
print(aggdf.ix[[6,5],['gender','date of birth', 'date of death','citizenship','place of birth','ethnic group','field of work','occupation', 'at least 1 site link']].T.to_latex(float_format=lambda x: "{0:.1f}%".format(x*100)))
In [26]:
r_aggdf = pd.rolling_mean(aggdf, 2,1)
ax = aggdf.plot( figsize = (5,5), colormap=plt.cm.jet, linewidth=3)
title("Coverage of Accompanying Properties Over Time", y = 1.08,size=18)
legend(bbox_to_anchor=(1.6,0.75))
vals = ax.get_yticks()
ax.set_yticklabels(['{:3.0f}%'.format(x*100) for x in vals])
plt.ylabel('Ratio of biographies with property')
plt.xlabel('Wikidata snapshot')
Out[26]:
In [27]:
ldf['female%'] = ldf['female'] / ldf['total']
In [28]:
ldf['nonbinary'] = ldf['total'] - ldf['female'] - ldf['male']
ldf['nonbinary%'] = ldf['nonbinary'] / ldf['total']
In [29]:
r_femper = pd.rolling_mean(ldf['female%'], 2, 1)
ax =r_femper.plot()
title('Female Ratio of Humans in Wikidata Over Time')
vals = ax.get_yticks()
ax.set_yticklabels(['{:3.1f}%'.format(x*100) for x in vals])
plt.ylabel('Female ratio')
plt.xlabel('Wikidata snapshot')
Out[29]:
In [30]:
twoseries = pd.concat([r_total, r_femper], axis=1)
twoseries.columns = ['Total bios', 'Female ratio']
ax = twoseries.plot(secondary_y=['Female ratio'],figsize=(6,4), linewidth=3)
ax.set_xlabel('Wikidata snapshot')
ax.get_yaxis().set_major_formatter(
matplotlib.ticker.FuncFormatter(lambda x, p: x/1000000))
ax.set_ylabel('Total biographies (millions)')
ax.right_ax.set_yticklabels(['{:3.1f}%'.format(x*100) for x in ax.right_ax.get_yticks()])
ax.right_ax.set_ylabel('Female ratio of biographies')
plt.title('Human Biographies in Wikidata over Time',y = 1.08,size=18)
Out[30]:
In [31]:
ldf['time'] = ldf.index
In [32]:
ldf['timesince'] = ldf['time'] - ldf['time'].ix[0]
In [33]:
ldf['dayssince'] = ldf['timesince'].dt.days
In [34]:
#from pandas.stats.api import ols
#result = ols(y=ldf['female%'], x=ldf['dayssince'])
In [35]:
ldf['femper'] = ldf['female%']
ldf['constant'] = 1
In [36]:
#import statsmodels.formula.api as sm
#smresult = sm.ols(formula="femper ~ dayssince", data=ldf).fit()
In [37]:
#smresult.summary()
In [38]:
#populate a dict with numbers:
longitudinal_langs = {}
for snapdir in snapdirs:
sitelinkspath = path.join(snapshot_dir,snapdir,'property_indexes','site_linkss-index.csv')
sldf = pd.read_csv(sitelinkspath,index_col=0).fillna(0)
sldf['total'] = sldf.sum(axis=1)
sldf['femper'] = sldf['female'] / sldf['total']
retseries = sldf['femper'][sldf['total']>100000]
longitudinal_langs[snapdir] = retseries
In [39]:
langdf = pd.DataFrame.from_dict(longitudinal_langs, orient='index')
In [40]:
langdf.dropna(axis=1, how='any', inplace=True)
In [41]:
langdf
Out[41]:
In [42]:
langdf.index = pd.to_datetime(langdf.index)
In [43]:
font = {'family': 'serif',
'color': 'darkred',
'weight': 'normal',
'size': 16,
}
In [44]:
import json
wikicodes = json.load(open('/home/notconfusing/workspace/WIGI/helpers/wiki_code_map.json','r'))
def lookup_wikicode(wikicode):
letters = wikicode.split('wiki')[0]
fullname = wikicodes[letters]
shortname = fullname.split(' ')[0]
return shortname
lookup_wikicode('enwiki')
Out[44]:
In [45]:
ax = langdf.plot(legend=False, figsize=(8,12))
for wiki in langdf:
offset=5 if wiki == 'jawiki' else 0
ax.annotate(wiki, (langdf.index[-1],langdf[wiki].ix['2016-01-03']), xytext=(2, 0+offset),
textcoords='offset points')
#plt.text(langdf.index[-1],langdf[wiki].ix['2016-01-03'], wiki, fontdict=font)
ax.yaxis.set_major_formatter(
FuncFormatter(lambda y,pos: ('{0:.2f}'.format(y*100)).rstrip('0').rstrip('.')+'%'))
plt.ylabel('Female ratio of biographies')
plt.xlabel('Wikidata snapshot')
plt.title('Change in Female Ratio of Biographies in \n Oct 2014 - Jan 2016',size=24)
plt.show()
In [ ]:
In [46]:
begin, end = '2015-07-28', '2016-01-03'
endsorted = pd.DataFrame(langdf.ix[end]).sort(columns=['2016-01-03 00:00:00'])
oddeven = {wiki: pos % 2 for pos, wiki in list(enumerate(endsorted.index))}
ax = langdf.ix[begin:end].plot(legend=False, figsize=(4,8))
for wiki in langdf:
side = begin if oddeven[wiki] == 0 else end
offset = 6 if wiki == 'jawiki' else 0
ax.annotate(lookup_wikicode(wiki), (end ,langdf[wiki].ix[end]), xytext=(2, 0+offset),
textcoords='offset points')
#plt.text(langdf.index[-1],langdf[wiki].ix['2016-01-03'], wiki, fontdict=font)
ax.yaxis.set_major_formatter(
FuncFormatter(lambda y,pos: ('{0:.2f}'.format(y*100)).rstrip('0').rstrip('.')+'%'))
plt.ylabel('Female ratio of biographies')
plt.xlabel('Wikidata snapshot')
plt.title('Change in Female Ratio of Biographies in \n July 2015 - Jan 2016',size=17)
plt.show()
In [47]:
#populate a dict with numbers:
long_fem_per = {}
long_tot = {}
for snapdir in [min(snapdirs), max(snapdirs)]:
sitelinkspath = path.join(snapshot_dir,snapdir,'property_indexes','site_linkss-index.csv')
sldf = pd.read_csv(sitelinkspath,index_col=0).fillna(0)
sldf['total'] = sldf.sum(axis=1)
sldf['femper'] = sldf['female'] / sldf['total']
femperseries = sldf['femper'][sldf['total']>100000]
totalseries = sldf['total'][sldf['total']>100000]
long_fem_per[snapdir] = femperseries
long_tot[snapdir] = totalseries
In [48]:
femperdf = pd.DataFrame.from_dict(long_fem_per)
totaldf = pd.DataFrame.from_dict(long_tot)
In [49]:
arrowdf =femperdf.join(totaldf, lsuffix='femper', rsuffix='total').T
In [50]:
arrowdf = arrowdf.T.dropna(how='any').T
In [51]:
arrowdf.ix[1]-arrowdf.ix[0]
Out[51]:
In [52]:
import math
fig = plt.figure(figsize=(8,8))
ax = fig.add_subplot(111)
for wiki in arrowdf:
plt.arrow(arrowdf[wiki].ix[2], #x1
arrowdf[wiki].ix[0], # y1
arrowdf[wiki].ix[3]-arrowdf[wiki].ix[2], # x2 - x1
arrowdf[wiki].ix[1]-arrowdf[wiki].ix[0], # y2 - y1
head_width=0.005, head_length=10000, fc='k', ec='k')
xoffset = 0 if wiki not in ['fiwiki'] else 10000
yoffset = 0 if wiki not in ['jawiki','nowiki'] else 0.005
plt.text(arrowdf[wiki].ix[3]+xoffset, arrowdf[wiki].ix[1]**0.99+yoffset, lookup_wikicode(wiki), fontdict=font)
plt.ylim(0.12, 0.22)
plt.ylabel("Female ratio of biographies",size=18)
plt.xlim(100000,1500000)
plt.xlabel("Total Biographies (log scale)", size=18)
plt.xscale('log')
plt.xticks(size=18)
ax.yaxis.set_major_formatter(
FuncFormatter(lambda y,pos: ('{0:.2f}'.format(y*100)).rstrip('0').rstrip('.')+'%'))
plt.yticks(size=18)
plt.title('Change in Female Ratio of Biographies and Size \n September 17 2014 - January 3 2016', size=24)
Out[52]:
In [53]:
fig = plt.figure(figsize=(8,8))
ax = fig.add_subplot(111)
for wiki in arrowdf:
plt.arrow(arrowdf[wiki].ix[0], #x1
arrowdf[wiki].ix[2], # y1
arrowdf[wiki].ix[1]-arrowdf[wiki].ix[0], # x2 - x1
arrowdf[wiki].ix[3]-arrowdf[wiki].ix[2], # y2 - y1
head_width=0.005, head_length=10000, fc='k', ec='k')
yoffset = 0 if wiki not in ['fiwiki','ruwiki','nlwiki','nowiki'] else 10000
xoffset = 0 if wiki not in ['jawiki'] else 0.01
plt.text(arrowdf[wiki].ix[1]**0.99+xoffset, arrowdf[wiki].ix[3]+yoffset, lookup_wikicode(wiki), fontdict=font)
plt.xlim(0.12, 0.22)
plt.xlabel("Female ratio of biographies",size=18)
plt.ylim(100000,1500000)
plt.ylabel("Total Biographies (log scale)", size=18)
plt.yscale('log')
plt.yticks(size=18)
ax.xaxis.set_major_formatter(
FuncFormatter(lambda y,pos: ('{0:.2f}'.format(y*100)).rstrip('0').rstrip('.')+'%'))
plt.xticks(size=18)
plt.title('Change in Female Ratio of Biographies and Size \n September 17 2014 - January 3 2016', size=24)
Out[53]:
In [54]:
#populate a dict with numbers:
eng_fem_totals = {}
all_fem_totals ={}
for snapdir in snapdirs:
sitelinkspath = path.join(snapshot_dir,snapdir,'property_indexes','site_linkss-index.csv')
sldf = pd.read_csv(sitelinkspath,index_col=0).fillna(0)
eng_fem = sldf.ix['enwiki']['female']
all_fem = sldf['female'].sum()
snapdate = pd.to_datetime(snapdir)
eng_fem_totals[snapdate] = eng_fem
all_fem_totals[snapdate] = all_fem
engfemdf = pd.DataFrame.from_dict(eng_fem_totals,orient='index').fillna(0)
engfemdf.columns = ['eng_fems']
engfemdf = engfemdf.sort_index()
allfemdf = pd.DataFrame.from_dict(all_fem_totals,orient='index').fillna(0)
allfemdf.columns = ['all_fems']
allfemdf = allfemdf.sort_index()
In [55]:
engfemdf['added']= engfemdf['eng_fems']-engfemdf['eng_fems'].shift()
allfemdf['added']= allfemdf['all_fems']-allfemdf['all_fems'].shift()
In [56]:
engfemdf['month'] = engfemdf.index.map(lambda x: x.month if x.year == 2015 else 0)
allfemdf['month'] = allfemdf.index.map(lambda x: x.month if x.year == 2015 else 0)
In [57]:
eng_months = engfemdf.groupby('month')
all_months = allfemdf.groupby('month')
In [58]:
wd_eng_months = eng_months.sum().ix[8:12]
wd_all_months = all_months.sum().ix[8:12]
In [59]:
wir_data = {8:1854, 9:1590,10:1989,11:1787,12:1473}
wir_months = pd.DataFrame.from_dict(wir_data, orient='index')
wir_months.columns=['added']
In [60]:
addeddf = wir_months.join(wd_eng_months,lsuffix='wir',rsuffix='wd')
alladdeddf= wir_months.join(wd_all_months,lsuffix='wir',rsuffix='wd')
In [61]:
del addeddf['eng_fems']
del alladdeddf['all_fems']
In [62]:
alladdeddf.plot()
Out[62]:
In [63]:
addeddf.corr()
Out[63]:
In [64]:
alladdeddf.corr()
Out[64]:
In [65]:
alladdeddf['addedwir'].corr(alladdeddf['addedwd'].shift())
Out[65]:
In [66]:
smooth = pd.rolling_mean(alladdeddf, window=2, min_periods=2).dropna()
In [67]:
slope, intercept, r_value, p_value, std_er = linregress(smooth['addedwir'],smooth['addedwd'])
print(r_value, p_value)
In [ ]:
np.correlate(addeddf['addedwir'],addeddf['addedwd'])
In [ ]:
from scipy.stats import linregress
In [ ]:
slope, intercept, r_value, p_value, std_er = linregress(alladdeddf['addedwir'][-4:],alladdeddf['addedwd'].shift()[-4:])
In [ ]:
r_value, p_value
In [ ]:
def mon2015(x):
dt = pd.to_datetime(x)
return x.month if x.year == 2015 else 0
enwiki['months'] = langdf.index.map(mon2015)
enmons = .groupby(by='months')
In [ ]:
wir_months
In [ ]:
yeardiff = langdf.ix[-1] - langdf.ix[0]
In [ ]:
yeardiff.sort(ascending=False)
yeardiff.hist()
In [ ]:
print pd.DataFrame(yeardiff).ix[0:10].to_latex(float_format=lambda x: "{0:.2f}%".format(x*100))
In [ ]:
pd.DataFrame(yeardiff).ix['enwiki']
In [ ]:
def trend(langseries):
result = ols(y=langseries, x=ldf['dayssince'])
return result.beta[0]
In [ ]:
trends = langdf.apply(lambda x: trend(x), axis=0)
In [ ]:
trends.sort(ascending=False)
trends
In [ ]:
def splitcol(i):
return i.split("|")
In [ ]:
#av idols adult video
av_tot = {}
act_tot ={}
for snapdir in snapdirs:
sitelinkspath = path.join(snapshot_dir,snapdir,'property_indexes','occupation-index.csv')
try:
avdf = pd.read_csv(sitelinkspath,index_col=0).fillna(0)
avindexes = avdf.index.map(lambda x: 'Q1079215' in x)
avtotal = avdf[avindexes].sum().sum()
av_tot[snapdir] = avtotal
actindexes = avdf.index.map(lambda x: 'Q488111' in x)
acttotal = avdf[actindexes].sum().sum()
act_tot[snapdir] = acttotal
except OSError: #no occupation yet
pass
In [ ]:
act_tot
In [ ]:
In [ ]: