notebook.community

Edit and run



In [1]:

    
import pandas as pd
from os import path
%pylab inline
#plt.style.use('fivethirtyeight')
plt.style.use('ggplot')
import matplotlib as mpl
mpl.rc("savefig", dpi=300)









    



Populating the interactive namespace from numpy and matplotlib



In [2]:

    
snapshot_dir = '/media/notconfusing/9d9b45fc-55f7-428c-a228-1c4c4a1b728c/home/maximilianklein/snapshot_data/'



In [3]:

    
lssnaps = !ls $snapshot_dir



In [4]:

    
snapdirs = [snap for snap in lssnaps if snap[0].isdigit()]



In [5]:

    
all_genders = set()
#find the genders
for snapdir in snapdirs:
    sitelinkspath = path.join(snapshot_dir,snapdir,'property_indexes','site_linkss-index.csv')
    sldf = pd.read_csv(sitelinkspath).fillna(0)
    snap_genders = sldf.columns
    for g in snap_genders: 
        if g.endswith('1'):
            print(sitelinkspath)
        all_genders.add(g)



In [6]:

    
#populate a dict with numbers:
longitudinal_totals = {}
for snapdir in snapdirs:
    sitelinkspath = path.join(snapshot_dir,snapdir,'property_indexes','site_linkss-index.csv')
    sldf = pd.read_csv(sitelinkspath,index_col=0).fillna(0)
    snap_genders = sldf.columns
    gendersum = sldf.sum()
    longitudinal_totals[snapdir] = gendersum

ldf = pd.DataFrame.from_dict(longitudinal_totals,orient='index').fillna(0)



In [7]:

    
print(len(ldf.columns))



In [8]:

    
ldf.index = pd.to_datetime(ldf.index)



In [9]:

    
ldf['total'] = ldf.sum(axis=1)



In [10]:

    
ldf['total']









    Out[10]:





2014-09-17    5869606
2014-10-13    6589799
2015-07-28    6589799
2015-08-03    6589799
2015-08-09    6609617
2015-08-12    6617934
2015-08-16    6617934
2015-08-21    6630052
2015-08-23    6630052
2015-09-06    6656076
2015-09-13    6680340
2015-09-20    6698141
2015-09-27    6714732
2015-10-13    6763390
2015-10-19    6763390
2015-10-26    6807585
2015-11-02    6824561
2015-11-09    6851235
2015-11-21    6876328
2015-11-22    6876328
2015-11-23    6901147
2015-12-04    6929623
2015-12-21    6987464
2016-01-03    6999542
Name: total, dtype: float64



In [11]:

    
ldf['ungendered'] = ldf['nan'] / ldf['total']



In [12]:

    
ldf['gender'] = 1 - ldf['ungendered']



In [13]:

    
ldf['total'].plot()









    Out[13]:





<matplotlib.axes._subplots.AxesSubplot at 0x7f83ba6d8860>



In [14]:

    
ldf['total']









    Out[14]:





2014-09-17    5869606
2014-10-13    6589799
2015-07-28    6589799
2015-08-03    6589799
2015-08-09    6609617
2015-08-12    6617934
2015-08-16    6617934
2015-08-21    6630052
2015-08-23    6630052
2015-09-06    6656076
2015-09-13    6680340
2015-09-20    6698141
2015-09-27    6714732
2015-10-13    6763390
2015-10-19    6763390
2015-10-26    6807585
2015-11-02    6824561
2015-11-09    6851235
2015-11-21    6876328
2015-11-22    6876328
2015-11-23    6901147
2015-12-04    6929623
2015-12-21    6987464
2016-01-03    6999542
Name: total, dtype: float64



In [15]:

    
r_total = pd.rolling_mean(ldf['total'], 2, 1)



In [16]:

    
ax = r_total.plot(title="Total Humans in Wikidata Over Time")
ax.get_yaxis().set_major_formatter(
    matplotlib.ticker.FuncFormatter(lambda x, p: format(int(x), ',')))
plt.ylabel('Number of humans')
plt.xlabel('Wikidata snapshot')









    Out[16]:





<matplotlib.text.Text at 0x7f83b96af4e0>



In [17]:

    
r_gendered = pd.rolling_mean(ldf['gender'], 2, 1)
r_gendered.plot(secondary_y=True, title="% of Humans with Gender in Wikidata")









    Out[17]:





<matplotlib.axes._subplots.AxesSubplot at 0x7f83b86591d0>



In [ ]:



In [69]:

    
#populate a dict with numbers:
longitudinal_countrys = {}
longitudinal_citizenships = {}
longitudinal_egs = {}
longitudinal_sites = {}

dict_csv_map = { 'place_of_births-index.csv': longitudinal_countrys,
'citizenships-index.csv' : longitudinal_citizenships,
'ethnic_groups-index.csv': longitudinal_egs,
'site_linkss-index.csv': longitudinal_sites}

for snapdir in snapdirs:
    for csvtype, longitudinal_dict in dict_csv_map.items():
        sitelinkspath = path.join(snapshot_dir,snapdir,'property_indexes',csvtype)
        cdf = pd.read_csv(sitelinkspath,index_col=0).fillna(0)
        csum = cdf.sum()
        total = csum.sum()
        nothaving = cdf.ix[0].sum()
        having = total - nothaving
        if snapdir == '2016-01-03':
            print(csvtype, having)
        having_per = having / float(total)
        longitudinal_dict[snapdir] = having_per









    



2016-01-03 ethnic_groups-index.csv 16881.0
2016-01-03 site_linkss-index.csv 6869792.0
2016-01-03 citizenships-index.csv 1802041.0
2016-01-03 place_of_births-index.csv 927166.0



In [70]:

    
longitudinal_dobs = {}
longitudinal_dods= {}
longitudinal_occ = {}
longitudinal_fow = {}
dict_csv_map = { 'dod-index.csv': longitudinal_dods,
'dob-index.csv' : longitudinal_dobs,
               'field_of_work-index.csv': longitudinal_fow,
               'occupation-index.csv': longitudinal_occ}

for snapdir in snapdirs:
    bigfname = 'gender-index-data-{}.csv'.format(snapdir)
    bigdf = pd.read_csv(path.join(snapshot_dir,snapdir,bigfname))
    humans =  len(bigdf)

    for csvtype, longitudinal_dict in dict_csv_map.items():
        sitelinkspath = path.join(snapshot_dir,snapdir,'property_indexes',csvtype)
        try:
            cdf = pd.read_csv(sitelinkspath,index_col=0).fillna(0)
            csum = cdf.sum()
            total = csum.sum()
        except IOError:
            total = 0
        if snapdir == '2016-01-03':
            print(csvtype, total)
        having_per = total/ float(humans)
        longitudinal_dict[snapdir] = having_per









    



field_of_work-index.csv 7812.0
occupation-index.csv 1782850.0
dob-index.csv 2177738.0
dod-index.csv 1097562.0






    



/usr/lib/python3/dist-packages/pandas/io/parsers.py:1150: DtypeWarning: Columns (4,7) have mixed types. Specify dtype option on import or set low_memory=False.
  data = self._reader.read(nrows)



In [73]:

    
java_min_int = -2147483648
bigdf = pd.read_csv(path.join(snapshot_dir,'2016-01-03',bigfname),na_values=[java_min_int])



In [84]:

    
bigdf['dob'].apply(lambda x: not math.isnan(x)).sum()









    Out[84]:





2177602



In [20]:

    
dobdf = pd.DataFrame.from_dict(longitudinal_dobs,orient='index').fillna(0)
dobdf.columns = ['date of birth']
doddf = pd.DataFrame.from_dict(longitudinal_dods,orient='index').fillna(0)
doddf.columns = ['date of death']
citdf = pd.DataFrame.from_dict(longitudinal_citizenships,orient='index').fillna(0)
citdf.columns = ['citizenship']
countrydf = pd.DataFrame.from_dict(longitudinal_countrys,orient='index').fillna(0)
countrydf.columns = ['place of birth']
egdf = pd.DataFrame.from_dict(longitudinal_egs,orient='index').fillna(0)
egdf.columns = ['ethnic group']
fowdf = pd.DataFrame.from_dict(longitudinal_fow,orient='index').fillna(0)
fowdf.columns = ['field of work']
occdf = pd.DataFrame.from_dict(longitudinal_occ,orient='index').fillna(0)
occdf.columns = ['occupation']
sitedf = pd.DataFrame.from_dict(longitudinal_sites,orient='index').fillna(0)
sitedf.columns = ['at least 1 site link']



In [21]:

    
aggdf = dobdf.join(doddf).join(citdf).join(countrydf).join(egdf).join(sitedf).join(ldf['gender']).join(fowdf).join(occdf)



In [22]:

    
aggdf.index = pd.to_datetime(aggdf.index)



In [23]:

    
aggdf.head(24)









    Out[23]:






  
    
      
      date of birth
      date of death
      citizenship
      place of birth
      ethnic group
      at least 1 site link
      gender
      field of work
      occupation
    
  
  
    
      2015-11-21
       0.709272
       0.360168
       0.575037
       0.292995
       0.004189
       0.986067
       0.969182
       0.002574
       0.567623
    
    
      2015-08-03
       0.667751
       0.337255
       0.560040
       0.266263
       0.003391
       0.989771
       0.971760
       0.000000
       0.000000
    
    
      2015-08-09
       0.669641
       0.338769
       0.560887
       0.266321
       0.003396
       0.989634
       0.971606
       0.002548
       0.557724
    
    
      2015-08-16
       0.669993
       0.339033
       0.560946
       0.266382
       0.003559
       0.989620
       0.971775
       0.002551
       0.558631
    
    
      2015-12-04
       0.713121
       0.360298
       0.575444
       0.291580
       0.005109
       0.982407
       0.966120
       0.002559
       0.568582
    
    
      2015-11-22
       0.709272
       0.360168
       0.575037
       0.292995
       0.004189
       0.986067
       0.969182
       0.002574
       0.567623
    
    
      2015-09-13
       0.674378
       0.343581
       0.571644
       0.284418
       0.003554
       0.989358
       0.970622
       0.002555
       0.563043
    
    
      2015-11-23
       0.712787
       0.361195
       0.574745
       0.292245
       0.004536
       0.984862
       0.967960
       0.002569
       0.571086
    
    
      2015-12-21
       0.716232
       0.361078
       0.581477
       0.303596
       0.005413
       0.981570
       0.965295
       0.002571
       0.582688
    
    
      2016-01-03
       0.717249
       0.361488
       0.582185
       0.305102
       0.005559
       0.981463
       0.965416
       0.002573
       0.587191
    
    
      2014-10-13
       0.740799
       0.374148
       0.560040
       0.266263
       0.003391
       0.989771
       0.971760
       0.000000
       0.000000
    
    
      2015-11-02
       0.700751
       0.357170
       0.579068
       0.294258
       0.003605
       0.988327
       0.972842
       0.002568
       0.566348
    
    
      2015-09-27
       0.673458
       0.343068
       0.577288
       0.283896
       0.003558
       0.989244
       0.972117
       0.002553
       0.562528
    
    
      2015-10-13
       0.675539
       0.347995
       0.578089
       0.291084
       0.003584
       0.989097
       0.973522
       0.002561
       0.559922
    
    
      2015-07-28
       0.667751
       0.337255
       0.560040
       0.266263
       0.003391
       0.989771
       0.971760
       0.000000
       0.000000
    
    
      2014-09-17
       0.575736
       0.285827
       0.428217
       0.240124
       0.003109
       0.996164
       0.952879
       0.000000
       0.000000
    
    
      2015-08-23
       0.673135
       0.340938
       0.561927
       0.276930
       0.003561
       0.989528
       0.971867
       0.002556
       0.563302
    
    
      2015-09-06
       0.675439
       0.343853
       0.572870
       0.283842
       0.003567
       0.989411
       0.971435
       0.002561
       0.565040
    
    
      2015-09-20
       0.673881
       0.343459
       0.575933
       0.284116
       0.003569
       0.989312
       0.971570
       0.002552
       0.562254
    
    
      2015-10-26
       0.695236
       0.354858
       0.579960
       0.294588
       0.003597
       0.988819
       0.973161
       0.002568
       0.564161
    
    
      2015-08-12
       0.669993
       0.339033
       0.560946
       0.266382
       0.003559
       0.989620
       0.971775
       0.002551
       0.558631
    
    
      2015-10-19
       0.675539
       0.347995
       0.578089
       0.291084
       0.003584
       0.989097
       0.973522
       0.002561
       0.559922
    
    
      2015-11-09
       0.702570
       0.358822
       0.576679
       0.293571
       0.004148
       0.988171
       0.971133
       0.002571
       0.564788
    
    
      2015-08-21
       0.673135
       0.340938
       0.561927
       0.276930
       0.003561
       0.989528
       0.971867
       0.002556
       0.563302



In [24]:

    
aggdf.ix[[6,5],['gender','date of birth', 'date of death','citizenship','place of birth','ethnic group', 'at least 1 site link']].T









    Out[24]:






  
    
      
      2015-09-13 00:00:00
      2015-11-22 00:00:00
    
  
  
    
      gender
       0.970622
       0.969182
    
    
      date of birth
       0.674378
       0.709272
    
    
      date of death
       0.343581
       0.360168
    
    
      citizenship
       0.571644
       0.575037
    
    
      place of birth
       0.284418
       0.292995
    
    
      ethnic group
       0.003554
       0.004189
    
    
      at least 1 site link
       0.989358
       0.986067



In [25]:

    
print(aggdf.ix[[6,5],['gender','date of birth', 'date of death','citizenship','place of birth','ethnic group','field of work','occupation', 'at least 1 site link']].T.to_latex(float_format=lambda x: "{0:.1f}%".format(x*100)))









    



\begin{tabular}{lrr}
\toprule
{} &  2015-09-13 &  2015-11-22 \\
\midrule
gender               &       97.1\% &       96.9\% \\
date of birth        &       67.4\% &       70.9\% \\
date of death        &       34.4\% &       36.0\% \\
citizenship          &       57.2\% &       57.5\% \\
place of birth       &       28.4\% &       29.3\% \\
ethnic group         &        0.4\% &        0.4\% \\
field of work        &        0.3\% &        0.3\% \\
occupation           &       56.3\% &       56.8\% \\
at least 1 site link &       98.9\% &       98.6\% \\
\bottomrule
\end{tabular}



In [26]:

    
r_aggdf = pd.rolling_mean(aggdf, 2,1)
ax = aggdf.plot( figsize = (5,5), colormap=plt.cm.jet, linewidth=3)
title("Coverage of Accompanying Properties Over Time", y = 1.08,size=18)
legend(bbox_to_anchor=(1.6,0.75))
vals = ax.get_yticks()
ax.set_yticklabels(['{:3.0f}%'.format(x*100) for x in vals])
plt.ylabel('Ratio of biographies with property')
plt.xlabel('Wikidata snapshot')









    Out[26]:





<matplotlib.text.Text at 0x7f83b11ede10>

FRB over time



In [27]:

    
ldf['female%'] = ldf['female'] / ldf['total']



In [28]:

    
ldf['nonbinary'] = ldf['total'] - ldf['female'] - ldf['male']
ldf['nonbinary%'] = ldf['nonbinary'] / ldf['total']



In [29]:

    
r_femper = pd.rolling_mean(ldf['female%'], 2, 1)
ax =r_femper.plot()
title('Female Ratio of Humans in Wikidata Over Time')
vals = ax.get_yticks()
ax.set_yticklabels(['{:3.1f}%'.format(x*100) for x in vals])
plt.ylabel('Female ratio')
plt.xlabel('Wikidata snapshot')









    Out[29]:





<matplotlib.text.Text at 0x7f83966e84a8>



In [30]:

    
twoseries = pd.concat([r_total, r_femper], axis=1)
twoseries.columns = ['Total bios', 'Female ratio']
ax = twoseries.plot(secondary_y=['Female ratio'],figsize=(6,4), linewidth=3)
ax.set_xlabel('Wikidata snapshot')
ax.get_yaxis().set_major_formatter(
    matplotlib.ticker.FuncFormatter(lambda x, p: x/1000000))
ax.set_ylabel('Total biographies (millions)')
ax.right_ax.set_yticklabels(['{:3.1f}%'.format(x*100) for x in ax.right_ax.get_yticks()])
ax.right_ax.set_ylabel('Female ratio of biographies')
plt.title('Human Biographies in Wikidata over Time',y = 1.08,size=18)









    Out[30]:





<matplotlib.text.Text at 0x7f83b07a38d0>



In [31]:

    
ldf['time'] = ldf.index



In [32]:

    
ldf['timesince'] = ldf['time'] - ldf['time'].ix[0]



In [33]:

    
ldf['dayssince'] = ldf['timesince'].dt.days



In [34]:

    
#from pandas.stats.api import ols
#result = ols(y=ldf['female%'], x=ldf['dayssince'])



In [35]:

    
ldf['femper'] = ldf['female%']
ldf['constant'] = 1



In [36]:

    
#import statsmodels.formula.api as sm
#smresult = sm.ols(formula="femper ~ dayssince", data=ldf).fit()



In [37]:

    
#smresult.summary()



In [38]:

    
#populate a dict with numbers:
longitudinal_langs = {}
for snapdir in snapdirs:
    sitelinkspath = path.join(snapshot_dir,snapdir,'property_indexes','site_linkss-index.csv')
    sldf = pd.read_csv(sitelinkspath,index_col=0).fillna(0)
    sldf['total'] = sldf.sum(axis=1)
    sldf['femper'] = sldf['female'] / sldf['total']
    retseries = sldf['femper'][sldf['total']>100000]
    longitudinal_langs[snapdir] = retseries



In [39]:

    
langdf = pd.DataFrame.from_dict(longitudinal_langs, orient='index')



In [40]:

    
langdf.dropna(axis=1, how='any', inplace=True)



In [41]:

    
langdf



In [42]:

    
langdf.index = pd.to_datetime(langdf.index)



In [43]:

    
font = {'family': 'serif',
        'color':  'darkred',
        'weight': 'normal',
        'size': 16,
        }



In [44]:

    
import json
wikicodes = json.load(open('/home/notconfusing/workspace/WIGI/helpers/wiki_code_map.json','r'))
def lookup_wikicode(wikicode):
    letters = wikicode.split('wiki')[0]
    fullname = wikicodes[letters]
    shortname = fullname.split(' ')[0]
    return shortname
lookup_wikicode('enwiki')









    Out[44]:





'English'



In [45]:

    
ax = langdf.plot(legend=False, figsize=(8,12))
for wiki in langdf:
    offset=5 if wiki == 'jawiki' else 0
    ax.annotate(wiki, (langdf.index[-1],langdf[wiki].ix['2016-01-03']), xytext=(2, 0+offset), 
                textcoords='offset points')
    #plt.text(langdf.index[-1],langdf[wiki].ix['2016-01-03'], wiki, fontdict=font)
ax.yaxis.set_major_formatter(
    FuncFormatter(lambda y,pos: ('{0:.2f}'.format(y*100)).rstrip('0').rstrip('.')+'%'))
plt.ylabel('Female ratio of biographies')
plt.xlabel('Wikidata snapshot')
plt.title('Change in Female Ratio of Biographies in \n Oct 2014 - Jan 2016',size=24)
plt.show()



In [ ]:



In [46]:

    
begin, end = '2015-07-28', '2016-01-03'
endsorted = pd.DataFrame(langdf.ix[end]).sort(columns=['2016-01-03 00:00:00'])
oddeven = {wiki: pos % 2 for pos, wiki in list(enumerate(endsorted.index))}



ax = langdf.ix[begin:end].plot(legend=False, figsize=(4,8))
for wiki in langdf:
    side = begin if oddeven[wiki] == 0 else end
    offset = 6 if wiki == 'jawiki' else 0
    ax.annotate(lookup_wikicode(wiki), (end ,langdf[wiki].ix[end]), xytext=(2, 0+offset), 
                textcoords='offset points')
    #plt.text(langdf.index[-1],langdf[wiki].ix['2016-01-03'], wiki, fontdict=font)
ax.yaxis.set_major_formatter(
    FuncFormatter(lambda y,pos: ('{0:.2f}'.format(y*100)).rstrip('0').rstrip('.')+'%'))
plt.ylabel('Female ratio of biographies')
plt.xlabel('Wikidata snapshot')
plt.title('Change in Female Ratio of Biographies in \n July 2015 - Jan 2016',size=17)
plt.show()



In [47]:

    
#populate a dict with numbers:
long_fem_per = {}
long_tot = {}
for snapdir in [min(snapdirs), max(snapdirs)]:
    sitelinkspath = path.join(snapshot_dir,snapdir,'property_indexes','site_linkss-index.csv')
    sldf = pd.read_csv(sitelinkspath,index_col=0).fillna(0)
    sldf['total'] = sldf.sum(axis=1)
    sldf['femper'] = sldf['female'] / sldf['total']
    femperseries = sldf['femper'][sldf['total']>100000]
    totalseries = sldf['total'][sldf['total']>100000]
    long_fem_per[snapdir] = femperseries
    long_tot[snapdir] = totalseries



In [48]:

    
femperdf = pd.DataFrame.from_dict(long_fem_per)
totaldf = pd.DataFrame.from_dict(long_tot)



In [49]:

    
arrowdf =femperdf.join(totaldf, lsuffix='femper', rsuffix='total').T



In [50]:

    
arrowdf = arrowdf.T.dropna(how='any').T



In [51]:

    
arrowdf.ix[1]-arrowdf.ix[0]









    Out[51]:





dewiki    0.001822
enwiki    0.004789
eswiki    0.009225
fiwiki    0.013299
frwiki    0.003542
itwiki    0.001296
jawiki    0.047647
nlwiki    0.005291
nowiki    0.006824
plwiki   -0.001679
ptwiki    0.003673
ruwiki    0.001085
svwiki    0.000601
zhwiki   -0.007243
dtype: float64



In [52]:

    
import math

fig = plt.figure(figsize=(8,8))
ax = fig.add_subplot(111)


for wiki in arrowdf:
    plt.arrow(arrowdf[wiki].ix[2],  #x1
              arrowdf[wiki].ix[0],  # y1
              arrowdf[wiki].ix[3]-arrowdf[wiki].ix[2], # x2 - x1
              arrowdf[wiki].ix[1]-arrowdf[wiki].ix[0], # y2 - y1
             head_width=0.005, head_length=10000, fc='k', ec='k')
    xoffset = 0 if wiki not in ['fiwiki'] else 10000
    yoffset = 0 if wiki not in ['jawiki','nowiki'] else 0.005
    plt.text(arrowdf[wiki].ix[3]+xoffset, arrowdf[wiki].ix[1]**0.99+yoffset, lookup_wikicode(wiki), fontdict=font)
plt.ylim(0.12, 0.22)
plt.ylabel("Female ratio of biographies",size=18)
plt.xlim(100000,1500000)
plt.xlabel("Total Biographies (log scale)", size=18)
plt.xscale('log')
plt.xticks(size=18)

ax.yaxis.set_major_formatter(
    FuncFormatter(lambda y,pos: ('{0:.2f}'.format(y*100)).rstrip('0').rstrip('.')+'%'))
plt.yticks(size=18)
plt.title('Change in Female Ratio of Biographies and Size \n September 17 2014 - January 3 2016', size=24)









    Out[52]:





<matplotlib.text.Text at 0x7f83916424a8>



In [53]:

    
fig = plt.figure(figsize=(8,8))
ax = fig.add_subplot(111)


for wiki in arrowdf:
    plt.arrow(arrowdf[wiki].ix[0],  #x1
              arrowdf[wiki].ix[2],  # y1
              arrowdf[wiki].ix[1]-arrowdf[wiki].ix[0], # x2 - x1
              arrowdf[wiki].ix[3]-arrowdf[wiki].ix[2], # y2 - y1
             head_width=0.005, head_length=10000, fc='k', ec='k')
    yoffset = 0 if wiki not in ['fiwiki','ruwiki','nlwiki','nowiki'] else 10000
    xoffset = 0 if wiki not in ['jawiki'] else 0.01
    plt.text(arrowdf[wiki].ix[1]**0.99+xoffset, arrowdf[wiki].ix[3]+yoffset, lookup_wikicode(wiki), fontdict=font)
plt.xlim(0.12, 0.22)
plt.xlabel("Female ratio of biographies",size=18)
plt.ylim(100000,1500000)
plt.ylabel("Total Biographies (log scale)", size=18)
plt.yscale('log')
plt.yticks(size=18)

ax.xaxis.set_major_formatter(
    FuncFormatter(lambda y,pos: ('{0:.2f}'.format(y*100)).rstrip('0').rstrip('.')+'%'))
plt.xticks(size=18)
plt.title('Change in Female Ratio of Biographies and Size \n September 17 2014 - January 3 2016', size=24)









    Out[53]:





<matplotlib.text.Text at 0x7f8390e66d68>



In [54]:

    
#populate a dict with numbers:
eng_fem_totals = {}
all_fem_totals ={}
for snapdir in snapdirs:
    sitelinkspath = path.join(snapshot_dir,snapdir,'property_indexes','site_linkss-index.csv')
    sldf = pd.read_csv(sitelinkspath,index_col=0).fillna(0)
    eng_fem = sldf.ix['enwiki']['female']
    all_fem = sldf['female'].sum()
    snapdate = pd.to_datetime(snapdir)
    eng_fem_totals[snapdate] = eng_fem
    all_fem_totals[snapdate] = all_fem

engfemdf = pd.DataFrame.from_dict(eng_fem_totals,orient='index').fillna(0)
engfemdf.columns = ['eng_fems']
engfemdf = engfemdf.sort_index()

allfemdf = pd.DataFrame.from_dict(all_fem_totals,orient='index').fillna(0)
allfemdf.columns = ['all_fems']
allfemdf = allfemdf.sort_index()



In [55]:

    
engfemdf['added']= engfemdf['eng_fems']-engfemdf['eng_fems'].shift()
allfemdf['added']= allfemdf['all_fems']-allfemdf['all_fems'].shift()



In [56]:

    
engfemdf['month'] = engfemdf.index.map(lambda x: x.month if x.year == 2015 else 0)
allfemdf['month'] = allfemdf.index.map(lambda x: x.month if x.year == 2015 else 0)



In [57]:

    
eng_months = engfemdf.groupby('month')
all_months = allfemdf.groupby('month')



In [58]:

    
wd_eng_months = eng_months.sum().ix[8:12]
wd_all_months = all_months.sum().ix[8:12]



In [59]:

    
wir_data = {8:1854, 9:1590,10:1989,11:1787,12:1473}
wir_months = pd.DataFrame.from_dict(wir_data, orient='index')
wir_months.columns=['added']



In [60]:

    
addeddf = wir_months.join(wd_eng_months,lsuffix='wir',rsuffix='wd')
alladdeddf= wir_months.join(wd_all_months,lsuffix='wir',rsuffix='wd')



In [61]:

    
del addeddf['eng_fems']
del alladdeddf['all_fems']



In [62]:

    
alladdeddf.plot()









    Out[62]:





<matplotlib.axes._subplots.AxesSubplot at 0x7f8391e47358>



In [63]:

    
addeddf.corr()









    Out[63]:






  
    
      
      addedwir
      addedwd
    
  
  
    
      addedwir
       1.000000
       0.083634
    
    
      addedwd
       0.083634
       1.000000



In [64]:

    
alladdeddf.corr()









    Out[64]:






  
    
      
      addedwir
      addedwd
    
  
  
    
      addedwir
       1.000000
       0.225557
    
    
      addedwd
       0.225557
       1.000000



In [65]:

    
alladdeddf['addedwir'].corr(alladdeddf['addedwd'].shift())









    Out[65]:





0.61052951838651781



In [66]:

    
smooth = pd.rolling_mean(alladdeddf, window=2, min_periods=2).dropna()



In [67]:

    
slope, intercept, r_value, p_value, std_er = linregress(smooth['addedwir'],smooth['addedwd'])
print(r_value, p_value)









    



---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-67-d46b1d5fc253> in <module>()
----> 1 slope, intercept, r_value, p_value, std_er = linregress(smooth['addedwir'],smooth['addedwd'])
      2 print(r_value, p_value)

NameError: name 'linregress' is not defined



In [ ]:

    
np.correlate(addeddf['addedwir'],addeddf['addedwd'])



In [ ]:

    
from scipy.stats import linregress



In [ ]:

    
slope, intercept, r_value, p_value, std_er = linregress(alladdeddf['addedwir'][-4:],alladdeddf['addedwd'].shift()[-4:])



In [ ]:

    
r_value, p_value



In [ ]:

    
def mon2015(x):
    dt = pd.to_datetime(x)
    return x.month if x.year == 2015 else 0

enwiki['months'] = langdf.index.map(mon2015)

enmons = .groupby(by='months')



In [ ]:

    
wir_months



In [ ]:

    
yeardiff = langdf.ix[-1] - langdf.ix[0]



In [ ]:

    
yeardiff.sort(ascending=False)
yeardiff.hist()



In [ ]:

    
print pd.DataFrame(yeardiff).ix[0:10].to_latex(float_format=lambda x: "{0:.2f}%".format(x*100))



In [ ]:

    
pd.DataFrame(yeardiff).ix['enwiki']



In [ ]:

    
def trend(langseries):
    result = ols(y=langseries, x=ldf['dayssince'])
    return result.beta[0]



In [ ]:

    
trends = langdf.apply(lambda x: trend(x), axis=0)



In [ ]:

    
trends.sort(ascending=False)
trends



In [ ]:

    
def splitcol(i):
    return i.split("|")



In [ ]:

    
#av idols adult video
av_tot = {}
act_tot ={}
for snapdir in snapdirs:
    sitelinkspath = path.join(snapshot_dir,snapdir,'property_indexes','occupation-index.csv')
    try:
        avdf = pd.read_csv(sitelinkspath,index_col=0).fillna(0)
        avindexes = avdf.index.map(lambda x: 'Q1079215' in x)
        avtotal = avdf[avindexes].sum().sum()
        av_tot[snapdir] = avtotal
        actindexes = avdf.index.map(lambda x: 'Q488111' in x)
        acttotal = avdf[actindexes].sum().sum()
        act_tot[snapdir] = acttotal
    except OSError: #no occupation yet
        pass



In [ ]:

    
act_tot



In [ ]:



In [ ]:

	dewiki	enwiki	eswiki	fiwiki	frwiki	itwiki	jawiki	nlwiki	nowiki	plwiki	ptwiki	ruwiki	svwiki	zhwiki
2014-09-17	0.149411	0.154439	0.158812	0.159695	0.151706	0.143675	0.126048	0.155467	0.199618	0.155391	0.160003	0.135623	0.197613	0.137898
2014-10-13	0.150655	0.157080	0.165257	0.172855	0.153639	0.145269	0.174524	0.160288	0.204345	0.153818	0.163643	0.136425	0.198135	0.136322
2015-07-28	0.150655	0.157080	0.165257	0.172855	0.153639	0.145269	0.174524	0.160288	0.204345	0.153818	0.163643	0.136425	0.198135	0.136322
2015-08-03	0.150655	0.157080	0.165257	0.172855	0.153639	0.145269	0.174524	0.160288	0.204345	0.153818	0.163643	0.136425	0.198135	0.136322
2015-08-09	0.150725	0.156740	0.165606	0.172678	0.153615	0.145325	0.174614	0.160279	0.204307	0.153780	0.163508	0.136546	0.198180	0.136776
2015-08-12	0.150796	0.156791	0.165613	0.172671	0.153971	0.145331	0.174670	0.160253	0.204418	0.153765	0.163371	0.136678	0.198229	0.136972
2015-08-16	0.150796	0.156791	0.165613	0.172671	0.153971	0.145331	0.174670	0.160253	0.204418	0.153765	0.163371	0.136678	0.198229	0.136972
2015-08-21	0.150881	0.156708	0.165541	0.172636	0.153999	0.145287	0.174978	0.160219	0.204455	0.153742	0.163389	0.136647	0.198234	0.137294
2015-08-23	0.150881	0.156708	0.165541	0.172636	0.153999	0.145287	0.174978	0.160219	0.204455	0.153742	0.163389	0.136647	0.198234	0.137294
2015-09-06	0.150958	0.156835	0.165377	0.172876	0.153993	0.145247	0.174302	0.160349	0.204367	0.153490	0.163401	0.136669	0.198290	0.137378
2015-09-13	0.150977	0.157026	0.165736	0.172606	0.154115	0.145235	0.174541	0.160562	0.205009	0.153453	0.163085	0.135852	0.198557	0.132569
2015-09-20	0.151077	0.157484	0.166200	0.171887	0.154168	0.145325	0.174649	0.160524	0.205385	0.153507	0.163036	0.136702	0.198581	0.131109
2015-09-27	0.151119	0.157667	0.166350	0.172374	0.154301	0.145290	0.174746	0.160376	0.205325	0.153541	0.162862	0.136972	0.198562	0.131097
2015-10-13	0.151115	0.158535	0.166726	0.172636	0.154417	0.145374	0.174496	0.160373	0.206378	0.153805	0.163798	0.136952	0.198443	0.131029
2015-10-19	0.151115	0.158535	0.166726	0.172636	0.154417	0.145374	0.174496	0.160373	0.206378	0.153805	0.163798	0.136952	0.198443	0.131029
2015-10-26	0.151140	0.158520	0.167108	0.172858	0.154428	0.145324	0.174517	0.160449	0.206349	0.153750	0.163883	0.137034	0.198529	0.130892
2015-11-02	0.151159	0.158632	0.167167	0.172880	0.154494	0.145181	0.174514	0.160469	0.206377	0.153424	0.163601	0.136951	0.198509	0.130923
2015-11-09	0.151207	0.158766	0.167338	0.172881	0.154680	0.145141	0.174511	0.160572	0.206527	0.153795	0.163693	0.137021	0.198441	0.131021
2015-11-21	0.151209	0.158802	0.167377	0.172881	0.154751	0.145069	0.174499	0.160539	0.206638	0.153803	0.164024	0.137165	0.198397	0.131047
2015-11-22	0.151209	0.158802	0.167377	0.172881	0.154751	0.145069	0.174499	0.160539	0.206638	0.153803	0.164024	0.137165	0.198397	0.131047
2015-11-23	0.151150	0.158836	0.167476	0.172894	0.154730	0.145055	0.174509	0.160600	0.206734	0.153782	0.163952	0.137242	0.198398	0.130724
2015-12-04	0.151173	0.158876	0.167620	0.172902	0.154954	0.145037	0.174527	0.160635	0.206731	0.153744	0.163872	0.137208	0.198219	0.130603
2015-12-21	0.151218	0.159199	0.168030	0.173033	0.155225	0.145170	0.173655	0.160725	0.206923	0.153685	0.163633	0.137107	0.198193	0.130627
2016-01-03	0.151233	0.159228	0.168038	0.172994	0.155248	0.144971	0.173695	0.160758	0.206442	0.153712	0.163676	0.136708	0.198214	0.130656

	date of birth	date of death	citizenship	place of birth	ethnic group	at least 1 site link	gender	field of work	occupation
2015-11-21	0.709272	0.360168	0.575037	0.292995	0.004189	0.986067	0.969182	0.002574	0.567623
2015-08-03	0.667751	0.337255	0.560040	0.266263	0.003391	0.989771	0.971760	0.000000	0.000000
2015-08-09	0.669641	0.338769	0.560887	0.266321	0.003396	0.989634	0.971606	0.002548	0.557724
2015-08-16	0.669993	0.339033	0.560946	0.266382	0.003559	0.989620	0.971775	0.002551	0.558631
2015-12-04	0.713121	0.360298	0.575444	0.291580	0.005109	0.982407	0.966120	0.002559	0.568582
2015-11-22	0.709272	0.360168	0.575037	0.292995	0.004189	0.986067	0.969182	0.002574	0.567623
2015-09-13	0.674378	0.343581	0.571644	0.284418	0.003554	0.989358	0.970622	0.002555	0.563043
2015-11-23	0.712787	0.361195	0.574745	0.292245	0.004536	0.984862	0.967960	0.002569	0.571086
2015-12-21	0.716232	0.361078	0.581477	0.303596	0.005413	0.981570	0.965295	0.002571	0.582688
2016-01-03	0.717249	0.361488	0.582185	0.305102	0.005559	0.981463	0.965416	0.002573	0.587191
2014-10-13	0.740799	0.374148	0.560040	0.266263	0.003391	0.989771	0.971760	0.000000	0.000000
2015-11-02	0.700751	0.357170	0.579068	0.294258	0.003605	0.988327	0.972842	0.002568	0.566348
2015-09-27	0.673458	0.343068	0.577288	0.283896	0.003558	0.989244	0.972117	0.002553	0.562528
2015-10-13	0.675539	0.347995	0.578089	0.291084	0.003584	0.989097	0.973522	0.002561	0.559922
2015-07-28	0.667751	0.337255	0.560040	0.266263	0.003391	0.989771	0.971760	0.000000	0.000000
2014-09-17	0.575736	0.285827	0.428217	0.240124	0.003109	0.996164	0.952879	0.000000	0.000000
2015-08-23	0.673135	0.340938	0.561927	0.276930	0.003561	0.989528	0.971867	0.002556	0.563302
2015-09-06	0.675439	0.343853	0.572870	0.283842	0.003567	0.989411	0.971435	0.002561	0.565040
2015-09-20	0.673881	0.343459	0.575933	0.284116	0.003569	0.989312	0.971570	0.002552	0.562254
2015-10-26	0.695236	0.354858	0.579960	0.294588	0.003597	0.988819	0.973161	0.002568	0.564161
2015-08-12	0.669993	0.339033	0.560946	0.266382	0.003559	0.989620	0.971775	0.002551	0.558631
2015-10-19	0.675539	0.347995	0.578089	0.291084	0.003584	0.989097	0.973522	0.002561	0.559922
2015-11-09	0.702570	0.358822	0.576679	0.293571	0.004148	0.988171	0.971133	0.002571	0.564788
2015-08-21	0.673135	0.340938	0.561927	0.276930	0.003561	0.989528	0.971867	0.002556	0.563302

	2015-09-13 00:00:00	2015-11-22 00:00:00
gender	0.970622	0.969182
date of birth	0.674378	0.709272
date of death	0.343581	0.360168
citizenship	0.571644	0.575037
place of birth	0.284418	0.292995
ethnic group	0.003554	0.004189
at least 1 site link	0.989358	0.986067