In [1]:
import pandas as pd
from os import path
%pylab inline
#plt.style.use('fivethirtyeight')
plt.style.use('ggplot')
import matplotlib as mpl
mpl.rc("savefig", dpi=300)


Populating the interactive namespace from numpy and matplotlib

In [2]:
snapshot_dir = '/media/notconfusing/9d9b45fc-55f7-428c-a228-1c4c4a1b728c/home/maximilianklein/snapshot_data/'

In [3]:
lssnaps = !ls $snapshot_dir

In [4]:
snapdirs = [snap for snap in lssnaps if snap[0].isdigit()]

In [5]:
all_genders = set()
#find the genders
for snapdir in snapdirs:
    sitelinkspath = path.join(snapshot_dir,snapdir,'property_indexes','site_linkss-index.csv')
    sldf = pd.read_csv(sitelinkspath).fillna(0)
    snap_genders = sldf.columns
    for g in snap_genders: 
        if g.endswith('1'):
            print(sitelinkspath)
        all_genders.add(g)

In [6]:
#populate a dict with numbers:
longitudinal_totals = {}
for snapdir in snapdirs:
    sitelinkspath = path.join(snapshot_dir,snapdir,'property_indexes','site_linkss-index.csv')
    sldf = pd.read_csv(sitelinkspath,index_col=0).fillna(0)
    snap_genders = sldf.columns
    gendersum = sldf.sum()
    longitudinal_totals[snapdir] = gendersum

ldf = pd.DataFrame.from_dict(longitudinal_totals,orient='index').fillna(0)

In [7]:
print(len(ldf.columns))


37

In [8]:
ldf.index = pd.to_datetime(ldf.index)

In [9]:
ldf['total'] = ldf.sum(axis=1)

In [10]:
ldf['total']


Out[10]:
2014-09-17    5869606
2014-10-13    6589799
2015-07-28    6589799
2015-08-03    6589799
2015-08-09    6609617
2015-08-12    6617934
2015-08-16    6617934
2015-08-21    6630052
2015-08-23    6630052
2015-09-06    6656076
2015-09-13    6680340
2015-09-20    6698141
2015-09-27    6714732
2015-10-13    6763390
2015-10-19    6763390
2015-10-26    6807585
2015-11-02    6824561
2015-11-09    6851235
2015-11-21    6876328
2015-11-22    6876328
2015-11-23    6901147
2015-12-04    6929623
2015-12-21    6987464
2016-01-03    6999542
Name: total, dtype: float64

In [11]:
ldf['ungendered'] = ldf['nan'] / ldf['total']

In [12]:
ldf['gender'] = 1 - ldf['ungendered']

In [13]:
ldf['total'].plot()


Out[13]:
<matplotlib.axes._subplots.AxesSubplot at 0x7f83ba6d8860>

In [14]:
ldf['total']


Out[14]:
2014-09-17    5869606
2014-10-13    6589799
2015-07-28    6589799
2015-08-03    6589799
2015-08-09    6609617
2015-08-12    6617934
2015-08-16    6617934
2015-08-21    6630052
2015-08-23    6630052
2015-09-06    6656076
2015-09-13    6680340
2015-09-20    6698141
2015-09-27    6714732
2015-10-13    6763390
2015-10-19    6763390
2015-10-26    6807585
2015-11-02    6824561
2015-11-09    6851235
2015-11-21    6876328
2015-11-22    6876328
2015-11-23    6901147
2015-12-04    6929623
2015-12-21    6987464
2016-01-03    6999542
Name: total, dtype: float64

In [15]:
r_total = pd.rolling_mean(ldf['total'], 2, 1)

In [16]:
ax = r_total.plot(title="Total Humans in Wikidata Over Time")
ax.get_yaxis().set_major_formatter(
    matplotlib.ticker.FuncFormatter(lambda x, p: format(int(x), ',')))
plt.ylabel('Number of humans')
plt.xlabel('Wikidata snapshot')


Out[16]:
<matplotlib.text.Text at 0x7f83b96af4e0>

In [17]:
r_gendered = pd.rolling_mean(ldf['gender'], 2, 1)
r_gendered.plot(secondary_y=True, title="% of Humans with Gender in Wikidata")


Out[17]:
<matplotlib.axes._subplots.AxesSubplot at 0x7f83b86591d0>

In [ ]:


In [69]:
#populate a dict with numbers:
longitudinal_countrys = {}
longitudinal_citizenships = {}
longitudinal_egs = {}
longitudinal_sites = {}

dict_csv_map = { 'place_of_births-index.csv': longitudinal_countrys,
'citizenships-index.csv' : longitudinal_citizenships,
'ethnic_groups-index.csv': longitudinal_egs,
'site_linkss-index.csv': longitudinal_sites}

for snapdir in snapdirs:
    for csvtype, longitudinal_dict in dict_csv_map.items():
        sitelinkspath = path.join(snapshot_dir,snapdir,'property_indexes',csvtype)
        cdf = pd.read_csv(sitelinkspath,index_col=0).fillna(0)
        csum = cdf.sum()
        total = csum.sum()
        nothaving = cdf.ix[0].sum()
        having = total - nothaving
        if snapdir == '2016-01-03':
            print(csvtype, having)
        having_per = having / float(total)
        longitudinal_dict[snapdir] = having_per


2016-01-03 ethnic_groups-index.csv 16881.0
2016-01-03 site_linkss-index.csv 6869792.0
2016-01-03 citizenships-index.csv 1802041.0
2016-01-03 place_of_births-index.csv 927166.0

In [70]:
longitudinal_dobs = {}
longitudinal_dods= {}
longitudinal_occ = {}
longitudinal_fow = {}
dict_csv_map = { 'dod-index.csv': longitudinal_dods,
'dob-index.csv' : longitudinal_dobs,
               'field_of_work-index.csv': longitudinal_fow,
               'occupation-index.csv': longitudinal_occ}

for snapdir in snapdirs:
    bigfname = 'gender-index-data-{}.csv'.format(snapdir)
    bigdf = pd.read_csv(path.join(snapshot_dir,snapdir,bigfname))
    humans =  len(bigdf)

    for csvtype, longitudinal_dict in dict_csv_map.items():
        sitelinkspath = path.join(snapshot_dir,snapdir,'property_indexes',csvtype)
        try:
            cdf = pd.read_csv(sitelinkspath,index_col=0).fillna(0)
            csum = cdf.sum()
            total = csum.sum()
        except IOError:
            total = 0
        if snapdir == '2016-01-03':
            print(csvtype, total)
        having_per = total/ float(humans)
        longitudinal_dict[snapdir] = having_per


field_of_work-index.csv 7812.0
occupation-index.csv 1782850.0
dob-index.csv 2177738.0
dod-index.csv 1097562.0
/usr/lib/python3/dist-packages/pandas/io/parsers.py:1150: DtypeWarning: Columns (4,7) have mixed types. Specify dtype option on import or set low_memory=False.
  data = self._reader.read(nrows)

In [73]:
java_min_int = -2147483648
bigdf = pd.read_csv(path.join(snapshot_dir,'2016-01-03',bigfname),na_values=[java_min_int])

In [84]:
bigdf['dob'].apply(lambda x: not math.isnan(x)).sum()


Out[84]:
2177602

In [20]:
dobdf = pd.DataFrame.from_dict(longitudinal_dobs,orient='index').fillna(0)
dobdf.columns = ['date of birth']
doddf = pd.DataFrame.from_dict(longitudinal_dods,orient='index').fillna(0)
doddf.columns = ['date of death']
citdf = pd.DataFrame.from_dict(longitudinal_citizenships,orient='index').fillna(0)
citdf.columns = ['citizenship']
countrydf = pd.DataFrame.from_dict(longitudinal_countrys,orient='index').fillna(0)
countrydf.columns = ['place of birth']
egdf = pd.DataFrame.from_dict(longitudinal_egs,orient='index').fillna(0)
egdf.columns = ['ethnic group']
fowdf = pd.DataFrame.from_dict(longitudinal_fow,orient='index').fillna(0)
fowdf.columns = ['field of work']
occdf = pd.DataFrame.from_dict(longitudinal_occ,orient='index').fillna(0)
occdf.columns = ['occupation']
sitedf = pd.DataFrame.from_dict(longitudinal_sites,orient='index').fillna(0)
sitedf.columns = ['at least 1 site link']

In [21]:
aggdf = dobdf.join(doddf).join(citdf).join(countrydf).join(egdf).join(sitedf).join(ldf['gender']).join(fowdf).join(occdf)

In [22]:
aggdf.index = pd.to_datetime(aggdf.index)

In [23]:
aggdf.head(24)


Out[23]:
date of birth date of death citizenship place of birth ethnic group at least 1 site link gender field of work occupation
2015-11-21 0.709272 0.360168 0.575037 0.292995 0.004189 0.986067 0.969182 0.002574 0.567623
2015-08-03 0.667751 0.337255 0.560040 0.266263 0.003391 0.989771 0.971760 0.000000 0.000000
2015-08-09 0.669641 0.338769 0.560887 0.266321 0.003396 0.989634 0.971606 0.002548 0.557724
2015-08-16 0.669993 0.339033 0.560946 0.266382 0.003559 0.989620 0.971775 0.002551 0.558631
2015-12-04 0.713121 0.360298 0.575444 0.291580 0.005109 0.982407 0.966120 0.002559 0.568582
2015-11-22 0.709272 0.360168 0.575037 0.292995 0.004189 0.986067 0.969182 0.002574 0.567623
2015-09-13 0.674378 0.343581 0.571644 0.284418 0.003554 0.989358 0.970622 0.002555 0.563043
2015-11-23 0.712787 0.361195 0.574745 0.292245 0.004536 0.984862 0.967960 0.002569 0.571086
2015-12-21 0.716232 0.361078 0.581477 0.303596 0.005413 0.981570 0.965295 0.002571 0.582688
2016-01-03 0.717249 0.361488 0.582185 0.305102 0.005559 0.981463 0.965416 0.002573 0.587191
2014-10-13 0.740799 0.374148 0.560040 0.266263 0.003391 0.989771 0.971760 0.000000 0.000000
2015-11-02 0.700751 0.357170 0.579068 0.294258 0.003605 0.988327 0.972842 0.002568 0.566348
2015-09-27 0.673458 0.343068 0.577288 0.283896 0.003558 0.989244 0.972117 0.002553 0.562528
2015-10-13 0.675539 0.347995 0.578089 0.291084 0.003584 0.989097 0.973522 0.002561 0.559922
2015-07-28 0.667751 0.337255 0.560040 0.266263 0.003391 0.989771 0.971760 0.000000 0.000000
2014-09-17 0.575736 0.285827 0.428217 0.240124 0.003109 0.996164 0.952879 0.000000 0.000000
2015-08-23 0.673135 0.340938 0.561927 0.276930 0.003561 0.989528 0.971867 0.002556 0.563302
2015-09-06 0.675439 0.343853 0.572870 0.283842 0.003567 0.989411 0.971435 0.002561 0.565040
2015-09-20 0.673881 0.343459 0.575933 0.284116 0.003569 0.989312 0.971570 0.002552 0.562254
2015-10-26 0.695236 0.354858 0.579960 0.294588 0.003597 0.988819 0.973161 0.002568 0.564161
2015-08-12 0.669993 0.339033 0.560946 0.266382 0.003559 0.989620 0.971775 0.002551 0.558631
2015-10-19 0.675539 0.347995 0.578089 0.291084 0.003584 0.989097 0.973522 0.002561 0.559922
2015-11-09 0.702570 0.358822 0.576679 0.293571 0.004148 0.988171 0.971133 0.002571 0.564788
2015-08-21 0.673135 0.340938 0.561927 0.276930 0.003561 0.989528 0.971867 0.002556 0.563302

In [24]:
aggdf.ix[[6,5],['gender','date of birth', 'date of death','citizenship','place of birth','ethnic group', 'at least 1 site link']].T


Out[24]:
2015-09-13 00:00:00 2015-11-22 00:00:00
gender 0.970622 0.969182
date of birth 0.674378 0.709272
date of death 0.343581 0.360168
citizenship 0.571644 0.575037
place of birth 0.284418 0.292995
ethnic group 0.003554 0.004189
at least 1 site link 0.989358 0.986067

In [25]:
print(aggdf.ix[[6,5],['gender','date of birth', 'date of death','citizenship','place of birth','ethnic group','field of work','occupation', 'at least 1 site link']].T.to_latex(float_format=lambda x: "{0:.1f}%".format(x*100)))


\begin{tabular}{lrr}
\toprule
{} &  2015-09-13 &  2015-11-22 \\
\midrule
gender               &       97.1\% &       96.9\% \\
date of birth        &       67.4\% &       70.9\% \\
date of death        &       34.4\% &       36.0\% \\
citizenship          &       57.2\% &       57.5\% \\
place of birth       &       28.4\% &       29.3\% \\
ethnic group         &        0.4\% &        0.4\% \\
field of work        &        0.3\% &        0.3\% \\
occupation           &       56.3\% &       56.8\% \\
at least 1 site link &       98.9\% &       98.6\% \\
\bottomrule
\end{tabular}


In [26]:
r_aggdf = pd.rolling_mean(aggdf, 2,1)
ax = aggdf.plot( figsize = (5,5), colormap=plt.cm.jet, linewidth=3)
title("Coverage of Accompanying Properties Over Time", y = 1.08,size=18)
legend(bbox_to_anchor=(1.6,0.75))
vals = ax.get_yticks()
ax.set_yticklabels(['{:3.0f}%'.format(x*100) for x in vals])
plt.ylabel('Ratio of biographies with property')
plt.xlabel('Wikidata snapshot')


Out[26]:
<matplotlib.text.Text at 0x7f83b11ede10>

FRB over time


In [27]:
ldf['female%'] = ldf['female'] / ldf['total']

In [28]:
ldf['nonbinary'] = ldf['total'] - ldf['female'] - ldf['male']
ldf['nonbinary%'] = ldf['nonbinary'] / ldf['total']

In [29]:
r_femper = pd.rolling_mean(ldf['female%'], 2, 1)
ax =r_femper.plot()
title('Female Ratio of Humans in Wikidata Over Time')
vals = ax.get_yticks()
ax.set_yticklabels(['{:3.1f}%'.format(x*100) for x in vals])
plt.ylabel('Female ratio')
plt.xlabel('Wikidata snapshot')


Out[29]:
<matplotlib.text.Text at 0x7f83966e84a8>

In [30]:
twoseries = pd.concat([r_total, r_femper], axis=1)
twoseries.columns = ['Total bios', 'Female ratio']
ax = twoseries.plot(secondary_y=['Female ratio'],figsize=(6,4), linewidth=3)
ax.set_xlabel('Wikidata snapshot')
ax.get_yaxis().set_major_formatter(
    matplotlib.ticker.FuncFormatter(lambda x, p: x/1000000))
ax.set_ylabel('Total biographies (millions)')
ax.right_ax.set_yticklabels(['{:3.1f}%'.format(x*100) for x in ax.right_ax.get_yticks()])
ax.right_ax.set_ylabel('Female ratio of biographies')
plt.title('Human Biographies in Wikidata over Time',y = 1.08,size=18)


Out[30]:
<matplotlib.text.Text at 0x7f83b07a38d0>

In [31]:
ldf['time'] = ldf.index

In [32]:
ldf['timesince'] = ldf['time'] - ldf['time'].ix[0]

In [33]:
ldf['dayssince'] = ldf['timesince'].dt.days

In [34]:
#from pandas.stats.api import ols
#result = ols(y=ldf['female%'], x=ldf['dayssince'])

In [35]:
ldf['femper'] = ldf['female%']
ldf['constant'] = 1

In [36]:
#import statsmodels.formula.api as sm
#smresult = sm.ols(formula="femper ~ dayssince", data=ldf).fit()

In [37]:
#smresult.summary()

In [38]:
#populate a dict with numbers:
longitudinal_langs = {}
for snapdir in snapdirs:
    sitelinkspath = path.join(snapshot_dir,snapdir,'property_indexes','site_linkss-index.csv')
    sldf = pd.read_csv(sitelinkspath,index_col=0).fillna(0)
    sldf['total'] = sldf.sum(axis=1)
    sldf['femper'] = sldf['female'] / sldf['total']
    retseries = sldf['femper'][sldf['total']>100000]
    longitudinal_langs[snapdir] = retseries

In [39]:
langdf = pd.DataFrame.from_dict(longitudinal_langs, orient='index')

In [40]:
langdf.dropna(axis=1, how='any', inplace=True)

In [41]:
langdf


Out[41]:
dewiki enwiki eswiki fiwiki frwiki itwiki jawiki nlwiki nowiki plwiki ptwiki ruwiki svwiki zhwiki
2014-09-17 0.149411 0.154439 0.158812 0.159695 0.151706 0.143675 0.126048 0.155467 0.199618 0.155391 0.160003 0.135623 0.197613 0.137898
2014-10-13 0.150655 0.157080 0.165257 0.172855 0.153639 0.145269 0.174524 0.160288 0.204345 0.153818 0.163643 0.136425 0.198135 0.136322
2015-07-28 0.150655 0.157080 0.165257 0.172855 0.153639 0.145269 0.174524 0.160288 0.204345 0.153818 0.163643 0.136425 0.198135 0.136322
2015-08-03 0.150655 0.157080 0.165257 0.172855 0.153639 0.145269 0.174524 0.160288 0.204345 0.153818 0.163643 0.136425 0.198135 0.136322
2015-08-09 0.150725 0.156740 0.165606 0.172678 0.153615 0.145325 0.174614 0.160279 0.204307 0.153780 0.163508 0.136546 0.198180 0.136776
2015-08-12 0.150796 0.156791 0.165613 0.172671 0.153971 0.145331 0.174670 0.160253 0.204418 0.153765 0.163371 0.136678 0.198229 0.136972
2015-08-16 0.150796 0.156791 0.165613 0.172671 0.153971 0.145331 0.174670 0.160253 0.204418 0.153765 0.163371 0.136678 0.198229 0.136972
2015-08-21 0.150881 0.156708 0.165541 0.172636 0.153999 0.145287 0.174978 0.160219 0.204455 0.153742 0.163389 0.136647 0.198234 0.137294
2015-08-23 0.150881 0.156708 0.165541 0.172636 0.153999 0.145287 0.174978 0.160219 0.204455 0.153742 0.163389 0.136647 0.198234 0.137294
2015-09-06 0.150958 0.156835 0.165377 0.172876 0.153993 0.145247 0.174302 0.160349 0.204367 0.153490 0.163401 0.136669 0.198290 0.137378
2015-09-13 0.150977 0.157026 0.165736 0.172606 0.154115 0.145235 0.174541 0.160562 0.205009 0.153453 0.163085 0.135852 0.198557 0.132569
2015-09-20 0.151077 0.157484 0.166200 0.171887 0.154168 0.145325 0.174649 0.160524 0.205385 0.153507 0.163036 0.136702 0.198581 0.131109
2015-09-27 0.151119 0.157667 0.166350 0.172374 0.154301 0.145290 0.174746 0.160376 0.205325 0.153541 0.162862 0.136972 0.198562 0.131097
2015-10-13 0.151115 0.158535 0.166726 0.172636 0.154417 0.145374 0.174496 0.160373 0.206378 0.153805 0.163798 0.136952 0.198443 0.131029
2015-10-19 0.151115 0.158535 0.166726 0.172636 0.154417 0.145374 0.174496 0.160373 0.206378 0.153805 0.163798 0.136952 0.198443 0.131029
2015-10-26 0.151140 0.158520 0.167108 0.172858 0.154428 0.145324 0.174517 0.160449 0.206349 0.153750 0.163883 0.137034 0.198529 0.130892
2015-11-02 0.151159 0.158632 0.167167 0.172880 0.154494 0.145181 0.174514 0.160469 0.206377 0.153424 0.163601 0.136951 0.198509 0.130923
2015-11-09 0.151207 0.158766 0.167338 0.172881 0.154680 0.145141 0.174511 0.160572 0.206527 0.153795 0.163693 0.137021 0.198441 0.131021
2015-11-21 0.151209 0.158802 0.167377 0.172881 0.154751 0.145069 0.174499 0.160539 0.206638 0.153803 0.164024 0.137165 0.198397 0.131047
2015-11-22 0.151209 0.158802 0.167377 0.172881 0.154751 0.145069 0.174499 0.160539 0.206638 0.153803 0.164024 0.137165 0.198397 0.131047
2015-11-23 0.151150 0.158836 0.167476 0.172894 0.154730 0.145055 0.174509 0.160600 0.206734 0.153782 0.163952 0.137242 0.198398 0.130724
2015-12-04 0.151173 0.158876 0.167620 0.172902 0.154954 0.145037 0.174527 0.160635 0.206731 0.153744 0.163872 0.137208 0.198219 0.130603
2015-12-21 0.151218 0.159199 0.168030 0.173033 0.155225 0.145170 0.173655 0.160725 0.206923 0.153685 0.163633 0.137107 0.198193 0.130627
2016-01-03 0.151233 0.159228 0.168038 0.172994 0.155248 0.144971 0.173695 0.160758 0.206442 0.153712 0.163676 0.136708 0.198214 0.130656

In [42]:
langdf.index = pd.to_datetime(langdf.index)

In [43]:
font = {'family': 'serif',
        'color':  'darkred',
        'weight': 'normal',
        'size': 16,
        }

In [44]:
import json
wikicodes = json.load(open('/home/notconfusing/workspace/WIGI/helpers/wiki_code_map.json','r'))
def lookup_wikicode(wikicode):
    letters = wikicode.split('wiki')[0]
    fullname = wikicodes[letters]
    shortname = fullname.split(' ')[0]
    return shortname
lookup_wikicode('enwiki')


Out[44]:
'English'

In [45]:
ax = langdf.plot(legend=False, figsize=(8,12))
for wiki in langdf:
    offset=5 if wiki == 'jawiki' else 0
    ax.annotate(wiki, (langdf.index[-1],langdf[wiki].ix['2016-01-03']), xytext=(2, 0+offset), 
                textcoords='offset points')
    #plt.text(langdf.index[-1],langdf[wiki].ix['2016-01-03'], wiki, fontdict=font)
ax.yaxis.set_major_formatter(
    FuncFormatter(lambda y,pos: ('{0:.2f}'.format(y*100)).rstrip('0').rstrip('.')+'%'))
plt.ylabel('Female ratio of biographies')
plt.xlabel('Wikidata snapshot')
plt.title('Change in Female Ratio of Biographies in \n Oct 2014 - Jan 2016',size=24)
plt.show()



In [ ]:


In [46]:
begin, end = '2015-07-28', '2016-01-03'
endsorted = pd.DataFrame(langdf.ix[end]).sort(columns=['2016-01-03 00:00:00'])
oddeven = {wiki: pos % 2 for pos, wiki in list(enumerate(endsorted.index))}



ax = langdf.ix[begin:end].plot(legend=False, figsize=(4,8))
for wiki in langdf:
    side = begin if oddeven[wiki] == 0 else end
    offset = 6 if wiki == 'jawiki' else 0
    ax.annotate(lookup_wikicode(wiki), (end ,langdf[wiki].ix[end]), xytext=(2, 0+offset), 
                textcoords='offset points')
    #plt.text(langdf.index[-1],langdf[wiki].ix['2016-01-03'], wiki, fontdict=font)
ax.yaxis.set_major_formatter(
    FuncFormatter(lambda y,pos: ('{0:.2f}'.format(y*100)).rstrip('0').rstrip('.')+'%'))
plt.ylabel('Female ratio of biographies')
plt.xlabel('Wikidata snapshot')
plt.title('Change in Female Ratio of Biographies in \n July 2015 - Jan 2016',size=17)
plt.show()



In [47]:
#populate a dict with numbers:
long_fem_per = {}
long_tot = {}
for snapdir in [min(snapdirs), max(snapdirs)]:
    sitelinkspath = path.join(snapshot_dir,snapdir,'property_indexes','site_linkss-index.csv')
    sldf = pd.read_csv(sitelinkspath,index_col=0).fillna(0)
    sldf['total'] = sldf.sum(axis=1)
    sldf['femper'] = sldf['female'] / sldf['total']
    femperseries = sldf['femper'][sldf['total']>100000]
    totalseries = sldf['total'][sldf['total']>100000]
    long_fem_per[snapdir] = femperseries
    long_tot[snapdir] = totalseries

In [48]:
femperdf = pd.DataFrame.from_dict(long_fem_per)
totaldf = pd.DataFrame.from_dict(long_tot)

In [49]:
arrowdf =femperdf.join(totaldf, lsuffix='femper', rsuffix='total').T

In [50]:
arrowdf = arrowdf.T.dropna(how='any').T

In [51]:
arrowdf.ix[1]-arrowdf.ix[0]


Out[51]:
dewiki    0.001822
enwiki    0.004789
eswiki    0.009225
fiwiki    0.013299
frwiki    0.003542
itwiki    0.001296
jawiki    0.047647
nlwiki    0.005291
nowiki    0.006824
plwiki   -0.001679
ptwiki    0.003673
ruwiki    0.001085
svwiki    0.000601
zhwiki   -0.007243
dtype: float64

In [52]:
import math

fig = plt.figure(figsize=(8,8))
ax = fig.add_subplot(111)


for wiki in arrowdf:
    plt.arrow(arrowdf[wiki].ix[2],  #x1
              arrowdf[wiki].ix[0],  # y1
              arrowdf[wiki].ix[3]-arrowdf[wiki].ix[2], # x2 - x1
              arrowdf[wiki].ix[1]-arrowdf[wiki].ix[0], # y2 - y1
             head_width=0.005, head_length=10000, fc='k', ec='k')
    xoffset = 0 if wiki not in ['fiwiki'] else 10000
    yoffset = 0 if wiki not in ['jawiki','nowiki'] else 0.005
    plt.text(arrowdf[wiki].ix[3]+xoffset, arrowdf[wiki].ix[1]**0.99+yoffset, lookup_wikicode(wiki), fontdict=font)
plt.ylim(0.12, 0.22)
plt.ylabel("Female ratio of biographies",size=18)
plt.xlim(100000,1500000)
plt.xlabel("Total Biographies (log scale)", size=18)
plt.xscale('log')
plt.xticks(size=18)

ax.yaxis.set_major_formatter(
    FuncFormatter(lambda y,pos: ('{0:.2f}'.format(y*100)).rstrip('0').rstrip('.')+'%'))
plt.yticks(size=18)
plt.title('Change in Female Ratio of Biographies and Size \n September 17 2014 - January 3 2016', size=24)


Out[52]:
<matplotlib.text.Text at 0x7f83916424a8>

In [53]:
fig = plt.figure(figsize=(8,8))
ax = fig.add_subplot(111)


for wiki in arrowdf:
    plt.arrow(arrowdf[wiki].ix[0],  #x1
              arrowdf[wiki].ix[2],  # y1
              arrowdf[wiki].ix[1]-arrowdf[wiki].ix[0], # x2 - x1
              arrowdf[wiki].ix[3]-arrowdf[wiki].ix[2], # y2 - y1
             head_width=0.005, head_length=10000, fc='k', ec='k')
    yoffset = 0 if wiki not in ['fiwiki','ruwiki','nlwiki','nowiki'] else 10000
    xoffset = 0 if wiki not in ['jawiki'] else 0.01
    plt.text(arrowdf[wiki].ix[1]**0.99+xoffset, arrowdf[wiki].ix[3]+yoffset, lookup_wikicode(wiki), fontdict=font)
plt.xlim(0.12, 0.22)
plt.xlabel("Female ratio of biographies",size=18)
plt.ylim(100000,1500000)
plt.ylabel("Total Biographies (log scale)", size=18)
plt.yscale('log')
plt.yticks(size=18)

ax.xaxis.set_major_formatter(
    FuncFormatter(lambda y,pos: ('{0:.2f}'.format(y*100)).rstrip('0').rstrip('.')+'%'))
plt.xticks(size=18)
plt.title('Change in Female Ratio of Biographies and Size \n September 17 2014 - January 3 2016', size=24)


Out[53]:
<matplotlib.text.Text at 0x7f8390e66d68>

In [54]:
#populate a dict with numbers:
eng_fem_totals = {}
all_fem_totals ={}
for snapdir in snapdirs:
    sitelinkspath = path.join(snapshot_dir,snapdir,'property_indexes','site_linkss-index.csv')
    sldf = pd.read_csv(sitelinkspath,index_col=0).fillna(0)
    eng_fem = sldf.ix['enwiki']['female']
    all_fem = sldf['female'].sum()
    snapdate = pd.to_datetime(snapdir)
    eng_fem_totals[snapdate] = eng_fem
    all_fem_totals[snapdate] = all_fem

engfemdf = pd.DataFrame.from_dict(eng_fem_totals,orient='index').fillna(0)
engfemdf.columns = ['eng_fems']
engfemdf = engfemdf.sort_index()

allfemdf = pd.DataFrame.from_dict(all_fem_totals,orient='index').fillna(0)
allfemdf.columns = ['all_fems']
allfemdf = allfemdf.sort_index()

In [55]:
engfemdf['added']= engfemdf['eng_fems']-engfemdf['eng_fems'].shift()
allfemdf['added']= allfemdf['all_fems']-allfemdf['all_fems'].shift()

In [56]:
engfemdf['month'] = engfemdf.index.map(lambda x: x.month if x.year == 2015 else 0)
allfemdf['month'] = allfemdf.index.map(lambda x: x.month if x.year == 2015 else 0)

In [57]:
eng_months = engfemdf.groupby('month')
all_months = allfemdf.groupby('month')

In [58]:
wd_eng_months = eng_months.sum().ix[8:12]
wd_all_months = all_months.sum().ix[8:12]

In [59]:
wir_data = {8:1854, 9:1590,10:1989,11:1787,12:1473}
wir_months = pd.DataFrame.from_dict(wir_data, orient='index')
wir_months.columns=['added']

In [60]:
addeddf = wir_months.join(wd_eng_months,lsuffix='wir',rsuffix='wd')
alladdeddf= wir_months.join(wd_all_months,lsuffix='wir',rsuffix='wd')

In [61]:
del addeddf['eng_fems']
del alladdeddf['all_fems']

In [62]:
alladdeddf.plot()


Out[62]:
<matplotlib.axes._subplots.AxesSubplot at 0x7f8391e47358>

In [63]:
addeddf.corr()


Out[63]:
addedwir addedwd
addedwir 1.000000 0.083634
addedwd 0.083634 1.000000

In [64]:
alladdeddf.corr()


Out[64]:
addedwir addedwd
addedwir 1.000000 0.225557
addedwd 0.225557 1.000000

In [65]:
alladdeddf['addedwir'].corr(alladdeddf['addedwd'].shift())


Out[65]:
0.61052951838651781

In [66]:
smooth = pd.rolling_mean(alladdeddf, window=2, min_periods=2).dropna()

In [67]:
slope, intercept, r_value, p_value, std_er = linregress(smooth['addedwir'],smooth['addedwd'])
print(r_value, p_value)


---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-67-d46b1d5fc253> in <module>()
----> 1 slope, intercept, r_value, p_value, std_er = linregress(smooth['addedwir'],smooth['addedwd'])
      2 print(r_value, p_value)

NameError: name 'linregress' is not defined

In [ ]:
np.correlate(addeddf['addedwir'],addeddf['addedwd'])

In [ ]:
from scipy.stats import linregress

In [ ]:
slope, intercept, r_value, p_value, std_er = linregress(alladdeddf['addedwir'][-4:],alladdeddf['addedwd'].shift()[-4:])

In [ ]:
r_value, p_value

In [ ]:
def mon2015(x):
    dt = pd.to_datetime(x)
    return x.month if x.year == 2015 else 0

enwiki['months'] = langdf.index.map(mon2015)

enmons = .groupby(by='months')

In [ ]:
wir_months

In [ ]:
yeardiff = langdf.ix[-1] - langdf.ix[0]

In [ ]:
yeardiff.sort(ascending=False)
yeardiff.hist()

In [ ]:
print pd.DataFrame(yeardiff).ix[0:10].to_latex(float_format=lambda x: "{0:.2f}%".format(x*100))

In [ ]:
pd.DataFrame(yeardiff).ix['enwiki']

In [ ]:
def trend(langseries):
    result = ols(y=langseries, x=ldf['dayssince'])
    return result.beta[0]

In [ ]:
trends = langdf.apply(lambda x: trend(x), axis=0)

In [ ]:
trends.sort(ascending=False)
trends

In [ ]:
def splitcol(i):
    return i.split("|")

In [ ]:
#av idols adult video
av_tot = {}
act_tot ={}
for snapdir in snapdirs:
    sitelinkspath = path.join(snapshot_dir,snapdir,'property_indexes','occupation-index.csv')
    try:
        avdf = pd.read_csv(sitelinkspath,index_col=0).fillna(0)
        avindexes = avdf.index.map(lambda x: 'Q1079215' in x)
        avtotal = avdf[avindexes].sum().sum()
        av_tot[snapdir] = avtotal
        actindexes = avdf.index.map(lambda x: 'Q488111' in x)
        acttotal = avdf[actindexes].sum().sum()
        act_tot[snapdir] = acttotal
    except OSError: #no occupation yet
        pass

In [ ]:
act_tot

In [ ]:


In [ ]: