notebook.community

Edit and run



In [1]:

    
import pandas
import math
import datetime
import os
import json

import matplotlib.pyplot as plt

plt.style.use('fivethirtyeight')

%pylab inline

java_min_int = -2147483648
gender_ordered = [u'female', u'male', u'transgender female', u'intersex', u"fa'afafine", u'transgender male', u'female animal', u'male animal', u'woman', u'genderqueer', u'kathoey']

import pywikibot
#Tranforming QIDs into English labels.
enwp = pywikibot.Site('en','wikipedia')
wikidata = enwp.data_repository()

retrieved = dict()

def english_label(qid):
    if type(qid) is float:
        if math.isnan(qid):
            return qid
    #first see if we've done it
    try:
        return retrieved[qid]
    except KeyError:
        try:
            page = pywikibot.ItemPage(wikidata, qid)
            data = page.get()
            lab = data['labels']['en']
            retrieved[qid] = lab
            return lab
        except:
            retrieved[qid] = qid
            return qid

def engify_labels(df, index=False):
    if index:
        axis = df.index
    else:
        axis = df.columns
    qids = [str(q) for q in axis]
    labels = [english_label(qid) for qid in qids]
    axis = labels









    



VERBOSE:pywiki:Starting 1 threads...






    



Populating the interactive namespace from numpy and matplotlib

This is how you'd get a dataframe for a specific snapshot



In [28]:

    
df = pandas.read_csv('snapshot_data/2014-09-17/gender-index-data-2014-09-17.csv', 
                                 na_values=[java_min_int])



In [2]:

    
ls snapshot_data/









    



2014-09-17/  2014-10-13/



In [151]:

    
#in case these things need ungzipping
snap_folders = !ls snapshot_data/
for folder in snap_folders:
    !gunzip -k snapshot_data/$folder/*.gz









    



gzip: snapshot_data/2014-09-17/gender-index-data-2014-09-17.csv already exists; do you wish to overwrite (y or n)? ^C
gzip: snapshot_data/2014-10-13/gender-index-data-2014-10-13.csv already exists; do you wish to overwrite (y or n)? ^C



In [152]:

    
folders = !ls snapshot_data/
locs = []
for folder in folders:
        loc = !ls snapshot_data/$folder/gender-index*.csv
        locs.append(loc[0])
print locs









    



['snapshot_data/2014-09-17/gender-index-data-2014-09-17.csv', 'snapshot_data/2014-10-13/gender-index-data-2014-10-13.csv']



In [153]:

    
snap_dfs = {loc[-14:-4]: pandas.read_csv(loc, na_values=[java_min_int]) for loc in locs}



In [35]:

    
def split_column(q_str):
    if type(q_str) is float:
        if numpy.isnan(q_str):
            return [q_str] #returning this way so we can gurantee that column contains list
    if type(q_str) is str:
        qs = q_str.split('|')
        return qs[:-1] #cos the format will always end with a |

for snap, df in snap_dfs.iteritems():
    for column in ['gender', 'ethnic_group', 'citizenship', 'place_of_birth', 'site_links']:
        column_plural = column+'s'
        df[column_plural] = df[column].apply(split_column)



In [36]:

    
latest = snap_dfs[max(snap_dfs.keys())]
earliest = snap_dfs[min(snap_dfs.keys())]



In [37]:

    
latest.query('dob == 999')









    Out[37]:






  
    
      
      qid
      dob
      dod
      gender
      ethnic_group
      citizenship
      place_of_birth
      site_links
      genders
      ethnic_groups
      citizenships
      place_of_births
      site_linkss
    
  
  
    
      25981  
         Q337805
       999
       1062
       Q6581097|
       NaN
       Q29520|
       Q1209298|
       zhwiki|wuuwiki|zh_classicalwiki|bowiki|ruwiki|...
       [Q6581097]
       [nan]
       [Q29520]
       [Q1209298]
       [zhwiki, wuuwiki, zh_classicalwiki, bowiki, ru...
    
    
      555620 
       Q15645727
       999
       1025
       Q6581072|
       NaN
           NaN
             NaN
                                                 jawiki|
       [Q6581072]
       [nan]
          [nan]
            [nan]
                                                [jawiki]
    
    
      808984 
        Q3012130
       999
       1082
       Q6581072|
       NaN
           NaN
             NaN
                     jawiki|kowiki|frwiki|eswiki|cawiki|
       [Q6581072]
       [nan]
          [nan]
            [nan]
                [jawiki, kowiki, frwiki, eswiki, cawiki]
    
    
      1947827
         Q336180
       999
       1072
       Q6581097|
       NaN
         Q794|
             NaN
       dewiki|jawiki|tgwiki|fawiki|ruwiki|frwiki|enwi...
       [Q6581097]
       [nan]
         [Q794]
            [nan]
       [dewiki, jawiki, tgwiki, fawiki, ruwiki, frwik...
    
    
      1993569
        Q1093355
       999
       1025
       Q6581072|
       NaN
           NaN
             NaN
         zhwiki|jawiki|kowiki|frwiki|eswiki|jawikiquote|
       [Q6581072]
       [nan]
          [nan]
            [nan]
       [zhwiki, jawiki, kowiki, frwiki, eswiki, jawik...



In [ ]:

    
for df in [earliest, latest]:
    for qid in latest.query('dob == 999')['qid']:
        print 'http://wikidata.org/wiki/'+qid
    print '\n'



In [38]:

    
from collections import defaultdict
import time

def make_reindex(snap_df):

    def int_dict_factory():
        return defaultdict(int)

    def nan_test(v):
        try:
            if math.isnan(v):
                return True
        except TypeError:
                return False
    #abstracted: we want year-gender, but also
    #gender-ethnicity -citizenship -place of birth, site-links

    params = ['dob','dod','genders','ethnic_groups','citizenships','place_of_births','site_linkss']
    gender_param = {param:defaultdict(int_dict_factory) for param in params}

    for index, row in snap_df.iterrows():
        row_data = {p : row[p] for p in params}
        for param in params:
            gender_dict = gender_param[param]
            vrs = row_data[param]
            genders = row_data['genders']
            if not nan_test(vrs):
                if not nan_test(genders):
                    for gender in genders:
                            if type(vrs) is list:
                                for var in vrs:
                                    gender_dict[gender][var] += 1
                            else: 
                                    gender_dict[gender][vrs] +=  1
                                    
    gender_dfs = {param: pandas.DataFrame.from_dict(gender_param[param], orient='columns') for param in params}
    return gender_dfs



In [148]:

    
gender_indexes = {snap: make_reindex(df) for snap, df in snap_dfs.iteritems()}









    



---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-148-4a18d1828f7e> in <module>()
----> 1 gender_indexes = {snap: make_reindex(df) for snap, df in snap_dfs.iteritems()}

NameError: name 'snap_dfs' is not defined



In [238]:

    
for snap, gender_dfs in gender_indexes.iteritems():
    for param, gender_df in gender_dfs.iteritems():
        print param
        engify_labels(gender_df)









    



---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-238-4c36c56aa8d6> in <module>()
----> 1 for snap, gender_dfs in gender_indexes.iteritems():
      2     for param, gender_df in gender_dfs.iteritems():
      3         print param
      4         engify_labels(gender_df)
      5 

NameError: name 'gender_indexes' is not defined



In [47]:

    
for snap, gender_df in gender_indexes.iteritems():
    for param, gender_df in gender_dfs.iteritems():
        property_index_dir = 'snapshot_data/%s/property_indexes' % (snap)
        if not os.path.exists(property_index_dir):
            os.makedirs(property_index_dir)
        filename = '%s/%s-index.csv' % (property_index_dir, param)
        filepoint = open(filename, 'w')
        filepoint.write(gender_df.to_csv())
        filepoint.close()



In [49]:

    
latest_date = max(snap_dfs.keys())
earliest_date = min(snap_dfs.keys())



In [88]:

    
gender_indexes[latest_date]['dob'].ix[999]









    Out[88]:





nan                  NaN
transgender female   NaN
intersex             NaN
fa'afafine           NaN
transgender male     NaN
male animal          NaN
woman                NaN
genderqueer          NaN
female                 3
male                   2
kathoey              NaN
Name: 999.0, dtype: float64



In [58]:

    
gender_dfs['dob'].plot(kind='area',stacked=True, figsize=(24,8))









    Out[58]:





<matplotlib.axes.AxesSubplot at 0x7f1c63aa9810>



In [43]:

    
plt = gender_dfs['dob'].ix[1800:].plot(kind='area',stacked=True, figsize=(24,8))
plt.set_title('''Wikidata Biography Gender Quantities by Year
    1800 onwards''', size=24)
plt.set_xlabel('Year', size=18)
plt.set_ylabel('Biographies', size=18)
plt.set_xlim((1800,2014))
plt.legend(title='Gender', loc=2)









    



---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-43-c0de4e952f65> in <module>()
----> 1 plt = gender_dfs['dob'].ix[1800:].plot(kind='area',stacked=True, figsize=(24,8))
      2 plt.set_title('''Wikidata Biography Gender Quantities by Year
      3     1800 onwards''', size=24)
      4 plt.set_xlabel('Year', size=18)
      5 plt.set_ylabel('Biographies', size=18)

NameError: name 'gender_dfs' is not defined



In [117]:

    
nonmale_early = gender_indexes[earliest_date]['dod'].copy(deep=True)
nonmale_late = gender_indexes[latest_date]['dod'].copy(deep=True)

del nonmale_early['nan']
nonmale_early['nonmale'] = nonmale_early.sum(axis=1) - nonmale_early['male']
nonmale_early['nm_per'] = (nonmale_early['nonmale'] ) / (nonmale_early['nonmale'] + nonmale_early['male'])

del nonmale_late['nan']
nonmale_late['nonmale'] = nonmale_late.sum(axis=1) - nonmale_late['male']
nonmale_late['nm_per'] = (nonmale_late['nonmale'] ) / (nonmale_late['nonmale'] + nonmale_late['male'])



In [128]:

    
plt = nonmale_early['nm_per'].plot(figsize(24,8), kind='line')
plt.set_xlim((-1000,2014))









    Out[128]:





(-1000, 2014)



In [133]:

    
ma = pandas.rolling_mean(nonmale_early['nm_per'], 10)



In [134]:

    
plt = ma.plot()
plt.set_xlim((-1000,2014))









    Out[134]:





(-1000, 2014)



In [ ]:

    
nonmale_late['nm_per'].plot(figsize=(24,8))
plt.set_xlim((-1000,2014))



In [67]:

    
gender_ordered









    Out[67]:





[u'female',
 u'male',
 u'transgender female',
 u'intersex',
 u"fa'afafine",
 u'transgender male',
 u'female animal',
 u'male animal',
 u'woman',
 u'genderqueer',
 u'kathoey']



In [34]:

    
['Date of Birth']*2









    Out[34]:





['Date of Birth', 'Date of Birth']



In [114]:

    
plt.style









    Out[114]:





'/usr/local/lib/python2.7/dist-packages/matplotlib/style/__init__.pyc'



In [16]:

    
infogram = pandas.DataFrame.from_csv('snapshot_data/2014-10-13/property_indexes/dob-index.csv')
infogram.fillna(0, inplace=True)
del infogram['nan']
infogram['total'] = infogram.sum(axis=1)
infogram['nonbin'] = infogram['total'] - infogram['male'] - infogram['female']
infogram['fem_per'] = infogram['female']  / (infogram['total'])
infogram['nonbin_per'] = infogram['nonbin'] / infogram['total']
rm = pandas.rolling_mean(infogram[['fem_per','nonbin_per']], 10, min_periods=10)



In [19]:

    
partyears = range(1800,2000,int(200/6.0))
rm.ix[partyears][['nonbin_per','fem_per']].T.to_csv('Magnus Gender analysis/infogram_dob_rm.csv')
infogram.ix[partyears][['nonbin_per','fem_per','nonbin']].T.to_csv('Magnus Gender analysis/infogram_dob.csv')



In [28]:

    
rm.ix[1880:1910]['nonbin_per']









    Out[28]:





1880    0.000000
1881    0.000000
1882    0.000022
1883    0.000022
1884    0.000022
1885    0.000043
1886    0.000043
1887    0.000043
1888    0.000043
1889    0.000043
1890    0.000043
1891    0.000043
1892    0.000021
1893    0.000021
1894    0.000021
1895    0.000000
1896    0.000000
1897    0.000000
1898    0.000000
1899    0.000000
1900    0.000000
1901    0.000000
1902    0.000000
1903    0.000000
1904    0.000000
1905    0.000000
1906    0.000000
1907    0.000000
1908    0.000000
1909    0.000000
1910    0.000000
Name: nonbin_per, dtype: float64



In [3]:

    
fig, axes = pylab.subplots(nrows=2, ncols=2, sharey=True)
for ax, beginning, l, (xtext, ytext) in zip(axes.ravel(), [-4000, 1800] * 2, ['b']*2+['d']*2, [('-4000 BCE to present','Date of Birth'),('1800 CE to present',''),('','Date of Death'),('','')]):
    acro = 'do'+l
    df = pandas.DataFrame.from_csv('snapshot_data/2014-10-13/property_indexes/%s-index.csv' % acro)
    del df['nan']
    df   = df.reindex_axis(gender_ordered,axis=1)
    p = df.plot(kind='area', figsize=(9,10), cmap='Accent', ax=ax, legend=False, linewidth=1)
    p.set_xlim((beginning,2014))
    #p.set_xlabel(xtext)
    p.set_ylabel(ytext)
    p.set_title(xtext, fontsize=12)

fig.legend = legend(bbox_to_anchor=(1.05, 1.5), loc=2, borderaxespad=0)
fig.suptitle('Volumes of Dates of Birth and Death in Wikidata \n by Gender \n Total and Modern Timeframes', fontsize=24)
fig.tight_layout()
subplots_adjust(hspace=0.1, top=0.82)



In [177]:

    
ra_len = 1

dox = pandas.DataFrame()
nonbindox = pandas.DataFrame()

for l in ['b','d']:
    acro = 'do'+l
    df = pandas.DataFrame.from_csv('snapshot_data/2014-10-13/property_indexes/%s-index.csv' % acro)
    del df['nan']
    df['total'] = df.sum(axis=1)
    df['nonbin'] = df['total'] - df['male'] - df['female']
    df['fem_per'] = df['female']  / (df['total'])
    df['nonbin_per'] = df['nonbin'] / df['total']
    
    ra = pandas.rolling_mean(df['fem_per'], ra_len)
    dox[acro] = ra
    
    nonbinra = pandas.rolling_mean(df['nonbin_per'], ra_len)
    nonbindox[acro] = nonbinra

fig, (pltf, pltb) =  pylab.subplots(nrows=2, ncols=1, sharex=True, figsize=(9,6))
dox.plot(kind='line',  cmap='Paired', linewidth=2, ax=pltf)
pltf.set_xlim((1400,2014))
pltf.set_ylim((0,0.7))
pltf.yaxis.set_major_formatter(matplotlib.ticker.FuncFormatter(lambda x, y: '{}%'.format(int(x*100) )))
pltf.set_title('Female ratio')
pltf.legend(('Date of Birth', 'Date of Death'),loc=4, bbox_to_anchor=(1.25,-0.25))

nonbindox.plot(kind='line',  cmap='Paired', linewidth=2, ax=pltb, legend=False)
pltb.set_xlim((1400,2014))
pltb.yaxis.set_major_formatter(matplotlib.ticker.FuncFormatter(lambda x, y: '{}%'.format(x*100)))
pltb.set_title('Non-binary Ratio')

fig.suptitle('Composition of Wikidata Genders in Modern History', fontsize=24)
fig.subplots_adjust(top=0.87)



In [207]:









    Out[207]:





<matplotlib.text.Text at 0x7f1990eb4290>

Place of Birth Ivestiagation



In [4]:

    
pob = gender_indexes[latest_date]['place_of_births'].copy(deep=True)









    



---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-4-4e0945f5cefe> in <module>()
----> 1 pob = gender_indexes[latest_date]['place_of_births'].copy(deep=True)

NameError: name 'gender_indexes' is not defined



In [70]:

    
len(pob)









    Out[70]:





72894

72,000 "places" how, many of these are countries. that is instance of "Q6256"



In [ ]:

    
%timeit -n 1 -r 1
def is_or_has_country(qid):
    countries = list() #we're going to return this
    page = pywikibot.ItemPage(wikidata, qid)
    data = page.get()
    claims = data['claims']
    for pid, claimlist in claims.iteritems():
        if pid == 'P17':
            for claim in claimlist:
                countries.append(claim.target.title()) #this is part of a country
        if pid == 'P31':
            for claim in claimlist:
                    if claim.target.title() == 'Q6256':
                        countries.append(qid) #this actually is a  country

    return countries 

place_country = dict()

count=0
for place in pob.index[1:]: #1 because the first index is nan
    place_country[place] = is_or_has_country(place)
    count += 1
    if count % 100 == 0:
        print count



In [97]:

    
pobs = [p for p in pob.index[1:]]
json.dump(pobs, open('pobs_list.json','w'))

do some processing on wmflabs to save on bandwidth



In [5]:

    
pobs_map = json.load(open('helpers/pobs_map.json','r'))

What percentage of pobs are of have a country? Which have more than one country?



In [9]:

    
have_country = [c[0] for c in pobs_map.values() if len(c) != 0]



In [10]:

    
len(list(set(have_country)))









    Out[10]:





235



In [14]:

    
len(have_country) / float(len(pobs_map.values()))









    Out[14]:





0.9363862099241353



In [15]:

    
have_no_country = [p for p, c in  pobs_map.iteritems() if len(c) == 0]



In [20]:

    
len(have_country)









    Out[20]:





68256



In [16]:

    
len(have_no_country)









    Out[16]:





4637



In [19]:

    
for place in have_no_country[10:20]:
    print 'http://wikidata.org/wiki/'+place









    



http://wikidata.org/wiki/Q361099
http://wikidata.org/wiki/Q4180803
http://wikidata.org/wiki/Q579468
http://wikidata.org/wiki/Q504912
http://wikidata.org/wiki/Q1013242
http://wikidata.org/wiki/Q1958565
http://wikidata.org/wiki/Q15763
http://wikidata.org/wiki/Q1091714
http://wikidata.org/wiki/Q448469
http://wikidata.org/wiki/Q7285906

there's not an obvious easy way to programmatically determine these, would probably have to go over them by hand

the sad part is that they probaby represent minority locations (and thus pepople)



In [23]:

    
country_lengths = sum([len(c) for c in have_country])



In [24]:

    
country_lengths / float(len(have_country))









    Out[24]:





1.0032524613220817

Ok, which have more than one country.



In [33]:

    
for place, country_list in pobs_map.iteritems():
    if len(country_list) > 1:
        if reduce(lambda a, b: a != b, country_list ):#all elements are the not  same
            pass#print ['http://wikidata.org/wiki/'+place for place in country_list]



In [34]:

    
import IPython.display



In [40]:

    
IPython.display.Image('helpers/Inglehart_Values_Map2.svg.png')









    Out[40]:

confcian
orthodox
isalmic
south asia
africa
catholic europea
protestant europe
english speaking



In [3]:

    
coun = [c for c in pobs_map.itervalues() if c]









    



---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-3-03814802914c> in <module>()
----> 1 coun = [c for c in pobs_map.itervalues() if c]

NameError: name 'pobs_map' is not defined



In [ ]:

    
country_culture = dict()
for place, country_list in pobs_map.iteritems():
    if country_list:
        qid =  country_list[0]
        if qid in country_culture.keys():
            continue
        else:
            link = 'http://wikidata.org/wiki/%s' % qid
            print english_label(qid)
            print link
            culture_num = input('enter culture num')
            country_culture[qid] = culture_num



In [39]:

    
fp = open('helpers/pob_agg.txt','w')
seen = list()
for place, country_list in pobs_map.iteritems():
    if country_list:
        qid =  country_list[0]
        if qid in seen:
            continue
        else:
            enlab = english_label(qid)
            writestr = u'%s\t%s\t\n' % (qid, enlab)
            fp.write(writestr.encode('utf-8'))
            seen.append(qid)
fp.close()



In [6]:

    
culture_map ={1:'confucian',
                        2:'orthodox',
                        3:'islamic',
                        4:'south asia',
                        5:'africa',
                        6: 'catholic europe',
                        7: 'protestant europe',
                        8: 'english speaking',
                        9: 'latin america'}



In [53]:

    
cultures_df.to_csv('helpers/culture_names.csv')



In [7]:

    
cultures_df = pandas.DataFrame.from_csv('helpers/culture_names.csv')
cultures_df['qid'] = cultures_df.index
cultures_df['cutlure_name'] = cultures_df['culture_number'].apply(lambda x: culture_map[x])



In [8]:

    
pob = pandas.DataFrame.from_csv('snapshot_data/2014-10-13/property_indexes/place_of_births-index.csv')
pob['qid'] = pob.index
#pob = pob.ix[1:] #remove nan row



In [9]:

    
qid_countryqid = json.load(open('helpers/pobs_map.json','r'))

def qid_to_country(qid):
    if type(qid) is float:
        if math.isnan(qid):
            return 'no_data'
    else:
        return qid_countryqid[qid]
    
pob['country_qid'] = pob['qid'].apply(lambda qid: qid_to_country(qid))



In [10]:

    
country_culture = dict(zip(cultures_df['qid'], cultures_df['culture_name']))



In [11]:

    
def aggregate_culture(qid_list):
    if not type(qid_list) is list and qid_list == 'no_data':
                return 'no_data'
    if len(qid_list) > 0:
        culture_name = country_culture[qid_list[0]]
        return culture_name
    else:
        return 'not_easily_aggregatable'



In [12]:

    
pob['culture_name'] = pob['country_qid'].apply(lambda qid_list: aggregate_culture(qid_list))



In [13]:

    
culture_groups = pob.groupby(by=['culture_name'])[u'transgender female', u'intersex', u"fa'afafine", u'transgender male', u'female animal', u'male animal', u'woman', u'genderqueer', u'female', u'male', u'kathoey'].sum().copy(deep=True)



In [35]:

    
culture_groups.to_csv('helpers/pob_plot_data_oct.csv')



In [15]:

    
culture_groups['total'] = culture_groups.sum(axis=1)



In [16]:

    
normed_pobs_agg = culture_groups.apply(lambda x: x/ float(x['total']), axis=1)



In [17]:

    
pobs_plot = normed_pobs_agg.sort('female')[normed_pobs_agg.columns[:-1]]
pobs_plot_mf = normed_pobs_agg.sort('female')[['male','female']]
pobs_plot_nmf = normed_pobs_agg[[u'transgender female', u'intersex', u"fa'afafine", u'transgender male', u'female animal', u'male animal', u'woman', u'genderqueer', u'kathoey']]



In [18]:

    
pobs_plot.plot(kind='bar', figsize=(10,10))









    Out[18]:





<matplotlib.axes.AxesSubplot at 0x7fc4d9f927d0>



In [91]:

    
plt = pobs_plot_mf.plot(kind='bar', figsize=(12,5), cmap='Paired')
plt.set_title('Place of Birth Gender Composition by 9 world cultures.\nNo Data are items without Birth Data\nNot Easily Aggregratable means Place of Birth did Claim did not have a Country Property.(7%)')
plt.set_ylabel('Gender Composition')
plt.legend(bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0)


plt = pobs_plot_nmf.plot(kind='bar', figsize=(12,5), cmap='Accent')
plt.set_title('Place of Birth Gender Composition by 9 world cultures.\nNo Data are items without Birth Data\nNot Easily Aggregratable means Place of Birth did Claim did not have a Country Property.(7%)')
plt.set_ylabel('Gender Composition')
plt.legend(bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0)









    Out[91]:





<matplotlib.legend.Legend at 0x7fc4ac5825d0>



In [79]:









    Out[79]:





<matplotlib.legend.Legend at 0x7fc4ad56a710>



In [ ]:

    
pob = gender_indexes[latest_date]['place_of_births'].copy(deep=True)

Ethnic Groups



In [142]:

    
eg = gender_indexes[latest_date]['ethnic_groups'].copy(deep=True)



In [154]:

    
eg = pandas.DataFrame.from_csv('snapshot_data/2014-10-13/property_indexes/ethnic_groups-index.csv')



In [155]:

    
len(eg)









    Out[155]:





683



In [160]:

    
eg=eg.ix[1:].fillna(value=0)
eg['total'] = eg.sum(axis=1)
engify_labels(eg,index=True)



In [162]:

    
eg['qid'] = eg.index



In [218]:

    
eg['ethnic_name'] = eg['qid'].apply(lambda x: english_label(x))



In [219]:

    
eg.sort('total', ascending=False).head(50)









    Out[219]:






  
    
      
      nan
      transgender female
      intersex
      fa'afafine
      transgender male
      female animal
      male animal
      woman
      genderqueer
      female
      male
      kathoey
      total
      qid
      ethinic_name
      ethnic_name
    
  
  
    
      Q539051
       116
       0
       0
       0
       0
       0
       0
       0
       0
       190
       2395
       0
       8103
        Q539051
                         Greeks
                         Greeks
    
    
      Q127885
         0
       1
       0
       0
       0
       0
       0
       0
       0
       123
       1135
       0
       3777
        Q127885
                          Serbs
                          Serbs
    
    
      Q7325
         1
       0
       0
       0
       0
       0
       0
       0
       0
        30
        157
       0
        564
          Q7325
                  Jewish people
                  Jewish people
    
    
      Q7129609
         2
       0
       0
       0
       0
       0
       0
       0
       0
       115
         57
       0
        522
       Q7129609
                 Caucasian race
                 Caucasian race
    
    
      Q161652
         0
       1
       0
       0
       0
       0
       0
       0
       0
       150
         20
       0
        513
        Q161652
                Japanese people
                Japanese people
    
    
      Q49085
         0
       0
       0
       0
       0
       0
       0
       0
       0
        72
         69
       0
        423
         Q49085
               African American
               African American
    
    
      Q235155
         0
       1
       0
       0
       0
       0
       0
       0
       0
        81
         35
       0
        351
        Q235155
                   white people
                   white people
    
    
      Q402913
         0
       0
       0
       0
       0
       0
       0
       0
       0
        28
         74
       0
        306
        Q402913
                 Bengali people
                 Bengali people
    
    
      Q187985
         0
       0
       0
       0
       0
       0
       0
       0
       0
         9
         88
       0
        291
        Q187985
                 Tibetan people
                 Tibetan people
    
    
      Q485150
         1
       0
       0
       0
       0
       0
       0
       0
       0
        21
         71
       0
        279
        Q485150
                      Romanians
                      Romanians
    
    
      Q7994501
         0
       0
       0
       0
       0
       0
       0
       0
       0
        32
         54
       0
        258
       Q7994501
                  White British
                  White British
    
    
      Q4887679
         0
       0
       0
       0
       0
       0
       0
       0
       0
         9
         71
       0
        240
       Q4887679
                 Bengali Hindus
                 Bengali Hindus
    
    
      Q42406
         0
       0
       0
       0
       0
       0
       0
       0
       0
        43
         30
       0
        219
         Q42406
                 English people
                 English people
    
    
      Q34069
         0
       0
       0
       0
       0
       0
       0
       0
       0
         3
         53
       0
        168
         Q34069
                 Ashkenazi Jews
                 Ashkenazi Jews
    
    
      Q42884
         0
       0
       0
       0
       0
       0
       0
       0
       0
        15
         39
       0
        162
         Q42884
                        Germans
                        Germans
    
    
      Q49078
         0
       0
       0
       0
       0
       0
       0
       0
       0
        22
         30
       0
        156
         Q49078
                 White American
                 White American
    
    
      Q2556103
         1
       0
       0
       0
       0
       0
       0
       0
       0
         6
         41
       0
        144
       Q2556103
                 Pashtun people
                 Pashtun people
    
    
      Q133032
         0
       0
       0
       0
       0
       0
       0
       0
       0
        14
         27
       0
        123
        Q133032
               Hungarian people
               Hungarian people
    
    
      Q42740
         1
       0
       0
       0
       0
       0
       0
       0
       0
         3
         37
       0
        123
         Q42740
                    Han Chinese
                    Han Chinese
    
    
      Q50001
         0
       0
       0
       0
       0
       0
       0
       0
       0
        26
         14
       0
        120
         Q50001
                       Italians
                       Italians
    
    
      Q1815623
         0
       0
       0
       0
       0
       0
       0
       0
       0
         0
         36
       0
        108
       Q1815623
        Sri Lankan Tamil people
        Sri Lankan Tamil people
    
    
      Q35323
         0
       0
       0
       0
       0
       0
       0
       0
       0
         7
         29
       0
        108
         Q35323
                           Arab
                           Arab
    
    
      Q241696
         0
       0
       0
       0
       0
       0
       0
       0
       0
        11
         22
       0
         99
        Q241696
                  Somali people
                  Somali people
    
    
      Q403656
         0
       0
       0
       0
       0
       0
       0
       0
       0
        16
         17
       0
         99
        Q403656
                        Baganda
                        Baganda
    
    
      Q932244
         0
       0
       0
       0
       0
       0
       0
       0
       0
         4
         28
       0
         96
        Q932244
               Sinhalese people
               Sinhalese people
    
    
      Q30
         0
       0
       0
       0
       0
       0
       0
       0
       0
         6
         25
       0
         93
            Q30
       United States of America
       United States of America
    
    
      Q49542
         0
       0
       0
       0
       0
       0
       0
       0
       0
         6
         23
       0
         87
         Q49542
                       Russians
                       Russians
    
    
      Q43103
         0
       1
       0
       0
       0
       0
       0
       0
       0
         4
         24
       0
         87
         Q43103
              European American
              European American
    
    
      Q121842
         0
       0
       0
       0
       0
       0
       0
       0
       0
        18
         10
       0
         84
        Q121842
                  French people
                  French people
    
    
      Q854323
         0
       0
       0
       0
       0
       0
       0
       0
       0
         5
         22
       0
         81
        Q854323
                 Punjabi people
                 Punjabi people
    
    
      Q84072
         0
       0
       0
       0
       0
       0
       0
       0
       0
         7
         19
       0
         78
         Q84072
                 Turkish people
                 Turkish people
    
    
      Q179248
         0
       0
       0
       0
       0
       0
       0
       0
       0
         4
         22
       0
         78
        Q179248
                      Albanians
                      Albanians
    
    
      Q1026
         0
       0
       0
       0
       0
       0
       0
       0
       0
         7
         18
       0
         75
          Q1026
                          Poles
                          Poles
    
    
      Q4172847
         0
       0
       0
       0
       0
       0
       0
       0
       0
        16
          8
       0
         72
       Q4172847
                Filipino people
                Filipino people
    
    
      Q170826
         0
       0
       0
       0
       0
       0
       0
       0
       0
        11
         12
       0
         69
        Q170826
                   Irish people
                   Irish people
    
    
      Q79797
         0
       0
       0
       0
       0
       0
       0
       0
       0
         5
         18
       0
         69
         Q79797
                      Armenians
                      Armenians
    
    
      Q581780
         0
       0
       0
       0
       0
       0
       0
       0
       0
         0
         22
       0
         66
        Q581780
                   Banu Khazraj
                   Banu Khazraj
    
    
      Q445618
         0
       0
       0
       0
       0
       0
       0
       0
       0
         1
         21
       0
         66
        Q445618
                 Banat Swabians
                 Banat Swabians
    
    
      Q170217
         1
       0
       0
       0
       0
       0
       0
       0
       0
        13
          7
       0
         63
        Q170217
                         Czechs
                         Czechs
    
    
      Q160894
         0
       0
       0
       0
       0
       0
       0
       0
       0
         9
         12
       0
         63
        Q160894
                 Spanish people
                 Spanish people
    
    
      Q483569
         0
       0
       0
       0
       0
       0
       0
       0
       0
         3
         18
       0
         63
        Q483569
                    Belarusians
                    Belarusians
    
    
      Q974693
         0
       0
       0
       0
       0
       0
       0
       0
       0
        12
          9
       0
         63
        Q974693
               Italian American
               Italian American
    
    
      Q1344183
         0
       0
       0
       0
       0
       0
       0
       0
       0
         7
         12
       0
         57
       Q1344183
               English American
               English American
    
    
      Q133255
         0
       0
       0
       0
       0
       0
       0
       0
       0
         3
         16
       0
         57
        Q133255
                     Bulgarians
                     Bulgarians
    
    
      Q181634
         0
       0
       0
       0
       0
       0
       0
       0
       0
         3
         15
       0
         54
        Q181634
                Scottish people
                Scottish people
    
    
      Q484464
         0
       0
       0
       0
       0
       0
       0
       0
       0
         9
          9
       0
         54
        Q484464
                        Koreans
                        Koreans
    
    
      Q842438
         0
       0
       0
       0
       0
       0
       0
       0
       0
         7
         10
       0
         51
        Q842438
                 British people
                 British people
    
    
      Q700469
         0
       0
       0
       0
       0
       0
       0
       0
       0
         1
         16
       0
         51
        Q700469
             Germans of Romania
             Germans of Romania
    
    
      Q6501380
         0
       0
       0
       0
       0
       0
       0
       0
       0
         5
         12
       0
         51
       Q6501380
                 Chinese people
                 Chinese people
    
    
      Q862086
         0
       0
       0
       0
       0
       0
       0
       0
       0
         9
          7
       0
         48
        Q862086
                  Indian people
                  Indian people



In [221]:

    
eg_normed = eg[[u'transgender female', u'intersex', u"fa'afafine", u'transgender male', u'female animal', u'male animal', u'woman', u'genderqueer', u'female', u'male', u'kathoey', 'total']].apply(lambda x: x/x['total'], axis=1)



In [228]:

    
eg_cut = eg[eg['total']>1]
eg_cut_normed = eg_cut[[u'transgender female', u'intersex', u"fa'afafine", u'transgender male', u'female animal', u'male animal', u'woman', u'genderqueer', u'female', u'male', u'kathoey', 'total']].apply(lambda x: x/x['total'], axis=1)



In [ ]:

    
eg_



In [177]:

    
engify_labels(eg_normed, index=True)



In [226]:

    
eg_normed.sort(columns=['female'], ascending=False)['female'].plot(kind='bar', figsize=(18,8))









    Out[226]:





<matplotlib.axes.AxesSubplot at 0x7f1990731090>



In [241]:

    
def export_for_crowd_aggregate(df, savename):
    crowd_source_export  = pandas.DataFrame()
    crowd_source_export['qid'] = df.index
    crowd_source_export['en_label'] = crowd_source_export['qid'].apply(lambda x: english_label(x).encode('utf-8'))
    crowd_source_export['aggregate_group'] = ''
    crowd_source_export.to_csv('helpers/%s_map.csv' % savename)



In [ ]:

    
export_for_crowd_aggregate(eg, 'ethnic_groups')

Citizenships



In [99]:

    
cz = gender_indexes[latest_date]['citizenships'].copy(deep=True)



In [180]:

    
cz = pandas.DataFrame.from_csv('snapshot_data/2014-10-13/property_indexes/citizenships-index.csv')



In [249]:

    
cz = cz.ix[1:] #remove the inintial nan ,which are items that had no citienzship



In [250]:

    
len(cz)









    Out[250]:





732



In [ ]:

    
export_for_crowd_aggregate(cz, 'citizenships')

SiteLinkss!



In [183]:

    
sl = pandas.DataFrame.from_csv('snapshot_data/2014-10-13/property_indexes/site_linkss-index.csv')



In [184]:

    
len(sl)









    Out[184]:





428

Regressions Maybe



In [57]:

    
yg.plot(kind='scatter', x='year', y='lnmale')
plt.draw()









    Out[57]:





<matplotlib.axes.AxesSubplot at 0x7f718fd5dc10>



In [108]:

    
yg_reg = yg
yg_reg['year'] = yg_reg.index



In [109]:

    
yg_reg['lnmale'] = numpy.log(yg_reg['male'])



In [123]:

    
import statsmodels.api as sm

nonnan = yg_reg.ix[start_year:1986].fillna(value=0)
model = sm.OLS(nonnan['male'],nonnan['year'])
results = model.fit()
print(results.summary())
    print(start_year, results.rsquared)









    



(None, 0.20534979805159537)
(-500, 0.21745102948866801)
(0, 0.21941326163168851)
(500, 0.22247317415119383)
(1000, 0.24992223055451823)
(1500, 0.37760491504948823)
(1800, 0.72773643874338734)
(1900, 0.95518546745325672)



In [144]:

    
rsquared_results = pandas.DataFrame(columns=['start_year','rsquared'])
for start_year in numpy.arange(-4000,1950,50):
    nm_model = sm.OLS(yg_reg.ix[start_year:1986]['nm_per'], sm.add_constant(yg_reg.ix[start_year:1986]['year']) )
    nm_results = nm_model.fit()
    rsquared_results = rsquared_results.append({'start_year':start_year, 'rsquared':nm_results.rsquared}, ignore_index=True)
    
ax = rsquared_results[rsquared_results['start_year']>500].plot(kind='line',x='start_year',y='rsquared', 
                                                                title=r'$R^2$ value for linear regression on non-male percentage')

ax.set_xlabel('starting year of regresssion untilt 1987')
ax.set_ylabel(r'$R^2$')









    Out[144]:





<matplotlib.text.Text at 0x7f719df8ef50>

This implies $y = -1.7495 + 0.001x$ with $R^2 = 0.885$

setting $y=0.5$ $\implies$ $x=2249.5$ or in the year 2250



In [ ]:

	qid	dob	dod	gender	ethnic_group	citizenship	place_of_birth	site_links	genders	ethnic_groups	citizenships	place_of_births	site_linkss
25981	Q337805	999	1062	Q6581097\|	NaN	Q29520\|	Q1209298\|	zhwiki\|wuuwiki\|zh_classicalwiki\|bowiki\|ruwiki\|...	[Q6581097]	[nan]	[Q29520]	[Q1209298]	[zhwiki, wuuwiki, zh_classicalwiki, bowiki, ru...
555620	Q15645727	999	1025	Q6581072\|	NaN	NaN	NaN	jawiki\|	[Q6581072]	[nan]	[nan]	[nan]	[jawiki]
808984	Q3012130	999	1082	Q6581072\|	NaN	NaN	NaN	jawiki\|kowiki\|frwiki\|eswiki\|cawiki\|	[Q6581072]	[nan]	[nan]	[nan]	[jawiki, kowiki, frwiki, eswiki, cawiki]
1947827	Q336180	999	1072	Q6581097\|	NaN	Q794\|	NaN	dewiki\|jawiki\|tgwiki\|fawiki\|ruwiki\|frwiki\|enwi...	[Q6581097]	[nan]	[Q794]	[nan]	[dewiki, jawiki, tgwiki, fawiki, ruwiki, frwik...
1993569	Q1093355	999	1025	Q6581072\|	NaN	NaN	NaN	zhwiki\|jawiki\|kowiki\|frwiki\|eswiki\|jawikiquote\|	[Q6581072]	[nan]	[nan]	[nan]	[zhwiki, jawiki, kowiki, frwiki, eswiki, jawik...

	nan	transgender female	female	male	total	qid	ethinic_name	ethnic_name
Q539051	116	0	190	2395	8103	Q539051	Greeks	Greeks
Q127885	0	1	123	1135	3777	Q127885	Serbs	Serbs
Q7325	1	0	30	157	564	Q7325	Jewish people	Jewish people
Q7129609	2	0	115	57	522	Q7129609	Caucasian race	Caucasian race
Q161652	0	1	150	20	513	Q161652	Japanese people	Japanese people
Q49085	0	0	72	69	423	Q49085	African American	African American
Q235155	0	1	81	35	351	Q235155	white people	white people
Q402913	0	0	28	74	306	Q402913	Bengali people	Bengali people
Q187985	0	0	9	88	291	Q187985	Tibetan people	Tibetan people
Q485150	1	0	21	71	279	Q485150	Romanians	Romanians
Q7994501	0	0	32	54	258	Q7994501	White British	White British
Q4887679	0	0	9	71	240	Q4887679	Bengali Hindus	Bengali Hindus
Q42406	0	0	43	30	219	Q42406	English people	English people
Q34069	0	0	3	53	168	Q34069	Ashkenazi Jews	Ashkenazi Jews
Q42884	0	0	15	39	162	Q42884	Germans	Germans
Q49078	0	0	22	30	156	Q49078	White American	White American
Q2556103	1	0	6	41	144	Q2556103	Pashtun people	Pashtun people
Q133032	0	0	14	27	123	Q133032	Hungarian people	Hungarian people
Q42740	1	0	3	37	123	Q42740	Han Chinese	Han Chinese
Q50001	0	0	26	14	120	Q50001	Italians	Italians
Q1815623	0	0	0	36	108	Q1815623	Sri Lankan Tamil people	Sri Lankan Tamil people
Q35323	0	0	7	29	108	Q35323	Arab	Arab
Q241696	0	0	11	22	99	Q241696	Somali people	Somali people
Q403656	0	0	16	17	99	Q403656	Baganda	Baganda
Q932244	0	0	4	28	96	Q932244	Sinhalese people	Sinhalese people
Q30	0	0	6	25	93	Q30	United States of America	United States of America
Q49542	0	0	6	23	87	Q49542	Russians	Russians
Q43103	0	1	4	24	87	Q43103	European American	European American
Q121842	0	0	18	10	84	Q121842	French people	French people
Q854323	0	0	5	22	81	Q854323	Punjabi people	Punjabi people
Q84072	0	0	7	19	78	Q84072	Turkish people	Turkish people
Q179248	0	0	4	22	78	Q179248	Albanians	Albanians
Q1026	0	0	7	18	75	Q1026	Poles	Poles
Q4172847	0	0	16	8	72	Q4172847	Filipino people	Filipino people
Q170826	0	0	11	12	69	Q170826	Irish people	Irish people
Q79797	0	0	5	18	69	Q79797	Armenians	Armenians
Q581780	0	0	0	22	66	Q581780	Banu Khazraj	Banu Khazraj
Q445618	0	0	1	21	66	Q445618	Banat Swabians	Banat Swabians
Q170217	1	0	13	7	63	Q170217	Czechs	Czechs
Q160894	0	0	9	12	63	Q160894	Spanish people	Spanish people
Q483569	0	0	3	18	63	Q483569	Belarusians	Belarusians
Q974693	0	0	12	9	63	Q974693	Italian American	Italian American
Q1344183	0	0	7	12	57	Q1344183	English American	English American
Q133255	0	0	3	16	57	Q133255	Bulgarians	Bulgarians
Q181634	0	0	3	15	54	Q181634	Scottish people	Scottish people
Q484464	0	0	9	9	54	Q484464	Koreans	Koreans
Q842438	0	0	7	10	51	Q842438	British people	British people
Q700469	0	0	1	16	51	Q700469	Germans of Romania	Germans of Romania
Q6501380	0	0	5	12	51	Q6501380	Chinese people	Chinese people
Q862086	0	0	9	7	48	Q862086	Indian people	Indian people