In [1]:
import pandas
import math
import datetime
import os
import json

import matplotlib.pyplot as plt

plt.style.use('fivethirtyeight')

%pylab inline

java_min_int = -2147483648
gender_ordered = [u'female', u'male', u'transgender female', u'intersex', u"fa'afafine", u'transgender male', u'female animal', u'male animal', u'woman', u'genderqueer', u'kathoey']

import pywikibot
#Tranforming QIDs into English labels.
enwp = pywikibot.Site('en','wikipedia')
wikidata = enwp.data_repository()

retrieved = dict()

def english_label(qid):
    if type(qid) is float:
        if math.isnan(qid):
            return qid
    #first see if we've done it
    try:
        return retrieved[qid]
    except KeyError:
        try:
            page = pywikibot.ItemPage(wikidata, qid)
            data = page.get()
            lab = data['labels']['en']
            retrieved[qid] = lab
            return lab
        except:
            retrieved[qid] = qid
            return qid

def engify_labels(df, index=False):
    if index:
        axis = df.index
    else:
        axis = df.columns
    qids = [str(q) for q in axis]
    labels = [english_label(qid) for qid in qids]
    axis = labels


VERBOSE:pywiki:Starting 1 threads...
Populating the interactive namespace from numpy and matplotlib

This is how you'd get a dataframe for a specific snapshot


In [28]:
df = pandas.read_csv('snapshot_data/2014-09-17/gender-index-data-2014-09-17.csv', 
                                 na_values=[java_min_int])

In [2]:
ls snapshot_data/


2014-09-17/  2014-10-13/

In [151]:
#in case these things need ungzipping
snap_folders = !ls snapshot_data/
for folder in snap_folders:
    !gunzip -k snapshot_data/$folder/*.gz


gzip: snapshot_data/2014-09-17/gender-index-data-2014-09-17.csv already exists; do you wish to overwrite (y or n)? ^C
gzip: snapshot_data/2014-10-13/gender-index-data-2014-10-13.csv already exists; do you wish to overwrite (y or n)? ^C

In [152]:
folders = !ls snapshot_data/
locs = []
for folder in folders:
        loc = !ls snapshot_data/$folder/gender-index*.csv
        locs.append(loc[0])
print locs


['snapshot_data/2014-09-17/gender-index-data-2014-09-17.csv', 'snapshot_data/2014-10-13/gender-index-data-2014-10-13.csv']

In [153]:
snap_dfs = {loc[-14:-4]: pandas.read_csv(loc, na_values=[java_min_int]) for loc in locs}

In [35]:
def split_column(q_str):
    if type(q_str) is float:
        if numpy.isnan(q_str):
            return [q_str] #returning this way so we can gurantee that column contains list
    if type(q_str) is str:
        qs = q_str.split('|')
        return qs[:-1] #cos the format will always end with a |

for snap, df in snap_dfs.iteritems():
    for column in ['gender', 'ethnic_group', 'citizenship', 'place_of_birth', 'site_links']:
        column_plural = column+'s'
        df[column_plural] = df[column].apply(split_column)

In [36]:
latest = snap_dfs[max(snap_dfs.keys())]
earliest = snap_dfs[min(snap_dfs.keys())]

In [37]:
latest.query('dob == 999')


Out[37]:
qid dob dod gender ethnic_group citizenship place_of_birth site_links genders ethnic_groups citizenships place_of_births site_linkss
25981 Q337805 999 1062 Q6581097| NaN Q29520| Q1209298| zhwiki|wuuwiki|zh_classicalwiki|bowiki|ruwiki|... [Q6581097] [nan] [Q29520] [Q1209298] [zhwiki, wuuwiki, zh_classicalwiki, bowiki, ru...
555620 Q15645727 999 1025 Q6581072| NaN NaN NaN jawiki| [Q6581072] [nan] [nan] [nan] [jawiki]
808984 Q3012130 999 1082 Q6581072| NaN NaN NaN jawiki|kowiki|frwiki|eswiki|cawiki| [Q6581072] [nan] [nan] [nan] [jawiki, kowiki, frwiki, eswiki, cawiki]
1947827 Q336180 999 1072 Q6581097| NaN Q794| NaN dewiki|jawiki|tgwiki|fawiki|ruwiki|frwiki|enwi... [Q6581097] [nan] [Q794] [nan] [dewiki, jawiki, tgwiki, fawiki, ruwiki, frwik...
1993569 Q1093355 999 1025 Q6581072| NaN NaN NaN zhwiki|jawiki|kowiki|frwiki|eswiki|jawikiquote| [Q6581072] [nan] [nan] [nan] [zhwiki, jawiki, kowiki, frwiki, eswiki, jawik...

In [ ]:
for df in [earliest, latest]:
    for qid in latest.query('dob == 999')['qid']:
        print 'http://wikidata.org/wiki/'+qid
    print '\n'

In [38]:
from collections import defaultdict
import time

def make_reindex(snap_df):

    def int_dict_factory():
        return defaultdict(int)

    def nan_test(v):
        try:
            if math.isnan(v):
                return True
        except TypeError:
                return False
    #abstracted: we want year-gender, but also
    #gender-ethnicity -citizenship -place of birth, site-links

    params = ['dob','dod','genders','ethnic_groups','citizenships','place_of_births','site_linkss']
    gender_param = {param:defaultdict(int_dict_factory) for param in params}

    for index, row in snap_df.iterrows():
        row_data = {p : row[p] for p in params}
        for param in params:
            gender_dict = gender_param[param]
            vrs = row_data[param]
            genders = row_data['genders']
            if not nan_test(vrs):
                if not nan_test(genders):
                    for gender in genders:
                            if type(vrs) is list:
                                for var in vrs:
                                    gender_dict[gender][var] += 1
                            else: 
                                    gender_dict[gender][vrs] +=  1
                                    
    gender_dfs = {param: pandas.DataFrame.from_dict(gender_param[param], orient='columns') for param in params}
    return gender_dfs

In [148]:
gender_indexes = {snap: make_reindex(df) for snap, df in snap_dfs.iteritems()}


---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-148-4a18d1828f7e> in <module>()
----> 1 gender_indexes = {snap: make_reindex(df) for snap, df in snap_dfs.iteritems()}

NameError: name 'snap_dfs' is not defined

In [238]:
for snap, gender_dfs in gender_indexes.iteritems():
    for param, gender_df in gender_dfs.iteritems():
        print param
        engify_labels(gender_df)


---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-238-4c36c56aa8d6> in <module>()
----> 1 for snap, gender_dfs in gender_indexes.iteritems():
      2     for param, gender_df in gender_dfs.iteritems():
      3         print param
      4         engify_labels(gender_df)
      5 

NameError: name 'gender_indexes' is not defined

In [47]:
for snap, gender_df in gender_indexes.iteritems():
    for param, gender_df in gender_dfs.iteritems():
        property_index_dir = 'snapshot_data/%s/property_indexes' % (snap)
        if not os.path.exists(property_index_dir):
            os.makedirs(property_index_dir)
        filename = '%s/%s-index.csv' % (property_index_dir, param)
        filepoint = open(filename, 'w')
        filepoint.write(gender_df.to_csv())
        filepoint.close()

In [49]:
latest_date = max(snap_dfs.keys())
earliest_date = min(snap_dfs.keys())

In [88]:
gender_indexes[latest_date]['dob'].ix[999]


Out[88]:
nan                  NaN
transgender female   NaN
intersex             NaN
fa'afafine           NaN
transgender male     NaN
male animal          NaN
woman                NaN
genderqueer          NaN
female                 3
male                   2
kathoey              NaN
Name: 999.0, dtype: float64

In [58]:
gender_dfs['dob'].plot(kind='area',stacked=True, figsize=(24,8))


Out[58]:
<matplotlib.axes.AxesSubplot at 0x7f1c63aa9810>

In [43]:
plt = gender_dfs['dob'].ix[1800:].plot(kind='area',stacked=True, figsize=(24,8))
plt.set_title('''Wikidata Biography Gender Quantities by Year
    1800 onwards''', size=24)
plt.set_xlabel('Year', size=18)
plt.set_ylabel('Biographies', size=18)
plt.set_xlim((1800,2014))
plt.legend(title='Gender', loc=2)


---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-43-c0de4e952f65> in <module>()
----> 1 plt = gender_dfs['dob'].ix[1800:].plot(kind='area',stacked=True, figsize=(24,8))
      2 plt.set_title('''Wikidata Biography Gender Quantities by Year
      3     1800 onwards''', size=24)
      4 plt.set_xlabel('Year', size=18)
      5 plt.set_ylabel('Biographies', size=18)

NameError: name 'gender_dfs' is not defined

In [117]:
nonmale_early = gender_indexes[earliest_date]['dod'].copy(deep=True)
nonmale_late = gender_indexes[latest_date]['dod'].copy(deep=True)

del nonmale_early['nan']
nonmale_early['nonmale'] = nonmale_early.sum(axis=1) - nonmale_early['male']
nonmale_early['nm_per'] = (nonmale_early['nonmale'] ) / (nonmale_early['nonmale'] + nonmale_early['male'])

del nonmale_late['nan']
nonmale_late['nonmale'] = nonmale_late.sum(axis=1) - nonmale_late['male']
nonmale_late['nm_per'] = (nonmale_late['nonmale'] ) / (nonmale_late['nonmale'] + nonmale_late['male'])

In [128]:
plt = nonmale_early['nm_per'].plot(figsize(24,8), kind='line')
plt.set_xlim((-1000,2014))


Out[128]:
(-1000, 2014)

In [133]:
ma = pandas.rolling_mean(nonmale_early['nm_per'], 10)

In [134]:
plt = ma.plot()
plt.set_xlim((-1000,2014))


Out[134]:
(-1000, 2014)

In [ ]:
nonmale_late['nm_per'].plot(figsize=(24,8))
plt.set_xlim((-1000,2014))

In [67]:
gender_ordered


Out[67]:
[u'female',
 u'male',
 u'transgender female',
 u'intersex',
 u"fa'afafine",
 u'transgender male',
 u'female animal',
 u'male animal',
 u'woman',
 u'genderqueer',
 u'kathoey']

In [34]:
['Date of Birth']*2


Out[34]:
['Date of Birth', 'Date of Birth']

In [114]:
plt.style


Out[114]:
'/usr/local/lib/python2.7/dist-packages/matplotlib/style/__init__.pyc'

In [16]:
infogram = pandas.DataFrame.from_csv('snapshot_data/2014-10-13/property_indexes/dob-index.csv')
infogram.fillna(0, inplace=True)
del infogram['nan']
infogram['total'] = infogram.sum(axis=1)
infogram['nonbin'] = infogram['total'] - infogram['male'] - infogram['female']
infogram['fem_per'] = infogram['female']  / (infogram['total'])
infogram['nonbin_per'] = infogram['nonbin'] / infogram['total']
rm = pandas.rolling_mean(infogram[['fem_per','nonbin_per']], 10, min_periods=10)

In [19]:
partyears = range(1800,2000,int(200/6.0))
rm.ix[partyears][['nonbin_per','fem_per']].T.to_csv('Magnus Gender analysis/infogram_dob_rm.csv')
infogram.ix[partyears][['nonbin_per','fem_per','nonbin']].T.to_csv('Magnus Gender analysis/infogram_dob.csv')

In [28]:
rm.ix[1880:1910]['nonbin_per']


Out[28]:
1880    0.000000
1881    0.000000
1882    0.000022
1883    0.000022
1884    0.000022
1885    0.000043
1886    0.000043
1887    0.000043
1888    0.000043
1889    0.000043
1890    0.000043
1891    0.000043
1892    0.000021
1893    0.000021
1894    0.000021
1895    0.000000
1896    0.000000
1897    0.000000
1898    0.000000
1899    0.000000
1900    0.000000
1901    0.000000
1902    0.000000
1903    0.000000
1904    0.000000
1905    0.000000
1906    0.000000
1907    0.000000
1908    0.000000
1909    0.000000
1910    0.000000
Name: nonbin_per, dtype: float64

In [3]:
fig, axes = pylab.subplots(nrows=2, ncols=2, sharey=True)
for ax, beginning, l, (xtext, ytext) in zip(axes.ravel(), [-4000, 1800] * 2, ['b']*2+['d']*2, [('-4000 BCE to present','Date of Birth'),('1800 CE to present',''),('','Date of Death'),('','')]):
    acro = 'do'+l
    df = pandas.DataFrame.from_csv('snapshot_data/2014-10-13/property_indexes/%s-index.csv' % acro)
    del df['nan']
    df   = df.reindex_axis(gender_ordered,axis=1)
    p = df.plot(kind='area', figsize=(9,10), cmap='Accent', ax=ax, legend=False, linewidth=1)
    p.set_xlim((beginning,2014))
    #p.set_xlabel(xtext)
    p.set_ylabel(ytext)
    p.set_title(xtext, fontsize=12)

fig.legend = legend(bbox_to_anchor=(1.05, 1.5), loc=2, borderaxespad=0)
fig.suptitle('Volumes of Dates of Birth and Death in Wikidata \n by Gender \n Total and Modern Timeframes', fontsize=24)
fig.tight_layout()
subplots_adjust(hspace=0.1, top=0.82)



In [177]:
ra_len = 1

dox = pandas.DataFrame()
nonbindox = pandas.DataFrame()

for l in ['b','d']:
    acro = 'do'+l
    df = pandas.DataFrame.from_csv('snapshot_data/2014-10-13/property_indexes/%s-index.csv' % acro)
    del df['nan']
    df['total'] = df.sum(axis=1)
    df['nonbin'] = df['total'] - df['male'] - df['female']
    df['fem_per'] = df['female']  / (df['total'])
    df['nonbin_per'] = df['nonbin'] / df['total']
    
    ra = pandas.rolling_mean(df['fem_per'], ra_len)
    dox[acro] = ra
    
    nonbinra = pandas.rolling_mean(df['nonbin_per'], ra_len)
    nonbindox[acro] = nonbinra

fig, (pltf, pltb) =  pylab.subplots(nrows=2, ncols=1, sharex=True, figsize=(9,6))
dox.plot(kind='line',  cmap='Paired', linewidth=2, ax=pltf)
pltf.set_xlim((1400,2014))
pltf.set_ylim((0,0.7))
pltf.yaxis.set_major_formatter(matplotlib.ticker.FuncFormatter(lambda x, y: '{}%'.format(int(x*100) )))
pltf.set_title('Female ratio')
pltf.legend(('Date of Birth', 'Date of Death'),loc=4, bbox_to_anchor=(1.25,-0.25))

nonbindox.plot(kind='line',  cmap='Paired', linewidth=2, ax=pltb, legend=False)
pltb.set_xlim((1400,2014))
pltb.yaxis.set_major_formatter(matplotlib.ticker.FuncFormatter(lambda x, y: '{}%'.format(x*100)))
pltb.set_title('Non-binary Ratio')

fig.suptitle('Composition of Wikidata Genders in Modern History', fontsize=24)
fig.subplots_adjust(top=0.87)



In [207]:



Out[207]:
<matplotlib.text.Text at 0x7f1990eb4290>

Place of Birth Ivestiagation


In [4]:
pob = gender_indexes[latest_date]['place_of_births'].copy(deep=True)


---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-4-4e0945f5cefe> in <module>()
----> 1 pob = gender_indexes[latest_date]['place_of_births'].copy(deep=True)

NameError: name 'gender_indexes' is not defined

In [70]:
len(pob)


Out[70]:
72894

72,000 "places" how, many of these are countries. that is instance of "Q6256"


In [ ]:
%timeit -n 1 -r 1
def is_or_has_country(qid):
    countries = list() #we're going to return this
    page = pywikibot.ItemPage(wikidata, qid)
    data = page.get()
    claims = data['claims']
    for pid, claimlist in claims.iteritems():
        if pid == 'P17':
            for claim in claimlist:
                countries.append(claim.target.title()) #this is part of a country
        if pid == 'P31':
            for claim in claimlist:
                    if claim.target.title() == 'Q6256':
                        countries.append(qid) #this actually is a  country

    return countries 

place_country = dict()

count=0
for place in pob.index[1:]: #1 because the first index is nan
    place_country[place] = is_or_has_country(place)
    count += 1
    if count % 100 == 0:
        print count

In [97]:
pobs = [p for p in pob.index[1:]]
json.dump(pobs, open('pobs_list.json','w'))

do some processing on wmflabs to save on bandwidth


In [5]:
pobs_map = json.load(open('helpers/pobs_map.json','r'))

What percentage of pobs are of have a country? Which have more than one country?


In [9]:
have_country = [c[0] for c in pobs_map.values() if len(c) != 0]

In [10]:
len(list(set(have_country)))


Out[10]:
235

In [14]:
len(have_country) / float(len(pobs_map.values()))


Out[14]:
0.9363862099241353

In [15]:
have_no_country = [p for p, c in  pobs_map.iteritems() if len(c) == 0]

In [20]:
len(have_country)


Out[20]:
68256

In [16]:
len(have_no_country)


Out[16]:
4637

In [19]:
for place in have_no_country[10:20]:
    print 'http://wikidata.org/wiki/'+place


http://wikidata.org/wiki/Q361099
http://wikidata.org/wiki/Q4180803
http://wikidata.org/wiki/Q579468
http://wikidata.org/wiki/Q504912
http://wikidata.org/wiki/Q1013242
http://wikidata.org/wiki/Q1958565
http://wikidata.org/wiki/Q15763
http://wikidata.org/wiki/Q1091714
http://wikidata.org/wiki/Q448469
http://wikidata.org/wiki/Q7285906

there's not an obvious easy way to programmatically determine these, would probably have to go over them by hand

the sad part is that they probaby represent minority locations (and thus pepople)


In [23]:
country_lengths = sum([len(c) for c in have_country])

In [24]:
country_lengths / float(len(have_country))


Out[24]:
1.0032524613220817

Ok, which have more than one country.


In [33]:
for place, country_list in pobs_map.iteritems():
    if len(country_list) > 1:
        if reduce(lambda a, b: a != b, country_list ):#all elements are the not  same
            pass#print ['http://wikidata.org/wiki/'+place for place in country_list]

In [34]:
import IPython.display

In [40]:
IPython.display.Image('helpers/Inglehart_Values_Map2.svg.png')


Out[40]:
  1. confcian
  2. orthodox
  3. isalmic
  4. south asia
  5. africa
  6. catholic europea
  7. protestant europe
  8. english speaking

In [3]:
coun = [c for c in pobs_map.itervalues() if c]


---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-3-03814802914c> in <module>()
----> 1 coun = [c for c in pobs_map.itervalues() if c]

NameError: name 'pobs_map' is not defined

In [ ]:
country_culture = dict()
for place, country_list in pobs_map.iteritems():
    if country_list:
        qid =  country_list[0]
        if qid in country_culture.keys():
            continue
        else:
            link = 'http://wikidata.org/wiki/%s' % qid
            print english_label(qid)
            print link
            culture_num = input('enter culture num')
            country_culture[qid] = culture_num

In [39]:
fp = open('helpers/pob_agg.txt','w')
seen = list()
for place, country_list in pobs_map.iteritems():
    if country_list:
        qid =  country_list[0]
        if qid in seen:
            continue
        else:
            enlab = english_label(qid)
            writestr = u'%s\t%s\t\n' % (qid, enlab)
            fp.write(writestr.encode('utf-8'))
            seen.append(qid)
fp.close()

In [6]:
culture_map ={1:'confucian',
                        2:'orthodox',
                        3:'islamic',
                        4:'south asia',
                        5:'africa',
                        6: 'catholic europe',
                        7: 'protestant europe',
                        8: 'english speaking',
                        9: 'latin america'}

In [53]:
cultures_df.to_csv('helpers/culture_names.csv')

In [7]:
cultures_df = pandas.DataFrame.from_csv('helpers/culture_names.csv')
cultures_df['qid'] = cultures_df.index
cultures_df['cutlure_name'] = cultures_df['culture_number'].apply(lambda x: culture_map[x])

In [8]:
pob = pandas.DataFrame.from_csv('snapshot_data/2014-10-13/property_indexes/place_of_births-index.csv')
pob['qid'] = pob.index
#pob = pob.ix[1:] #remove nan row

In [9]:
qid_countryqid = json.load(open('helpers/pobs_map.json','r'))

def qid_to_country(qid):
    if type(qid) is float:
        if math.isnan(qid):
            return 'no_data'
    else:
        return qid_countryqid[qid]
    
pob['country_qid'] = pob['qid'].apply(lambda qid: qid_to_country(qid))

In [10]:
country_culture = dict(zip(cultures_df['qid'], cultures_df['culture_name']))

In [11]:
def aggregate_culture(qid_list):
    if not type(qid_list) is list and qid_list == 'no_data':
                return 'no_data'
    if len(qid_list) > 0:
        culture_name = country_culture[qid_list[0]]
        return culture_name
    else:
        return 'not_easily_aggregatable'

In [12]:
pob['culture_name'] = pob['country_qid'].apply(lambda qid_list: aggregate_culture(qid_list))

In [13]:
culture_groups = pob.groupby(by=['culture_name'])[u'transgender female', u'intersex', u"fa'afafine", u'transgender male', u'female animal', u'male animal', u'woman', u'genderqueer', u'female', u'male', u'kathoey'].sum().copy(deep=True)

In [35]:
culture_groups.to_csv('helpers/pob_plot_data_oct.csv')

In [15]:
culture_groups['total'] = culture_groups.sum(axis=1)

In [16]:
normed_pobs_agg = culture_groups.apply(lambda x: x/ float(x['total']), axis=1)

In [17]:
pobs_plot = normed_pobs_agg.sort('female')[normed_pobs_agg.columns[:-1]]
pobs_plot_mf = normed_pobs_agg.sort('female')[['male','female']]
pobs_plot_nmf = normed_pobs_agg[[u'transgender female', u'intersex', u"fa'afafine", u'transgender male', u'female animal', u'male animal', u'woman', u'genderqueer', u'kathoey']]

In [18]:
pobs_plot.plot(kind='bar', figsize=(10,10))


Out[18]:
<matplotlib.axes.AxesSubplot at 0x7fc4d9f927d0>

In [91]:
plt = pobs_plot_mf.plot(kind='bar', figsize=(12,5), cmap='Paired')
plt.set_title('Place of Birth Gender Composition by 9 world cultures.\nNo Data are items without Birth Data\nNot Easily Aggregratable means Place of Birth did Claim did not have a Country Property.(7%)')
plt.set_ylabel('Gender Composition')
plt.legend(bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0)


plt = pobs_plot_nmf.plot(kind='bar', figsize=(12,5), cmap='Accent')
plt.set_title('Place of Birth Gender Composition by 9 world cultures.\nNo Data are items without Birth Data\nNot Easily Aggregratable means Place of Birth did Claim did not have a Country Property.(7%)')
plt.set_ylabel('Gender Composition')
plt.legend(bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0)


Out[91]:
<matplotlib.legend.Legend at 0x7fc4ac5825d0>

In [79]:



Out[79]:
<matplotlib.legend.Legend at 0x7fc4ad56a710>

In [ ]:
pob = gender_indexes[latest_date]['place_of_births'].copy(deep=True)

Ethnic Groups


In [142]:
eg = gender_indexes[latest_date]['ethnic_groups'].copy(deep=True)

In [154]:
eg = pandas.DataFrame.from_csv('snapshot_data/2014-10-13/property_indexes/ethnic_groups-index.csv')

In [155]:
len(eg)


Out[155]:
683

In [160]:
eg=eg.ix[1:].fillna(value=0)
eg['total'] = eg.sum(axis=1)
engify_labels(eg,index=True)

In [162]:
eg['qid'] = eg.index

In [218]:
eg['ethnic_name'] = eg['qid'].apply(lambda x: english_label(x))

In [219]:
eg.sort('total', ascending=False).head(50)


Out[219]:
nan transgender female intersex fa'afafine transgender male female animal male animal woman genderqueer female male kathoey total qid ethinic_name ethnic_name
Q539051 116 0 0 0 0 0 0 0 0 190 2395 0 8103 Q539051 Greeks Greeks
Q127885 0 1 0 0 0 0 0 0 0 123 1135 0 3777 Q127885 Serbs Serbs
Q7325 1 0 0 0 0 0 0 0 0 30 157 0 564 Q7325 Jewish people Jewish people
Q7129609 2 0 0 0 0 0 0 0 0 115 57 0 522 Q7129609 Caucasian race Caucasian race
Q161652 0 1 0 0 0 0 0 0 0 150 20 0 513 Q161652 Japanese people Japanese people
Q49085 0 0 0 0 0 0 0 0 0 72 69 0 423 Q49085 African American African American
Q235155 0 1 0 0 0 0 0 0 0 81 35 0 351 Q235155 white people white people
Q402913 0 0 0 0 0 0 0 0 0 28 74 0 306 Q402913 Bengali people Bengali people
Q187985 0 0 0 0 0 0 0 0 0 9 88 0 291 Q187985 Tibetan people Tibetan people
Q485150 1 0 0 0 0 0 0 0 0 21 71 0 279 Q485150 Romanians Romanians
Q7994501 0 0 0 0 0 0 0 0 0 32 54 0 258 Q7994501 White British White British
Q4887679 0 0 0 0 0 0 0 0 0 9 71 0 240 Q4887679 Bengali Hindus Bengali Hindus
Q42406 0 0 0 0 0 0 0 0 0 43 30 0 219 Q42406 English people English people
Q34069 0 0 0 0 0 0 0 0 0 3 53 0 168 Q34069 Ashkenazi Jews Ashkenazi Jews
Q42884 0 0 0 0 0 0 0 0 0 15 39 0 162 Q42884 Germans Germans
Q49078 0 0 0 0 0 0 0 0 0 22 30 0 156 Q49078 White American White American
Q2556103 1 0 0 0 0 0 0 0 0 6 41 0 144 Q2556103 Pashtun people Pashtun people
Q133032 0 0 0 0 0 0 0 0 0 14 27 0 123 Q133032 Hungarian people Hungarian people
Q42740 1 0 0 0 0 0 0 0 0 3 37 0 123 Q42740 Han Chinese Han Chinese
Q50001 0 0 0 0 0 0 0 0 0 26 14 0 120 Q50001 Italians Italians
Q1815623 0 0 0 0 0 0 0 0 0 0 36 0 108 Q1815623 Sri Lankan Tamil people Sri Lankan Tamil people
Q35323 0 0 0 0 0 0 0 0 0 7 29 0 108 Q35323 Arab Arab
Q241696 0 0 0 0 0 0 0 0 0 11 22 0 99 Q241696 Somali people Somali people
Q403656 0 0 0 0 0 0 0 0 0 16 17 0 99 Q403656 Baganda Baganda
Q932244 0 0 0 0 0 0 0 0 0 4 28 0 96 Q932244 Sinhalese people Sinhalese people
Q30 0 0 0 0 0 0 0 0 0 6 25 0 93 Q30 United States of America United States of America
Q49542 0 0 0 0 0 0 0 0 0 6 23 0 87 Q49542 Russians Russians
Q43103 0 1 0 0 0 0 0 0 0 4 24 0 87 Q43103 European American European American
Q121842 0 0 0 0 0 0 0 0 0 18 10 0 84 Q121842 French people French people
Q854323 0 0 0 0 0 0 0 0 0 5 22 0 81 Q854323 Punjabi people Punjabi people
Q84072 0 0 0 0 0 0 0 0 0 7 19 0 78 Q84072 Turkish people Turkish people
Q179248 0 0 0 0 0 0 0 0 0 4 22 0 78 Q179248 Albanians Albanians
Q1026 0 0 0 0 0 0 0 0 0 7 18 0 75 Q1026 Poles Poles
Q4172847 0 0 0 0 0 0 0 0 0 16 8 0 72 Q4172847 Filipino people Filipino people
Q170826 0 0 0 0 0 0 0 0 0 11 12 0 69 Q170826 Irish people Irish people
Q79797 0 0 0 0 0 0 0 0 0 5 18 0 69 Q79797 Armenians Armenians
Q581780 0 0 0 0 0 0 0 0 0 0 22 0 66 Q581780 Banu Khazraj Banu Khazraj
Q445618 0 0 0 0 0 0 0 0 0 1 21 0 66 Q445618 Banat Swabians Banat Swabians
Q170217 1 0 0 0 0 0 0 0 0 13 7 0 63 Q170217 Czechs Czechs
Q160894 0 0 0 0 0 0 0 0 0 9 12 0 63 Q160894 Spanish people Spanish people
Q483569 0 0 0 0 0 0 0 0 0 3 18 0 63 Q483569 Belarusians Belarusians
Q974693 0 0 0 0 0 0 0 0 0 12 9 0 63 Q974693 Italian American Italian American
Q1344183 0 0 0 0 0 0 0 0 0 7 12 0 57 Q1344183 English American English American
Q133255 0 0 0 0 0 0 0 0 0 3 16 0 57 Q133255 Bulgarians Bulgarians
Q181634 0 0 0 0 0 0 0 0 0 3 15 0 54 Q181634 Scottish people Scottish people
Q484464 0 0 0 0 0 0 0 0 0 9 9 0 54 Q484464 Koreans Koreans
Q842438 0 0 0 0 0 0 0 0 0 7 10 0 51 Q842438 British people British people
Q700469 0 0 0 0 0 0 0 0 0 1 16 0 51 Q700469 Germans of Romania Germans of Romania
Q6501380 0 0 0 0 0 0 0 0 0 5 12 0 51 Q6501380 Chinese people Chinese people
Q862086 0 0 0 0 0 0 0 0 0 9 7 0 48 Q862086 Indian people Indian people

In [221]:
eg_normed = eg[[u'transgender female', u'intersex', u"fa'afafine", u'transgender male', u'female animal', u'male animal', u'woman', u'genderqueer', u'female', u'male', u'kathoey', 'total']].apply(lambda x: x/x['total'], axis=1)

In [228]:
eg_cut = eg[eg['total']>1]
eg_cut_normed = eg_cut[[u'transgender female', u'intersex', u"fa'afafine", u'transgender male', u'female animal', u'male animal', u'woman', u'genderqueer', u'female', u'male', u'kathoey', 'total']].apply(lambda x: x/x['total'], axis=1)

In [ ]:
eg_

In [177]:
engify_labels(eg_normed, index=True)

In [226]:
eg_normed.sort(columns=['female'], ascending=False)['female'].plot(kind='bar', figsize=(18,8))


Out[226]:
<matplotlib.axes.AxesSubplot at 0x7f1990731090>

In [241]:
def export_for_crowd_aggregate(df, savename):
    crowd_source_export  = pandas.DataFrame()
    crowd_source_export['qid'] = df.index
    crowd_source_export['en_label'] = crowd_source_export['qid'].apply(lambda x: english_label(x).encode('utf-8'))
    crowd_source_export['aggregate_group'] = ''
    crowd_source_export.to_csv('helpers/%s_map.csv' % savename)

In [ ]:
export_for_crowd_aggregate(eg, 'ethnic_groups')

Citizenships


In [99]:
cz = gender_indexes[latest_date]['citizenships'].copy(deep=True)

In [180]:
cz = pandas.DataFrame.from_csv('snapshot_data/2014-10-13/property_indexes/citizenships-index.csv')

In [249]:
cz = cz.ix[1:] #remove the inintial nan ,which are items that had no citienzship

In [250]:
len(cz)


Out[250]:
732

In [ ]:
export_for_crowd_aggregate(cz, 'citizenships')

SiteLinkss!


In [183]:
sl = pandas.DataFrame.from_csv('snapshot_data/2014-10-13/property_indexes/site_linkss-index.csv')

In [184]:
len(sl)


Out[184]:
428

Regressions Maybe


In [57]:
yg.plot(kind='scatter', x='year', y='lnmale')
plt.draw()


Out[57]:
<matplotlib.axes.AxesSubplot at 0x7f718fd5dc10>

In [108]:
yg_reg = yg
yg_reg['year'] = yg_reg.index

In [109]:
yg_reg['lnmale'] = numpy.log(yg_reg['male'])

In [123]:
import statsmodels.api as sm

nonnan = yg_reg.ix[start_year:1986].fillna(value=0)
model = sm.OLS(nonnan['male'],nonnan['year'])
results = model.fit()
print(results.summary())
    print(start_year, results.rsquared)


(None, 0.20534979805159537)
(-500, 0.21745102948866801)
(0, 0.21941326163168851)
(500, 0.22247317415119383)
(1000, 0.24992223055451823)
(1500, 0.37760491504948823)
(1800, 0.72773643874338734)
(1900, 0.95518546745325672)

In [144]:
rsquared_results = pandas.DataFrame(columns=['start_year','rsquared'])
for start_year in numpy.arange(-4000,1950,50):
    nm_model = sm.OLS(yg_reg.ix[start_year:1986]['nm_per'], sm.add_constant(yg_reg.ix[start_year:1986]['year']) )
    nm_results = nm_model.fit()
    rsquared_results = rsquared_results.append({'start_year':start_year, 'rsquared':nm_results.rsquared}, ignore_index=True)
    
ax = rsquared_results[rsquared_results['start_year']>500].plot(kind='line',x='start_year',y='rsquared', 
                                                                title=r'$R^2$ value for linear regression on non-male percentage')

ax.set_xlabel('starting year of regresssion untilt 1987')
ax.set_ylabel(r'$R^2$')


Out[144]:
<matplotlib.text.Text at 0x7f719df8ef50>

This implies $y = -1.7495 + 0.001x$ with $R^2 = 0.885$

setting $y=0.5$ $\implies$ $x=2249.5$ or in the year 2250


In [ ]: