In [8]:
import pandas as pd
import numpy
from collections import defaultdict
from matplotlib.pylab import style
import json
style.use('fivethirtyeight')
%pylab inline
java_min_int = -2147483648


Populating the interactive namespace from numpy and matplotlib

In [2]:
allrecs = pd.read_csv('snapshot_data/2014-10-13/gender-index-data-2014-10-13.csv',na_values=[java_min_int])
def split_column(q_str):
    if type(q_str) is float:
        if numpy.isnan(q_str):
            return q_str 
    if type(q_str) is str:
        qs = q_str.split('|')
        return qs[:-1] #cos the format will always end with a |

for col in ['gender','site_links']:
    allrecs[col] = allrecs[col].apply(split_column)

In [3]:
allrecs.head(5)


Out[3]:
qid dob dod gender ethnic_group citizenship place_of_birth site_links
0 Q23 1732 1799 [Q6581097] NaN Q30| Q494413| [zhwiki, kywiki, euwiki, plwiki, bswiki, angwi...
1 Q42 1952 2001 [Q6581097] NaN Q145| Q350| [zhwiki, jvwiki, euwiki, plwiki, bswiki, eswik...
2 Q207 1946 NaN [Q6581097] NaN Q30| Q49145| [uzwiki, eswiki, kowikiquote, huwiki, liwikiqu...
3 Q297 NaN 1660 [Q6581097] NaN Q29| Q8717| [zhwiki, kywiki, plwiki, euwiki, bswiki, uzwik...
4 Q326 1942 NaN [Q6581097] NaN Q298|Q39| Q2887| [zhwiki, plwiki, euwiki, kowiki, frwiki, eswik...

In [4]:
lang_culture_map = pd.DataFrame.from_csv('helpers/aggregation_maps/lang_culture.csv')

def agg_culture(wikiname_list):
    cultures = set()
    if isinstance(wikiname_list, list):
        for wikiname in wikiname_list:
            parts = wikiname.split('wiki')
            if parts[1] == '': #this was a pedia
                lang_code = parts[0]
                try:
                    culture = lang_culture_map.ix[lang_code]['culture']
                    cultures.add(culture)
                except KeyError:
                    continue
    return list(cultures)

In [5]:
len(allrecs)


Out[5]:
2561999

In [6]:
allrecs['cultures'] = allrecs['site_links'].apply(agg_culture)


---------------------------------------------------------------------------
KeyboardInterrupt                         Traceback (most recent call last)
<ipython-input-6-e25fe682b4b1> in <module>()
----> 1 allrecs['cultures'] = allrecs['site_links'].apply(agg_culture)

/usr/local/lib/python2.7/dist-packages/pandas/core/series.pyc in apply(self, func, convert_dtype, args, **kwds)
   2056             values = lib.map_infer(values, lib.Timestamp)
   2057 
-> 2058         mapped = lib.map_infer(values, f, convert=convert_dtype)
   2059         if len(mapped) and isinstance(mapped[0], Series):
   2060             from pandas.core.frame import DataFrame

/usr/local/lib/python2.7/dist-packages/pandas/lib.so in pandas.lib.map_infer (pandas/lib.c:57158)()

<ipython-input-4-ad100d1b5480> in agg_culture(wikiname_list)
      9                 lang_code = parts[0]
     10                 try:
---> 11                     culture = lang_culture_map.ix[lang_code]['culture']
     12                     cultures.add(culture)
     13                 except KeyError:

/usr/local/lib/python2.7/dist-packages/pandas/core/indexing.pyc in __getitem__(self, key)
     70             return self._getitem_tuple(key)
     71         else:
---> 72             return self._getitem_axis(key, axis=0)
     73 
     74     def _get_label(self, label, axis=0):

/usr/local/lib/python2.7/dist-packages/pandas/core/indexing.pyc in _getitem_axis(self, key, axis)
    926                     return self._get_loc(key, axis=axis)
    927 
--> 928             return self._get_label(key, axis=axis)
    929 
    930     def _getitem_iterable(self, key, axis=0):

/usr/local/lib/python2.7/dist-packages/pandas/core/indexing.pyc in _get_label(self, label, axis)
     86             raise IndexingError('no slices here, handle elsewhere')
     87 
---> 88         return self.obj._xs(label, axis=axis)
     89 
     90     def _get_loc(self, key, axis=0):

/usr/local/lib/python2.7/dist-packages/pandas/core/generic.pyc in xs(self, key, axis, level, copy, drop_level)
   1469 
   1470             result = Series(new_values, index=self.columns,
-> 1471                             name=self.index[loc])
   1472 
   1473         else:

/usr/local/lib/python2.7/dist-packages/pandas/core/series.pyc in __init__(self, data, index, dtype, name, copy, fastpath)
    216 
    217         object.__setattr__(self, 'name', name)
--> 218         self._set_axis(0, index, fastpath=True)
    219 
    220     @classmethod

/usr/local/lib/python2.7/dist-packages/pandas/core/series.pyc in _set_axis(self, axis, labels, fastpath)
    259         self._set_subtyp(is_all_dates)
    260 
--> 261         object.__setattr__(self, '_index', labels)
    262         if not fastpath:
    263             self._data.set_axis(axis, labels)

KeyboardInterrupt: 

In [ ]:
def dofd():
    return defaultdict(int)

culture_gender_dict = defaultdict(dofd)

for row in allrecs.iterrows():
    colbit = row[1]
    gender_list = colbit[3]
    if isinstance(gender_list, list):
        gender = gender_list[0]
    else:
        gender = None
    cultures = colbit[8]
    for culture in cultures:
        culture_gender_dict[culture][gender] += 1

In [ ]:
lang_cultures = pd.DataFrame.from_dict(culture_gender_dict, orient='index')

In [ ]:
lang_cultures.to_json('helpers/lang_cultures.json')

In [37]:
lang_cultures = pd.DataFrame.from_dict(json.load(open('helpers/lang_cultures.json','r')))

In [51]:
lang_cultures = lang_cultures.drop('Northern Sotho Wikipedia',axis=0)

In [55]:
import pywikibot
#Tranforming QIDs into English labels.
enwp = pywikibot.Site('en','wikipedia')
wikidata = enwp.data_repository()

retrieved = dict()

def english_label(qid):
    if qid:
        if type(qid) is float:
            if math.isnan(qid):
                return None
        #first see if we've done it
        try:
            return retrieved[qid]
        except KeyError:
            try:
                page = pywikibot.ItemPage(wikidata, qid)
                data = page.get()
                lab = data['labels']['en']
                retrieved[qid] = lab
                return lab
            except KeyError:
                retrieved[qid] = qid
                return qid
            except:
                return qid
    return qid

In [56]:
lang_cultures.columns = map(english_label, lang_cultures.columns)


VERBOSE:pywiki:Found 1 wikidata:wikidata processes running, including this one.

In [57]:
lang_cultures['human_total'] = lang_cultures.sum(axis=1)
lang_cultures['gendered_total'] = lang_cultures['human_total'] - lang_cultures['null']
lang_cultures['nonbin_total'] = lang_cultures['gendered_total'] - lang_cultures['female'] - lang_cultures['male']
lang_cultures['fem_per'] = lang_cultures['female'] / lang_cultures['gendered_total']
lang_cultures['nonbin_per'] = lang_cultures['nonbin_total'] / lang_cultures['gendered_total']

In [58]:
lang_cultures.fillna(0,inplace=True)

In [59]:
lang_cultures


Out[59]:
transgender female intersex fa'afafine transgender male female animal male animal woman genderqueer female male kathoey null human_total gendered_total nonbin_total fem_per nonbin_per
Africa 0 0 0 0 0 0 0 0 1888 12824 0 306 15018 14712 0 0.128331 0.000000
Catholic european 39 3 1 7 0 6 0 3 122048 712655 0 33480 868242 834762 59 0.146207 0.000071
Confucian 24 2 1 1 0 0 0 2 47355 133508 1 163710 344604 180894 31 0.261783 0.000171
Constructed 6 1 0 1 0 0 0 0 4321 27085 1 2252 33667 31415 9 0.137546 0.000286
English-speaking 99 17 1 19 0 0 0 7 186455 1016500 1 3259 1206358 1203099 144 0.154979 0.000120
Islamic 29 1 0 2 0 0 0 1 20931 100099 1 19529 140593 121064 34 0.172892 0.000281
Latin America 25 3 1 4 0 0 0 1 38195 202735 0 1979 242943 240964 34 0.158509 0.000141
Orthodox 23 1 0 1 1 2 1 1 53843 324884 1 15316 394074 378758 31 0.142157 0.000082
Protestant European 26 2 1 7 0 0 0 3 132250 683435 1 17611 833336 815725 40 0.162126 0.000049
South Asia 13 1 0 2 0 0 0 1 17000 68817 1 15180 101015 85835 18 0.198054 0.000210

In [88]:
fig, ax = plt.subplots(1,1,figsize=(8,8))
lang_cultures[['gendered_total','fem_per']].plot(kind='scatter', x='gendered_total', y='fem_per', logx=True, ax=ax, c='#74bc3a')

ax.yaxis.set_major_formatter(matplotlib.ticker.FuncFormatter(lambda x, y: '{:.0%}'.format(x )))
ax.xaxis.set_major_formatter(matplotlib.ticker.FuncFormatter(lambda x, y: '{:,.0f}'.format(x )))

ax.set_xlim(min(lang_cultures['gendered_total']) * 0.6, max(lang_cultures['gendered_total']) *3)
ax.set_ylim(min(lang_cultures['fem_per']) * 0.85, max(lang_cultures['fem_per']) *1.15)

for label, x, y in zip(lang_cultures.index, lang_cultures['gendered_total'], lang_cultures['fem_per']):
    plt.annotate(
        label, 
        xy = (x, y), xytext = (5,2),
        textcoords = 'offset points', ha = 'left', va = 'bottom')
        #bbox = dict(boxstyle = 'round,pad=0.5', fc = 'yellow', alpha = 0.5),
        #arrowprops = dict(arrowstyle = '->', connectionstyle = 'arc3,rad=0'))

plt.annotate("", xy=(10000,0.5), xytext=(0,0)) 
plt.title('Female % vs. Total Biographies \nLanguage Aggregated By Culture', fontsize=24)
plt.xlabel('Number of Biographies Recorded')
plt.ylabel('Composition of Biographies Which Are Female')


Out[88]:
<matplotlib.text.Text at 0x7feccbfe6a50>

In [ ]: