notebook.community

Edit and run



In [8]:

    
import pandas as pd
import numpy
from collections import defaultdict
from matplotlib.pylab import style
import json
style.use('fivethirtyeight')
%pylab inline
java_min_int = -2147483648









    



Populating the interactive namespace from numpy and matplotlib



In [2]:

    
allrecs = pd.read_csv('snapshot_data/2014-10-13/gender-index-data-2014-10-13.csv',na_values=[java_min_int])
def split_column(q_str):
    if type(q_str) is float:
        if numpy.isnan(q_str):
            return q_str 
    if type(q_str) is str:
        qs = q_str.split('|')
        return qs[:-1] #cos the format will always end with a |

for col in ['gender','site_links']:
    allrecs[col] = allrecs[col].apply(split_column)



In [3]:

    
allrecs.head(5)









    Out[3]:






  
    
      
      qid
      dob
      dod
      gender
      ethnic_group
      citizenship
      place_of_birth
      site_links
    
  
  
    
      0
        Q23
       1732
       1799
       [Q6581097]
       NaN
            Q30|
       Q494413|
       [zhwiki, kywiki, euwiki, plwiki, bswiki, angwi...
    
    
      1
        Q42
       1952
       2001
       [Q6581097]
       NaN
           Q145|
          Q350|
       [zhwiki, jvwiki, euwiki, plwiki, bswiki, eswik...
    
    
      2
       Q207
       1946
        NaN
       [Q6581097]
       NaN
            Q30|
        Q49145|
       [uzwiki, eswiki, kowikiquote, huwiki, liwikiqu...
    
    
      3
       Q297
        NaN
       1660
       [Q6581097]
       NaN
            Q29|
         Q8717|
       [zhwiki, kywiki, plwiki, euwiki, bswiki, uzwik...
    
    
      4
       Q326
       1942
        NaN
       [Q6581097]
       NaN
       Q298|Q39|
         Q2887|
       [zhwiki, plwiki, euwiki, kowiki, frwiki, eswik...



In [4]:

    
lang_culture_map = pd.DataFrame.from_csv('helpers/aggregation_maps/lang_culture.csv')

def agg_culture(wikiname_list):
    cultures = set()
    if isinstance(wikiname_list, list):
        for wikiname in wikiname_list:
            parts = wikiname.split('wiki')
            if parts[1] == '': #this was a pedia
                lang_code = parts[0]
                try:
                    culture = lang_culture_map.ix[lang_code]['culture']
                    cultures.add(culture)
                except KeyError:
                    continue
    return list(cultures)



In [5]:

    
len(allrecs)









    Out[5]:





2561999



In [6]:

    
allrecs['cultures'] = allrecs['site_links'].apply(agg_culture)









    



---------------------------------------------------------------------------
KeyboardInterrupt                         Traceback (most recent call last)
<ipython-input-6-e25fe682b4b1> in <module>()
----> 1 allrecs['cultures'] = allrecs['site_links'].apply(agg_culture)

/usr/local/lib/python2.7/dist-packages/pandas/core/series.pyc in apply(self, func, convert_dtype, args, **kwds)
   2056             values = lib.map_infer(values, lib.Timestamp)
   2057 
-> 2058         mapped = lib.map_infer(values, f, convert=convert_dtype)
   2059         if len(mapped) and isinstance(mapped[0], Series):
   2060             from pandas.core.frame import DataFrame

/usr/local/lib/python2.7/dist-packages/pandas/lib.so in pandas.lib.map_infer (pandas/lib.c:57158)()

<ipython-input-4-ad100d1b5480> in agg_culture(wikiname_list)
      9                 lang_code = parts[0]
     10                 try:
---> 11                     culture = lang_culture_map.ix[lang_code]['culture']
     12                     cultures.add(culture)
     13                 except KeyError:

/usr/local/lib/python2.7/dist-packages/pandas/core/indexing.pyc in __getitem__(self, key)
     70             return self._getitem_tuple(key)
     71         else:
---> 72             return self._getitem_axis(key, axis=0)
     73 
     74     def _get_label(self, label, axis=0):

/usr/local/lib/python2.7/dist-packages/pandas/core/indexing.pyc in _getitem_axis(self, key, axis)
    926                     return self._get_loc(key, axis=axis)
    927 
--> 928             return self._get_label(key, axis=axis)
    929 
    930     def _getitem_iterable(self, key, axis=0):

/usr/local/lib/python2.7/dist-packages/pandas/core/indexing.pyc in _get_label(self, label, axis)
     86             raise IndexingError('no slices here, handle elsewhere')
     87 
---> 88         return self.obj._xs(label, axis=axis)
     89 
     90     def _get_loc(self, key, axis=0):

/usr/local/lib/python2.7/dist-packages/pandas/core/generic.pyc in xs(self, key, axis, level, copy, drop_level)
   1469 
   1470             result = Series(new_values, index=self.columns,
-> 1471                             name=self.index[loc])
   1472 
   1473         else:

/usr/local/lib/python2.7/dist-packages/pandas/core/series.pyc in __init__(self, data, index, dtype, name, copy, fastpath)
    216 
    217         object.__setattr__(self, 'name', name)
--> 218         self._set_axis(0, index, fastpath=True)
    219 
    220     @classmethod

/usr/local/lib/python2.7/dist-packages/pandas/core/series.pyc in _set_axis(self, axis, labels, fastpath)
    259         self._set_subtyp(is_all_dates)
    260 
--> 261         object.__setattr__(self, '_index', labels)
    262         if not fastpath:
    263             self._data.set_axis(axis, labels)

KeyboardInterrupt:



In [ ]:

    
def dofd():
    return defaultdict(int)

culture_gender_dict = defaultdict(dofd)

for row in allrecs.iterrows():
    colbit = row[1]
    gender_list = colbit[3]
    if isinstance(gender_list, list):
        gender = gender_list[0]
    else:
        gender = None
    cultures = colbit[8]
    for culture in cultures:
        culture_gender_dict[culture][gender] += 1



In [ ]:

    
lang_cultures = pd.DataFrame.from_dict(culture_gender_dict, orient='index')



In [ ]:

    
lang_cultures.to_json('helpers/lang_cultures.json')



In [37]:

    
lang_cultures = pd.DataFrame.from_dict(json.load(open('helpers/lang_cultures.json','r')))



In [51]:

    
lang_cultures = lang_cultures.drop('Northern Sotho Wikipedia',axis=0)



In [55]:

    
import pywikibot
#Tranforming QIDs into English labels.
enwp = pywikibot.Site('en','wikipedia')
wikidata = enwp.data_repository()

retrieved = dict()

def english_label(qid):
    if qid:
        if type(qid) is float:
            if math.isnan(qid):
                return None
        #first see if we've done it
        try:
            return retrieved[qid]
        except KeyError:
            try:
                page = pywikibot.ItemPage(wikidata, qid)
                data = page.get()
                lab = data['labels']['en']
                retrieved[qid] = lab
                return lab
            except KeyError:
                retrieved[qid] = qid
                return qid
            except:
                return qid
    return qid



In [56]:

    
lang_cultures.columns = map(english_label, lang_cultures.columns)









    



VERBOSE:pywiki:Found 1 wikidata:wikidata processes running, including this one.



In [57]:

    
lang_cultures['human_total'] = lang_cultures.sum(axis=1)
lang_cultures['gendered_total'] = lang_cultures['human_total'] - lang_cultures['null']
lang_cultures['nonbin_total'] = lang_cultures['gendered_total'] - lang_cultures['female'] - lang_cultures['male']
lang_cultures['fem_per'] = lang_cultures['female'] / lang_cultures['gendered_total']
lang_cultures['nonbin_per'] = lang_cultures['nonbin_total'] / lang_cultures['gendered_total']



In [58]:

    
lang_cultures.fillna(0,inplace=True)



In [59]:

    
lang_cultures









    Out[59]:






  
    
      
      transgender female
      intersex
      fa'afafine
      transgender male
      female animal
      male animal
      woman
      genderqueer
      female
      male
      kathoey
      null
      human_total
      gendered_total
      nonbin_total
      fem_per
      nonbin_per
    
  
  
    
      Africa
        0
        0
       0
        0
       0
       0
       0
       0
         1888
         12824
       0
          306
         15018
         14712
         0
       0.128331
       0.000000
    
    
      Catholic european
       39
        3
       1
        7
       0
       6
       0
       3
       122048
        712655
       0
        33480
        868242
        834762
        59
       0.146207
       0.000071
    
    
      Confucian
       24
        2
       1
        1
       0
       0
       0
       2
        47355
        133508
       1
       163710
        344604
        180894
        31
       0.261783
       0.000171
    
    
      Constructed
        6
        1
       0
        1
       0
       0
       0
       0
         4321
         27085
       1
         2252
         33667
         31415
         9
       0.137546
       0.000286
    
    
      English-speaking
       99
       17
       1
       19
       0
       0
       0
       7
       186455
       1016500
       1
         3259
       1206358
       1203099
       144
       0.154979
       0.000120
    
    
      Islamic
       29
        1
       0
        2
       0
       0
       0
       1
        20931
        100099
       1
        19529
        140593
        121064
        34
       0.172892
       0.000281
    
    
      Latin America
       25
        3
       1
        4
       0
       0
       0
       1
        38195
        202735
       0
         1979
        242943
        240964
        34
       0.158509
       0.000141
    
    
      Orthodox
       23
        1
       0
        1
       1
       2
       1
       1
        53843
        324884
       1
        15316
        394074
        378758
        31
       0.142157
       0.000082
    
    
      Protestant European
       26
        2
       1
        7
       0
       0
       0
       3
       132250
        683435
       1
        17611
        833336
        815725
        40
       0.162126
       0.000049
    
    
      South Asia
       13
        1
       0
        2
       0
       0
       0
       1
        17000
         68817
       1
        15180
        101015
         85835
        18
       0.198054
       0.000210



In [88]:

    
fig, ax = plt.subplots(1,1,figsize=(8,8))
lang_cultures[['gendered_total','fem_per']].plot(kind='scatter', x='gendered_total', y='fem_per', logx=True, ax=ax, c='#74bc3a')

ax.yaxis.set_major_formatter(matplotlib.ticker.FuncFormatter(lambda x, y: '{:.0%}'.format(x )))
ax.xaxis.set_major_formatter(matplotlib.ticker.FuncFormatter(lambda x, y: '{:,.0f}'.format(x )))

ax.set_xlim(min(lang_cultures['gendered_total']) * 0.6, max(lang_cultures['gendered_total']) *3)
ax.set_ylim(min(lang_cultures['fem_per']) * 0.85, max(lang_cultures['fem_per']) *1.15)

for label, x, y in zip(lang_cultures.index, lang_cultures['gendered_total'], lang_cultures['fem_per']):
    plt.annotate(
        label, 
        xy = (x, y), xytext = (5,2),
        textcoords = 'offset points', ha = 'left', va = 'bottom')
        #bbox = dict(boxstyle = 'round,pad=0.5', fc = 'yellow', alpha = 0.5),
        #arrowprops = dict(arrowstyle = '->', connectionstyle = 'arc3,rad=0'))

plt.annotate("", xy=(10000,0.5), xytext=(0,0)) 
plt.title('Female % vs. Total Biographies \nLanguage Aggregated By Culture', fontsize=24)
plt.xlabel('Number of Biographies Recorded')
plt.ylabel('Composition of Biographies Which Are Female')









    Out[88]:





<matplotlib.text.Text at 0x7feccbfe6a50>



In [ ]:

	qid	dob	dod	gender	ethnic_group	citizenship	place_of_birth	site_links
0	Q23	1732	1799	[Q6581097]	NaN	Q30\|	Q494413\|	[zhwiki, kywiki, euwiki, plwiki, bswiki, angwi...
1	Q42	1952	2001	[Q6581097]	NaN	Q145\|	Q350\|	[zhwiki, jvwiki, euwiki, plwiki, bswiki, eswik...
2	Q207	1946	NaN	[Q6581097]	NaN	Q30\|	Q49145\|	[uzwiki, eswiki, kowikiquote, huwiki, liwikiqu...
3	Q297	NaN	1660	[Q6581097]	NaN	Q29\|	Q8717\|	[zhwiki, kywiki, plwiki, euwiki, bswiki, uzwik...
4	Q326	1942	NaN	[Q6581097]	NaN	Q298\|Q39\|	Q2887\|	[zhwiki, plwiki, euwiki, kowiki, frwiki, eswik...

	transgender female	intersex	fa'afafine	transgender male	female animal	male animal	woman	genderqueer	female	male	kathoey	null	human_total	gendered_total	nonbin_total	fem_per	nonbin_per
Africa	0	0	0	0	0	0	0	0	1888	12824	0	306	15018	14712	0	0.128331	0.000000
Catholic european	39	3	1	7	0	6	0	3	122048	712655	0	33480	868242	834762	59	0.146207	0.000071
Confucian	24	2	1	1	0	0	0	2	47355	133508	1	163710	344604	180894	31	0.261783	0.000171
Constructed	6	1	0	1	0	0	0	0	4321	27085	1	2252	33667	31415	9	0.137546	0.000286
English-speaking	99	17	1	19	0	0	0	7	186455	1016500	1	3259	1206358	1203099	144	0.154979	0.000120
Islamic	29	1	0	2	0	0	0	1	20931	100099	1	19529	140593	121064	34	0.172892	0.000281
Latin America	25	3	1	4	0	0	0	1	38195	202735	0	1979	242943	240964	34	0.158509	0.000141
Orthodox	23	1	0	1	1	2	1	1	53843	324884	1	15316	394074	378758	31	0.142157	0.000082
Protestant European	26	2	1	7	0	0	0	3	132250	683435	1	17611	833336	815725	40	0.162126	0.000049
South Asia	13	1	0	2	0	0	0	1	17000	68817	1	15180	101015	85835	18	0.198054	0.000210