In [8]:
import pandas as pd
import numpy
from collections import defaultdict
from matplotlib.pylab import style
import json
style.use('fivethirtyeight')
%pylab inline
java_min_int = -2147483648
In [2]:
allrecs = pd.read_csv('snapshot_data/2014-10-13/gender-index-data-2014-10-13.csv',na_values=[java_min_int])
def split_column(q_str):
if type(q_str) is float:
if numpy.isnan(q_str):
return q_str
if type(q_str) is str:
qs = q_str.split('|')
return qs[:-1] #cos the format will always end with a |
for col in ['gender','site_links']:
allrecs[col] = allrecs[col].apply(split_column)
In [3]:
allrecs.head(5)
Out[3]:
In [4]:
lang_culture_map = pd.DataFrame.from_csv('helpers/aggregation_maps/lang_culture.csv')
def agg_culture(wikiname_list):
cultures = set()
if isinstance(wikiname_list, list):
for wikiname in wikiname_list:
parts = wikiname.split('wiki')
if parts[1] == '': #this was a pedia
lang_code = parts[0]
try:
culture = lang_culture_map.ix[lang_code]['culture']
cultures.add(culture)
except KeyError:
continue
return list(cultures)
In [5]:
len(allrecs)
Out[5]:
In [6]:
allrecs['cultures'] = allrecs['site_links'].apply(agg_culture)
In [ ]:
def dofd():
return defaultdict(int)
culture_gender_dict = defaultdict(dofd)
for row in allrecs.iterrows():
colbit = row[1]
gender_list = colbit[3]
if isinstance(gender_list, list):
gender = gender_list[0]
else:
gender = None
cultures = colbit[8]
for culture in cultures:
culture_gender_dict[culture][gender] += 1
In [ ]:
lang_cultures = pd.DataFrame.from_dict(culture_gender_dict, orient='index')
In [ ]:
lang_cultures.to_json('helpers/lang_cultures.json')
In [37]:
lang_cultures = pd.DataFrame.from_dict(json.load(open('helpers/lang_cultures.json','r')))
In [51]:
lang_cultures = lang_cultures.drop('Northern Sotho Wikipedia',axis=0)
In [55]:
import pywikibot
#Tranforming QIDs into English labels.
enwp = pywikibot.Site('en','wikipedia')
wikidata = enwp.data_repository()
retrieved = dict()
def english_label(qid):
if qid:
if type(qid) is float:
if math.isnan(qid):
return None
#first see if we've done it
try:
return retrieved[qid]
except KeyError:
try:
page = pywikibot.ItemPage(wikidata, qid)
data = page.get()
lab = data['labels']['en']
retrieved[qid] = lab
return lab
except KeyError:
retrieved[qid] = qid
return qid
except:
return qid
return qid
In [56]:
lang_cultures.columns = map(english_label, lang_cultures.columns)
In [57]:
lang_cultures['human_total'] = lang_cultures.sum(axis=1)
lang_cultures['gendered_total'] = lang_cultures['human_total'] - lang_cultures['null']
lang_cultures['nonbin_total'] = lang_cultures['gendered_total'] - lang_cultures['female'] - lang_cultures['male']
lang_cultures['fem_per'] = lang_cultures['female'] / lang_cultures['gendered_total']
lang_cultures['nonbin_per'] = lang_cultures['nonbin_total'] / lang_cultures['gendered_total']
In [58]:
lang_cultures.fillna(0,inplace=True)
In [59]:
lang_cultures
Out[59]:
In [88]:
fig, ax = plt.subplots(1,1,figsize=(8,8))
lang_cultures[['gendered_total','fem_per']].plot(kind='scatter', x='gendered_total', y='fem_per', logx=True, ax=ax, c='#74bc3a')
ax.yaxis.set_major_formatter(matplotlib.ticker.FuncFormatter(lambda x, y: '{:.0%}'.format(x )))
ax.xaxis.set_major_formatter(matplotlib.ticker.FuncFormatter(lambda x, y: '{:,.0f}'.format(x )))
ax.set_xlim(min(lang_cultures['gendered_total']) * 0.6, max(lang_cultures['gendered_total']) *3)
ax.set_ylim(min(lang_cultures['fem_per']) * 0.85, max(lang_cultures['fem_per']) *1.15)
for label, x, y in zip(lang_cultures.index, lang_cultures['gendered_total'], lang_cultures['fem_per']):
plt.annotate(
label,
xy = (x, y), xytext = (5,2),
textcoords = 'offset points', ha = 'left', va = 'bottom')
#bbox = dict(boxstyle = 'round,pad=0.5', fc = 'yellow', alpha = 0.5),
#arrowprops = dict(arrowstyle = '->', connectionstyle = 'arc3,rad=0'))
plt.annotate("", xy=(10000,0.5), xytext=(0,0))
plt.title('Female % vs. Total Biographies \nLanguage Aggregated By Culture', fontsize=24)
plt.xlabel('Number of Biographies Recorded')
plt.ylabel('Composition of Biographies Which Are Female')
Out[88]:
In [ ]: