In [1]:
import pandas as pd
import numpy
from collections import defaultdict
from matplotlib.pylab import style
import json
style.use('fivethirtyeight')
%pylab inline
java_min_int = -2147483648
In [106]:
import pywikibot
#Tranforming QIDs into English labels.
enwp = pywikibot.Site('en','wikipedia')
wikidata = enwp.data_repository()
retrieved = dict()
def english_label(qid):
if qid:
if type(qid) is float:
if math.isnan(qid):
return None
#first see if we've done it
try:
return retrieved[qid]
except KeyError:
try:
page = pywikibot.ItemPage(wikidata, qid)
data = page.get()
lab = data['labels']['en']
retrieved[qid] = lab
return lab
except KeyError:
retrieved[qid] = qid
return qid
except:
return qid
return qid
lang_map = json.load(open('helpers/wiki_code_map.json','r'))
def lookup_lang(lang):
try:
full= lang_map[lang]
if full.split()[-1].lower() == 'wikipedia':
return ' '.join(full.split()[:-1])
else: return full
except:
return lang
In [2]:
allrecs = pd.read_csv('snapshot_data/2014-10-13/gender-index-data-2014-10-13.csv',na_values=[java_min_int])
def split_column(q_str):
if type(q_str) is float:
if numpy.isnan(q_str):
return q_str
if type(q_str) is str:
qs = q_str.split('|')
return qs[:-1] #cos the format will always end with a |
for col in ['gender','site_links']:
allrecs[col] = allrecs[col].apply(split_column)
In [3]:
allrecs.head(5)
Out[3]:
In [15]:
for row in allrecs.iterrows():
print(row[1][3])
break
In [32]:
def makedict():
return defaultdict(int)
single = defaultdict(makedict)
many = defaultdict(makedict)
for index, row in allrecs.iterrows():
try:
gender = row[3][0]
except:
gender = None
wikiname_list = row[7]
if isinstance(wikiname_list, list):
if len(wikiname_list) == 1:
single[wikiname_list[0]][gender] += 1
else:
for wikiname in wikiname_list:
many[wikiname][gender] += 1
In [59]:
def makedf(indict):
df = pd.DataFrame.from_dict(indict, orient='index')
df.columns = map(english_label, df.columns)
df['human_total'] = df.sum(axis=1)
df['gendered_total'] = df['human_total'] - df[float('nan')]
df['nonbin_total'] = df['gendered_total'] - df['female'] - df['male']
df['fem_per'] = df['female'] / df['gendered_total']
df['nonbin_per'] = df['nonbin_total'] / df['gendered_total']
return df
In [60]:
sdf = makedf(single)
mdf = makedf(many)
In [61]:
tdf = sdf.join(mdf, how='inner', lsuffix='_single', rsuffix='_many')
In [62]:
tdf.fillna(0, inplace=True)
In [65]:
tdf.columns
Out[65]:
In [68]:
tdf['fem_per_diff'] = tdf['fem_per_single'] - tdf['fem_per_many']
In [121]:
tdf[tdf['gendered_total_single']>7310].sort('fem_per_diff',ascending=False)[['fem_per_diff','fem_per_single','female_single','gendered_total_single','fem_per_many']]
Out[121]:
In [94]:
fig, ax = plt.subplots(1,1,figsize=(8,8))
tdf[tdf['gendered_total_single']>2500][['gendered_total_single','fem_per_diff']].plot(kind='scatter', x='gendered_total_single', y='fem_per_diff', logx=True, ax=ax, c='#74bc3a')
ax.yaxis.set_major_formatter(matplotlib.ticker.FuncFormatter(lambda x, y: '{:.0%}'.format(x )))
ax.xaxis.set_major_formatter(matplotlib.ticker.FuncFormatter(lambda x, y: '{:,.0f}'.format(x )))
ax.set_xlim(min(tdf['gendered_total_single']) * 0.6, max(tdf['gendered_total_single']) *3)
#ax.set_ylim(min(tdf['fem_per_diff']) * 0.85, max(tdf['fem_per_diff']) *1.15)
for label, x, y in zip(tdf.index, tdf['gendered_total_single'], tdf['fem_per_diff']):
plt.annotate(
label,
xy = (x, y), xytext = (5,2),
textcoords = 'offset points', ha = 'left', va = 'bottom')
#bbox = dict(boxstyle = 'round,pad=0.5', fc = 'yellow', alpha = 0.5),
#arrowprops = dict(arrowstyle = '->', connectionstyle = 'arc3,rad=0'))
plt.annotate("", xy=(10000,0.5), xytext=(0,0))
plt.title('Differnce in Female Ratio, of "Language-Unique" Articles vs Others \n by Language-Unique Articles"', fontsize=24)
plt.xlabel('Number of Unique Biographies Recorded')
plt.ylabel('Composition of Biographies Which Are Female')
Out[94]:
In [137]:
ptdf = tdf[tdf['gendered_total_single']>730].sort('fem_per_diff')
maxbio = max(ptdf['gendered_total_single'])
bios_size = ptdf['gendered_total_single'].apply(lambda x: math.log(x)/math.log(maxbio))
my_colors = [(x/2, x, 0.75) for x in bios_size]
fig, ax = plt.subplots(1,1,figsize=(8,6))
ptdf['fem_per_diff'].plot(kind='bar', ax=ax, colors=my_colors)
ax.yaxis.set_major_formatter(matplotlib.ticker.FuncFormatter(lambda x, y: '{:.0%}'.format(x )))
wikilabels = ax.get_xticklabels()
wikinames = map(lambda x: x.get_text().split('wiki')[0], wikilabels)
fullnames = map(lookup_lang, wikinames)
ax.set_xticklabels(fullnames)
ax.set_title('Difference in Female Ratio \n by language-unique and language-many articles \n by Wikipedia Language', size=24)
ax.set_ylabel('[language-unique female ratio] - [language-many female ratio]')
ax.set_xlabel('Wikipedia Language | Darker Colours indicate more absolute lanuage-unique articles')
Out[137]:
In [ ]: