In [1]:
import pandas as pd
import numpy
import json
from collections import defaultdict
%pylab inline
java_min_int = -2147483648


Populating the interactive namespace from numpy and matplotlib

In [2]:
import math
import pywikibot
#Tranforming QIDs into English labels.
enwp = pywikibot.Site('en','wikipedia')
wikidata = enwp.data_repository()

retrieved = dict()

def english_label(qid):
    if type(qid) is float:
        if math.isnan(qid):
            return None
    #first see if we've done it
    try:
        return retrieved[qid]
    except KeyError:
        try:
            page = pywikibot.ItemPage(wikidata, qid)
            data = page.get()
            lab = data['labels']['en']
            retrieved[qid] = lab
            return lab
        except KeyError:
            retrieved[qid] = qid
            return qid


VERBOSE:pywiki:Starting 1 threads...

In [3]:
allrecs = pd.DataFrame.from_dict(json.load(open('helpers/world_cultures_shortcut.json','r')))

In [167]:
pd.crosstab[allrecs['culture']]


---------------------------------------------------------------------------
TypeError                                 Traceback (most recent call last)
<ipython-input-167-c9d92ccf2fb2> in <module>()
----> 1 pd.crosstab[allrecs['culture']]

TypeError: 'function' object has no attribute '__getitem__'

In [4]:
genrecs = allrecs[allrecs['gender'].apply(lambda x: x is not None)]
genrecs['en_gender'] = genrecs['gender'].apply(english_label)


VERBOSE:pywiki:Found 1 wikidata:wikidata processes running, including this one.
-c:2: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy

In [5]:
cultures = genrecs[['culture','en_gender']].groupby(by='culture')

In [6]:
# it should probably be done this way cultpiv = pd.DataFrame.pivot_table(pd.DataFrame(cultures), index='culture', values='en_gender')

In [145]:
perc_dict = defaultdict(dict)
for group, df in cultures:
    total = float(len(df))
    fem = len(df[df["en_gender"] == 'female'])
    mal = len(df[df["en_gender"] == 'male'])
    fem_per = fem / total
    nonbin_per = (total - (fem + mal)) / total
    perc_dict[group]['total'] = total
    perc_dict[group]['female %'] = fem_per
    perc_dict[group]['nonbinary %'] = nonbin_per

In [149]:
cultplotdf = pd.DataFrame.from_dict(perc_dict, orient='index').sort('female %')

In [9]:
from matplotlib.pylab import style
style.use('fivethirtyeight')

In [165]:
fig, ax = plt.subplots(1, 1, figsize=(6,6))
cultplotdf[['total','female %']].plot(kind='bar',secondary_y=['total'], ax=ax, grid=False)
ax.right_ax.set_ylabel('total biographies')
ax.set_ylabel('female %')
ax.legend(loc=2)
ax.right_ax.legend(loc=1)
ax.set_xlabel('culture')
ax.yaxis.set_major_formatter(matplotlib.ticker.FuncFormatter(lambda x, y: '{:.0%}'.format(x )))
ax.right_ax.yaxis.set_major_formatter(matplotlib.ticker.FuncFormatter(lambda x, y: '{:,}'.format(int(x))))

fig.suptitle('Female Percentage of Biographies by Culture', fontsize=24)
fig.subplots_adjust(top=0.88)



In [166]:
fig, ax = plt.subplots(1, 1, figsize=(6,6))
cultplotdf[['total','nonbinary %']].plot(kind='bar',secondary_y=['total'], ax=ax, grid=False)
ax.right_ax.set_ylabel('total biographies')
ax.set_ylabel('nonbinary %')
ax.legend(loc=2)
ax.right_ax.legend(loc=1)
ax.set_xlabel('culture')
ax.yaxis.set_major_formatter(matplotlib.ticker.FuncFormatter(lambda x, y: '{:.4%}'.format(x )))
ax.right_ax.yaxis.set_major_formatter(matplotlib.ticker.FuncFormatter(lambda x, y: '{:,}'.format(int(x))))

fig.suptitle('Nonbinary Percentage of Biographies by Culture', fontsize=24)
fig.subplots_adjust(top=0.88)



In [18]:
fig, axes = plt.subplots(nrows=3, ncols=1, sharex=True, figsize=(10,10))
measures = ['total', 'fem_per', 'nonbin_per']
for meas, ax in zip(measures, axes):
    cultplotdf[meas].plot(kind='bar', ax=ax, legend=False)


fig.legend = legend(bbox_to_anchor=(1.05, 1.5), loc=2, borderaxespad=0)
fig.suptitle('Volumes of Dates of Birth and Death in Wikidata \n by Gender \n Total and Modern Timeframes', fontsize=24)
fig.tight_layout()
subplots_adjust(hspace=0.1, top=0.82)



In [10]:


In [27]:
fig = plt.figure()
ax = fig.add_subplot(111)
ax2 = ax.twinx()
width =0.4

cultplotdf['total'].plot(kind='bar', ax=ax, width=width, position=1)
cultplotdf['fem_per'].plot(kind='bar', ax=ax, width=width, position=0)
ax.set_ylabel('Total Biographies')
ax2.set_ylabel('Female Percentage')


Out[27]:
<matplotlib.text.Text at 0x7ff981638210>