notebook.community

Edit and run



In [1]:

    
import pandas as pd
import numpy
import json
from collections import defaultdict
%pylab inline
java_min_int = -2147483648









    



Populating the interactive namespace from numpy and matplotlib



In [2]:

    
import math
import pywikibot
#Tranforming QIDs into English labels.
enwp = pywikibot.Site('en','wikipedia')
wikidata = enwp.data_repository()

retrieved = dict()

def english_label(qid):
    if type(qid) is float:
        if math.isnan(qid):
            return None
    #first see if we've done it
    try:
        return retrieved[qid]
    except KeyError:
        try:
            page = pywikibot.ItemPage(wikidata, qid)
            data = page.get()
            lab = data['labels']['en']
            retrieved[qid] = lab
            return lab
        except KeyError:
            retrieved[qid] = qid
            return qid









    



VERBOSE:pywiki:Starting 1 threads...



In [3]:

    
allrecs = pd.DataFrame.from_dict(json.load(open('helpers/world_cultures_shortcut.json','r')))



In [167]:

    
pd.crosstab[allrecs['culture']]









    



---------------------------------------------------------------------------
TypeError                                 Traceback (most recent call last)
<ipython-input-167-c9d92ccf2fb2> in <module>()
----> 1 pd.crosstab[allrecs['culture']]

TypeError: 'function' object has no attribute '__getitem__'



In [4]:

    
genrecs = allrecs[allrecs['gender'].apply(lambda x: x is not None)]
genrecs['en_gender'] = genrecs['gender'].apply(english_label)









    



VERBOSE:pywiki:Found 1 wikidata:wikidata processes running, including this one.
-c:2: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy



In [5]:

    
cultures = genrecs[['culture','en_gender']].groupby(by='culture')



In [6]:

    
# it should probably be done this way cultpiv = pd.DataFrame.pivot_table(pd.DataFrame(cultures), index='culture', values='en_gender')



In [145]:

    
perc_dict = defaultdict(dict)
for group, df in cultures:
    total = float(len(df))
    fem = len(df[df["en_gender"] == 'female'])
    mal = len(df[df["en_gender"] == 'male'])
    fem_per = fem / total
    nonbin_per = (total - (fem + mal)) / total
    perc_dict[group]['total'] = total
    perc_dict[group]['female %'] = fem_per
    perc_dict[group]['nonbinary %'] = nonbin_per



In [149]:

    
cultplotdf = pd.DataFrame.from_dict(perc_dict, orient='index').sort('female %')



In [9]:

    
from matplotlib.pylab import style
style.use('fivethirtyeight')



In [165]:

    
fig, ax = plt.subplots(1, 1, figsize=(6,6))
cultplotdf[['total','female %']].plot(kind='bar',secondary_y=['total'], ax=ax, grid=False)
ax.right_ax.set_ylabel('total biographies')
ax.set_ylabel('female %')
ax.legend(loc=2)
ax.right_ax.legend(loc=1)
ax.set_xlabel('culture')
ax.yaxis.set_major_formatter(matplotlib.ticker.FuncFormatter(lambda x, y: '{:.0%}'.format(x )))
ax.right_ax.yaxis.set_major_formatter(matplotlib.ticker.FuncFormatter(lambda x, y: '{:,}'.format(int(x))))

fig.suptitle('Female Percentage of Biographies by Culture', fontsize=24)
fig.subplots_adjust(top=0.88)



In [166]:

    
fig, ax = plt.subplots(1, 1, figsize=(6,6))
cultplotdf[['total','nonbinary %']].plot(kind='bar',secondary_y=['total'], ax=ax, grid=False)
ax.right_ax.set_ylabel('total biographies')
ax.set_ylabel('nonbinary %')
ax.legend(loc=2)
ax.right_ax.legend(loc=1)
ax.set_xlabel('culture')
ax.yaxis.set_major_formatter(matplotlib.ticker.FuncFormatter(lambda x, y: '{:.4%}'.format(x )))
ax.right_ax.yaxis.set_major_formatter(matplotlib.ticker.FuncFormatter(lambda x, y: '{:,}'.format(int(x))))

fig.suptitle('Nonbinary Percentage of Biographies by Culture', fontsize=24)
fig.subplots_adjust(top=0.88)



In [18]:

    
fig, axes = plt.subplots(nrows=3, ncols=1, sharex=True, figsize=(10,10))
measures = ['total', 'fem_per', 'nonbin_per']
for meas, ax in zip(measures, axes):
    cultplotdf[meas].plot(kind='bar', ax=ax, legend=False)


fig.legend = legend(bbox_to_anchor=(1.05, 1.5), loc=2, borderaxespad=0)
fig.suptitle('Volumes of Dates of Birth and Death in Wikidata \n by Gender \n Total and Modern Timeframes', fontsize=24)
fig.tight_layout()
subplots_adjust(hspace=0.1, top=0.82)



In [10]:



In [27]:

    
fig = plt.figure()
ax = fig.add_subplot(111)
ax2 = ax.twinx()
width =0.4

cultplotdf['total'].plot(kind='bar', ax=ax, width=width, position=1)
cultplotdf['fem_per'].plot(kind='bar', ax=ax, width=width, position=0)
ax.set_ylabel('Total Biographies')
ax2.set_ylabel('Female Percentage')









    Out[27]:





<matplotlib.text.Text at 0x7ff981638210>