In [1]:
import pandas as pd
import numpy
from collections import defaultdict
from matplotlib.pylab import style
import json
style.use('fivethirtyeight')
%pylab inline
java_min_int = -2147483648


Populating the interactive namespace from numpy and matplotlib

In [106]:
import pywikibot
#Tranforming QIDs into English labels.
enwp = pywikibot.Site('en','wikipedia')
wikidata = enwp.data_repository()

retrieved = dict()

def english_label(qid):
    if qid:
        if type(qid) is float:
            if math.isnan(qid):
                return None
        #first see if we've done it
        try:
            return retrieved[qid]
        except KeyError:
            try:
                page = pywikibot.ItemPage(wikidata, qid)
                data = page.get()
                lab = data['labels']['en']
                retrieved[qid] = lab
                return lab
            except KeyError:
                retrieved[qid] = qid
                return qid
            except:
                return qid
    return qid

lang_map = json.load(open('helpers/wiki_code_map.json','r'))
def lookup_lang(lang):
    try:
            full= lang_map[lang]
            if full.split()[-1].lower() == 'wikipedia':
                return ' '.join(full.split()[:-1])
            else: return full
    except:
        return lang

In [2]:
allrecs = pd.read_csv('snapshot_data/2014-10-13/gender-index-data-2014-10-13.csv',na_values=[java_min_int])
def split_column(q_str):
    if type(q_str) is float:
        if numpy.isnan(q_str):
            return q_str 
    if type(q_str) is str:
        qs = q_str.split('|')
        return qs[:-1] #cos the format will always end with a |

for col in ['gender','site_links']:
    allrecs[col] = allrecs[col].apply(split_column)

In [3]:
allrecs.head(5)


Out[3]:
qid dob dod gender ethnic_group citizenship place_of_birth site_links
0 Q23 1732 1799 [Q6581097] NaN Q30| Q494413| [zhwiki, kywiki, euwiki, plwiki, bswiki, angwi...
1 Q42 1952 2001 [Q6581097] NaN Q145| Q350| [zhwiki, jvwiki, euwiki, plwiki, bswiki, eswik...
2 Q207 1946 NaN [Q6581097] NaN Q30| Q49145| [uzwiki, eswiki, kowikiquote, huwiki, liwikiqu...
3 Q297 NaN 1660 [Q6581097] NaN Q29| Q8717| [zhwiki, kywiki, plwiki, euwiki, bswiki, uzwik...
4 Q326 1942 NaN [Q6581097] NaN Q298|Q39| Q2887| [zhwiki, plwiki, euwiki, kowiki, frwiki, eswik...

In [15]:
for row in allrecs.iterrows():
    print(row[1][3])
    break


['Q6581097']

In [32]:
def makedict():
    return defaultdict(int)

single = defaultdict(makedict)
many = defaultdict(makedict)


for index, row in allrecs.iterrows():
    try:
        gender = row[3][0]
    except:
        gender = None
    wikiname_list = row[7]
    if isinstance(wikiname_list, list):
        if len(wikiname_list) == 1:
            single[wikiname_list[0]][gender] += 1
        else:
            for wikiname in wikiname_list:
                many[wikiname][gender] += 1

In [59]:
def makedf(indict):
    df = pd.DataFrame.from_dict(indict, orient='index')
    df.columns = map(english_label, df.columns)
    df['human_total'] = df.sum(axis=1)
    df['gendered_total'] = df['human_total'] - df[float('nan')]
    df['nonbin_total'] = df['gendered_total'] - df['female'] - df['male']
    df['fem_per'] = df['female'] / df['gendered_total']
    df['nonbin_per'] = df['nonbin_total'] / df['gendered_total']
    return df

In [60]:
sdf = makedf(single)
mdf = makedf(many)

In [61]:
tdf = sdf.join(mdf, how='inner', lsuffix='_single', rsuffix='_many')

In [62]:
tdf.fillna(0, inplace=True)

In [65]:
tdf.columns


Out[65]:
Index([u'male_single', u'female_single', u'None_single', u'male animal_single', u'intersex_single', u'transgender female_single', u'woman', u'transgender male_single', u'genderqueer_single', u'female animal', u'human_total_single', u'gendered_total_single', u'nonbin_total_single', u'fem_per_single', u'nonbin_per_single', u'intersex_many', u'kathoey', u'transgender male_many', u'genderqueer_many', u'transgender female_many', u'None_many', u'male_many', u'female_many', u'fa'afafine', u'male animal_many', u'human_total_many', u'gendered_total_many', u'nonbin_total_many', u'fem_per_many', u'nonbin_per_many'], dtype='object')

In [68]:
tdf['fem_per_diff'] = tdf['fem_per_single'] - tdf['fem_per_many']

In [121]:
tdf[tdf['gendered_total_single']>7310].sort('fem_per_diff',ascending=False)[['fem_per_diff','fem_per_single','female_single','gendered_total_single','fem_per_many']]


Out[121]:
fem_per_diff fem_per_single female_single gendered_total_single fem_per_many
jawiki 0.413549 0.603585 13434 22257 0.190037
hewiki 0.060952 0.216555 2386 11018 0.155602
dawiki 0.036127 0.181383 2233 12311 0.145256
nowiki 0.029112 0.227961 4277 18762 0.198849
svwiki 0.020537 0.213389 8772 41108 0.192852
ukwiki 0.001318 0.137714 2953 21443 0.136396
fiwiki -0.000251 0.167975 3145 18723 0.168226
dewiki -0.000691 0.149034 30335 203544 0.149725
plwiki -0.008691 0.149597 10353 69206 0.158288
frwiki -0.012200 0.142818 14247 99756 0.155018
nlwiki -0.012330 0.148959 5482 36802 0.161290
enwiki -0.015220 0.146669 80262 547233 0.161889
itwiki -0.018435 0.129315 7102 54920 0.147751
eswiki -0.021227 0.143191 7851 54829 0.164418
ptwiki -0.021357 0.143572 4678 32583 0.164929
etwiki -0.022251 0.127290 1042 8186 0.149541
cswiki -0.024574 0.136172 2453 18014 0.160746
ruwiki -0.044208 0.105331 8083 76739 0.149539

In [94]:
fig, ax = plt.subplots(1,1,figsize=(8,8))
tdf[tdf['gendered_total_single']>2500][['gendered_total_single','fem_per_diff']].plot(kind='scatter', x='gendered_total_single', y='fem_per_diff', logx=True, ax=ax, c='#74bc3a')

ax.yaxis.set_major_formatter(matplotlib.ticker.FuncFormatter(lambda x, y: '{:.0%}'.format(x )))
ax.xaxis.set_major_formatter(matplotlib.ticker.FuncFormatter(lambda x, y: '{:,.0f}'.format(x )))

ax.set_xlim(min(tdf['gendered_total_single']) * 0.6, max(tdf['gendered_total_single']) *3)
#ax.set_ylim(min(tdf['fem_per_diff']) * 0.85, max(tdf['fem_per_diff']) *1.15)

for label, x, y in zip(tdf.index, tdf['gendered_total_single'], tdf['fem_per_diff']):
    plt.annotate(
        label, 
        xy = (x, y), xytext = (5,2),
        textcoords = 'offset points', ha = 'left', va = 'bottom')
        #bbox = dict(boxstyle = 'round,pad=0.5', fc = 'yellow', alpha = 0.5),
        #arrowprops = dict(arrowstyle = '->', connectionstyle = 'arc3,rad=0'))

plt.annotate("", xy=(10000,0.5), xytext=(0,0)) 
plt.title('Differnce in Female Ratio, of "Language-Unique" Articles vs Others \n by Language-Unique Articles"', fontsize=24)
plt.xlabel('Number of Unique Biographies Recorded')
plt.ylabel('Composition of Biographies Which Are Female')


Out[94]:
<matplotlib.text.Text at 0x7f83e125f9d0>

In [137]:
ptdf = tdf[tdf['gendered_total_single']>730].sort('fem_per_diff')

maxbio = max(ptdf['gendered_total_single'])
bios_size = ptdf['gendered_total_single'].apply(lambda x: math.log(x)/math.log(maxbio)) 

my_colors = [(x/2, x, 0.75) for x in bios_size]
        
fig, ax = plt.subplots(1,1,figsize=(8,6))
ptdf['fem_per_diff'].plot(kind='bar', ax=ax, colors=my_colors)
ax.yaxis.set_major_formatter(matplotlib.ticker.FuncFormatter(lambda x, y: '{:.0%}'.format(x )))

wikilabels = ax.get_xticklabels()
wikinames = map(lambda x: x.get_text().split('wiki')[0], wikilabels)
fullnames = map(lookup_lang, wikinames)
ax.set_xticklabels(fullnames)

ax.set_title('Difference in Female Ratio \n by language-unique and language-many articles \n by Wikipedia Language', size=24)
ax.set_ylabel('[language-unique female ratio] - [language-many female ratio]')
ax.set_xlabel('Wikipedia Language | Darker Colours indicate more absolute lanuage-unique articles')


Out[137]:
<matplotlib.text.Text at 0x7f83cc7db590>

In [ ]: