notebook.community

Edit and run



In [1]:

    
import pandas as pd
import numpy

%pylab inline
java_min_int = -2147483648









    



Populating the interactive namespace from numpy and matplotlib



In [2]:

    
allrecs = pd.read_csv('snapshot_data/2014-10-13/gender-index-data-2014-10-13.csv',na_values=[java_min_int])



In [3]:

    
def split_column(q_str):
    if type(q_str) is float:
        if numpy.isnan(q_str):
            return q_str #returning this way so we can gurantee that column contains list
    if type(q_str) is str:
        qs = q_str.split('|')
        return qs[0] #cos the format will always end with a |



In [4]:

    
for col in ['place_of_birth','gender','citizenship','ethnic_group']:
    allrecs[col] = allrecs[col].apply(split_column)



In [5]:

    
allrecs.head(10)









    Out[5]:






  
    
      
      qid
      dob
      dod
      gender
      ethnic_group
      citizenship
      place_of_birth
      site_links
    
  
  
    
      0
        Q23
       1732
       1799
       Q6581097
       NaN
          Q30
       Q494413
       zhwiki|kywiki|euwiki|plwiki|bswiki|angwiki|uzw...
    
    
      1
        Q42
       1952
       2001
       Q6581097
       NaN
         Q145
          Q350
       zhwiki|jvwiki|euwiki|plwiki|bswiki|eswiki|tawi...
    
    
      2
       Q207
       1946
        NaN
       Q6581097
       NaN
          Q30
        Q49145
       uzwiki|eswiki|kowikiquote|huwiki|liwikiquote|p...
    
    
      3
       Q297
        NaN
       1660
       Q6581097
       NaN
          Q29
         Q8717
       zhwiki|kywiki|plwiki|euwiki|bswiki|uzwiki|eswi...
    
    
      4
       Q326
       1942
        NaN
       Q6581097
       NaN
         Q298
         Q2887
       zhwiki|plwiki|euwiki|kowiki|frwiki|eswiki|yowi...
    
    
      5
       Q368
       1915
       2006
       Q6581097
       NaN
         Q298
        Q33986
       lbwiki|zhwiki|plwiki|euwiki|bswiki|angwiki|esw...
    
    
      6
       Q377
       1882
       1942
       Q6581097
       NaN
       Q34266
       Q658871
       zhwiki|kywiki|ukwikisource|jvwiki|plwiki|euwik...
    
    
      7
       Q475
       1911
       1982
       Q6581097
       NaN
         Q298
         Q2887
       plwiki|euwiki|kowiki|frwiki|eswiki|yowiki|ocwi...
    
    
      8
       Q501
       1821
       1867
       Q6581097
       NaN
         Q142
           Q90
       zhwiki|glwikisource|plwiki|euwiki|bswiki|ptwik...
    
    
      9
       Q530
       1956
        NaN
       Q6581097
       NaN
          Q34
       Q499415
       plwiki|euwiki|frwiki|bswiki|bewiki|eswiki|ocwi...



In [6]:

    
import json
pobs_map = json.load(open('helpers/aggregation_maps/pobs_map.json','r'))
country_map = pd.DataFrame.from_csv('helpers/aggregation_maps/country_maps.csv')

ethnic_group_map = json.load(open('helpers/aggregation_maps/ethnic_groups_map.json','r'))
citizenship_map = json.load(open('helpers/aggregation_maps/citizenship_map.json','r'))









    



---------------------------------------------------------------------------
IOError                                   Traceback (most recent call last)
<ipython-input-6-4eb645f6ebfb> in <module>()
      3 country_map = pd.DataFrame.from_csv('helpers/aggregation_maps/country_maps.csv')
      4 
----> 5 ethnic_group_map = json.load(open('helpers/aggregation_maps/ethnic_groups_map.json','r'))
      6 citizenship_map = json.load(open('helpers/aggregation_maps/citizenship_map.json','r'))

IOError: [Errno 2] No such file or directory: 'helpers/aggregation_maps/ethnic_groups_map.json'



In [7]:

    
def map_pob(qid):
    if not type(qid) is str:
        return None
    else:
        country_list = pobs_map[qid]
        if len(country_list) == 0:
            return None
        else:
            country = country_list[0] #assumption
            culture = country_map.ix[country]['culture_name']
            return culture

def map_wrapper(m):
    def return_fun(qid):
        try:
            return m[qid]
        except KeyError:
            return None
    return return_fun

mismatch = pd.DataFrame()

def determine_culture(row):
    #order is important because it determines the preference we will use
    col_map_fun = zip(['ethnic_group', 'citizenship', 'place_of_birth'],
                      [map_wrapper(ethnic_group_map),map_wrapper(citizenship_map), map_pob])
    culture = None
    for col, map_fun in col_map_fun:
        guess = map_fun(row[col])
        guess = str(guess).lower()
        if (culture is not None) and (guess is not None):
            if culture != guess:
                mismatch.append(row,ignore_index=True)
        if guess:
            culture = guess
    
    return culture



In [8]:

    
allrecs['culture'] = allrecs.apply(lambda x: determine_culture(x), axis=1)



In [9]:

    
len(allrecs[allrecs['culture'].apply(lambda x: x is not None)])









    Out[9]:





2561999



In [10]:

    
import math
import pywikibot
#Tranforming QIDs into English labels.
enwp = pywikibot.Site('en','wikipedia')
wikidata = enwp.data_repository()

retrieved = dict()

def english_label(qid):
    if type(qid) is float:
        if math.isnan(qid):
            return None
    #first see if we've done it
    try:
        return retrieved[qid]
    except KeyError:
        try:
            page = pywikibot.ItemPage(wikidata, qid)
            data = page.get()
            lab = data['labels']['en']
            retrieved[qid] = lab
            return lab
        except KeyError:
            retrieved[qid] = qid
            return qid









    



VERBOSE:pywiki:Starting 1 threads...



In [11]:

    
english_label('Q6581097')









    



VERBOSE:pywiki:Found 1 wikidata:wikidata processes running, including this one.






    Out[11]:





u'male'



In [12]:

    
allrecs['gender_name'] = allrecs['gender'].apply(english_label)



In [13]:

    
outdf = allrecs[['gender_name','culture']]



In [29]:

    
allrecs.to_csv('helpers/allrecords_culture_mapped.csv')
outdf.to_csv('helpers/Chi_Squared_Test_Data.csv')

how many records have gender, pob and dob

Making the 3bin genpobdob



In [15]:

    
hasdob = allrecs[allrecs['dob'].apply(lambda x: not math.isnan(x))]
len(hasdob)









    Out[15]:





1484003



In [16]:

    
hasgender = hasdob[hasdob['gender'].apply(lambda x: not math.isnan(x) if type(x) is float else True)]
len(hasgender)









    Out[16]:





1470491



In [17]:

    
hascult = hasgender[hasgender['culture'].apply(lambda x: x is not None)]
len(hascult)









    Out[17]:





1470491



In [18]:

    
culture_groups = hascult.groupby('culture')



In [19]:

    
def make_perc_series(df):
    years_per = dict()
    dobs = df.groupby('dob')
    #hate to use a for loop, fixlater
    for year, group in dobs:
        nmcount = group[group['gender'] != 'Q6581097']['gender'].count()
        totalcount = group['gender'].count()
        nmper = nmcount / float(totalcount)
        years_per[year] = nmper
        perc_series = pd.TimeSeries(data=years_per)
    
    return perc_series
    
perc_dict = dict()
for name, group in culture_groups:
    perc_series = make_perc_series(group)
    perc_dict[name] = perc_series



In [26]:

    
end_year = 1988
for start_year in [-1000, 1800]:
    for ra_len in [10, 100]:
        ra_dict = dict()
        for name, series in perc_dict.iteritems():
            ra_dict[name] = pd.rolling_mean(series, ra_len, min_periods=10)

        cult_dob_per  = pd.DataFrame(ra_dict)
        plt = cult_dob_per.plot(figsize=(20,6),  cmap='Paired', linewidth=1.5)
        plt.set_xlim((start_year, end_year))
        plt.set_xticks(range(start_year, end_year,(end_year-start_year) / 15))
        plt.set_ylim((0,0.6))
        plt.set_title('Non-male percentage of Biographies by Date of Birth - %s Year Rolling Average' % str(ra_len))
        plt.legend(loc=2)

Make dobculture



In [21]:

    
dobexists = allrecs[allrecs['dob'].apply(lambda x: not math.isnan(x))]
dobcultureexists = dobexists[dobexists['culture'].apply(lambda x: x is not None)]
len(dobcultureexists)









    Out[21]:





1484003



In [22]:

    
culture_groups = dobcultureexists[['dob','culture']].groupby(by='culture')



In [23]:

    
def make_tot_series(df):
    years_tot = dict()
    dobs = df.groupby('dob')
    #hate to use a for loop, fixlater
    for year, group in dobs:
        totalcount = group['culture'].count()
        years_tot[year] = totalcount
        tot_series = pd.TimeSeries(data=years_tot)
    
    return tot_series
    
tot_dict = dict()
for name, group in culture_groups:
    tot_dict[name] = make_tot_series(group)



In [27]:

    
end_year = 1988
for start_year in [1500, 1800]:
    for ra_len in [2, 5, 10]:
        ra_dict = dict()
        for name, series in tot_dict.iteritems():
            ra_dict[name] = pd.rolling_mean(series, ra_len, min_periods=1)

        cult_dob = pd.DataFrame(ra_dict)
        plt = cult_dob.plot(figsize=(20,6),  cmap='Paired', linewidth=1.5)
        plt.set_xlim((start_year,end_year))
        plt.set_xticks(range(start_year,end_year,(end_year-start_year) / 15))
        plt.set_title('Total Biographies by Date of Birth |  %s Year Rolling Average' % str(ra_len))
        plt.legend(loc=2)



In [25]:

    
for start_year, end_year in zip([-2000, -1000], [1000,1500]):
    for ra_len in [1,2,10]:
        ra_dict = dict()
        for name, series in tot_dict.iteritems():
            ra_dict[name] = pd.rolling_mean(series, ra_len, min_periods=1)
        cult_dob = pd.DataFrame(ra_dict)
        plt = cult_dob.plot(figsize=(20,6),  cmap='Set2', linewidth=1.5)
        plt.set_ylim((0,50))
        plt.set_yscale('log')
        plt.set_xlim((start_year,end_year))
        plt.set_xticks(range(start_year,end_year,(end_year-start_year) / 15))
        plt.set_title('Total Biographies by Date of Birth |  %s Year Rolling Average' % str(ra_len))
        plt.legend(loc=2)









    



/usr/local/lib/python2.7/dist-packages/numpy/ma/core.py:3895: UserWarning: Warning: converting a masked element to nan.
  warnings.warn("Warning: converting a masked element to nan.")



In [ ]:

	qid	dob	dod	gender	ethnic_group	citizenship	place_of_birth	site_links
0	Q23	1732	1799	Q6581097	NaN	Q30	Q494413	zhwiki\|kywiki\|euwiki\|plwiki\|bswiki\|angwiki\|uzw...
1	Q42	1952	2001	Q6581097	NaN	Q145	Q350	zhwiki\|jvwiki\|euwiki\|plwiki\|bswiki\|eswiki\|tawi...
2	Q207	1946	NaN	Q6581097	NaN	Q30	Q49145	uzwiki\|eswiki\|kowikiquote\|huwiki\|liwikiquote\|p...
3	Q297	NaN	1660	Q6581097	NaN	Q29	Q8717	zhwiki\|kywiki\|plwiki\|euwiki\|bswiki\|uzwiki\|eswi...
4	Q326	1942	NaN	Q6581097	NaN	Q298	Q2887	zhwiki\|plwiki\|euwiki\|kowiki\|frwiki\|eswiki\|yowi...
5	Q368	1915	2006	Q6581097	NaN	Q298	Q33986	lbwiki\|zhwiki\|plwiki\|euwiki\|bswiki\|angwiki\|esw...
6	Q377	1882	1942	Q6581097	NaN	Q34266	Q658871	zhwiki\|kywiki\|ukwikisource\|jvwiki\|plwiki\|euwik...
7	Q475	1911	1982	Q6581097	NaN	Q298	Q2887	plwiki\|euwiki\|kowiki\|frwiki\|eswiki\|yowiki\|ocwi...
8	Q501	1821	1867	Q6581097	NaN	Q142	Q90	zhwiki\|glwikisource\|plwiki\|euwiki\|bswiki\|ptwik...
9	Q530	1956	NaN	Q6581097	NaN	Q34	Q499415	plwiki\|euwiki\|frwiki\|bswiki\|bewiki\|eswiki\|ocwi...