In [63]:
from __future__ import print_function, division
import os,sys
import pickle, pprint,csv
import numpy as np
import pylab as pl
%pylab inline
DEBUG = False
NMC = 1000 #number of montecarlo draws
# doing this only for >=3 author papers,
# and limiting the inference to the first 3 authors
maxauth=3
#read in list of names
# pkl_file = open('name_list/female.pkl', 'rb')
# femalenames = pickle.load(pkl_file)
# pkl_file = open('name_list/male.pkl', 'rb')
# malenames = pickle.load(pkl_file)
femalenames = []
femalecounts = []
# reading in names with clear gender id
filename = 'namedb/female_uniq.csv'
with open(filename, 'rb') as f:
reader = csv.reader(f)
try:
for row in reader:
if row[0].startswith('#'):
continue
femalenames.append(row[0].lower())
femalecounts.append(float(row[1].lower()))
except csv.Error as e:
sys.exit('file %s, line %d: %s' % (filename, reader.line_num, e))
femalenames = np.array(femalenames)
femalecounts = np.array(femalecounts)
malenames = []
malecounts = []
filename = 'namedb/male_uniq.csv'
with open(filename, 'rb') as f:
reader = csv.reader(f)
try:
for row in reader:
if row[0].startswith('#'):
continue
malenames.append(row[0].lower())
malecounts.append(float(row[1].lower()))
except csv.Error as e:
sys.exit('file %s, line %d: %s' % (filename, reader.line_num, e))
malenames = np.array(malenames)
malecounts = np.array(malecounts)
if DEBUG:
print (femalecounts,malecounts)
In [5]:
DEBUG = True
# reads in paper list
pkl_file = open('papers_recent.pkl', 'rb')
papers = pickle.load(pkl_file)
print ("We have a list of %d papers."%len(papers))
print ("\nThe first one looks like:")
print (papers[0])
In [6]:
DEBUG = False
def choosegender(first):
nratio = femalecounts[femalenames == first] / malecounts[malenames == first]
if nratio > 0.75:
return 'f'
if nratio < 0.25:
return 'm'
return 'u'
paperstats={'nauth':[],'ncite':[],'femaleratio':[]}
tot_female = 0
tot_male = 0
tot_unknowns = 0
for ppr in papers:
femalecount = 0
malecount = 0
unknowns = 0
# parse paper info
try:
ncite= ppr['number_of_citations']
except:
ncite=float('NaN')
nauth = len(ppr['authors'])
# skip if less than 3 authors
if nauth < 3 :
continue
# reduct to first 3 authors
authors=ppr['authors'][:maxauth]
for a in authors:
#read first name when possible
try:
first = a.split()[1].replace(',','').strip().lower()
except:
unknowns += 1
continue
if not '.' in first:
if DEBUG:
print ("nauth, ncite:", nauth, ncite,)
print (first)
if first in femalenames and first in malenames:
#print (first)
g = choosegender(first)
if g == 'f':
femalecount += 1
elif g == 'm':
malecount += 1
else:
unknowns += 1
elif first in femalenames :
femalecount += 1
elif first in malenames :
malecount += 1
else:
unknowns += 1
else:
unknowns += 1
if DEBUG:
print ("females: ", femalecount)
print ("males: ", malecount)
print ("unknowns:", unknowns)
if unknowns == 0:
femaleratio = float(femalecount) / float(femalecount + malecount)
# print femaleratio, "maleratio:", float(malecount)/float(maxauth)
tot_female += femalecount
tot_male += malecount
tot_unknowns += unknowns
paperstats['nauth'].append(nauth)
paperstats['ncite'].append(ncite)
paperstats['femaleratio'].append(femaleratio)
# print femaleratio, "maleratio:", float(malecount)/float(maxauth)
if DEBUG:
print ("femaleratio:", femaleratio)
In [10]:
pl.figure()
pl.title("ACTUAL FEMALE RATIO IN THE FIRST 3 AUTHORS")
pl.hist(paperstats['femaleratio'], color='SteelBlue')
tot_male, tot_female, tot_unknowns
Out[10]:
In [44]:
pl.figure()
pl.ylabel("female ratio")
pl.xlabel("number of authors")
pl.scatter(paperstats['nauth'], paperstats['femaleratio'], alpha = 0.01)
pl.title("FEMALE CONCENTRATION IN LEAD AUTHORS VS NUMBER OF AUTHORS")
Np = len(paperstats['ncite'])
print (Np, "papers with the gender of all 3 authors identified")
Nc = int(max(paperstats['ncite']) / 5) + 1
ncite = [None] * Nc
for i in range(Nc):
ncite[i] = [paperstats['femaleratio'][ii] \
for ii in range(len(paperstats['femaleratio'])) \
if int(paperstats['ncite'][ii] / 5) == i]
#pl.figure()
#pl.title("FEMALE RATIO IN THE FIRST 3 AUTHORS AGAINST CITATION COUNT")
#for i in range(Nc):
# pl.scatter([i * 5] * len(ncite[i]), ncite[i], alpha = 0.1)
#pl.ylabel("female ratio")
#pl.xlabel("citations")
In [46]:
plt.figure()
plt.ylabel ("female ratio")
plt.xlabel ("number of citations")
plt.scatter(paperstats['ncite'], paperstats['femaleratio'], alpha = 0.1)
plt.show()
In [13]:
def pickAndDel(indx, picks):
# print, indx, len(picks)
tmp = picks[indx]
del picks[indx]
return tmp
fameleFracRand = np.zeros((1000, Np))
for i in range(NMC):
picks = range(Np) + range(Np) + range(Np)
#femaleRand = randint(0, Np, tot_female)
femaleRand = np.array([pickAndDel(randint(0, len(picks)), picks) \
for j in range(tot_female)])
fameleFracRand[i] = np.array([(femaleRand == j).sum() / \
3.0 for j in range(Np)])
In [91]:
for i in range(10):
pl.hist(fameleFracRand[i], color='IndianRed', alpha = 0.3)
pl.hist(fameleFracRand[i], color='IndianRed', alpha = 0.3,
label = "Simulated")
pl.hist(paperstats['femaleratio'], color='SteelBlue', alpha = 0.7,
label = "True")
pl.xlabel("Fraction of female authors")
pl.legend()
Out[91]:
In [28]:
ratios = np.zeros((1000,10))
for i in range(1000):
ratios[i] = histogram(fameleFracRand[i])[0]
statratios = np.array([np.array([ratios[:,i].mean(),
ratios[:,i].std()]) for i in range(10)])
statratios.T
Out[28]:
In [109]:
y = pl.hist(paperstats['femaleratio'], color='SteelBlue', alpha = 0.5, label="True fractions")
x = 0.5 * (histogram(fameleFracRand[0])[1][1:] +
histogram(fameleFracRand[0])[1][:-1])
pl.errorbar(x[::3], statratios[::3, 0],
yerr = (statratios[::3, 1]**2 + y[0][::3])**0.5,
fmt = '.', color = 'IndianRed', label = "MC simulated fractions")
pl.legend(fontsize=10)
pl.xlabel("Female fraction in lead 3 authors")
pl.ylabel("Number of papers")
pl.savefig("ADSgenderclustering.png")
In [61]:
# KS test
sp.stats.ks_2samp(paperstats['femaleratio'], statratios[:, 0])
Out[61]:
In [62]:
# AD test
sp.stats.anderson_ksamp([paperstats['femaleratio'], statratios[:, 0]])
Out[62]:
In [70]:
ks = np.zeros(NMC)
ad = np.zeros(NMC)
for i in range(NMC):
ks[i] = sp.stats.ks_2samp(paperstats['femaleratio'], ratios[i])[1]
ad[i] = sp.stats.anderson_ksamp([paperstats['femaleratio'], ratios[i]])[2]
In [82]:
print ("KS mean, std:", ks.mean(), ks.std())
print ("AD mean, std:", ad.mean(), ad.std())
if ks.mean() < 0.003 and ad.mean() < 0.003:
print (r"Null Rejected at > 3 Sigma!")
else:
print ("Null not rejected")
In [ ]: