In [1]:
import numpy
import matplotlib
from matplotlib import pylab, mlab, pyplot
np = numpy
plt = pyplot
from IPython.core.pylabtools import figsize, getfigs
%matplotlib inline
from pylab import *
from numpy import *
import tabular
from sklearn import svm
In [2]:
import tabular as tb
X = tb.tabarray(SVfile = 'edge.csv')
In [3]:
X.shape
Out[3]:
In [4]:
fieldnames = X.metadata['names']
In [29]:
print fieldnames
In [5]:
In [6]:
# Hypothesis 1a: Higher status participants are more verbose than are lower status participants.
# Hypothesis 1b: Higher status participants use more dominant language than do lower status participants.
# Hypothesis 1c: Male participants are more verbose than are female participants.
# Hypothesis 1d: Male participants use more dominant language than do female participants.
# Hypothesis 2a: Gender and status interact, such that high status is a better predictor of verbosity for male scientists than for female scientists.
# Hypothesis 2b: Gender and status interact, such that high status is a better predictor of dominant language among male scientists than among female scientists.
# Hypothesis 3a: Very low and very high status participants are the least likely to be verbose.
# Hypothesis 3b: Very low and very high status participants are the least likely to use dominant language.
# **Hypothesis 4: Female participation correlates with the number of females in the discussion.
# Hypothesis 5a: The effect of gender on verbosity will be stronger in live speech than in written text.
# Hypothesis 5b: The effect of gender on use of dominant language will be stronger in live speech than in written text.
# variables of interest = [‘status’, ‘verbosity’, ‘gender’, ‘linguistic dominance’, ‘modality’, ‘discussion index’]
In [39]:
# Hypothesis 1a: Higher status participants are more verbose than are lower status participants.
## Status markers
# 'H_Index',
# 'i10_Index',
# 'Citations_Year',
# 'Citations_Cumulative',
# 'AcademicHierarchyStrict',
# 'PreviousCitations'
## Word count
# 'WC'
def get_nonnan_indices(a):
inds = [isnan(i) for i in a if isnan(i)==False]
prop_nans = sum(inds)/len(a)
return inds,prop_nans
h_ind,i10_ind,cityear_ind,citcumul_ind = map(get_nonnan_indices,[X['H_Index'],X['i10_Index'],X['Citations_Year'],X['Citations_Cumulative']])
In [51]:
fig = plt.figure(figsize=(12,5))
ax = fig.add_subplot(1,2,1)
plt.scatter(X['Citations_Year'],X['WC'])
ax = fig.add_subplot(1,2,2)
plt.scatter(X['Citations_Year'],X['WPS'])
Out[51]:
In [63]:
h = plt.hist(X['WC'],300)
plt.xlim([0, 1000])
Out[63]:
In [79]:
logcityear,loghind,logcitcumul = map(np.log())
status = X['Citations_Year']+X['H_Index']+X['Citations_Cumulative']
In [76]:
plt.scatter(status,np.log(X['WC']))
plt.ylabel('log (word count)')
plt.xlabel('status score')
Out[76]:
In [17]:
A = X['Citations_Cumulative']
cityear = A[~np.isnan(A)]
In [23]:
h = plt.hist(cityear,100)
In [25]:
# lower status cutoff
np.percentile(cityear,33)
Out[25]:
In [26]:
# medium status cutoff
np.percentile(cityear,66)
Out[26]:
In [34]:
phd = nansum(X['HavePhD'])/len(X['HavePhD'])
usrank = X['USA_Ranking_Dif']
In [40]:
def remove_nans(a):
return a[~np.isnan(a)]
h = plt.hist(remove_nans(usrank),50)
In [56]:
# score people as 1 for belonging to the first bin, 0 for any other bin
ranks = remove_nans(X['Workplace_SR_Bin'])
h = plt.hist(ranks)
In [57]:
# score people as 1 for belonging to the first bin, 0 for any other bin
ranks = remove_nans(X['PhD_Institution_SR_Bin'])
h = plt.hist(ranks)
In [59]:
# career stage = award +1 for being a chaired professor or professor (that is, 5 or 6)
career_stage = remove_nans(X['AcademicHierarchyStrict'])
h = plt.hist(career_stage)
In [66]:
def _normalize(x):
x = x - x.mean(0)
x = x/x.std(0)
return x
verbosity = _normalize(X['WC'])
In [82]:
career_status = X['AcademicHierarchyStrict'] + X['PhD_Institution_SR_Bin'] + X['Workplace_SR_Bin'] + X['HavePhD']
pub_status = X['Citations_Cumulative']
c1 = (X['AcademicHierarchyStrict'] > 4) * 1.0
c2 = (X['PhD_Institution_SR_Bin'] == 1) * 1.0
c3 = (X['Workplace_SR_Bin'] == 1) * 1.0
c4 = (X['HavePhD'] == 1) * 1.0
c5 = (X['Citations_Cumulative'] > np.percentile(X['Citations_Cumulative'],1/3.0)) * 1.0
c6 = (X['Citations_Cumulative'] > np.percentile(X['Citations_Cumulative'],2/3.0)) * 1.0
c = c1 + c2 + c3 + c4 + c5 + c6
In [84]:
plt.hist(c,6)
Out[84]:
In [88]:
# Jordan: 15:02, Thu Aug 27 2015: In the current analysis, anybody from whom we are missing data is awarded no points for that category.
# This is biased towards low status. What we really want is for the people from whom we are missing data to be regarded as being
# of average status.
sum(np.isnan(X['Workplace_SR_Bin']))
Out[88]:
In [ ]:
# Theodore thought that that that that
# What I'm telling you is that // that (~b/c) (that)--> "that" that (that)--> student wrote was a 'that' and not a 'which' is the reason he failed.