In [1]:
import numpy
import matplotlib
from matplotlib import pylab, mlab, pyplot
np = numpy
plt = pyplot

from IPython.core.pylabtools import figsize, getfigs

%matplotlib inline
from pylab import *
from numpy import *

import tabular
from sklearn import svm

In [2]:
import tabular as tb
X = tb.tabarray(SVfile = 'edge.csv')


Inferring delimiter to be ','
Setting metadata attribute from dialect delimiter to equal specified value: ','
Inferring names from the last header line (line 1 ).

In [3]:
X.shape


Out[3]:
(7975,)

In [4]:
fieldnames = X.metadata['names']

In [29]:
print fieldnames


['Year', 'Title', 'Link', 'Type', 'ThreadId', 'Male_Contributions', 'Female_Contributions', 'FemaleParticipation', 'NumberofAuthorContributions', 'DebateSize', 'Live', 'UniqueContributors', 'UniqueMaleContributors', 'UniqueFemaleContributors', 'UniqueFemaleParticipation', 'Id', 'Id_num', 'Role', 'TwoAuthors', 'Name', 'Male', 'Female', 'Academic', 'Limited_Information', 'Job_Title', 'Job_Title_S', 'Job_Title_S_num', 'Department', 'Department_S', 'Department_S_num', 'Discipline', 'Workplace', 'HavePhD', 'PhD_Field', 'PhD_Year', 'PreviousContributions', 'ContributionsThisYear', 'ThreadsThisYear', 'PreviousThreads', 'AuthorAndCommenter', 'PhD_Institution', 'Years_from_PhD', 'PhD_Institution_SR', 'PhD_Institution_SR_Bin', 'Workplace_SR', 'Workplace_SR_Bin', 'SR_Ranking_Dif', 'PhD_Institution_US_IR', 'PhD_Institution_US_IR_Bin', 'Workplace_US_IR', 'Workplace_US_IR_Bin', 'USA_I_Ranking_Dif', 'PhD_Institution_US', 'PhD_Institution_US_Bin', 'Workplace_US', 'Workplace_US_Bin', 'USA_Ranking_Dif', 'Total_Citations', 'H_Index', 'i10_Index', 'Citations_Year', 'Citations_Cumulative', 'AcademicHierarchyStrict', 'PreviousCitations', 'ContributionsbyAuthor', 'dummy_Natural Sciences', 'dummy_Social Sciences', 'dummy_Professions', 'dummy_Humanities', 'dummy_Formal Sciences', 'dummy_Physics', 'dummy_Anthropology', 'dummy_Earth Sciences', 'dummy_Biology', 'dummy_Psychology', 'dummy_Journalism, media studies and communication', 'dummy_Medicine', 'dummy_Philosophy', 'dummy_Space Sciences', 'dummy_Linguistics', 'dummy_Computer Sciences', 'dummy_Engineering', 'dummy_Arts', 'dummy_Business', 'dummy_Environmental Studies and Forrestry', 'dummy_Sociology', 'dummy_Mathematics', 'dummy_Asian Studies', 'dummy_Education', 'dummy_Political Science', 'dummy_Economics', 'dummy_Systems', 'dummy_History', 'dummy_Musics', 'dummy_Chemistry', 'dummy_Archeology', 'dummy_Architecture and Design', 'dummy_Law', 'dummy_Zoology', 'dummy_Literature', 'dummy_Divinity', 'Order', 'Text', 'Number_Characters', 'WC', 'WPS', 'Sixltr', 'Dic', 'Numerals', 'funct', 'pronoun', 'ppron', 'i', 'we', 'you', 'shehe', 'they', 'ipron', 'article', 'verb', 'auxverb', 'past', 'present', 'future', 'adverb', 'preps', 'conj', 'negate', 'quant', 'number', 'swear', 'social', 'family', 'friend', 'humans', 'affect', 'posemo', 'negemo', 'anx', 'anger', 'sad', 'cogmech', 'insight', 'cause', 'discrep', 'tentat', 'certain', 'inhib', 'incl', 'excl', 'percept', 'see', 'hear', 'feel', 'bio', 'body', 'health', 'sexual', 'ingest', 'relativ', 'motion', 'space', 'time', 'work', 'achieve', 'leisure', 'home', 'money', 'relig', 'death', 'assent', 'nonfl', 'filler', 'Period', 'Comma', 'Colon', 'SemiC', 'QMark', 'Exclam', 'Dash', 'Quote', 'Apostro', 'Parenth', 'OtherP', 'AllPct']

In [5]:

Testing the stated hypotheses...


In [6]:
# Hypothesis 1a: Higher status participants are more verbose than are lower status participants.
# Hypothesis 1b: Higher status participants use more dominant language than do lower status participants.
# Hypothesis 1c: Male participants are more verbose than are female participants.
# Hypothesis 1d: Male participants use more dominant language than do female participants.
# Hypothesis 2a: Gender and status interact, such that high status is a better predictor of verbosity for male scientists than for female scientists. 
# Hypothesis 2b: Gender and status interact, such that high status is a better predictor of dominant language among male scientists than among female scientists. 
# Hypothesis 3a: Very low and very high status participants are the least likely to be verbose.
# Hypothesis 3b: Very low and very high status participants are the least likely to use dominant language.
# **Hypothesis 4: Female participation correlates with the number of females in the discussion.
# Hypothesis 5a: The effect of gender on verbosity will be stronger in live speech than in written text.
# Hypothesis 5b: The effect of gender on use of dominant language will be stronger in live speech than in written text.

# variables of interest = [‘status’, ‘verbosity’, ‘gender’, ‘linguistic dominance’, ‘modality’, ‘discussion index’]

In [39]:
# Hypothesis 1a: Higher status participants are more verbose than are lower status participants.

## Status markers
# 'H_Index',
#  'i10_Index',
#  'Citations_Year',
#  'Citations_Cumulative',
#  'AcademicHierarchyStrict',
#  'PreviousCitations'

## Word count
# 'WC'

def get_nonnan_indices(a):
    inds = [isnan(i) for i in a if isnan(i)==False]
    prop_nans = sum(inds)/len(a)
    return inds,prop_nans

h_ind,i10_ind,cityear_ind,citcumul_ind = map(get_nonnan_indices,[X['H_Index'],X['i10_Index'],X['Citations_Year'],X['Citations_Cumulative']])

In [51]:
fig = plt.figure(figsize=(12,5))
ax = fig.add_subplot(1,2,1)
plt.scatter(X['Citations_Year'],X['WC'])
ax = fig.add_subplot(1,2,2)
plt.scatter(X['Citations_Year'],X['WPS'])


Out[51]:
<matplotlib.collections.PathCollection at 0x10dc70c90>

In [63]:
h = plt.hist(X['WC'],300)
plt.xlim([0, 1000])


Out[63]:
(0, 1000)

In [79]:
logcityear,loghind,logcitcumul = map(np.log())
status = X['Citations_Year']+X['H_Index']+X['Citations_Cumulative']

In [76]:
plt.scatter(status,np.log(X['WC']))
plt.ylabel('log (word count)')
plt.xlabel('status score')


Out[76]:
<matplotlib.text.Text at 0x10f33cdd0>

In [17]:
A = X['Citations_Cumulative']
cityear = A[~np.isnan(A)]

In [23]:
h = plt.hist(cityear,100)



In [25]:
# lower status cutoff
np.percentile(cityear,33)


Out[25]:
4923.0

In [26]:
# medium status cutoff
np.percentile(cityear,66)


Out[26]:
19077.0

In [34]:
phd = nansum(X['HavePhD'])/len(X['HavePhD'])
usrank = X['USA_Ranking_Dif']

In [40]:
def remove_nans(a):
    return a[~np.isnan(a)]

h = plt.hist(remove_nans(usrank),50)



In [56]:
# score people as 1 for belonging to the first bin, 0 for any other bin
ranks = remove_nans(X['Workplace_SR_Bin'])
h = plt.hist(ranks)



In [57]:
# score people as 1 for belonging to the first bin, 0 for any other bin
ranks = remove_nans(X['PhD_Institution_SR_Bin'])
h = plt.hist(ranks)



In [59]:
# career stage = award +1 for being a chaired professor or professor (that is, 5 or 6)
career_stage = remove_nans(X['AcademicHierarchyStrict'])
h = plt.hist(career_stage)



In [66]:
def _normalize(x):   
    x = x - x.mean(0)
    x = x/x.std(0)
    return x

verbosity = _normalize(X['WC'])

In [82]:
career_status = X['AcademicHierarchyStrict'] + X['PhD_Institution_SR_Bin'] + X['Workplace_SR_Bin'] + X['HavePhD'] 
pub_status = X['Citations_Cumulative']

c1 = (X['AcademicHierarchyStrict'] > 4) * 1.0 
c2 = (X['PhD_Institution_SR_Bin'] == 1) * 1.0
c3 = (X['Workplace_SR_Bin'] == 1) * 1.0
c4 = (X['HavePhD'] == 1) * 1.0

c5 = (X['Citations_Cumulative'] > np.percentile(X['Citations_Cumulative'],1/3.0)) * 1.0
c6 = (X['Citations_Cumulative'] > np.percentile(X['Citations_Cumulative'],2/3.0)) * 1.0

c = c1 + c2 + c3 + c4 + c5 + c6


-c:9: RuntimeWarning: invalid value encountered in greater
-c:10: RuntimeWarning: invalid value encountered in greater

In [84]:
plt.hist(c,6)


Out[84]:
(array([ 2227.,   440.,  1313.,   767.,  1885.,  1343.]),
 array([ 0.,  1.,  2.,  3.,  4.,  5.,  6.]),
 <a list of 6 Patch objects>)

In [88]:
# Jordan: 15:02, Thu Aug 27 2015: In the current analysis, anybody from whom we are missing data is awarded no points for that category.
# This is biased towards low status. What we really want is for the people from whom we are missing data to be regarded as being
# of average status. 
sum(np.isnan(X['Workplace_SR_Bin']))


Out[88]:
3721

In [ ]:
# Theodore thought that that that that 
# What I'm telling you is that // that (~b/c) (that)--> "that" that (that)--> student wrote was a 'that' and not a 'which' is the reason he failed.