notebook.community

Edit and run



In [37]:

    
import pandas as pd
import nltk
from collections import Counter
from nltk.corpus import brown
from nltk import UnigramTagger as ut
import matplotlib.pyplot as plt



In [3]:

    
post23 = pd.read_csv('./Oct15_post23bio.tsv', sep='\t')



In [4]:

    
pre23 = pd.read_csv('./Oct15_pre23bio.tsv', sep='\t')



In [7]:

    
bio = pd.concat([pre23, post23], ignore_index=True)



In [7]:

    
# bio.to_csv('./Oct15_bio.tsv', sep='\t', index=False)



In [8]:

    
bio.head()









    Out[8]:







  
    
      
      docid
      charname
      charid
      chargender
      authgender
      pubdate
      words
    
  
  
    
      0
      yale.39002005473021
      BaronShibusawa
      yale.39002005473021|BaronShibusawa
      m
      m
      1920
      agent-is agent-has agent-given agent-does agen...
    
    
      1
      uc2.ark+=13960=t3222sv4b
      LordSalisbury
      uc2.ark+=13960=t3222sv4b|LordSalisbury
      m
      f
      1907
      agent-wrote agent-says agent-succeeded agent-o...
    
    
      2
      uc2.ark+=13960=t3222sv4b
      Nicholas
      uc2.ark+=13960=t3222sv4b|Nicholas
      m
      f
      1907
      agent-liked agent-says agent-was agent-worked ...
    
    
      3
      uc2.ark+=13960=t3222sv4b
      Mr.GathorneHardy
      uc2.ark+=13960=t3222sv4b|Mr.GathorneHardy
      m
      f
      1907
      agent-put agent-made agent-were agent-recognis...
    
    
      4
      uc2.ark+=13960=t3222sv4b
      Wilberforce
      uc2.ark+=13960=t3222sv4b|Wilberforce
      m
      f
      1907
      agent-held agent-foresaw agent-had poss-life p...



In [9]:

    
print(post23.shape)
print(pre23.shape)
print(bio.shape)









    



(515872, 7)
(855434, 7)
(1371306, 7)



In [10]:

    
pd.set_option('max_colwidth', 500)
bio.words[:5]









    Out[10]:





0                                                                                                                       agent-is agent-has agent-given agent-does agent-takes agent-explained agent-trying agent-thinks agent-get agent-have agent-sent agent-feared poss-theory poss-money poss-influenza mod-confucianist mod-desirous
1                                    agent-wrote agent-says agent-succeeded agent-offered agent-wrote agent-made agent-returned agent-ceased agent-selected agent-see poss-brother poss-government poss-offer poss-cousin poss-post poss-title poss-government poss-instructions patient-chosen patient-informing said-peace said-honour
2                                                                                                                                                                                                         agent-liked agent-says agent-was agent-worked poss-sister poss-letters poss-family poss-sister said-liked said-seen said-heard
3                                                                                                                                                         agent-put agent-made agent-were agent-recognised agent-considered agent-maintained poss-reply patient-furnished patient-entered patient-raised patient-replaced said-emergency
4    agent-held agent-foresaw agent-had poss-life poss-estates poss-mind mod-promoter said-hearts said-retire said-often said-littleness said-common said-greatness said-god said-presence said-mercy said-grow said-day said-day said-nobleness said-aim said-quick said-steadiness said-purpose said-greatness said-conduct said-makes
Name: words, dtype: object



In [11]:

    
# tuples (tag, word)
# [tuple(item.split('-')) for sublist in bio.words[:5].str.split() for item in sublist]

# just words
# [item.split('-')[1] for sublist in bio.words[:5].str.split() for item in sublist]

# just tags
# [item.split('-')[0] for sublist in bio.words[:5].str.split() for item in sublist]

# one big dict
# for k, v in [tuple(item.split('-')) for sublist in bio.words[:5].str.split() for item in sublist]:
#     d[k].append(v)

from collections import defaultdict

def pos_to_dict(words_str):
    d = defaultdict( list )
    try:
        for k, v in [ tuple(item.split('-')) for item in words_str.split() ]:
            d[k].append(v)
    except (ValueError):
        # where the word has multiple hyphens, split off only the first one as the pos key
        for k, v in [ (item.split('-')[0], item[len(item.split('-')[0])+1:]) for item in words_str.split() ]:
            d[k].append(v)
    except (AttributeError, TypeError):
        return words_str
    return d

bio['words'] = bio['words'].apply(pos_to_dict)



In [13]:

    
def count_pos(pos_dict):
    d = { k:len(v) for k, v in pos_dict.items() }
    return d

bio['count_pos'] = bio['words'].apply(count_pos)



In [20]:

    
bio.head(10)









    Out[20]:







  
    
      
      docid
      charname
      charid
      chargender
      authgender
      pubdate
      words
      count_pos
    
  
  
    
      0
      yale.39002005473021
      BaronShibusawa
      yale.39002005473021|BaronShibusawa
      m
      m
      1920
      {'agent': ['is', 'has', 'given', 'does', 'takes', 'explained', 'trying', 'thinks', 'get', 'have', 'sent', 'feared'], 'poss': ['theory', 'money', 'influenza'], 'mod': ['confucianist', 'desirous']}
      {'agent': 12, 'poss': 3, 'mod': 2}
    
    
      1
      uc2.ark+=13960=t3222sv4b
      LordSalisbury
      uc2.ark+=13960=t3222sv4b|LordSalisbury
      m
      f
      1907
      {'agent': ['wrote', 'says', 'succeeded', 'offered', 'wrote', 'made', 'returned', 'ceased', 'selected', 'see'], 'poss': ['brother', 'government', 'offer', 'cousin', 'post', 'title', 'government', 'instructions'], 'patient': ['chosen', 'informing'], 'said': ['peace', 'honour']}
      {'agent': 10, 'poss': 8, 'patient': 2, 'said': 2}
    
    
      2
      uc2.ark+=13960=t3222sv4b
      Nicholas
      uc2.ark+=13960=t3222sv4b|Nicholas
      m
      f
      1907
      {'agent': ['liked', 'says', 'was', 'worked'], 'poss': ['sister', 'letters', 'family', 'sister'], 'said': ['liked', 'seen', 'heard']}
      {'agent': 4, 'poss': 4, 'said': 3}
    
    
      3
      uc2.ark+=13960=t3222sv4b
      Mr.GathorneHardy
      uc2.ark+=13960=t3222sv4b|Mr.GathorneHardy
      m
      f
      1907
      {'agent': ['put', 'made', 'were', 'recognised', 'considered', 'maintained'], 'poss': ['reply'], 'patient': ['furnished', 'entered', 'raised', 'replaced'], 'said': ['emergency']}
      {'agent': 6, 'poss': 1, 'patient': 4, 'said': 1}
    
    
      4
      uc2.ark+=13960=t3222sv4b
      Wilberforce
      uc2.ark+=13960=t3222sv4b|Wilberforce
      m
      f
      1907
      {'agent': ['held', 'foresaw', 'had'], 'poss': ['life', 'estates', 'mind'], 'mod': ['promoter'], 'said': ['hearts', 'retire', 'often', 'littleness', 'common', 'greatness', 'god', 'presence', 'mercy', 'grow', 'day', 'day', 'nobleness', 'aim', 'quick', 'steadiness', 'purpose', 'greatness', 'conduct', 'makes']}
      {'agent': 3, 'poss': 3, 'mod': 1, 'said': 20}
    
    
      5
      uc2.ark+=13960=t3222sv4b
      LordBeaconsfield
      uc2.ark+=13960=t3222sv4b|LordBeaconsfield
      m
      f
      1907
      {'agent': ['shown', 'offered', 'making', 'tendered', 'wrote', 'looked', 'shown', 'offered', 'making', 'tendered', 'wrote', 'looked'], 'poss': ['judgement', 'accepting', 'resignation', 'successor', 'gladstone', 'engagement', 'life', 'host', 'party', 'judgement', 'accepting', 'resignation', 'successor', 'gladstone', 'engagement', 'life', 'host', 'party'], 'patient': ['summoned', 'showed', 'summoned', 'showed'], 'said': ['obliged', 'forego', 'honour', 'pleasure', 'guest', 'obliged', 'forego', '...
      {'agent': 12, 'poss': 18, 'patient': 4, 'said': 20}
    
    
      6
      uc2.ark+=13960=t3222sv4b
      Bob
      uc2.ark+=13960=t3222sv4b|Bob
      m
      f
      1907
      {'agent': ['looking', 'brings', 'has', 'seems', 'approve', 'carried', 'was', 'make', 'looked', 'came', 'wished', 'won', 'disarmed', 'do', 'get', 'got', 'bought', 'done', 'be'], 'poss': ['career', 'forehead', 'mouth', 'movements', 'companions', 'family', 'beauty', 'sergeant', 'father', 'permission'], 'mod': ['slender', 'graceful', 'quiet', 'sarcastic', 'calm', 'person'], 'patient': ['love', 'saw', 'marked', 'saves', 'carried', 'thanking', 'assuring']}
      {'agent': 19, 'poss': 10, 'mod': 6, 'patient': 7}
    
    
      7
      uc2.ark+=13960=t3222sv4b
      Earl
      uc2.ark+=13960=t3222sv4b|Earl
      m
      f
      1907
      {'agent': ['sold', 'came', 'resigned', 'pass'], 'poss': ['marriage', 'mines', 'brother', 'history', 'crimea'], 'said': ['history', 'lives', 'lindsays', 'christian', 'art']}
      {'agent': 4, 'poss': 5, 'said': 5}
    
    
      8
      uc2.ark+=13960=t3222sv4b
      Committee
      uc2.ark+=13960=t3222sv4b|Committee
      u
      f
      1907
      {'agent': ['made', 'decided', 'directed', 'consider', 'attributed', 'is'], 'patient': ['had'], 'said': ['terms', 'conditions', 'service', 'army']}
      {'agent': 6, 'patient': 1, 'said': 4}
    
    
      9
      uc2.ark+=13960=t3222sv4b
      FrederickStephenson
      uc2.ark+=13960=t3222sv4b|FrederickStephenson
      m
      f
      1907
      {'agent': ['join', 'took', 'invalided', 'was', 'have', 'pitched', 'loved', 'coming'], 'poss': ['officers', 'senior', 'friend', 'regiment', 'father', 'regiment', 'father', 'country', 'career', 'richmond'], 'mod': ['adjutant'], 'patient': ['telling', 'include', 'invalided', 'exhausted', 'welcome']}
      {'agent': 8, 'poss': 10, 'mod': 1, 'patient': 5}

Questions:

are the relative frequency of verbs associated with male characters higher?
- verbs: agent (object), patient (passive)
- nouns: mod, poss
for feminine and masuline characters are there different ratios for those roles
- (agent-patient of verb), (poss-mod of noun)
'up, down, away, there, again'
- poss role? or mod role?



In [21]:

    
from collections import Counter

def get_counts(df):
    c = Counter()
    for idx, row in df['count_pos'].items():
        c.update(row)
    return c



In [27]:

    
all_counts = get_counts(bio)



In [22]:

    
male_char_counts = get_counts(bio.loc[bio.chargender == 'm',:])
female_char_counts = get_counts(bio.loc[bio.chargender == 'f',:])



In [33]:

    
male_auth_counts = get_counts(bio.loc[bio.authgender == 'm',:])
female_auth_counts = get_counts(bio.loc[bio.authgender == 'f',:])



In [27]:

    
male_char_counts









    Out[27]:





Counter({'agent': 31365088,
         'lll>': 1,
         'mod': 2593052,
         'patient': 6141017,
         'poss': 21493348,
         'said': 28788655,
         'tho': 1,
         'to': 1})

Is the relative frequency of verbs associated with male characters higher?



In [154]:

    
print('relative % of verbs associated with male characters', (male_char_counts['agent'] 
                                                              + male_char_counts['patient']) 
      / (male_char_counts['agent'] 
         + male_char_counts['patient'] 
         + male_char_counts['mod'] 
         + male_char_counts['poss'] 
         + male_char_counts['said'])
     )
print('relative % of verbs associated with female characters', (female_char_counts['agent'] 
                                                                + female_char_counts['patient']) 
      / (female_char_counts['agent'] 
         + female_char_counts['patient'] 
         + female_char_counts['mod'] 
         + female_char_counts['poss'] 
         + female_char_counts['said'])
     )









    



relative % of verbs associated with male characters 0.41497702618554577
relative % of verbs associated with female characters 0.4330731659372056

Percentage of verbs that are active



In [158]:

    
print('% of verbs describing men that are active verbs', (male_char_counts['agent'] 
                                                          / (male_char_counts['agent'] 
                                                             + male_char_counts['patient'])
                                                         ))

print('% of verbs describing women that are active verbs', (female_char_counts['agent'] 
                                                            / (female_char_counts['agent'] 
                                                               + female_char_counts['patient'])
                                                           ))









    



% of verbs describing men that are active verbs 0.8362662025288949
% of verbs describing women that are active verbs 0.8437637681994732

Percentage of nouns that are possessive



In [159]:

    
print('% of nouns describing men that are possessive nouns', (male_char_counts['poss'] 
                                                              / (male_char_counts['poss'] 
                                                                 + male_char_counts['mod'])
                                                             ))

print('% of nouns describing women that are possessive nouns', (female_char_counts['poss'] 
                                                                / (female_char_counts['poss'] 
                                                                   + female_char_counts['mod'])
                                                               ))









    



% of nouns describing men that are possessive nouns 0.8923437292413976
% of nouns describing women that are possessive nouns 0.8992377751419598

Is the relative frequency of dialogue words associated with male characters higher?



In [160]:

    
print('relative % of dialog words assocatied with male characters', (male_char_counts['said']) 
      / (male_char_counts['agent'] 
         + male_char_counts['patient'] 
         + male_char_counts['mod'] 
         + male_char_counts['poss'] 
         + male_char_counts['said'])
     )
print('relative % of dialog words associated with female characters', (female_char_counts['said']) 
      / (female_char_counts['agent'] 
         + female_char_counts['patient'] 
         + female_char_counts['mod'] 
         + female_char_counts['poss'] 
         + female_char_counts['said'] )
     )









    



relative % of dialog words assocatied with male characters 0.3185249558646957
relative % of dialog words associated with female characters 0.28209319761216356

what are the verbs being used most by woman? (authored)
- verbs: agent (object), patient (passive)
- nouns: mod, poss

plot by year:
- % of verbs describing women that are active verbs
- % of verbs describing men that are active verbs

raw data of character sizes
- plotting trend over time

path, '/projects/ischoolichass/ichass/usesofscale/chardata'



In [31]:

    
all_by_year = bio.groupby('pubdate').apply(get_counts)









    Out[31]:





pubdate
0                           {'agent': 1185, 'poss': 838, 'mod': 60, 'patient': 179, 'said': 1864}
1701                         {'agent': 909, 'poss': 374, 'mod': 52, 'patient': 155, 'said': 1329}
1705                           {'agent': 452, 'poss': 423, 'patient': 96, 'said': 525, 'mod': 29}
1706                            {'agent': 110, 'poss': 174, 'patient': 38, 'mod': 10, 'said': 35}
1714                       {'agent': 1930, 'patient': 314, 'poss': 849, 'mod': 108, 'said': 2045}
1715                                                                      {'agent': 7, 'poss': 6}
1717                         {'agent': 571, 'patient': 108, 'said': 1401, 'poss': 277, 'mod': 37}
1719                              {'agent': 97, 'poss': 92, 'mod': 5, 'patient': 29, 'said': 143}
1720                            {'agent': 113, 'poss': 127, 'mod': 8, 'patient': 26, 'said': 405}
1726                                             {'agent': 8, 'poss': 14, 'mod': 1, 'patient': 1}
1732                           {'agent': 213, 'poss': 266, 'patient': 74, 'said': 508, 'mod': 20}
1734                          {'said': 989, 'agent': 421, 'poss': 351, 'mod': 18, 'patient': 123}
1737                          {'agent': 707, 'poss': 694, 'mod': 87, 'patient': 145, 'said': 995}
1739                                            {'agent': 29, 'poss': 23, 'mod': 4, 'patient': 9}
1740                            {'agent': 102, 'poss': 107, 'mod': 21, 'patient': 15, 'said': 33}
1741                      {'agent': 3015, 'poss': 2462, 'mod': 202, 'patient': 913, 'said': 7780}
1742                     {'agent': 5428, 'poss': 4301, 'mod': 308, 'patient': 1589, 'said': 8067}
1744                          {'agent': 404, 'said': 942, 'poss': 324, 'patient': 106, 'mod': 16}
1747                    {'agent': 2648, 'poss': 2218, 'patient': 1167, 'said': 15091, 'mod': 398}
1750                     {'agent': 4735, 'poss': 5068, 'patient': 1624, 'mod': 416, 'said': 7600}
1752                         {'agent': 517, 'poss': 604, 'mod': 53, 'patient': 183, 'said': 1444}
1753                      {'agent': 1395, 'poss': 1344, 'mod': 155, 'patient': 404, 'said': 1912}
1755                         {'agent': 884, 'poss': 728, 'mod': 83, 'patient': 342, 'said': 6982}
1758                       {'agent': 1079, 'poss': 1150, 'patient': 247, 'said': 3419, 'mod': 95}
1759                    {'agent': 5721, 'poss': 3462, 'patient': 1424, 'said': 11070, 'mod': 453}
1760                  {'agent': 11699, 'poss': 9874, 'patient': 4127, 'said': 21079, 'mod': 1035}
1761                    {'agent': 5395, 'poss': 4718, 'mod': 419, 'patient': 1623, 'said': 10857}
1762                          {'poss': 482, 'mod': 38, 'patient': 157, 'said': 655, 'agent': 472}
1763                       {'agent': 2420, 'poss': 1995, 'mod': 71, 'patient': 678, 'said': 3980}
1765                      {'agent': 1292, 'poss': 1343, 'mod': 126, 'patient': 310, 'said': 4032}
                                                  ...                                            
1971             {'agent': 165263, 'poss': 102694, 'said': 85866, 'mod': 11209, 'patient': 26963}
1972    {'agent': 161857, 'poss': 105793, 'patient': 29691, 'said': 115904, 'mod': 11954, '>': 1}
1973            {'agent': 180395, 'poss': 108139, 'patient': 32386, 'said': 155848, 'mod': 13197}
1974            {'agent': 225386, 'poss': 121866, 'mod': 15756, 'patient': 36533, 'said': 123364}
1975            {'agent': 236514, 'poss': 127547, 'mod': 16191, 'said': 121237, 'patient': 37926}
1976            {'agent': 205248, 'poss': 133360, 'mod': 14460, 'patient': 35124, 'said': 114669}
1977            {'agent': 184325, 'poss': 114338, 'mod': 15420, 'patient': 34020, 'said': 119863}
1978            {'agent': 234620, 'said': 151980, 'poss': 132521, 'mod': 17619, 'patient': 40843}
1979            {'agent': 223429, 'poss': 127621, 'mod': 16164, 'patient': 37166, 'said': 124367}
1980            {'agent': 299229, 'poss': 169112, 'mod': 20386, 'patient': 46375, 'said': 164779}
1981            {'agent': 315960, 'poss': 181482, 'mod': 21413, 'patient': 52658, 'said': 170879}
1982            {'said': 176319, 'agent': 313079, 'poss': 176355, 'patient': 54209, 'mod': 23234}
1983            {'agent': 254504, 'poss': 143528, 'mod': 17741, 'patient': 40341, 'said': 150401}
1984            {'agent': 323121, 'poss': 186573, 'mod': 21480, 'patient': 52097, 'said': 158617}
1985            {'agent': 297170, 'poss': 172334, 'patient': 54608, 'said': 129542, 'mod': 22140}
1986            {'agent': 323468, 'poss': 167686, 'patient': 51599, 'mod': 23125, 'said': 180932}
1987            {'agent': 437540, 'poss': 226284, 'mod': 28584, 'patient': 69036, 'said': 238660}
1988            {'agent': 301625, 'poss': 171784, 'patient': 51816, 'mod': 21825, 'said': 178108}
1989            {'agent': 309961, 'poss': 193101, 'mod': 21354, 'patient': 51279, 'said': 182316}
1990            {'said': 207188, 'agent': 382923, 'poss': 207035, 'mod': 27552, 'patient': 63843}
1991            {'agent': 295611, 'poss': 166381, 'mod': 20862, 'patient': 48286, 'said': 163215}
1992            {'agent': 326284, 'poss': 219485, 'mod': 23485, 'patient': 55475, 'said': 211180}
1993            {'agent': 316279, 'poss': 171811, 'said': 194424, 'mod': 22048, 'patient': 51458}
1994            {'agent': 253294, 'poss': 136254, 'mod': 18343, 'patient': 42168, 'said': 145083}
1995            {'agent': 358969, 'said': 208970, 'poss': 198972, 'patient': 60806, 'mod': 24534}
1996            {'agent': 310821, 'said': 180674, 'poss': 175598, 'mod': 21787, 'patient': 52090}
1997            {'agent': 318360, 'poss': 171647, 'mod': 22947, 'patient': 52884, 'said': 190157}
1998            {'agent': 208509, 'poss': 120908, 'mod': 14876, 'patient': 35543, 'said': 123207}
1999            {'agent': 179989, 'poss': 105199, 'mod': 13342, 'patient': 33167, 'said': 117516}
2000            {'agent': 202388, 'poss': 118253, 'said': 104637, 'mod': 15929, 'patient': 36824}
Length: 264, dtype: object



In [35]:

    
type(all_by_year)









    Out[35]:





pandas.core.series.Series



In [46]:

    
# restructure dictionary Counter object into Pandas DataFrame
all_by_year = all_by_year.apply(pd.Series).fillna(0)



In [51]:

    
# drop messy error columns
all_by_year = all_by_year.drop(['>', 'lll>', 'tho', 'to'], axis=1)



In [55]:

    
all_by_year.head()



In [57]:

    
# drop year 0??
# this must mean the pubdate is unknown
all_by_year = all_by_year.drop(0)
all_by_year.head()

Plotting raw word counts associated with characters by year



In [102]:

    
plt.figure(figsize=(15,10))
plt.plot(all_by_year.index, all_by_year.sum(axis=1), label='all words', color='darkred')
plt.title('Word counts for all biography characters')
plt.legend()
plt.show()



In [142]:

    
plt.figure(figsize=(15,10))
plt.plot(all_by_year.index, all_by_year['poss'], label='noun-posessive', color='mediumblue')
plt.plot(all_by_year.index, all_by_year['mod'], label='noun-modifier', color='skyblue')
plt.plot(all_by_year.index, all_by_year['agent'], label='verb-active', color='seagreen')
plt.plot(all_by_year.index, all_by_year['patient'], label='verb-passive', color='lightgreen')
plt.plot(all_by_year.index, all_by_year['said'], label='dialogue', color='dimgrey')
plt.title('Word counts by linguistic category for all biography characters')
plt.legend()
plt.savefig('./raw_word_counts')
plt.show()

plotting relative frequencies by gender

plot by year:
- % of verbs describing women that are active verbs
- % of verbs describing men that are active verbs



In [103]:

    
all_by_gender_yr = bio.groupby(['pubdate', 'chargender']).apply(get_counts)



In [104]:

    
all_by_gender_yr.head()









    Out[104]:





pubdate  chargender
0        f                                    {'agent': 15, 'poss': 7, 'mod': 3, 'said': 37}
         m             {'agent': 1169, 'poss': 828, 'mod': 57, 'patient': 179, 'said': 1784}
         u                                               {'poss': 3, 'said': 43, 'agent': 1}
1701     m               {'agent': 883, 'poss': 370, 'mod': 49, 'patient': 144, 'said': 713}
         u                    {'agent': 26, 'patient': 11, 'said': 616, 'mod': 3, 'poss': 4}
dtype: object



In [105]:

    
all_by_gender_yr = all_by_gender_yr.apply(pd.Series).fillna(0)
all_by_gender_yr.head()



In [109]:

    
# drop messy error columns
all_by_gender_yr = all_by_gender_yr.drop(['>', 'lll>', 'tho', 'to'], axis=1)
all_by_gender_yr = all_by_gender_yr.drop(0)
all_by_gender_yr.head()



In [162]:

    
all_by_gender_yr.loc[
    all_by_gender_yr.index.get_level_values('chargender') == 'm', 
].head()



In [166]:

    
prcnt_dialogue_m = all_by_gender_yr.loc[
    all_by_gender_yr.index.get_level_values('chargender') == 'm', 'said'
] / all_by_gender_yr.loc[
    all_by_gender_yr.index.get_level_values('chargender') == 'm'].sum(axis=1)

prcnt_dialogue_f = all_by_gender_yr.loc[
    all_by_gender_yr.index.get_level_values('chargender') == 'f', 'said'
] / all_by_gender_yr.loc[
    all_by_gender_yr.index.get_level_values('chargender') == 'f'].sum(axis=1)



In [169]:

    
plt.figure(figsize=(15,10))
plt.plot(prcnt_dialogue_m.index.get_level_values('pubdate'), 
         prcnt_dialogue_m, label='Male - % words that are dialogue', color='mediumblue')
plt.plot(prcnt_dialogue_f.index.get_level_values('pubdate'), 
         prcnt_dialogue_f, label='Female - % words that are dialogue', color='darkred')
plt.title('Percent of words associated with male vs female characters that are dialogue')
plt.legend()
plt.savefig('./prcnt_dialogue')
plt.show()



In [132]:

    
prcnt_active_f = all_by_gender_yr.loc[
    all_by_gender_yr.index.get_level_values('chargender') == 'f', 'agent'
] / (all_by_gender_yr.loc[
    all_by_gender_yr.index.get_level_values('chargender') == 'f', 'agent'
] + all_by_gender_yr.loc[
    all_by_gender_yr.index.get_level_values('chargender') == 'f', 'patient'
])



In [133]:

    
prcnt_active_m = all_by_gender_yr.loc[
    all_by_gender_yr.index.get_level_values('chargender') == 'm', 'agent'
] / (all_by_gender_yr.loc[
    all_by_gender_yr.index.get_level_values('chargender') == 'm', 'agent'
] + all_by_gender_yr.loc[
    all_by_gender_yr.index.get_level_values('chargender') == 'm', 'patient'
])



In [141]:

    
plt.figure(figsize=(15,10))
plt.plot(prcnt_active_f.index.get_level_values('pubdate'), 
         prcnt_active_f,
         label='Female - % active verbs', color='darkred')
plt.plot(prcnt_active_m.index.get_level_values('pubdate'), 
         prcnt_active_m,
         label='Male - % active verbs', color='mediumblue')
plt.title('Percentage of verbs that are active verbs for male vs female characters')
plt.legend()
plt.savefig('./prcnt_active_verbs')
plt.show()



In [149]:

    
prcnt_poss_f = all_by_gender_yr.loc[
    all_by_gender_yr.index.get_level_values('chargender') == 'f', 'poss'
] / (all_by_gender_yr.loc[
    all_by_gender_yr.index.get_level_values('chargender') == 'f', 'poss'
] + all_by_gender_yr.loc[
    all_by_gender_yr.index.get_level_values('chargender') == 'f', 'mod'
])

prcnt_poss_m = all_by_gender_yr.loc[
    all_by_gender_yr.index.get_level_values('chargender') == 'm', 'poss'
] / (all_by_gender_yr.loc[
    all_by_gender_yr.index.get_level_values('chargender') == 'm', 'poss'
] + all_by_gender_yr.loc[
    all_by_gender_yr.index.get_level_values('chargender') == 'm', 'mod'
])



In [151]:

    
plt.figure(figsize=(15,10))
plt.plot(prcnt_poss_f.index.get_level_values('pubdate'), 
         prcnt_poss_f,
         label='Female - % possessive nouns', color='darkred')
plt.plot(prcnt_poss_m.index.get_level_values('pubdate'), 
         prcnt_poss_m,
         label='Male - % possessive nouns', color='mediumblue')
plt.title('Percentage of nouns that are possessive nouns for male vs female characters')
plt.legend()
plt.savefig('./prcnt_poss_nouns')
plt.show()



In [ ]:



In [ ]:



In [ ]:

Get the raw counts of present vs past tense, in bio

import and train the Unigram Tagger

- this POS tagger is meant to work with words out of context
- we have "agent" vs "patient" tags but not "past-tense" vs "present-tense"



In [ ]:

    
brown_sents = brown.tagged_sents()
# Split the data into train and test sets.
train = int(len(brown_sents)*90/100) # use 90% for training
# Trains the tagger
uni_tag = ut(brown_sents[:train]) # this will take some time, ~1-2 mins
# evaluate on 10%
uni_tag.evaluate(brown_sents[train+1:]) # will also take ~1-2 mins
# Tags a random sentence
#uni_tag.tag("this is a foo bar sentence .".split())



In [ ]:

    
def get_past_tense(words):
    tagged = uni_tag.tag(words)
    tagged = [list(tup) for tup in tagged]
    print(tagged)
    for word_tag in tagged:
        if word_tag[0].startswith('was-'):
            word_tag[1] = 'VBN'
    past_words = [item[0] for item in tagged if item[1] in ['VBN', 'VBD']]
    return past_words

def get_present_tense(words):
    tagged = uni_tag.tag(words)
    tagged = [list(tup) for tup in tagged]
    present_words = [item[0] for item in tagged if item[1] in ['VB','VBZ', 'VBG', 'VBP']]
    return present_words



In [ ]:

    
[tuple(item.split('-')) for sublist in post23.words[:5].str.split() for item in sublist]



In [ ]:

    
# flat_list = [item for sublist in l for item in sublist]
pre_words = [word for sublist in pre23['words'] for word in sublist.split()]



In [ ]:

    
pre_words[:10]



In [ ]:

    
post_words = [word for sublist in post23['words'] for word in sublist.split()]



In [ ]:

    
print(len(post_words))
print(len(pre_words))



In [ ]:

    
counts = Counter(post_words + pre_words)



In [ ]:

    
len(bio_words)



In [ ]:

    
len(postbio_words)



In [ ]:

    
Counter(words)

	agent	mod	patient	poss	said
pubdate
0	1185.0	60.0	179.0	838.0	1864.0
1701	909.0	52.0	155.0	374.0	1329.0
1705	452.0	29.0	96.0	423.0	525.0
1706	110.0	10.0	38.0	174.0	35.0
1714	1930.0	108.0	314.0	849.0	2045.0

	agent	mod	patient	poss	said
pubdate
1701	909.0	52.0	155.0	374.0	1329.0
1705	452.0	29.0	96.0	423.0	525.0
1706	110.0	10.0	38.0	174.0	35.0
1714	1930.0	108.0	314.0	849.0	2045.0
1715	7.0	0.0	0.0	6.0	0.0

		>	agent	lll>	mod	patient	poss	said	tho	to
pubdate	chargender
0	f	0.0	15.0	0.0	3.0	0.0	7.0	37.0	0.0	0.0
	m	0.0	1169.0	0.0	57.0	179.0	828.0	1784.0	0.0	0.0
	u	0.0	1.0	0.0	0.0	0.0	3.0	43.0	0.0	0.0
1701	m	0.0	883.0	0.0	49.0	144.0	370.0	713.0	0.0	0.0
1701	u	0.0	26.0	0.0	3.0	11.0	4.0	616.0	0.0	0.0

		agent	mod	patient	poss	said
pubdate	chargender
1701	m	883.0	49.0	144.0	370.0	713.0
1701	u	26.0	3.0	11.0	4.0	616.0
1705	m	436.0	29.0	90.0	423.0	239.0
1705	u	16.0	0.0	6.0	0.0	286.0
1706	f	18.0	0.0	7.0	32.0	0.0

		agent	mod	patient	poss	said
pubdate	chargender
1701	m	883.0	49.0	144.0	370.0	713.0
1705	m	436.0	29.0	90.0	423.0	239.0
1706	m	92.0	10.0	31.0	142.0	35.0
1714	m	1912.0	106.0	281.0	841.0	1937.0
1715	m	7.0	0.0	0.0	6.0	0.0

	docid	charname	charid	chargender	authgender	pubdate	words
0	yale.39002005473021	BaronShibusawa	yale.39002005473021\|BaronShibusawa	m	m	1920	agent-is agent-has agent-given agent-does agen...
1	uc2.ark+=13960=t3222sv4b	LordSalisbury	uc2.ark+=13960=t3222sv4b\|LordSalisbury	m	f	1907	agent-wrote agent-says agent-succeeded agent-o...
2	uc2.ark+=13960=t3222sv4b	Nicholas	uc2.ark+=13960=t3222sv4b\|Nicholas	m	f	1907	agent-liked agent-says agent-was agent-worked ...
3	uc2.ark+=13960=t3222sv4b	Mr.GathorneHardy	uc2.ark+=13960=t3222sv4b\|Mr.GathorneHardy	m	f	1907	agent-put agent-made agent-were agent-recognis...
4	uc2.ark+=13960=t3222sv4b	Wilberforce	uc2.ark+=13960=t3222sv4b\|Wilberforce	m	f	1907	agent-held agent-foresaw agent-had poss-life p...