In [37]:
import pandas as pd
import nltk
from collections import Counter
from nltk.corpus import brown
from nltk import UnigramTagger as ut
import matplotlib.pyplot as plt

In [3]:
post23 = pd.read_csv('./Oct15_post23bio.tsv', sep='\t')

In [4]:
pre23 = pd.read_csv('./Oct15_pre23bio.tsv', sep='\t')

In [7]:
bio = pd.concat([pre23, post23], ignore_index=True)

In [7]:
# bio.to_csv('./Oct15_bio.tsv', sep='\t', index=False)

In [8]:
bio.head()


Out[8]:
docid charname charid chargender authgender pubdate words
0 yale.39002005473021 BaronShibusawa yale.39002005473021|BaronShibusawa m m 1920 agent-is agent-has agent-given agent-does agen...
1 uc2.ark+=13960=t3222sv4b LordSalisbury uc2.ark+=13960=t3222sv4b|LordSalisbury m f 1907 agent-wrote agent-says agent-succeeded agent-o...
2 uc2.ark+=13960=t3222sv4b Nicholas uc2.ark+=13960=t3222sv4b|Nicholas m f 1907 agent-liked agent-says agent-was agent-worked ...
3 uc2.ark+=13960=t3222sv4b Mr.GathorneHardy uc2.ark+=13960=t3222sv4b|Mr.GathorneHardy m f 1907 agent-put agent-made agent-were agent-recognis...
4 uc2.ark+=13960=t3222sv4b Wilberforce uc2.ark+=13960=t3222sv4b|Wilberforce m f 1907 agent-held agent-foresaw agent-had poss-life p...

In [9]:
print(post23.shape)
print(pre23.shape)
print(bio.shape)


(515872, 7)
(855434, 7)
(1371306, 7)

In [10]:
pd.set_option('max_colwidth', 500)
bio.words[:5]


Out[10]:
0                                                                                                                       agent-is agent-has agent-given agent-does agent-takes agent-explained agent-trying agent-thinks agent-get agent-have agent-sent agent-feared poss-theory poss-money poss-influenza mod-confucianist mod-desirous
1                                    agent-wrote agent-says agent-succeeded agent-offered agent-wrote agent-made agent-returned agent-ceased agent-selected agent-see poss-brother poss-government poss-offer poss-cousin poss-post poss-title poss-government poss-instructions patient-chosen patient-informing said-peace said-honour
2                                                                                                                                                                                                         agent-liked agent-says agent-was agent-worked poss-sister poss-letters poss-family poss-sister said-liked said-seen said-heard
3                                                                                                                                                         agent-put agent-made agent-were agent-recognised agent-considered agent-maintained poss-reply patient-furnished patient-entered patient-raised patient-replaced said-emergency
4    agent-held agent-foresaw agent-had poss-life poss-estates poss-mind mod-promoter said-hearts said-retire said-often said-littleness said-common said-greatness said-god said-presence said-mercy said-grow said-day said-day said-nobleness said-aim said-quick said-steadiness said-purpose said-greatness said-conduct said-makes
Name: words, dtype: object

In [11]:
# tuples (tag, word)
# [tuple(item.split('-')) for sublist in bio.words[:5].str.split() for item in sublist]

# just words
# [item.split('-')[1] for sublist in bio.words[:5].str.split() for item in sublist]

# just tags
# [item.split('-')[0] for sublist in bio.words[:5].str.split() for item in sublist]

# one big dict
# for k, v in [tuple(item.split('-')) for sublist in bio.words[:5].str.split() for item in sublist]:
#     d[k].append(v)

from collections import defaultdict

def pos_to_dict(words_str):
    d = defaultdict( list )
    try:
        for k, v in [ tuple(item.split('-')) for item in words_str.split() ]:
            d[k].append(v)
    except (ValueError):
        # where the word has multiple hyphens, split off only the first one as the pos key
        for k, v in [ (item.split('-')[0], item[len(item.split('-')[0])+1:]) for item in words_str.split() ]:
            d[k].append(v)
    except (AttributeError, TypeError):
        return words_str
    return d

bio['words'] = bio['words'].apply(pos_to_dict)

In [13]:
def count_pos(pos_dict):
    d = { k:len(v) for k, v in pos_dict.items() }
    return d

bio['count_pos'] = bio['words'].apply(count_pos)

In [20]:
bio.head(10)


Out[20]:
docid charname charid chargender authgender pubdate words count_pos
0 yale.39002005473021 BaronShibusawa yale.39002005473021|BaronShibusawa m m 1920 {'agent': ['is', 'has', 'given', 'does', 'takes', 'explained', 'trying', 'thinks', 'get', 'have', 'sent', 'feared'], 'poss': ['theory', 'money', 'influenza'], 'mod': ['confucianist', 'desirous']} {'agent': 12, 'poss': 3, 'mod': 2}
1 uc2.ark+=13960=t3222sv4b LordSalisbury uc2.ark+=13960=t3222sv4b|LordSalisbury m f 1907 {'agent': ['wrote', 'says', 'succeeded', 'offered', 'wrote', 'made', 'returned', 'ceased', 'selected', 'see'], 'poss': ['brother', 'government', 'offer', 'cousin', 'post', 'title', 'government', 'instructions'], 'patient': ['chosen', 'informing'], 'said': ['peace', 'honour']} {'agent': 10, 'poss': 8, 'patient': 2, 'said': 2}
2 uc2.ark+=13960=t3222sv4b Nicholas uc2.ark+=13960=t3222sv4b|Nicholas m f 1907 {'agent': ['liked', 'says', 'was', 'worked'], 'poss': ['sister', 'letters', 'family', 'sister'], 'said': ['liked', 'seen', 'heard']} {'agent': 4, 'poss': 4, 'said': 3}
3 uc2.ark+=13960=t3222sv4b Mr.GathorneHardy uc2.ark+=13960=t3222sv4b|Mr.GathorneHardy m f 1907 {'agent': ['put', 'made', 'were', 'recognised', 'considered', 'maintained'], 'poss': ['reply'], 'patient': ['furnished', 'entered', 'raised', 'replaced'], 'said': ['emergency']} {'agent': 6, 'poss': 1, 'patient': 4, 'said': 1}
4 uc2.ark+=13960=t3222sv4b Wilberforce uc2.ark+=13960=t3222sv4b|Wilberforce m f 1907 {'agent': ['held', 'foresaw', 'had'], 'poss': ['life', 'estates', 'mind'], 'mod': ['promoter'], 'said': ['hearts', 'retire', 'often', 'littleness', 'common', 'greatness', 'god', 'presence', 'mercy', 'grow', 'day', 'day', 'nobleness', 'aim', 'quick', 'steadiness', 'purpose', 'greatness', 'conduct', 'makes']} {'agent': 3, 'poss': 3, 'mod': 1, 'said': 20}
5 uc2.ark+=13960=t3222sv4b LordBeaconsfield uc2.ark+=13960=t3222sv4b|LordBeaconsfield m f 1907 {'agent': ['shown', 'offered', 'making', 'tendered', 'wrote', 'looked', 'shown', 'offered', 'making', 'tendered', 'wrote', 'looked'], 'poss': ['judgement', 'accepting', 'resignation', 'successor', 'gladstone', 'engagement', 'life', 'host', 'party', 'judgement', 'accepting', 'resignation', 'successor', 'gladstone', 'engagement', 'life', 'host', 'party'], 'patient': ['summoned', 'showed', 'summoned', 'showed'], 'said': ['obliged', 'forego', 'honour', 'pleasure', 'guest', 'obliged', 'forego', '... {'agent': 12, 'poss': 18, 'patient': 4, 'said': 20}
6 uc2.ark+=13960=t3222sv4b Bob uc2.ark+=13960=t3222sv4b|Bob m f 1907 {'agent': ['looking', 'brings', 'has', 'seems', 'approve', 'carried', 'was', 'make', 'looked', 'came', 'wished', 'won', 'disarmed', 'do', 'get', 'got', 'bought', 'done', 'be'], 'poss': ['career', 'forehead', 'mouth', 'movements', 'companions', 'family', 'beauty', 'sergeant', 'father', 'permission'], 'mod': ['slender', 'graceful', 'quiet', 'sarcastic', 'calm', 'person'], 'patient': ['love', 'saw', 'marked', 'saves', 'carried', 'thanking', 'assuring']} {'agent': 19, 'poss': 10, 'mod': 6, 'patient': 7}
7 uc2.ark+=13960=t3222sv4b Earl uc2.ark+=13960=t3222sv4b|Earl m f 1907 {'agent': ['sold', 'came', 'resigned', 'pass'], 'poss': ['marriage', 'mines', 'brother', 'history', 'crimea'], 'said': ['history', 'lives', 'lindsays', 'christian', 'art']} {'agent': 4, 'poss': 5, 'said': 5}
8 uc2.ark+=13960=t3222sv4b Committee uc2.ark+=13960=t3222sv4b|Committee u f 1907 {'agent': ['made', 'decided', 'directed', 'consider', 'attributed', 'is'], 'patient': ['had'], 'said': ['terms', 'conditions', 'service', 'army']} {'agent': 6, 'patient': 1, 'said': 4}
9 uc2.ark+=13960=t3222sv4b FrederickStephenson uc2.ark+=13960=t3222sv4b|FrederickStephenson m f 1907 {'agent': ['join', 'took', 'invalided', 'was', 'have', 'pitched', 'loved', 'coming'], 'poss': ['officers', 'senior', 'friend', 'regiment', 'father', 'regiment', 'father', 'country', 'career', 'richmond'], 'mod': ['adjutant'], 'patient': ['telling', 'include', 'invalided', 'exhausted', 'welcome']} {'agent': 8, 'poss': 10, 'mod': 1, 'patient': 5}

Questions:

  • are the relative frequency of verbs associated with male characters higher?
    • verbs: agent (object), patient (passive)
    • nouns: mod, poss
  • for feminine and masuline characters are there different ratios for those roles
    • (agent-patient of verb), (poss-mod of noun)
  • 'up, down, away, there, again'
    • poss role? or mod role?

In [21]:
from collections import Counter

def get_counts(df):
    c = Counter()
    for idx, row in df['count_pos'].items():
        c.update(row)
    return c

In [27]:
all_counts = get_counts(bio)

In [22]:
male_char_counts = get_counts(bio.loc[bio.chargender == 'm',:])
female_char_counts = get_counts(bio.loc[bio.chargender == 'f',:])

In [33]:
male_auth_counts = get_counts(bio.loc[bio.authgender == 'm',:])
female_auth_counts = get_counts(bio.loc[bio.authgender == 'f',:])

In [27]:
male_char_counts


Out[27]:
Counter({'agent': 31365088,
         'lll>': 1,
         'mod': 2593052,
         'patient': 6141017,
         'poss': 21493348,
         'said': 28788655,
         'tho': 1,
         'to': 1})

Is the relative frequency of verbs associated with male characters higher?


In [154]:
print('relative % of verbs associated with male characters', (male_char_counts['agent'] 
                                                              + male_char_counts['patient']) 
      / (male_char_counts['agent'] 
         + male_char_counts['patient'] 
         + male_char_counts['mod'] 
         + male_char_counts['poss'] 
         + male_char_counts['said'])
     )
print('relative % of verbs associated with female characters', (female_char_counts['agent'] 
                                                                + female_char_counts['patient']) 
      / (female_char_counts['agent'] 
         + female_char_counts['patient'] 
         + female_char_counts['mod'] 
         + female_char_counts['poss'] 
         + female_char_counts['said'])
     )


relative % of verbs associated with male characters 0.41497702618554577
relative % of verbs associated with female characters 0.4330731659372056

Percentage of verbs that are active


In [158]:
print('% of verbs describing men that are active verbs', (male_char_counts['agent'] 
                                                          / (male_char_counts['agent'] 
                                                             + male_char_counts['patient'])
                                                         ))

print('% of verbs describing women that are active verbs', (female_char_counts['agent'] 
                                                            / (female_char_counts['agent'] 
                                                               + female_char_counts['patient'])
                                                           ))


% of verbs describing men that are active verbs 0.8362662025288949
% of verbs describing women that are active verbs 0.8437637681994732

Percentage of nouns that are possessive


In [159]:
print('% of nouns describing men that are possessive nouns', (male_char_counts['poss'] 
                                                              / (male_char_counts['poss'] 
                                                                 + male_char_counts['mod'])
                                                             ))

print('% of nouns describing women that are possessive nouns', (female_char_counts['poss'] 
                                                                / (female_char_counts['poss'] 
                                                                   + female_char_counts['mod'])
                                                               ))


% of nouns describing men that are possessive nouns 0.8923437292413976
% of nouns describing women that are possessive nouns 0.8992377751419598

Is the relative frequency of dialogue words associated with male characters higher?


In [160]:
print('relative % of dialog words assocatied with male characters', (male_char_counts['said']) 
      / (male_char_counts['agent'] 
         + male_char_counts['patient'] 
         + male_char_counts['mod'] 
         + male_char_counts['poss'] 
         + male_char_counts['said'])
     )
print('relative % of dialog words associated with female characters', (female_char_counts['said']) 
      / (female_char_counts['agent'] 
         + female_char_counts['patient'] 
         + female_char_counts['mod'] 
         + female_char_counts['poss'] 
         + female_char_counts['said'] )
     )


relative % of dialog words assocatied with male characters 0.3185249558646957
relative % of dialog words associated with female characters 0.28209319761216356
  • what are the verbs being used most by woman? (authored)
    • verbs: agent (object), patient (passive)
    • nouns: mod, poss
  • plot by year:
    • % of verbs describing women that are active verbs
    • % of verbs describing men that are active verbs
  • raw data of character sizes
    • plotting trend over time

path, '/projects/ischoolichass/ichass/usesofscale/chardata'


In [31]:
all_by_year = bio.groupby('pubdate').apply(get_counts)


Out[31]:
pubdate
0                           {'agent': 1185, 'poss': 838, 'mod': 60, 'patient': 179, 'said': 1864}
1701                         {'agent': 909, 'poss': 374, 'mod': 52, 'patient': 155, 'said': 1329}
1705                           {'agent': 452, 'poss': 423, 'patient': 96, 'said': 525, 'mod': 29}
1706                            {'agent': 110, 'poss': 174, 'patient': 38, 'mod': 10, 'said': 35}
1714                       {'agent': 1930, 'patient': 314, 'poss': 849, 'mod': 108, 'said': 2045}
1715                                                                      {'agent': 7, 'poss': 6}
1717                         {'agent': 571, 'patient': 108, 'said': 1401, 'poss': 277, 'mod': 37}
1719                              {'agent': 97, 'poss': 92, 'mod': 5, 'patient': 29, 'said': 143}
1720                            {'agent': 113, 'poss': 127, 'mod': 8, 'patient': 26, 'said': 405}
1726                                             {'agent': 8, 'poss': 14, 'mod': 1, 'patient': 1}
1732                           {'agent': 213, 'poss': 266, 'patient': 74, 'said': 508, 'mod': 20}
1734                          {'said': 989, 'agent': 421, 'poss': 351, 'mod': 18, 'patient': 123}
1737                          {'agent': 707, 'poss': 694, 'mod': 87, 'patient': 145, 'said': 995}
1739                                            {'agent': 29, 'poss': 23, 'mod': 4, 'patient': 9}
1740                            {'agent': 102, 'poss': 107, 'mod': 21, 'patient': 15, 'said': 33}
1741                      {'agent': 3015, 'poss': 2462, 'mod': 202, 'patient': 913, 'said': 7780}
1742                     {'agent': 5428, 'poss': 4301, 'mod': 308, 'patient': 1589, 'said': 8067}
1744                          {'agent': 404, 'said': 942, 'poss': 324, 'patient': 106, 'mod': 16}
1747                    {'agent': 2648, 'poss': 2218, 'patient': 1167, 'said': 15091, 'mod': 398}
1750                     {'agent': 4735, 'poss': 5068, 'patient': 1624, 'mod': 416, 'said': 7600}
1752                         {'agent': 517, 'poss': 604, 'mod': 53, 'patient': 183, 'said': 1444}
1753                      {'agent': 1395, 'poss': 1344, 'mod': 155, 'patient': 404, 'said': 1912}
1755                         {'agent': 884, 'poss': 728, 'mod': 83, 'patient': 342, 'said': 6982}
1758                       {'agent': 1079, 'poss': 1150, 'patient': 247, 'said': 3419, 'mod': 95}
1759                    {'agent': 5721, 'poss': 3462, 'patient': 1424, 'said': 11070, 'mod': 453}
1760                  {'agent': 11699, 'poss': 9874, 'patient': 4127, 'said': 21079, 'mod': 1035}
1761                    {'agent': 5395, 'poss': 4718, 'mod': 419, 'patient': 1623, 'said': 10857}
1762                          {'poss': 482, 'mod': 38, 'patient': 157, 'said': 655, 'agent': 472}
1763                       {'agent': 2420, 'poss': 1995, 'mod': 71, 'patient': 678, 'said': 3980}
1765                      {'agent': 1292, 'poss': 1343, 'mod': 126, 'patient': 310, 'said': 4032}
                                                  ...                                            
1971             {'agent': 165263, 'poss': 102694, 'said': 85866, 'mod': 11209, 'patient': 26963}
1972    {'agent': 161857, 'poss': 105793, 'patient': 29691, 'said': 115904, 'mod': 11954, '>': 1}
1973            {'agent': 180395, 'poss': 108139, 'patient': 32386, 'said': 155848, 'mod': 13197}
1974            {'agent': 225386, 'poss': 121866, 'mod': 15756, 'patient': 36533, 'said': 123364}
1975            {'agent': 236514, 'poss': 127547, 'mod': 16191, 'said': 121237, 'patient': 37926}
1976            {'agent': 205248, 'poss': 133360, 'mod': 14460, 'patient': 35124, 'said': 114669}
1977            {'agent': 184325, 'poss': 114338, 'mod': 15420, 'patient': 34020, 'said': 119863}
1978            {'agent': 234620, 'said': 151980, 'poss': 132521, 'mod': 17619, 'patient': 40843}
1979            {'agent': 223429, 'poss': 127621, 'mod': 16164, 'patient': 37166, 'said': 124367}
1980            {'agent': 299229, 'poss': 169112, 'mod': 20386, 'patient': 46375, 'said': 164779}
1981            {'agent': 315960, 'poss': 181482, 'mod': 21413, 'patient': 52658, 'said': 170879}
1982            {'said': 176319, 'agent': 313079, 'poss': 176355, 'patient': 54209, 'mod': 23234}
1983            {'agent': 254504, 'poss': 143528, 'mod': 17741, 'patient': 40341, 'said': 150401}
1984            {'agent': 323121, 'poss': 186573, 'mod': 21480, 'patient': 52097, 'said': 158617}
1985            {'agent': 297170, 'poss': 172334, 'patient': 54608, 'said': 129542, 'mod': 22140}
1986            {'agent': 323468, 'poss': 167686, 'patient': 51599, 'mod': 23125, 'said': 180932}
1987            {'agent': 437540, 'poss': 226284, 'mod': 28584, 'patient': 69036, 'said': 238660}
1988            {'agent': 301625, 'poss': 171784, 'patient': 51816, 'mod': 21825, 'said': 178108}
1989            {'agent': 309961, 'poss': 193101, 'mod': 21354, 'patient': 51279, 'said': 182316}
1990            {'said': 207188, 'agent': 382923, 'poss': 207035, 'mod': 27552, 'patient': 63843}
1991            {'agent': 295611, 'poss': 166381, 'mod': 20862, 'patient': 48286, 'said': 163215}
1992            {'agent': 326284, 'poss': 219485, 'mod': 23485, 'patient': 55475, 'said': 211180}
1993            {'agent': 316279, 'poss': 171811, 'said': 194424, 'mod': 22048, 'patient': 51458}
1994            {'agent': 253294, 'poss': 136254, 'mod': 18343, 'patient': 42168, 'said': 145083}
1995            {'agent': 358969, 'said': 208970, 'poss': 198972, 'patient': 60806, 'mod': 24534}
1996            {'agent': 310821, 'said': 180674, 'poss': 175598, 'mod': 21787, 'patient': 52090}
1997            {'agent': 318360, 'poss': 171647, 'mod': 22947, 'patient': 52884, 'said': 190157}
1998            {'agent': 208509, 'poss': 120908, 'mod': 14876, 'patient': 35543, 'said': 123207}
1999            {'agent': 179989, 'poss': 105199, 'mod': 13342, 'patient': 33167, 'said': 117516}
2000            {'agent': 202388, 'poss': 118253, 'said': 104637, 'mod': 15929, 'patient': 36824}
Length: 264, dtype: object

In [35]:
type(all_by_year)


Out[35]:
pandas.core.series.Series

In [46]:
# restructure dictionary Counter object into Pandas DataFrame
all_by_year = all_by_year.apply(pd.Series).fillna(0)

In [51]:
# drop messy error columns
all_by_year = all_by_year.drop(['>', 'lll>', 'tho', 'to'], axis=1)

In [55]:
all_by_year.head()


Out[55]:
agent mod patient poss said
pubdate
0 1185.0 60.0 179.0 838.0 1864.0
1701 909.0 52.0 155.0 374.0 1329.0
1705 452.0 29.0 96.0 423.0 525.0
1706 110.0 10.0 38.0 174.0 35.0
1714 1930.0 108.0 314.0 849.0 2045.0

In [57]:
# drop year 0??
# this must mean the pubdate is unknown
all_by_year = all_by_year.drop(0)
all_by_year.head()


Out[57]:
agent mod patient poss said
pubdate
1701 909.0 52.0 155.0 374.0 1329.0
1705 452.0 29.0 96.0 423.0 525.0
1706 110.0 10.0 38.0 174.0 35.0
1714 1930.0 108.0 314.0 849.0 2045.0
1715 7.0 0.0 0.0 6.0 0.0

Plotting raw word counts associated with characters by year


In [102]:
plt.figure(figsize=(15,10))
plt.plot(all_by_year.index, all_by_year.sum(axis=1), label='all words', color='darkred')
plt.title('Word counts for all biography characters')
plt.legend()
plt.show()



In [142]:
plt.figure(figsize=(15,10))
plt.plot(all_by_year.index, all_by_year['poss'], label='noun-posessive', color='mediumblue')
plt.plot(all_by_year.index, all_by_year['mod'], label='noun-modifier', color='skyblue')
plt.plot(all_by_year.index, all_by_year['agent'], label='verb-active', color='seagreen')
plt.plot(all_by_year.index, all_by_year['patient'], label='verb-passive', color='lightgreen')
plt.plot(all_by_year.index, all_by_year['said'], label='dialogue', color='dimgrey')
plt.title('Word counts by linguistic category for all biography characters')
plt.legend()
plt.savefig('./raw_word_counts')
plt.show()


plotting relative frequencies by gender

  • plot by year:
    • % of verbs describing women that are active verbs
    • % of verbs describing men that are active verbs

In [103]:
all_by_gender_yr = bio.groupby(['pubdate', 'chargender']).apply(get_counts)

In [104]:
all_by_gender_yr.head()


Out[104]:
pubdate  chargender
0        f                                    {'agent': 15, 'poss': 7, 'mod': 3, 'said': 37}
         m             {'agent': 1169, 'poss': 828, 'mod': 57, 'patient': 179, 'said': 1784}
         u                                               {'poss': 3, 'said': 43, 'agent': 1}
1701     m               {'agent': 883, 'poss': 370, 'mod': 49, 'patient': 144, 'said': 713}
         u                    {'agent': 26, 'patient': 11, 'said': 616, 'mod': 3, 'poss': 4}
dtype: object

In [105]:
all_by_gender_yr = all_by_gender_yr.apply(pd.Series).fillna(0)
all_by_gender_yr.head()


Out[105]:
> agent lll> mod patient poss said tho to
pubdate chargender
0 f 0.0 15.0 0.0 3.0 0.0 7.0 37.0 0.0 0.0
m 0.0 1169.0 0.0 57.0 179.0 828.0 1784.0 0.0 0.0
u 0.0 1.0 0.0 0.0 0.0 3.0 43.0 0.0 0.0
1701 m 0.0 883.0 0.0 49.0 144.0 370.0 713.0 0.0 0.0
u 0.0 26.0 0.0 3.0 11.0 4.0 616.0 0.0 0.0

In [109]:
# drop messy error columns
all_by_gender_yr = all_by_gender_yr.drop(['>', 'lll>', 'tho', 'to'], axis=1)
all_by_gender_yr = all_by_gender_yr.drop(0)
all_by_gender_yr.head()


Out[109]:
agent mod patient poss said
pubdate chargender
1701 m 883.0 49.0 144.0 370.0 713.0
u 26.0 3.0 11.0 4.0 616.0
1705 m 436.0 29.0 90.0 423.0 239.0
u 16.0 0.0 6.0 0.0 286.0
1706 f 18.0 0.0 7.0 32.0 0.0

In [162]:
all_by_gender_yr.loc[
    all_by_gender_yr.index.get_level_values('chargender') == 'm', 
].head()


Out[162]:
agent mod patient poss said
pubdate chargender
1701 m 883.0 49.0 144.0 370.0 713.0
1705 m 436.0 29.0 90.0 423.0 239.0
1706 m 92.0 10.0 31.0 142.0 35.0
1714 m 1912.0 106.0 281.0 841.0 1937.0
1715 m 7.0 0.0 0.0 6.0 0.0

In [166]:
prcnt_dialogue_m = all_by_gender_yr.loc[
    all_by_gender_yr.index.get_level_values('chargender') == 'm', 'said'
] / all_by_gender_yr.loc[
    all_by_gender_yr.index.get_level_values('chargender') == 'm'].sum(axis=1)

prcnt_dialogue_f = all_by_gender_yr.loc[
    all_by_gender_yr.index.get_level_values('chargender') == 'f', 'said'
] / all_by_gender_yr.loc[
    all_by_gender_yr.index.get_level_values('chargender') == 'f'].sum(axis=1)

In [169]:
plt.figure(figsize=(15,10))
plt.plot(prcnt_dialogue_m.index.get_level_values('pubdate'), 
         prcnt_dialogue_m, label='Male - % words that are dialogue', color='mediumblue')
plt.plot(prcnt_dialogue_f.index.get_level_values('pubdate'), 
         prcnt_dialogue_f, label='Female - % words that are dialogue', color='darkred')
plt.title('Percent of words associated with male vs female characters that are dialogue')
plt.legend()
plt.savefig('./prcnt_dialogue')
plt.show()



In [132]:
prcnt_active_f = all_by_gender_yr.loc[
    all_by_gender_yr.index.get_level_values('chargender') == 'f', 'agent'
] / (all_by_gender_yr.loc[
    all_by_gender_yr.index.get_level_values('chargender') == 'f', 'agent'
] + all_by_gender_yr.loc[
    all_by_gender_yr.index.get_level_values('chargender') == 'f', 'patient'
])

In [133]:
prcnt_active_m = all_by_gender_yr.loc[
    all_by_gender_yr.index.get_level_values('chargender') == 'm', 'agent'
] / (all_by_gender_yr.loc[
    all_by_gender_yr.index.get_level_values('chargender') == 'm', 'agent'
] + all_by_gender_yr.loc[
    all_by_gender_yr.index.get_level_values('chargender') == 'm', 'patient'
])

In [141]:
plt.figure(figsize=(15,10))
plt.plot(prcnt_active_f.index.get_level_values('pubdate'), 
         prcnt_active_f,
         label='Female - % active verbs', color='darkred')
plt.plot(prcnt_active_m.index.get_level_values('pubdate'), 
         prcnt_active_m,
         label='Male - % active verbs', color='mediumblue')
plt.title('Percentage of verbs that are active verbs for male vs female characters')
plt.legend()
plt.savefig('./prcnt_active_verbs')
plt.show()



In [149]:
prcnt_poss_f = all_by_gender_yr.loc[
    all_by_gender_yr.index.get_level_values('chargender') == 'f', 'poss'
] / (all_by_gender_yr.loc[
    all_by_gender_yr.index.get_level_values('chargender') == 'f', 'poss'
] + all_by_gender_yr.loc[
    all_by_gender_yr.index.get_level_values('chargender') == 'f', 'mod'
])

prcnt_poss_m = all_by_gender_yr.loc[
    all_by_gender_yr.index.get_level_values('chargender') == 'm', 'poss'
] / (all_by_gender_yr.loc[
    all_by_gender_yr.index.get_level_values('chargender') == 'm', 'poss'
] + all_by_gender_yr.loc[
    all_by_gender_yr.index.get_level_values('chargender') == 'm', 'mod'
])

In [151]:
plt.figure(figsize=(15,10))
plt.plot(prcnt_poss_f.index.get_level_values('pubdate'), 
         prcnt_poss_f,
         label='Female - % possessive nouns', color='darkred')
plt.plot(prcnt_poss_m.index.get_level_values('pubdate'), 
         prcnt_poss_m,
         label='Male - % possessive nouns', color='mediumblue')
plt.title('Percentage of nouns that are possessive nouns for male vs female characters')
plt.legend()
plt.savefig('./prcnt_poss_nouns')
plt.show()



In [ ]:


In [ ]:


In [ ]:

Get the raw counts of present vs past tense, in bio

import and train the Unigram Tagger

- this POS tagger is meant to work with words out of context
- we have "agent" vs "patient" tags but not "past-tense" vs "present-tense"

In [ ]:
brown_sents = brown.tagged_sents()
# Split the data into train and test sets.
train = int(len(brown_sents)*90/100) # use 90% for training
# Trains the tagger
uni_tag = ut(brown_sents[:train]) # this will take some time, ~1-2 mins
# evaluate on 10%
uni_tag.evaluate(brown_sents[train+1:]) # will also take ~1-2 mins
# Tags a random sentence
#uni_tag.tag("this is a foo bar sentence .".split())

In [ ]:
def get_past_tense(words):
    tagged = uni_tag.tag(words)
    tagged = [list(tup) for tup in tagged]
    print(tagged)
    for word_tag in tagged:
        if word_tag[0].startswith('was-'):
            word_tag[1] = 'VBN'
    past_words = [item[0] for item in tagged if item[1] in ['VBN', 'VBD']]
    return past_words

def get_present_tense(words):
    tagged = uni_tag.tag(words)
    tagged = [list(tup) for tup in tagged]
    present_words = [item[0] for item in tagged if item[1] in ['VB','VBZ', 'VBG', 'VBP']]
    return present_words

In [ ]:
[tuple(item.split('-')) for sublist in post23.words[:5].str.split() for item in sublist]

In [ ]:
# flat_list = [item for sublist in l for item in sublist]
pre_words = [word for sublist in pre23['words'] for word in sublist.split()]

In [ ]:
pre_words[:10]

In [ ]:
post_words = [word for sublist in post23['words'] for word in sublist.split()]

In [ ]:
print(len(post_words))
print(len(pre_words))

In [ ]:
counts = Counter(post_words + pre_words)

In [ ]:
len(bio_words)

In [ ]:
len(postbio_words)

In [ ]:
Counter(words)