In [37]:
import pandas as pd
import nltk
from collections import Counter
from nltk.corpus import brown
from nltk import UnigramTagger as ut
import matplotlib.pyplot as plt
In [3]:
post23 = pd.read_csv('./Oct15_post23bio.tsv', sep='\t')
In [4]:
pre23 = pd.read_csv('./Oct15_pre23bio.tsv', sep='\t')
In [7]:
bio = pd.concat([pre23, post23], ignore_index=True)
In [7]:
# bio.to_csv('./Oct15_bio.tsv', sep='\t', index=False)
In [8]:
bio.head()
Out[8]:
In [9]:
print(post23.shape)
print(pre23.shape)
print(bio.shape)
In [10]:
pd.set_option('max_colwidth', 500)
bio.words[:5]
Out[10]:
In [11]:
# tuples (tag, word)
# [tuple(item.split('-')) for sublist in bio.words[:5].str.split() for item in sublist]
# just words
# [item.split('-')[1] for sublist in bio.words[:5].str.split() for item in sublist]
# just tags
# [item.split('-')[0] for sublist in bio.words[:5].str.split() for item in sublist]
# one big dict
# for k, v in [tuple(item.split('-')) for sublist in bio.words[:5].str.split() for item in sublist]:
# d[k].append(v)
from collections import defaultdict
def pos_to_dict(words_str):
d = defaultdict( list )
try:
for k, v in [ tuple(item.split('-')) for item in words_str.split() ]:
d[k].append(v)
except (ValueError):
# where the word has multiple hyphens, split off only the first one as the pos key
for k, v in [ (item.split('-')[0], item[len(item.split('-')[0])+1:]) for item in words_str.split() ]:
d[k].append(v)
except (AttributeError, TypeError):
return words_str
return d
bio['words'] = bio['words'].apply(pos_to_dict)
In [13]:
def count_pos(pos_dict):
d = { k:len(v) for k, v in pos_dict.items() }
return d
bio['count_pos'] = bio['words'].apply(count_pos)
In [20]:
bio.head(10)
Out[20]:
In [21]:
from collections import Counter
def get_counts(df):
c = Counter()
for idx, row in df['count_pos'].items():
c.update(row)
return c
In [27]:
all_counts = get_counts(bio)
In [22]:
male_char_counts = get_counts(bio.loc[bio.chargender == 'm',:])
female_char_counts = get_counts(bio.loc[bio.chargender == 'f',:])
In [33]:
male_auth_counts = get_counts(bio.loc[bio.authgender == 'm',:])
female_auth_counts = get_counts(bio.loc[bio.authgender == 'f',:])
In [27]:
male_char_counts
Out[27]:
In [154]:
print('relative % of verbs associated with male characters', (male_char_counts['agent']
+ male_char_counts['patient'])
/ (male_char_counts['agent']
+ male_char_counts['patient']
+ male_char_counts['mod']
+ male_char_counts['poss']
+ male_char_counts['said'])
)
print('relative % of verbs associated with female characters', (female_char_counts['agent']
+ female_char_counts['patient'])
/ (female_char_counts['agent']
+ female_char_counts['patient']
+ female_char_counts['mod']
+ female_char_counts['poss']
+ female_char_counts['said'])
)
In [158]:
print('% of verbs describing men that are active verbs', (male_char_counts['agent']
/ (male_char_counts['agent']
+ male_char_counts['patient'])
))
print('% of verbs describing women that are active verbs', (female_char_counts['agent']
/ (female_char_counts['agent']
+ female_char_counts['patient'])
))
In [159]:
print('% of nouns describing men that are possessive nouns', (male_char_counts['poss']
/ (male_char_counts['poss']
+ male_char_counts['mod'])
))
print('% of nouns describing women that are possessive nouns', (female_char_counts['poss']
/ (female_char_counts['poss']
+ female_char_counts['mod'])
))
In [160]:
print('relative % of dialog words assocatied with male characters', (male_char_counts['said'])
/ (male_char_counts['agent']
+ male_char_counts['patient']
+ male_char_counts['mod']
+ male_char_counts['poss']
+ male_char_counts['said'])
)
print('relative % of dialog words associated with female characters', (female_char_counts['said'])
/ (female_char_counts['agent']
+ female_char_counts['patient']
+ female_char_counts['mod']
+ female_char_counts['poss']
+ female_char_counts['said'] )
)
path, '/projects/ischoolichass/ichass/usesofscale/chardata'
In [31]:
all_by_year = bio.groupby('pubdate').apply(get_counts)
Out[31]:
In [35]:
type(all_by_year)
Out[35]:
In [46]:
# restructure dictionary Counter object into Pandas DataFrame
all_by_year = all_by_year.apply(pd.Series).fillna(0)
In [51]:
# drop messy error columns
all_by_year = all_by_year.drop(['>', 'lll>', 'tho', 'to'], axis=1)
In [55]:
all_by_year.head()
Out[55]:
In [57]:
# drop year 0??
# this must mean the pubdate is unknown
all_by_year = all_by_year.drop(0)
all_by_year.head()
Out[57]:
In [102]:
plt.figure(figsize=(15,10))
plt.plot(all_by_year.index, all_by_year.sum(axis=1), label='all words', color='darkred')
plt.title('Word counts for all biography characters')
plt.legend()
plt.show()
In [142]:
plt.figure(figsize=(15,10))
plt.plot(all_by_year.index, all_by_year['poss'], label='noun-posessive', color='mediumblue')
plt.plot(all_by_year.index, all_by_year['mod'], label='noun-modifier', color='skyblue')
plt.plot(all_by_year.index, all_by_year['agent'], label='verb-active', color='seagreen')
plt.plot(all_by_year.index, all_by_year['patient'], label='verb-passive', color='lightgreen')
plt.plot(all_by_year.index, all_by_year['said'], label='dialogue', color='dimgrey')
plt.title('Word counts by linguistic category for all biography characters')
plt.legend()
plt.savefig('./raw_word_counts')
plt.show()
In [103]:
all_by_gender_yr = bio.groupby(['pubdate', 'chargender']).apply(get_counts)
In [104]:
all_by_gender_yr.head()
Out[104]:
In [105]:
all_by_gender_yr = all_by_gender_yr.apply(pd.Series).fillna(0)
all_by_gender_yr.head()
Out[105]:
In [109]:
# drop messy error columns
all_by_gender_yr = all_by_gender_yr.drop(['>', 'lll>', 'tho', 'to'], axis=1)
all_by_gender_yr = all_by_gender_yr.drop(0)
all_by_gender_yr.head()
Out[109]:
In [162]:
all_by_gender_yr.loc[
all_by_gender_yr.index.get_level_values('chargender') == 'm',
].head()
Out[162]:
In [166]:
prcnt_dialogue_m = all_by_gender_yr.loc[
all_by_gender_yr.index.get_level_values('chargender') == 'm', 'said'
] / all_by_gender_yr.loc[
all_by_gender_yr.index.get_level_values('chargender') == 'm'].sum(axis=1)
prcnt_dialogue_f = all_by_gender_yr.loc[
all_by_gender_yr.index.get_level_values('chargender') == 'f', 'said'
] / all_by_gender_yr.loc[
all_by_gender_yr.index.get_level_values('chargender') == 'f'].sum(axis=1)
In [169]:
plt.figure(figsize=(15,10))
plt.plot(prcnt_dialogue_m.index.get_level_values('pubdate'),
prcnt_dialogue_m, label='Male - % words that are dialogue', color='mediumblue')
plt.plot(prcnt_dialogue_f.index.get_level_values('pubdate'),
prcnt_dialogue_f, label='Female - % words that are dialogue', color='darkred')
plt.title('Percent of words associated with male vs female characters that are dialogue')
plt.legend()
plt.savefig('./prcnt_dialogue')
plt.show()
In [132]:
prcnt_active_f = all_by_gender_yr.loc[
all_by_gender_yr.index.get_level_values('chargender') == 'f', 'agent'
] / (all_by_gender_yr.loc[
all_by_gender_yr.index.get_level_values('chargender') == 'f', 'agent'
] + all_by_gender_yr.loc[
all_by_gender_yr.index.get_level_values('chargender') == 'f', 'patient'
])
In [133]:
prcnt_active_m = all_by_gender_yr.loc[
all_by_gender_yr.index.get_level_values('chargender') == 'm', 'agent'
] / (all_by_gender_yr.loc[
all_by_gender_yr.index.get_level_values('chargender') == 'm', 'agent'
] + all_by_gender_yr.loc[
all_by_gender_yr.index.get_level_values('chargender') == 'm', 'patient'
])
In [141]:
plt.figure(figsize=(15,10))
plt.plot(prcnt_active_f.index.get_level_values('pubdate'),
prcnt_active_f,
label='Female - % active verbs', color='darkred')
plt.plot(prcnt_active_m.index.get_level_values('pubdate'),
prcnt_active_m,
label='Male - % active verbs', color='mediumblue')
plt.title('Percentage of verbs that are active verbs for male vs female characters')
plt.legend()
plt.savefig('./prcnt_active_verbs')
plt.show()
In [149]:
prcnt_poss_f = all_by_gender_yr.loc[
all_by_gender_yr.index.get_level_values('chargender') == 'f', 'poss'
] / (all_by_gender_yr.loc[
all_by_gender_yr.index.get_level_values('chargender') == 'f', 'poss'
] + all_by_gender_yr.loc[
all_by_gender_yr.index.get_level_values('chargender') == 'f', 'mod'
])
prcnt_poss_m = all_by_gender_yr.loc[
all_by_gender_yr.index.get_level_values('chargender') == 'm', 'poss'
] / (all_by_gender_yr.loc[
all_by_gender_yr.index.get_level_values('chargender') == 'm', 'poss'
] + all_by_gender_yr.loc[
all_by_gender_yr.index.get_level_values('chargender') == 'm', 'mod'
])
In [151]:
plt.figure(figsize=(15,10))
plt.plot(prcnt_poss_f.index.get_level_values('pubdate'),
prcnt_poss_f,
label='Female - % possessive nouns', color='darkred')
plt.plot(prcnt_poss_m.index.get_level_values('pubdate'),
prcnt_poss_m,
label='Male - % possessive nouns', color='mediumblue')
plt.title('Percentage of nouns that are possessive nouns for male vs female characters')
plt.legend()
plt.savefig('./prcnt_poss_nouns')
plt.show()
In [ ]:
In [ ]:
In [ ]:
In [ ]:
brown_sents = brown.tagged_sents()
# Split the data into train and test sets.
train = int(len(brown_sents)*90/100) # use 90% for training
# Trains the tagger
uni_tag = ut(brown_sents[:train]) # this will take some time, ~1-2 mins
# evaluate on 10%
uni_tag.evaluate(brown_sents[train+1:]) # will also take ~1-2 mins
# Tags a random sentence
#uni_tag.tag("this is a foo bar sentence .".split())
In [ ]:
def get_past_tense(words):
tagged = uni_tag.tag(words)
tagged = [list(tup) for tup in tagged]
print(tagged)
for word_tag in tagged:
if word_tag[0].startswith('was-'):
word_tag[1] = 'VBN'
past_words = [item[0] for item in tagged if item[1] in ['VBN', 'VBD']]
return past_words
def get_present_tense(words):
tagged = uni_tag.tag(words)
tagged = [list(tup) for tup in tagged]
present_words = [item[0] for item in tagged if item[1] in ['VB','VBZ', 'VBG', 'VBP']]
return present_words
In [ ]:
[tuple(item.split('-')) for sublist in post23.words[:5].str.split() for item in sublist]
In [ ]:
# flat_list = [item for sublist in l for item in sublist]
pre_words = [word for sublist in pre23['words'] for word in sublist.split()]
In [ ]:
pre_words[:10]
In [ ]:
post_words = [word for sublist in post23['words'] for word in sublist.split()]
In [ ]:
print(len(post_words))
print(len(pre_words))
In [ ]:
counts = Counter(post_words + pre_words)
In [ ]:
len(bio_words)
In [ ]:
len(postbio_words)
In [ ]:
Counter(words)