In [3]:
from collections import Counter
import pandas as pd
%matplotlib inline
from pylab import rcParams
from bs4 import BeautifulSoup
import textacy
rcParams['figure.figsize'] = 10, 4
import matplotlib.pyplot as plt
plt.style.use('ggplot')
In [4]:
import spacy
nlp = spacy.load('en')
In [7]:
def proportionWithTag(doc, tag):
""" Returns the proportion of words in the document that have a certain POS tag.
If given a list instead of a tag, returns the proportions of words in the document
that have those tags."""
totalWords = len(doc)
if type(tag) == list:
wordsWithTag = [word for word in doc if word.tag_ in tag]
else:
wordsWithTag = [word for word in doc if word.tag_ == tag]
return len(wordsWithTag)/totalWords
def proportionWithLemma(doc, lemma):
totalWords = len(doc)
wordsWithLemma = [word for word in doc if word.lemma_ == lemma]
return len(wordsWithLemma)/totalWords
In [8]:
def beProportion(doc):
totalWords = len(doc)
bes = [word for word in doc if word.lemma_ == 'be' and word.tag_ in verbtags] # 488 is "be"
return len(bes)/totalWords
From the Penn Treebank table: https://www.ling.upenn.edu/courses/Fall_2003/ling001/penn_treebank_pos.html
In [9]:
presentVerbTags = ['VB', 'VBG', 'VBP', 'VBZ']
verbtags = ['VB', 'VBD', 'VBG', 'VBN', 'VBP', 'VBZ']
In [13]:
tagDict = {"CC": "Coordinating conjunction",
"DT": "Determiner",
"EX": "Existential there",
"IN": "Preposition or subordinating conjunction",
"JJ": "Adjective",
"JJR": "Adjective, comparative",
"JJS": "Adjective, superlative",
"MD": "Modal",
"NN": "Noun, singular or mass",
"NNS": "Noun, plural",
"NNP": "Proper noun, singular",
"NNPS": "Proper noun, plural",
"PDT": "Predeterminer",
"POS": "Possessive ending",
"PRP": "Personal pronoun",
"PRP$": "Possessive pronoun",
"RB": "Adverb",
"RBR": "Adverb, comparative",
"RBS": "Adverb, superlative",
"RP": "Particle",
"TO": "to",
"UH": "Interjection",
"VB": "Verb, base form",
"VBD": "Verb, past tense",
"VBG": "Verb, gerund or present participle",
"VBN": "Verb, past participle",
"VBP": "Verb, non-3rd person singular present",
"VBZ": "Verb, 3rd person singular present",
"WDT": "Wh-determiner",
"WP": "Wh-pronoun",
"WP$": "Possessive wh-pronoun",
"WRB": "Wh-adverb"}
In [14]:
tagset = list(tagDict.keys())
In [15]:
def compareTags(a, b, tagset):
proportionsDict = {}
for tag in tagset:
proportionsDict[tag] = [proportionWithTag(x, tag) for x in [a, b]]
df = pd.DataFrame(proportionsDict).T
df['factor'] = (df[1]/df[0])-1
return df['factor']
In [17]:
def compareLemmas(a, b, lemmas):
proportionsDict = {}
for lemma in lemmas:
proportionsDict[lemma] = [proportionWithLemma(x, lemma) for x in [a, b]]
df = pd.DataFrame(proportionsDict).T
df['factor'] = df[1]/df[0]
df['factor'].plot(kind="bar")
In [19]:
# Read annotated edition of Middlemarch
with open('e2/annotated.html') as f:
annotated = f.read()
In [20]:
# Parse
soup = BeautifulSoup(annotated, 'lxml')
levels = [soup.find_all('span', 'c-'+str(level)) for level in range(17)]
# Remove HTML tags.
cleanLevels = []
for level in levels:
cleanLevel = [quote.get_text() for quote in level]
cleanLevels.append(cleanLevel)
In [21]:
def getQuotes(cleanLevels, n):
""" Get quotes from list, according to n times they are quoted. """
quotedGroups = cleanLevels[n:]
quoted = []
for group in quotedGroups:
quoted.extend(group)
quoted = ' '.join(quoted)
return quoted
In [22]:
tagset.remove('WP$')
In [23]:
unquoted = cleanLevels[0]
quotedGroups = cleanLevels[1:]
quoted = []
for group in quotedGroups:
quoted.extend(group)
quotes = ' '.join(quoted)
nonquotes = ' '.join(unquoted)
In [24]:
quotes5 = getQuotes(cleanLevels, 5)
quotes8 = getQuotes(cleanLevels, 8)
In [ ]:
nonquotesS, quotesS, quotes5S, quotes8S = nlp(nonquotes), nlp(quotes), nlp(quotes5), nlp(quotes8)
In [ ]:
factor_all = compareTags(nonquotesS, quotesS, tagset)
factor_mid = compareTags(nonquotesS, quotes5S, tagset)
factor_high = compareTags(nonquotesS, quotes8S, tagset)
In [ ]:
rcParams['figure.figsize'] = 8, 5
df = pd.DataFrame([factor_all, factor_high], index=['', ''])
ax = df.T.plot(subplots=True, kind='bar')
ax[0].set_title('(a)')
ax[1].set_title('(b)')
# ax[2].set_title('(c)')
In [ ]:
tagDict['UH']
In [ ]:
# A list of plural nouns from highly quoted text.
[w for w in quotes8S if w.tag_ == 'NNS']
In [288]:
[w for w in quotesS if w.tag_ == "NNPS"]
Out[288]:
In [291]:
[w for w in quotesS if w.tag_ == "NNPS"]
Out[291]:
In [89]:
def compareAllLemmas(a, b):
lemmasA = pd.Series(Counter([word.lemma_ for word in a]))
lemmasB = pd.Series(Counter([word.lemma_ for word in b]))
# df = pd.concat([lemmasA, lemmasB], axis=1).fillna(0)
return lemmasA, lemmasB
In [91]:
sa, sb = compareAllLemmas(nonquotes, quotes8)
In [100]:
df = pd.concat([sa, sb], axis=1).fillna(0)
In [123]:
# quotesBag = quotes8.to_bag_of_terms(lemmatize=True, weighting='freq', as_strings=True)
def bagOfTerms(doc):
bag = doc.to_bag_of_terms(lemmatize=True, weighting='freq', as_strings=True)
return pd.Series(bag)
In [152]:
nonquotesDoc = textacy.Doc(nonquotes)
quotesDoc = textacy.Doc(quotes)
quotes5Doc = textacy.Doc(quotes5)
quotes8Doc = textacy.Doc(quotes8)
In [153]:
nonquotesBag = bagOfTerms(nonquotesDoc)
quotesBag = bagOfTerms(quotesDoc)
quotes5Bag = bagOfTerms(quotes5Doc)
quotes8Bag = bagOfTerms(quotes8Doc)
In [167]:
def compareAllLemmas(a, b):
""" Compares two bags of terms. """
df = pd.concat([a, b], axis=1).fillna(0)
delta = df[0] - df[1]
return delta.sort_values()
In [172]:
df = compareAllLemmas(quotesBag, nonquotesBag)
In [173]:
df
Out[173]:
In [ ]: