In [3]:
    
from collections import Counter
import pandas as pd
%matplotlib inline
from pylab import rcParams
from bs4 import BeautifulSoup
import textacy
rcParams['figure.figsize'] = 10, 4
import matplotlib.pyplot as plt
plt.style.use('ggplot')
    
In [4]:
    
import spacy
nlp = spacy.load('en')
    
In [7]:
    
def proportionWithTag(doc, tag):
    """ Returns the proportion of words in the document that have a certain POS tag. 
    If given a list instead of a tag, returns the proportions of words in the document 
    that have those tags."""
    totalWords = len(doc)
    if type(tag) == list: 
        wordsWithTag = [word for word in doc if word.tag_ in tag]
    else: 
        wordsWithTag = [word for word in doc if word.tag_ == tag]
    return len(wordsWithTag)/totalWords
def proportionWithLemma(doc, lemma): 
    totalWords = len(doc)
    wordsWithLemma = [word for word in doc if word.lemma_ == lemma]
    return len(wordsWithLemma)/totalWords
    
In [8]:
    
def beProportion(doc): 
    totalWords = len(doc)
    bes = [word for word in doc if word.lemma_ == 'be' and word.tag_ in verbtags] # 488 is "be" 
    return len(bes)/totalWords
    
From the Penn Treebank table: https://www.ling.upenn.edu/courses/Fall_2003/ling001/penn_treebank_pos.html
In [9]:
    
presentVerbTags = ['VB', 'VBG', 'VBP', 'VBZ']
verbtags = ['VB', 'VBD', 'VBG', 'VBN', 'VBP', 'VBZ']
    
In [13]:
    
tagDict = {"CC": "Coordinating conjunction",
"DT": "Determiner",
"EX": "Existential there",
"IN": "Preposition or subordinating conjunction",
"JJ": "Adjective",
"JJR": "Adjective, comparative",
"JJS": "Adjective, superlative",
"MD": "Modal",
"NN": "Noun, singular or mass",
"NNS": "Noun, plural",
"NNP": "Proper noun, singular",
"NNPS": "Proper noun, plural",
"PDT": "Predeterminer",
"POS": "Possessive ending",
"PRP": "Personal pronoun",
"PRP$": "Possessive pronoun",
"RB": "Adverb",
"RBR": "Adverb, comparative",
"RBS": "Adverb, superlative",
"RP": "Particle",
"TO": "to",
"UH": "Interjection",
"VB": "Verb, base form",
"VBD": "Verb, past tense",
"VBG": "Verb, gerund or present participle",
"VBN": "Verb, past participle",
"VBP": "Verb, non-3rd person singular present",
"VBZ": "Verb, 3rd person singular present",
"WDT": "Wh-determiner",
"WP": "Wh-pronoun",
"WP$": "Possessive wh-pronoun",
"WRB": "Wh-adverb"}
    
In [14]:
    
tagset = list(tagDict.keys())
    
In [15]:
    
def compareTags(a, b, tagset):
    proportionsDict = {}
    for tag in tagset: 
        proportionsDict[tag] = [proportionWithTag(x, tag) for x in [a, b]]
    df = pd.DataFrame(proportionsDict).T
    df['factor'] = (df[1]/df[0])-1
    return df['factor']
    
In [17]:
    
def compareLemmas(a, b, lemmas):
    proportionsDict = {}
    for lemma in lemmas: 
        proportionsDict[lemma] = [proportionWithLemma(x, lemma) for x in [a, b]]
    df = pd.DataFrame(proportionsDict).T
    df['factor'] = df[1]/df[0]
    df['factor'].plot(kind="bar")
    
In [19]:
    
# Read annotated edition of Middlemarch
with open('e2/annotated.html') as f: 
    annotated = f.read()
    
In [20]:
    
# Parse 
soup = BeautifulSoup(annotated, 'lxml')
levels = [soup.find_all('span', 'c-'+str(level)) for level in range(17)]
# Remove HTML tags. 
cleanLevels = []
for level in levels: 
    cleanLevel = [quote.get_text() for quote in level]
    cleanLevels.append(cleanLevel)
    
In [21]:
    
def getQuotes(cleanLevels, n): 
    """ Get quotes from list, according to n times they are quoted. """
    quotedGroups = cleanLevels[n:]
    quoted = []
    for group in quotedGroups: 
        quoted.extend(group)
    quoted = ' '.join(quoted)
    return quoted
    
In [22]:
    
tagset.remove('WP$')
    
In [23]:
    
unquoted = cleanLevels[0]
quotedGroups = cleanLevels[1:]
quoted = []
for group in quotedGroups: 
    quoted.extend(group)
quotes = ' '.join(quoted)
nonquotes = ' '.join(unquoted)
    
In [24]:
    
quotes5 = getQuotes(cleanLevels, 5)
quotes8 = getQuotes(cleanLevels, 8)
    
In [ ]:
    
nonquotesS, quotesS, quotes5S, quotes8S = nlp(nonquotes), nlp(quotes), nlp(quotes5), nlp(quotes8)
    
In [ ]:
    
factor_all = compareTags(nonquotesS, quotesS, tagset)
factor_mid = compareTags(nonquotesS, quotes5S, tagset)
factor_high = compareTags(nonquotesS, quotes8S, tagset)
    
In [ ]:
    
rcParams['figure.figsize'] = 8, 5
df = pd.DataFrame([factor_all, factor_high], index=['', ''])
ax = df.T.plot(subplots=True, kind='bar')
ax[0].set_title('(a)')
ax[1].set_title('(b)')
# ax[2].set_title('(c)')
    
In [ ]:
    
tagDict['UH']
    
In [ ]:
    
# A list of plural nouns from highly quoted text. 
[w for w in quotes8S if w.tag_ == 'NNS']
    
In [288]:
    
[w for w in quotesS if w.tag_ == "NNPS"]
    
    Out[288]:
In [291]:
    
[w for w in quotesS if w.tag_ == "NNPS"]
    
    Out[291]:
In [89]:
    
def compareAllLemmas(a, b): 
    lemmasA = pd.Series(Counter([word.lemma_ for word in a]))
    lemmasB = pd.Series(Counter([word.lemma_ for word in b]))
#     df = pd.concat([lemmasA, lemmasB], axis=1).fillna(0)
    return lemmasA, lemmasB
    
In [91]:
    
sa, sb = compareAllLemmas(nonquotes, quotes8)
    
In [100]:
    
df = pd.concat([sa, sb], axis=1).fillna(0)
    
In [123]:
    
# quotesBag = quotes8.to_bag_of_terms(lemmatize=True, weighting='freq', as_strings=True)
def bagOfTerms(doc): 
    bag = doc.to_bag_of_terms(lemmatize=True, weighting='freq', as_strings=True)
    return pd.Series(bag)
    
In [152]:
    
nonquotesDoc = textacy.Doc(nonquotes)
quotesDoc = textacy.Doc(quotes)
quotes5Doc = textacy.Doc(quotes5)
quotes8Doc = textacy.Doc(quotes8)
    
In [153]:
    
nonquotesBag = bagOfTerms(nonquotesDoc)
quotesBag = bagOfTerms(quotesDoc)
quotes5Bag = bagOfTerms(quotes5Doc)
quotes8Bag = bagOfTerms(quotes8Doc)
    
In [167]:
    
def compareAllLemmas(a, b): 
    """ Compares two bags of terms. """
    df = pd.concat([a, b], axis=1).fillna(0)
    delta = df[0] - df[1]
    return delta.sort_values()
    
In [172]:
    
df = compareAllLemmas(quotesBag, nonquotesBag)
    
In [173]:
    
df
    
    Out[173]:
In [ ]: