Exploratory analysis of quoted speech


In [3]:
from collections import Counter
import pandas as pd
%matplotlib inline
from pylab import rcParams
from bs4 import BeautifulSoup
import textacy
rcParams['figure.figsize'] = 10, 4
import matplotlib.pyplot as plt
plt.style.use('ggplot')

In [4]:
import spacy
nlp = spacy.load('en')

In [7]:
def proportionWithTag(doc, tag):
    """ Returns the proportion of words in the document that have a certain POS tag. 
    If given a list instead of a tag, returns the proportions of words in the document 
    that have those tags."""
    totalWords = len(doc)
    if type(tag) == list: 
        wordsWithTag = [word for word in doc if word.tag_ in tag]
    else: 
        wordsWithTag = [word for word in doc if word.tag_ == tag]
    return len(wordsWithTag)/totalWords

def proportionWithLemma(doc, lemma): 
    totalWords = len(doc)
    wordsWithLemma = [word for word in doc if word.lemma_ == lemma]
    return len(wordsWithLemma)/totalWords

In [8]:
def beProportion(doc): 
    totalWords = len(doc)
    bes = [word for word in doc if word.lemma_ == 'be' and word.tag_ in verbtags] # 488 is "be" 
    return len(bes)/totalWords

From the Penn Treebank table: https://www.ling.upenn.edu/courses/Fall_2003/ling001/penn_treebank_pos.html

  1. VB Verb, base form
  2. VBD Verb, past tense
  3. VBG Verb, gerund or present participle
  4. VBN Verb, past participle
  5. VBP Verb, non-3rd person singular present
  6. VBZ Verb, 3rd person singular present

In [9]:
presentVerbTags = ['VB', 'VBG', 'VBP', 'VBZ']
verbtags = ['VB', 'VBD', 'VBG', 'VBN', 'VBP', 'VBZ']

In [13]:
tagDict = {"CC": "Coordinating conjunction",
"DT": "Determiner",
"EX": "Existential there",
"IN": "Preposition or subordinating conjunction",
"JJ": "Adjective",
"JJR": "Adjective, comparative",
"JJS": "Adjective, superlative",
"MD": "Modal",
"NN": "Noun, singular or mass",
"NNS": "Noun, plural",
"NNP": "Proper noun, singular",
"NNPS": "Proper noun, plural",
"PDT": "Predeterminer",
"POS": "Possessive ending",
"PRP": "Personal pronoun",
"PRP$": "Possessive pronoun",
"RB": "Adverb",
"RBR": "Adverb, comparative",
"RBS": "Adverb, superlative",
"RP": "Particle",
"TO": "to",
"UH": "Interjection",
"VB": "Verb, base form",
"VBD": "Verb, past tense",
"VBG": "Verb, gerund or present participle",
"VBN": "Verb, past participle",
"VBP": "Verb, non-3rd person singular present",
"VBZ": "Verb, 3rd person singular present",
"WDT": "Wh-determiner",
"WP": "Wh-pronoun",
"WP$": "Possessive wh-pronoun",
"WRB": "Wh-adverb"}

In [14]:
tagset = list(tagDict.keys())

In [15]:
def compareTags(a, b, tagset):
    proportionsDict = {}
    for tag in tagset: 
        proportionsDict[tag] = [proportionWithTag(x, tag) for x in [a, b]]
    df = pd.DataFrame(proportionsDict).T
    df['factor'] = (df[1]/df[0])-1
    return df['factor']

In [17]:
def compareLemmas(a, b, lemmas):
    proportionsDict = {}
    for lemma in lemmas: 
        proportionsDict[lemma] = [proportionWithLemma(x, lemma) for x in [a, b]]
    df = pd.DataFrame(proportionsDict).T
    df['factor'] = df[1]/df[0]
    df['factor'].plot(kind="bar")

In [19]:
# Read annotated edition of Middlemarch
with open('e2/annotated.html') as f: 
    annotated = f.read()

In [20]:
# Parse 
soup = BeautifulSoup(annotated, 'lxml')

levels = [soup.find_all('span', 'c-'+str(level)) for level in range(17)]

# Remove HTML tags. 
cleanLevels = []
for level in levels: 
    cleanLevel = [quote.get_text() for quote in level]
    cleanLevels.append(cleanLevel)

In [21]:
def getQuotes(cleanLevels, n): 
    """ Get quotes from list, according to n times they are quoted. """
    quotedGroups = cleanLevels[n:]
    quoted = []
    for group in quotedGroups: 
        quoted.extend(group)
    quoted = ' '.join(quoted)
    return quoted

In [22]:
tagset.remove('WP$')

In [23]:
unquoted = cleanLevels[0]
quotedGroups = cleanLevels[1:]
quoted = []
for group in quotedGroups: 
    quoted.extend(group)
quotes = ' '.join(quoted)
nonquotes = ' '.join(unquoted)

In [24]:
quotes5 = getQuotes(cleanLevels, 5)
quotes8 = getQuotes(cleanLevels, 8)

In [ ]:
nonquotesS, quotesS, quotes5S, quotes8S = nlp(nonquotes), nlp(quotes), nlp(quotes5), nlp(quotes8)

In [ ]:
factor_all = compareTags(nonquotesS, quotesS, tagset)
factor_mid = compareTags(nonquotesS, quotes5S, tagset)
factor_high = compareTags(nonquotesS, quotes8S, tagset)

In [ ]:
rcParams['figure.figsize'] = 8, 5
df = pd.DataFrame([factor_all, factor_high], index=['', ''])
ax = df.T.plot(subplots=True, kind='bar')
ax[0].set_title('(a)')
ax[1].set_title('(b)')
# ax[2].set_title('(c)')

In [ ]:
tagDict['UH']

In [ ]:
# A list of plural nouns from highly quoted text. 
[w for w in quotes8S if w.tag_ == 'NNS']

In [288]:
[w for w in quotesS if w.tag_ == "NNPS"]


Out[288]:
[Sages,
 Mythologies,
 Girls,
 Physicians,
 Middlemarchers,
 Israelites,
 Saints,
 devour-,
 Elizabethans,
 Mythologies,
 Hobbes,
 Raffles,
 Rights,
 Christians,
 Lords,
 Stars,
 Heavens,
 Stoics,
 Alexandrians,
 Garths,
 Apostles,
 Lords,
 Characters,
 Apostles,
 Moors,
 Germans]

In [291]:
[w for w in quotesS if w.tag_ == "NNPS"]


Out[291]:
[Sages,
 Mythologies,
 Girls,
 Physicians,
 Middlemarchers,
 Israelites,
 Saints,
 devour-,
 Elizabethans,
 Mythologies,
 Hobbes,
 Raffles,
 Rights,
 Christians,
 Lords,
 Stars,
 Heavens,
 Stoics,
 Alexandrians,
 Garths,
 Apostles,
 Lords,
 Characters,
 Apostles,
 Moors,
 Germans]

In [89]:
def compareAllLemmas(a, b): 
    lemmasA = pd.Series(Counter([word.lemma_ for word in a]))
    lemmasB = pd.Series(Counter([word.lemma_ for word in b]))
#     df = pd.concat([lemmasA, lemmasB], axis=1).fillna(0)
    return lemmasA, lemmasB

In [91]:
sa, sb = compareAllLemmas(nonquotes, quotes8)

In [100]:
df = pd.concat([sa, sb], axis=1).fillna(0)

In [123]:
# quotesBag = quotes8.to_bag_of_terms(lemmatize=True, weighting='freq', as_strings=True)

def bagOfTerms(doc): 
    bag = doc.to_bag_of_terms(lemmatize=True, weighting='freq', as_strings=True)
    return pd.Series(bag)

In [152]:
nonquotesDoc = textacy.Doc(nonquotes)
quotesDoc = textacy.Doc(quotes)
quotes5Doc = textacy.Doc(quotes5)
quotes8Doc = textacy.Doc(quotes8)

In [153]:
nonquotesBag = bagOfTerms(nonquotesDoc)
quotesBag = bagOfTerms(quotesDoc)
quotes5Bag = bagOfTerms(quotes5Doc)
quotes8Bag = bagOfTerms(quotes8Doc)

In [167]:
def compareAllLemmas(a, b): 
    """ Compares two bags of terms. """
    df = pd.concat([a, b], axis=1).fillna(0)
    delta = df[0] - df[1]
    return delta.sort_values()

In [172]:
df = compareAllLemmas(quotesBag, nonquotesBag)

In [173]:
df


Out[173]:
say             -0.003908
mr.             -0.002463
lydgate         -0.002194
fred            -0.001954
bulstrode       -0.001712
mary            -0.001311
rosamond        -0.001070
mrs.            -0.001058
celia           -0.001023
not             -0.000978
garth           -0.000973
farebrother     -0.000968
's              -0.000907
james           -0.000898
brooke          -0.000878
vincy           -0.000618
'               -0.000607
ladislaw        -0.000601
casaubon        -0.000560
raffles         -0.000553
middlemarch     -0.000552
sir             -0.000537
wish            -0.000512
speak           -0.000475
come            -0.000474
go              -0.000473
featherstone    -0.000470
shall           -0.000469
caleb           -0.000446
tell            -0.000437
                   ...   
marriage         0.000245
large            0.000263
deep             0.000265
little           0.000266
heart            0.000268
mind             0.000268
self             0.000269
struggle         0.000279
inward           0.000287
small            0.000294
people           0.000296
young            0.000304
lot              0.000308
human            0.000324
english          0.000336
sort             0.000337
history          0.000337
live             0.000369
new              0.000374
man              0.000375
nature           0.000377
light            0.000384
great            0.000391
soul             0.000397
consciousness    0.000479
world            0.000611
love             0.000640
like             0.000742
woman            0.000817
life             0.001291
dtype: float64

In [ ]: