In [4]:
from collections import Counter
import pandas as pd
import numpy as np
%matplotlib inline
from pylab import rcParams
import textacy
rcParams['figure.figsize'] = 10, 4
import matplotlib.pyplot as plt
plt.style.use('ggplot')
In [5]:
import spacy
nlp = spacy.load('en')
From the Penn Treebank table: https://www.ling.upenn.edu/courses/Fall_2003/ling001/penn_treebank_pos.html
In [6]:
tagDict = {"CC": "Coordinating conjunction",
"DT": "Determiner",
"EX": "Existential there",
"IN": "Preposition or subordinating conjunction",
"JJ": "Adjective",
"JJR": "Adjective, comparative",
"JJS": "Adjective, superlative",
"MD": "Modal",
"NN": "Noun, singular or mass",
"NNS": "Noun, plural",
"NNP": "Proper noun, singular",
"NNPS": "Proper noun, plural",
"PDT": "Predeterminer",
"POS": "Possessive ending",
"PRP": "Personal pronoun",
"PRP$": "Possessive pronoun",
"RB": "Adverb",
"RBR": "Adverb, comparative",
"RBS": "Adverb, superlative",
"RP": "Particle",
"TO": "to",
"UH": "Interjection",
"VB": "Verb, base form",
"VBD": "Verb, past tense",
"VBG": "Verb, gerund or present participle",
"VBN": "Verb, past participle",
"VBP": "Verb, non-3rd person singular present",
"VBZ": "Verb, 3rd person singular present",
"WDT": "Wh-determiner",
"WP": "Wh-pronoun",
"WP$": "Possessive wh-pronoun",
"WRB": "Wh-adverb"}
In [7]:
jstorDF = pd.read_json('../txt/e2a.json')
bpoDF = pd.read_json('../txt/e4.json')
In [8]:
mm = open('../middlemarch.txt').read()
In [9]:
mmLength = len(mm)
In [10]:
def tallyQuotes(df, textLength):
""" Given a DataFrame containing matched Locations in A,
i.e. character offsets of quotes, tally these for each character in the text. """
locs = df['Locations in A'].values
tally = np.zeros(textLength) # Create a blank tally.
for locSet in locs:
for loc in locSet:
for i in range(loc[0], loc[1]+1):
tally[i] += 1
return tally
In [11]:
jstorTally = tallyQuotes(jstorDF, mmLength)
bpoTally = tallyQuotes(bpoDF, mmLength)
In [12]:
jstorTally.max(), bpoTally.max()
Out[12]:
In [13]:
def getText(tally):
""" Gets segements from Middlemarch from the tally. """
text = ""
for i in range(len(tally)):
if tally[i] - tally[i-1] == 1: # We're on a roll
text += (mm[tally[i]])
else:
text += (' ' + mm[tally[i]]) # Put spaces between quotes
return text
In [14]:
def segment(tally, cutoff=4):
""" Divides a tally into three parts: nonquotes,
moderaly quoted passages, and highly quoted passages.
Returns a list of three SpaCy docs. """
nonQuotedIndices = np.where(tally == 0)[0]
quotedIndices = np.where((tally > 0) & (jstorTally < cutoff))[0]
highlyQuotedIndices = np.where(tally >= cutoff)[0]
texts = [getText(text) for text in [nonQuotedIndices, quotedIndices, highlyQuotedIndices]]
docs = [nlp(text) for text in texts]
return docs
In [15]:
def POSSignature(doc):
""" Gets the POS proportions for a document. """
tags = [w.tag_ for w in doc]
count = pd.Series(Counter(tags))/len(doc)
return count
In [16]:
jstorDocs = segment(jstorTally)
bpoDocs = segment(bpoTally)
In [17]:
# Wordcounts for nonquotes, moderately quoted passages, and highly quoted passages
# for JSTOR and BPO docs.
[len(group) for group in jstorDocs], [len(group) for group in bpoDocs]
Out[17]:
In [18]:
jstorPOS = [POSSignature(doc) for doc in jstorDocs]
bpoPOS = [POSSignature(doc) for doc in bpoDocs]
In [19]:
labels = ['JSTOR-Nonquotes', 'JSTOR-Quotes', 'JSTOR-FreqQuotes', 'BPO-Nonquotes', 'BPO-Quotes', 'BPO-FreqQuotes']
posDF = pd.DataFrame(jstorPOS + bpoPOS,
index=labels).fillna(0)
In [20]:
posDF.T.plot(kind='bar', figsize=(16,6))
Out[20]:
In [21]:
tagList = ['IN', 'JJ', 'JJR', 'JJS', 'NN', 'NNPS', 'NNS', 'POS', 'PRP', 'WP$']
In [22]:
posDF[tagList].T.plot(kind='bar', figsize=(16,6))
Out[22]:
In [23]:
posDF[['JJR', 'JJS']].T.plot(kind='bar', figsize=(16,6))
Out[23]:
In [24]:
(posDF.loc['JSTOR-Quotes'] - posDF.loc['JSTOR-Nonquotes']).plot(kind='bar', figsize=(16,6))
Out[24]:
In [28]:
ax = (posDF.loc['JSTOR-Quotes'] - posDF.loc['JSTOR-Nonquotes']).plot(kind='bar', figsize=(16,6))
fig = ax.get_figure()
fig.savefig('pos-tags.png', bboxinches='tight', dpi=300)
In [138]:
(posDF.loc['JSTOR-FreqQuotes'] - posDF.loc['JSTOR-Nonquotes']).plot(kind='bar', figsize=(16,6))
Out[138]:
In [139]:
(posDF.loc['BPO-Quotes'] - posDF.loc['BPO-Nonquotes']).plot(kind='bar', figsize=(16,6))
Out[139]:
In [140]:
(posDF.loc['BPO-FreqQuotes'] - posDF.loc['BPO-Nonquotes']).plot(kind='bar', figsize=(16,6))
Out[140]:
In [168]:
allDocs = jstorDocs + bpoDocs
In [182]:
def bagOfTerms(text):
doc = textacy.Doc(text)
bag = doc.to_bag_of_terms(as_strings=True, lemmatize=True, weighting='freq')
return pd.Series(bag)
In [184]:
docTerms = [bagOfTerms(doc) for doc in allDocs]
In [188]:
df = pd.DataFrame(docTerms, index=labels).fillna(0)
In [193]:
(df.loc['JSTOR-Quotes'] - df.loc['JSTOR-Nonquotes']).sort_values()
Out[193]:
In [194]:
(df.loc['JSTOR-FreqQuotes'] - df.loc['JSTOR-Nonquotes']).sort_values()
Out[194]:
In [195]:
(df.loc['BPO-Quotes'] - df.loc['BPO-Nonquotes']).sort_values()
Out[195]:
In [196]:
(df.loc['BPO-FreqQuotes'] - df.loc['BPO-Nonquotes']).sort_values()
Out[196]:
In [ ]: