In [4]:
    
from collections import Counter
import pandas as pd
import numpy as np
%matplotlib inline
from pylab import rcParams
import textacy
rcParams['figure.figsize'] = 10, 4
import matplotlib.pyplot as plt
plt.style.use('ggplot')
    
In [5]:
    
import spacy
nlp = spacy.load('en')
    
From the Penn Treebank table: https://www.ling.upenn.edu/courses/Fall_2003/ling001/penn_treebank_pos.html
In [6]:
    
tagDict = {"CC": "Coordinating conjunction",
"DT": "Determiner",
"EX": "Existential there",
"IN": "Preposition or subordinating conjunction",
"JJ": "Adjective",
"JJR": "Adjective, comparative",
"JJS": "Adjective, superlative",
"MD": "Modal",
"NN": "Noun, singular or mass",
"NNS": "Noun, plural",
"NNP": "Proper noun, singular",
"NNPS": "Proper noun, plural",
"PDT": "Predeterminer",
"POS": "Possessive ending",
"PRP": "Personal pronoun",
"PRP$": "Possessive pronoun",
"RB": "Adverb",
"RBR": "Adverb, comparative",
"RBS": "Adverb, superlative",
"RP": "Particle",
"TO": "to",
"UH": "Interjection",
"VB": "Verb, base form",
"VBD": "Verb, past tense",
"VBG": "Verb, gerund or present participle",
"VBN": "Verb, past participle",
"VBP": "Verb, non-3rd person singular present",
"VBZ": "Verb, 3rd person singular present",
"WDT": "Wh-determiner",
"WP": "Wh-pronoun",
"WP$": "Possessive wh-pronoun",
"WRB": "Wh-adverb"}
    
In [7]:
    
jstorDF = pd.read_json('../txt/e2a.json')
bpoDF = pd.read_json('../txt/e4.json')
    
In [8]:
    
mm = open('../middlemarch.txt').read()
    
In [9]:
    
mmLength = len(mm)
    
In [10]:
    
def tallyQuotes(df, textLength):
    """ Given a DataFrame containing matched Locations in A, 
    i.e. character offsets of quotes, tally these for each character in the text. """
    locs = df['Locations in A'].values
    tally = np.zeros(textLength) # Create a blank tally. 
    for locSet in locs: 
        for loc in locSet: 
            for i in range(loc[0], loc[1]+1): 
                tally[i] += 1
    return tally
    
In [11]:
    
jstorTally = tallyQuotes(jstorDF, mmLength)
bpoTally = tallyQuotes(bpoDF, mmLength)
    
In [12]:
    
jstorTally.max(), bpoTally.max()
    
    Out[12]:
In [13]:
    
def getText(tally): 
    """ Gets segements from Middlemarch from the tally. """
    text = ""
    for i in range(len(tally)): 
        if tally[i] - tally[i-1] == 1: # We're on a roll
            text += (mm[tally[i]])
        else: 
            text += (' ' + mm[tally[i]]) # Put spaces between quotes
    return text
    
In [14]:
    
def segment(tally, cutoff=4): 
    """ Divides a tally into three parts: nonquotes,  
    moderaly quoted passages, and highly quoted passages. 
    Returns a list of three SpaCy docs. """
    nonQuotedIndices = np.where(tally == 0)[0]
    quotedIndices = np.where((tally > 0) & (jstorTally < cutoff))[0]
    highlyQuotedIndices = np.where(tally >= cutoff)[0]
    texts = [getText(text) for text in [nonQuotedIndices, quotedIndices, highlyQuotedIndices]]
    docs = [nlp(text) for text in texts]
    return docs
    
In [15]:
    
def POSSignature(doc): 
    """ Gets the POS proportions for a document. """
    tags = [w.tag_ for w in doc]
    count = pd.Series(Counter(tags))/len(doc)
    return count
    
In [16]:
    
jstorDocs = segment(jstorTally)
bpoDocs = segment(bpoTally)
    
In [17]:
    
# Wordcounts for nonquotes, moderately quoted passages, and highly quoted passages
# for JSTOR and BPO docs. 
[len(group) for group in jstorDocs], [len(group) for group in bpoDocs]
    
    Out[17]:
In [18]:
    
jstorPOS = [POSSignature(doc) for doc in jstorDocs]
bpoPOS = [POSSignature(doc) for doc in bpoDocs]
    
In [19]:
    
labels = ['JSTOR-Nonquotes', 'JSTOR-Quotes', 'JSTOR-FreqQuotes', 'BPO-Nonquotes', 'BPO-Quotes', 'BPO-FreqQuotes']
posDF = pd.DataFrame(jstorPOS + bpoPOS, 
             index=labels).fillna(0)
    
In [20]:
    
posDF.T.plot(kind='bar', figsize=(16,6))
    
    Out[20]:
    
In [21]:
    
tagList = ['IN', 'JJ', 'JJR', 'JJS', 'NN', 'NNPS', 'NNS', 'POS', 'PRP', 'WP$']
    
In [22]:
    
posDF[tagList].T.plot(kind='bar', figsize=(16,6))
    
    Out[22]:
    
In [23]:
    
posDF[['JJR', 'JJS']].T.plot(kind='bar', figsize=(16,6))
    
    Out[23]:
    
In [24]:
    
(posDF.loc['JSTOR-Quotes'] - posDF.loc['JSTOR-Nonquotes']).plot(kind='bar', figsize=(16,6))
    
    Out[24]:
    
In [28]:
    
ax = (posDF.loc['JSTOR-Quotes'] - posDF.loc['JSTOR-Nonquotes']).plot(kind='bar', figsize=(16,6))
fig = ax.get_figure()
fig.savefig('pos-tags.png', bboxinches='tight', dpi=300)
    
    
In [138]:
    
(posDF.loc['JSTOR-FreqQuotes'] - posDF.loc['JSTOR-Nonquotes']).plot(kind='bar', figsize=(16,6))
    
    Out[138]:
    
In [139]:
    
(posDF.loc['BPO-Quotes'] - posDF.loc['BPO-Nonquotes']).plot(kind='bar', figsize=(16,6))
    
    Out[139]:
    
In [140]:
    
(posDF.loc['BPO-FreqQuotes'] - posDF.loc['BPO-Nonquotes']).plot(kind='bar', figsize=(16,6))
    
    Out[140]:
    
In [168]:
    
allDocs = jstorDocs + bpoDocs
    
In [182]:
    
def bagOfTerms(text): 
    doc = textacy.Doc(text)
    bag = doc.to_bag_of_terms(as_strings=True, lemmatize=True, weighting='freq')
    return pd.Series(bag)
    
In [184]:
    
docTerms = [bagOfTerms(doc) for doc in allDocs]
    
In [188]:
    
df = pd.DataFrame(docTerms, index=labels).fillna(0)
    
In [193]:
    
(df.loc['JSTOR-Quotes'] - df.loc['JSTOR-Nonquotes']).sort_values()
    
    Out[193]:
In [194]:
    
(df.loc['JSTOR-FreqQuotes'] - df.loc['JSTOR-Nonquotes']).sort_values()
    
    Out[194]:
In [195]:
    
(df.loc['BPO-Quotes'] - df.loc['BPO-Nonquotes']).sort_values()
    
    Out[195]:
In [196]:
    
(df.loc['BPO-FreqQuotes'] - df.loc['BPO-Nonquotes']).sort_values()
    
    Out[196]:
In [ ]: