Analysis of quoted language, v2

How is quoted language different form nonquoted language?


In [4]:
from collections import Counter
import pandas as pd
import numpy as np
%matplotlib inline
from pylab import rcParams
import textacy
rcParams['figure.figsize'] = 10, 4
import matplotlib.pyplot as plt
plt.style.use('ggplot')

In [5]:
import spacy
nlp = spacy.load('en')

From the Penn Treebank table: https://www.ling.upenn.edu/courses/Fall_2003/ling001/penn_treebank_pos.html

  1. VB Verb, base form
  2. VBD Verb, past tense
  3. VBG Verb, gerund or present participle
  4. VBN Verb, past participle
  5. VBP Verb, non-3rd person singular present
  6. VBZ Verb, 3rd person singular present

In [6]:
tagDict = {"CC": "Coordinating conjunction",
"DT": "Determiner",
"EX": "Existential there",
"IN": "Preposition or subordinating conjunction",
"JJ": "Adjective",
"JJR": "Adjective, comparative",
"JJS": "Adjective, superlative",
"MD": "Modal",
"NN": "Noun, singular or mass",
"NNS": "Noun, plural",
"NNP": "Proper noun, singular",
"NNPS": "Proper noun, plural",
"PDT": "Predeterminer",
"POS": "Possessive ending",
"PRP": "Personal pronoun",
"PRP$": "Possessive pronoun",
"RB": "Adverb",
"RBR": "Adverb, comparative",
"RBS": "Adverb, superlative",
"RP": "Particle",
"TO": "to",
"UH": "Interjection",
"VB": "Verb, base form",
"VBD": "Verb, past tense",
"VBG": "Verb, gerund or present participle",
"VBN": "Verb, past participle",
"VBP": "Verb, non-3rd person singular present",
"VBZ": "Verb, 3rd person singular present",
"WDT": "Wh-determiner",
"WP": "Wh-pronoun",
"WP$": "Possessive wh-pronoun",
"WRB": "Wh-adverb"}

In [7]:
jstorDF = pd.read_json('../txt/e2a.json')
bpoDF = pd.read_json('../txt/e4.json')

In [8]:
mm = open('../middlemarch.txt').read()

In [9]:
mmLength = len(mm)

In [10]:
def tallyQuotes(df, textLength):
    """ Given a DataFrame containing matched Locations in A, 
    i.e. character offsets of quotes, tally these for each character in the text. """
    locs = df['Locations in A'].values
    tally = np.zeros(textLength) # Create a blank tally. 
    for locSet in locs: 
        for loc in locSet: 
            for i in range(loc[0], loc[1]+1): 
                tally[i] += 1
    return tally

In [11]:
jstorTally = tallyQuotes(jstorDF, mmLength)
bpoTally = tallyQuotes(bpoDF, mmLength)

In [12]:
jstorTally.max(), bpoTally.max()


Out[12]:
(30.0, 30.0)

In [13]:
def getText(tally): 
    """ Gets segements from Middlemarch from the tally. """
    text = ""
    for i in range(len(tally)): 
        if tally[i] - tally[i-1] == 1: # We're on a roll
            text += (mm[tally[i]])
        else: 
            text += (' ' + mm[tally[i]]) # Put spaces between quotes
    return text

In [14]:
def segment(tally, cutoff=4): 
    """ Divides a tally into three parts: nonquotes,  
    moderaly quoted passages, and highly quoted passages. 
    Returns a list of three SpaCy docs. """
    nonQuotedIndices = np.where(tally == 0)[0]
    quotedIndices = np.where((tally > 0) & (jstorTally < cutoff))[0]
    highlyQuotedIndices = np.where(tally >= cutoff)[0]
    texts = [getText(text) for text in [nonQuotedIndices, quotedIndices, highlyQuotedIndices]]
    docs = [nlp(text) for text in texts]
    return docs

In [15]:
def POSSignature(doc): 
    """ Gets the POS proportions for a document. """
    tags = [w.tag_ for w in doc]
    count = pd.Series(Counter(tags))/len(doc)
    return count

In [16]:
jstorDocs = segment(jstorTally)
bpoDocs = segment(bpoTally)

In [17]:
# Wordcounts for nonquotes, moderately quoted passages, and highly quoted passages
# for JSTOR and BPO docs. 
[len(group) for group in jstorDocs], [len(group) for group in bpoDocs]


Out[17]:
([352071, 54828, 6671], [345468, 61838, 7842])

In [18]:
jstorPOS = [POSSignature(doc) for doc in jstorDocs]
bpoPOS = [POSSignature(doc) for doc in bpoDocs]

In [19]:
labels = ['JSTOR-Nonquotes', 'JSTOR-Quotes', 'JSTOR-FreqQuotes', 'BPO-Nonquotes', 'BPO-Quotes', 'BPO-FreqQuotes']
posDF = pd.DataFrame(jstorPOS + bpoPOS, 
             index=labels).fillna(0)

In [20]:
posDF.T.plot(kind='bar', figsize=(16,6))


Out[20]:
<matplotlib.axes._subplots.AxesSubplot at 0x7f27da8142e8>

In [21]:
tagList = ['IN', 'JJ', 'JJR', 'JJS', 'NN', 'NNPS', 'NNS', 'POS', 'PRP', 'WP$']

In [22]:
posDF[tagList].T.plot(kind='bar', figsize=(16,6))


Out[22]:
<matplotlib.axes._subplots.AxesSubplot at 0x7f27da6c30f0>

In [23]:
posDF[['JJR', 'JJS']].T.plot(kind='bar', figsize=(16,6))


Out[23]:
<matplotlib.axes._subplots.AxesSubplot at 0x7f27335c2898>

In [24]:
(posDF.loc['JSTOR-Quotes'] - posDF.loc['JSTOR-Nonquotes']).plot(kind='bar', figsize=(16,6))


Out[24]:
<matplotlib.axes._subplots.AxesSubplot at 0x7f273334a588>

In [28]:
ax = (posDF.loc['JSTOR-Quotes'] - posDF.loc['JSTOR-Nonquotes']).plot(kind='bar', figsize=(16,6))
fig = ax.get_figure()
fig.savefig('pos-tags.png', bboxinches='tight', dpi=300)



In [138]:
(posDF.loc['JSTOR-FreqQuotes'] - posDF.loc['JSTOR-Nonquotes']).plot(kind='bar', figsize=(16,6))


Out[138]:
<matplotlib.axes._subplots.AxesSubplot at 0x7fcc84ab8898>

In [139]:
(posDF.loc['BPO-Quotes'] - posDF.loc['BPO-Nonquotes']).plot(kind='bar', figsize=(16,6))


Out[139]:
<matplotlib.axes._subplots.AxesSubplot at 0x7fcc847632b0>

In [140]:
(posDF.loc['BPO-FreqQuotes'] - posDF.loc['BPO-Nonquotes']).plot(kind='bar', figsize=(16,6))


Out[140]:
<matplotlib.axes._subplots.AxesSubplot at 0x7fcc845cdd30>

MDWs


In [168]:
allDocs = jstorDocs + bpoDocs

In [182]:
def bagOfTerms(text): 
    doc = textacy.Doc(text)
    bag = doc.to_bag_of_terms(as_strings=True, lemmatize=True, weighting='freq')
    return pd.Series(bag)

In [184]:
docTerms = [bagOfTerms(doc) for doc in allDocs]

In [188]:
df = pd.DataFrame(docTerms, index=labels).fillna(0)

In [193]:
(df.loc['JSTOR-Quotes'] - df.loc['JSTOR-Nonquotes']).sort_values()


Out[193]:
say             -0.003610
mr.             -0.002072
fred            -0.001886
bulstrode       -0.001726
lydgate         -0.001587
mary            -0.001194
garth           -0.001148
celia           -0.000989
james           -0.000984
not             -0.000958
mrs.            -0.000949
farebrother     -0.000755
vincy           -0.000669
rosamond        -0.000663
brooke          -0.000621
'               -0.000612
sir             -0.000537
raffles         -0.000531
wish            -0.000521
go              -0.000509
tell            -0.000486
ladislaw        -0.000475
come            -0.000475
shall           -0.000454
caleb           -0.000453
hear            -0.000418
sir james       -0.000413
middlemarch     -0.000409
lowick          -0.000394
speak           -0.000392
                   ...   
fall             0.000259
face             0.000260
paris            0.000260
struggle         0.000263
lie              0.000264
desire           0.000265
english          0.000267
touch            0.000275
lot              0.000276
human            0.000277
inward           0.000279
feel             0.000281
see              0.000291
self             0.000292
mind             0.000295
high             0.000298
history          0.000301
nature           0.000307
live             0.000311
deep             0.000311
sort             0.000345
little           0.000351
consciousness    0.000363
soul             0.000388
world            0.000488
love             0.000606
dorothea         0.000667
woman            0.000780
like             0.000791
life             0.001161
dtype: float64

In [194]:
(df.loc['JSTOR-FreqQuotes'] - df.loc['JSTOR-Nonquotes']).sort_values()


Out[194]:
say             -0.006397
lydgate         -0.004499
mr.             -0.003395
bulstrode       -0.002948
fred            -0.002781
rosamond        -0.002616
mary            -0.001792
not             -0.001615
know            -0.001411
's              -0.001404
mrs.            -0.001375
james           -0.001368
garth           -0.001308
go              -0.001281
farebrother     -0.001211
good            -0.001175
middlemarch     -0.001157
come            -0.001084
'               -0.000995
sir             -0.000938
vincy           -0.000880
ladislaw        -0.000831
tell            -0.000820
speak           -0.000749
shall           -0.000706
mean            -0.000706
think           -0.000696
caleb           -0.000686
dorothea        -0.000682
say mr.         -0.000679
                   ...   
human            0.000663
emotion          0.000666
thought          0.000673
rome             0.000679
struggle         0.000685
ardent           0.000700
marriage         0.000704
small            0.000727
form             0.000733
action           0.000759
certain          0.000770
large            0.000770
heart            0.000787
girl             0.000790
act              0.000801
sort             0.000803
christian        0.000833
lie              0.000900
theresa          0.000901
long             0.000944
knowledge        0.000971
eye              0.001024
live             0.001042
consciousness    0.001054
great            0.001080
new              0.001134
woman            0.001154
world            0.001883
light            0.002183
life             0.002417
dtype: float64

In [195]:
(df.loc['BPO-Quotes'] - df.loc['BPO-Nonquotes']).sort_values()


Out[195]:
mr.             -0.003859
's              -0.002553
fred            -0.002487
say             -0.002419
casaubon        -0.002100
mrs.            -0.001329
garth           -0.001287
james           -0.001282
not             -0.001220
brooke          -0.001156
farebrother     -0.001091
bulstrode       -0.001026
vincy           -0.001010
lydgate         -0.001003
dorothea        -0.000960
ladislaw        -0.000921
caleb           -0.000802
mary            -0.000735
rosamond        -0.000662
cadwallader     -0.000645
lowick          -0.000640
mr. casaubon    -0.000631
think           -0.000625
wish            -0.000616
sir james       -0.000538
come            -0.000513
thing           -0.000506
mr. brooke      -0.000497
say mr.         -0.000492
father          -0.000485
                   ...   
d                0.000333
wake             0.000333
nature           0.000333
labor            0.000338
year             0.000339
wilberforce      0.000343
death            0.000346
hard             0.000359
condition        0.000369
fall             0.000377
woman            0.000390
cry              0.000402
marriage         0.000402
hand             0.000403
belief           0.000432
laure            0.000445
consciousness    0.000455
prayer           0.000496
s                0.000516
abel             0.000519
soul             0.000532
key              0.000582
sit              0.000583
time             0.000596
live             0.000600
                 0.000615
old              0.000616
paris            0.000625
e                0.001433
life             0.001534
dtype: float64

In [196]:
(df.loc['BPO-FreqQuotes'] - df.loc['BPO-Nonquotes']).sort_values()


Out[196]:
say                      -0.006101
mr.                      -0.004911
's                       -0.004774
lydgate                  -0.004568
dorothea                 -0.004117
casaubon                 -0.003035
bulstrode                -0.002721
know                     -0.002612
fred                     -0.002604
rosamond                 -0.002533
good                     -0.002381
think                    -0.002320
come                     -0.002091
like                     -0.001924
brooke                   -0.001916
look                     -0.001832
mary                     -0.001787
mrs.                     -0.001739
go                       -0.001650
little                   -0.001528
not                      -0.001513
garth                    -0.001345
thing                    -0.001325
celia                    -0.001302
james                    -0.001282
tell                     -0.001203
farebrother              -0.001150
vincy                    -0.001127
middlemarch              -0.001102
ladislaw                 -0.001096
                            ...   
life of mistake           0.001684
certain spiritual         0.001684
light and tangled         0.001684
fashion the nature        0.001684
three hundred year ago    0.001684
level of feminine         0.001684
world                     0.002913
bear                      0.003040
order                     0.003068
tipton                    0.003116
nature                    0.003116
self                      0.003123
power                     0.003134
desire                    0.003172
common                    0.003263
include                   0.003304
ideal                     0.003314
passionate                0.003321
agreement                 0.003347
theresa                   0.003357
epic                      0.003362
spanish                   0.003362
indefiniteness            0.003364
epic life                 0.003367
woman                     0.004446
light                     0.004629
social                    0.004982
theresas                  0.005051
belief                    0.006640
life                      0.009238
dtype: float64

In [ ]: