In [42]:
import spacy
import textacy
import nltk
import numpy as np
import pandas as pd
from collections import Counter
%matplotlib inline
import matplotlib.pyplot as plt
plt.style.use('ggplot')
In [2]:
mm = open('../middlemarch.txt').read()
In [3]:
df = pd.read_json('../txt/e2a.json')
In [4]:
grouped = df.groupby('journal')
In [5]:
def getText(ranges):
texts = []
for rangeSet in ranges:
for textRange in rangeSet:
# print(textRange)
if len(textRange) > 1:
text = mm[textRange[0]:textRange[1]]
texts.append(text)
return texts
In [6]:
journalDict = {}
for journal in grouped:
journalDict[journal[0]] = journal[1]['Locations in A'].values
In [7]:
textDict = {}
for journal in journalDict:
textDict[journal] = getText(journalDict[journal])
In [8]:
specialistText = ' '.join(textDict.pop('George Eliot - George Henry Lewes Studies'))
specialistText = specialistText.replace('\n', ' ')
In [9]:
nonSpecialistText = ' '.join([' '.join(item) for item in textDict.values()])
nonSpecialistText = nonSpecialistText.replace('\n', ' ')
In [10]:
st = textacy.Doc(specialistText)
nst = textacy.Doc(nonSpecialistText)
In [11]:
stBag = st.to_bag_of_terms(as_strings=True, weighting='freq')
nstBag = nst.to_bag_of_terms(as_strings=True, weighting='freq')
In [12]:
stArray = pd.Series(stBag)
nstArray = pd.Series(nstBag)
In [59]:
# Negative values are distinctive of nonspecialists.
# Positive values are distinctive of specialists.
(stArray - nstArray).fillna(0).sort_values()
Out[59]:
In [47]:
def POSRepresentation(text):
return [pair[1] for item in text.pos_tagged_text for pair in item]
def tagRepresentation(text):
return [w.tag_ for w in text.spacy_doc]
def POSStats(text):
# rep = POSRepresentation(text)
rep = tagRepresentation(text)
length = len(rep)
s = pd.Series(rep).value_counts()/length
return s
stPOS = POSStats(st)
nstPOS = POSStats(nst)
In [53]:
df = pd.DataFrame([stPOS, nstPOS], index=['Specialists', 'Nonspecialists']).T
In [54]:
df.plot(kind='bar', figsize=(16,6))
Out[54]:
In [58]:
(df['Specialists'] - df['Nonspecialists']).plot(kind='bar', figsize=(16,6))
Out[58]:
In [ ]: