In [1]:
import spacy
import textacy
import pandas as pd
import re
from collections import Counter
In [2]:
with open('../middlemarch.txt') as f:
mm = f.read()
In [3]:
dialogueObjs = list(re.finditer('"([A-Za-z].*?("|\n\n))', mm, re.DOTALL))
In [4]:
dialogue = [dia.group(0) for dia in dialogueObjs]
In [8]:
totalDialogueChars = sum([len(dia.group(0)) for dia in dialogueObjs])
totalDialogueChars
Out[8]:
In [9]:
totalTextChars = len(mm)
totalTextChars
Out[9]:
In [27]:
percentDialogue = (totalDialogueChars / totalTextChars) * 100
percentDialogue
Out[27]:
In [11]:
dialogueLocs = [dia.span() for dia in dialogueObjs]
In [13]:
df = pd.read_json('../txt/e4.json')
In [14]:
allLocs = df['Locations in A'].values
In [15]:
# Make a big list of all possible character offsets in which dialogue occurs.
dialogueListList = [list(range(loc[0], loc[1])) for loc in dialogueLocs]
dialogueList = []
for item in dialogueListList:
dialogueList += item
In [16]:
# Check to see whether the start or the end of a critical quotation appears in our big list.
inDialogue = []
dialogueQuotes = []
for locSet in allLocs:
for loc in locSet:
if loc[0] in dialogueList or loc[1] in dialogueList:
inDialogue.append(1)
dialogueQuotes.append(mm[loc[0]:loc[1]])
else:
inDialogue.append(0)
In [17]:
dialogueCount = Counter(inDialogue)
dialogueCount
Out[17]:
In [18]:
totalQuotations = dialogueCount[0] + dialogueCount[1]
totalQuotations
Out[18]:
In [25]:
# Percentage of dialogue. Unadjusted for non-quotes
percentDialogueInQuotes = (dialogueCount[1] / (totalQuotations) ) * 100
percentDialogueInQuotes
Out[25]:
In [28]:
print('Of %s critical quotations, %s, or %s percent, are of dialogue. The novel is about %s percent dialogue.' % (totalQuotations, dialogueCount[1], percentDialogueInQuotes, percentDialogue))
In [ ]: