This notebook revisits some of the literary historical trends found by Franco Moretti in his article "Style, Inc." (Critical Inquiry, 36.1 (2009), 134-158). See especially his Figures 1 (p 135) and 18 (p 155).
Note that the dataset used in this notebook is not Moretti's bibliograpy of novels, but Hathi Trust's catalog of fiction texts (https://sharc.hathitrust.org/genre).
In [ ]:
%pylab inline
from datascience import *
In [ ]:
metadata_tb = Table.read_table("fiction_metadata.csv")
In [ ]:
metadata_tb
In [ ]:
# Remove rows that contain duplicate titles
# Sets are specially designed to handle unique elements and check for duplicates efficiently
titles = set()
indexes = []
for i in range(len(metadata_tb['title'])):
if metadata_tb['title'][i] not in titles:
indexes.append(i)
titles.add(metadata_tb['title'][i])
singlevol_tb = metadata_tb.take(indexes)
In [ ]:
# Inspect annual distribution of books
singlevol_tb.hist('date')
In [ ]:
# Limit to Moretti's date range
date_mask = (singlevol_tb['date'] > 1750) & (singlevol_tb['date'] < 1850)
singlevol_tb = singlevol_tb.where(date_mask)
In [ ]:
# EX. Plot the distribution of page counts ('totalpages').
# Should we remove any entries from our metadata? Why or why not?
# EX. Plot the distribution of confidence values that given texts are fiction('prob80precise').
# Should we remove any entries from our metadata? Why or why not?
In [ ]:
import numpy as np
In [ ]:
singlevol_tb = singlevol_tb.select(['title', 'date'])
In [ ]:
# Determine length of each title
title_tokens = [x.split() for x in singlevol_tb['title']]
title_length = [len(x) for x in title_tokens]
singlevol_tb['title_len'] = title_length
In [ ]:
singlevol_tb
In [ ]:
# Determine average title length per year
mean_table = singlevol_tb.group('date', collect=np.mean)
In [ ]:
mean_table
In [ ]:
mean_table.scatter('date','title_len mean')
In [ ]:
# Does the pattern hold when we treat individual titles as data points?
singlevol_tb.scatter('date', 'title_len')
In [ ]:
singlevol_tb.scatter('date', 'title_len', fit_line=True)
In [ ]:
# EX. Moretti also produces graphs for the median and standard deviation
# of title lengths by year. Create graphs that represent these data.
In [ ]:
import re
In [ ]:
# Example from previous lesson
for line in open('lecture notes 09-22-15.txt'):
for word in line.split():
if word.endswith('ing'):
print(word)
In [ ]:
# Reproduced using regex
for line in open('lecture notes 09-22-15.txt'):
for word in line.split():
if re.search(r'ing$', word): # only change from above
print(word)
In [ ]:
# EX. Remove the "$" from the code above. How does it change the output? Why?
In [ ]:
word = 'Having'
re.search(r'ing$', word)
In [ ]:
word = 'Ideas'
re.search(r'ing$', word)
In [ ]:
# assign list of words to variable, so we don't have to read in the file each time
with open('lecture notes 09-22-15.txt') as file_in:
lec_notes = file_in.read()
word_list = lec_notes.split()
In [ ]:
[word for word in word_list if re.search(r'^..t..$', word)]
In [ ]:
# EX. What do you think the "^" and "." metacharacters do in the code?
In [ ]:
[word for word in word_list if re.search(r'^a.*t', word)]
In [ ]:
# EX. What do you think the "*" metacharacter does in the code?
In [ ]:
poe = "While I nodded, nearly napping, suddenly there came a tapping,\
As of someone gently rapping, rapping at my chamber door."
In [ ]:
re.findall(r'.apping', poe)
In [ ]:
re.findall(r'.(?=apping)', poe)
In [ ]:
re.findall(r"(?<=ly ).apping", poe)
In [ ]:
re.findall(r"(?<=ly ).(?=apping)", poe)
In [ ]:
# EX. Find a list of "-apping" words that are followed by a comma in the line from Poe
# -- but make sure the comma doesn't appear in your list entries!
In [ ]:
def istheXofY(text):
return re.search(r'the .* of .*', text.lower())!=None and len(text.split())<=4
In [ ]:
print(istheXofY('The Castle of Otronto'))
print(istheXofY('The Castle in which there are some people of Otronto and other places'))
In [ ]:
# Graph the frequency of "The X of Y" titles per decade
singlevol_tb['theXofY'] = singlevol_tb.apply(istheXofY, 'title')
singlevol_tb['decade'] = singlevol_tb['date']//10*10
singlevol_tb.group('decade', collect=np.mean).scatter('decade', 'theXofY mean')
In [ ]:
# Create table containing only "The X of Y" titles
theXofY_tb = singlevol_tb.where('theXofY').drop('theXofY')
In [ ]:
def gettheX(text):
X = re.findall(r'(?<=the ).*(?= of )', text.lower())[0]
return X
def gettheY(text):
Y = re.findall(r'(?<= of ).*', text.lower())[0]
return Y
In [ ]:
print(gettheX('The Castle of Otronto'))
print(gettheY('The Castle of Otronto'))
print()
print(gettheX('The castle in which there are some people of Otronto and other places'))
print(gettheY('The castle in which there are some people of Otronto and other places'))
In [ ]:
# Create new columns containing on the the Y and Y from each title
theXofY_tb['theX'] = theXofY_tb.apply(gettheX, 'title')
theXofY_tb['ofY'] = theXofY_tb.apply(gettheY, 'title')
In [ ]:
theXofY_tb
In [ ]:
from collections import Counter
Xs = Counter(theXofY_tb['theX'])
Ys = Counter(theXofY_tb['ofY'])
In [ ]:
Xs.most_common(10)
In [ ]:
Ys.most_common(10)
In [ ]:
# EX. In Moretti's study, he gives examples of titles using the formula "The X of Y"
# with lengths of up to seven words. If we tweak our function istheXofY()to allow
# for longer titles, how does this change our findings? Why?