This notebook revisits some of the literary historical trends found by Franco Moretti in his article "Style, Inc." (Critical Inquiry, 36.1 (2009), 134-158). See especially his Figures 1 (p 135) and 18 (p 155).

Note that the dataset used in this notebook is not Moretti's bibliograpy of novels, but Hathi Trust's catalog of fiction texts (https://sharc.hathitrust.org/genre).

Metadata

  • Inspecting & Cleaning
  • Trends
  • Detecting Word Patterns

  • Intro to Regex
  • A Fortunate Formula
  • Inspecting & Cleaning

    
    
    In [ ]:
    %pylab inline
    from datascience import *
    
    
    
    In [ ]:
    metadata_tb = Table.read_table("fiction_metadata.csv")
    
    
    
    In [ ]:
    metadata_tb
    
    
    
    In [ ]:
    # Remove rows that contain duplicate titles
    # Sets are specially designed to handle unique elements and check for duplicates efficiently
    titles = set()
    indexes = []
    for i in range(len(metadata_tb['title'])):
        if metadata_tb['title'][i] not in titles:
            indexes.append(i)
            titles.add(metadata_tb['title'][i])
    singlevol_tb = metadata_tb.take(indexes)
    
    
    
    In [ ]:
    # Inspect annual distribution of books
    singlevol_tb.hist('date')
    
    
    
    In [ ]:
    # Limit to Moretti's date range
    
    date_mask = (singlevol_tb['date'] > 1750) & (singlevol_tb['date'] < 1850)
    singlevol_tb = singlevol_tb.where(date_mask)
    
    
    
    In [ ]:
    # EX. Plot the distribution of page counts ('totalpages').
    #     Should we remove any entries from our metadata? Why or why not?
    
    # EX. Plot the distribution of confidence values that given texts are fiction('prob80precise').
    #     Should we remove any entries from our metadata? Why or why not?
    

    Trends

    
    
    In [ ]:
    import numpy as np
    
    
    
    In [ ]:
    singlevol_tb = singlevol_tb.select(['title', 'date'])
    
    
    
    In [ ]:
    # Determine length of each title
    title_tokens = [x.split() for x in singlevol_tb['title']]
    title_length = [len(x) for x in title_tokens]
    singlevol_tb['title_len'] = title_length
    
    
    
    In [ ]:
    singlevol_tb
    
    
    
    In [ ]:
    # Determine average title length per year
    mean_table = singlevol_tb.group('date', collect=np.mean)
    
    
    
    In [ ]:
    mean_table
    
    
    
    In [ ]:
    mean_table.scatter('date','title_len mean')
    
    
    
    In [ ]:
    # Does the pattern hold when we treat individual titles as data points?
    singlevol_tb.scatter('date', 'title_len')
    
    
    
    In [ ]:
    singlevol_tb.scatter('date', 'title_len', fit_line=True)
    
    
    
    In [ ]:
    # EX. Moretti also produces graphs for the median and standard deviation
    #     of title lengths by year. Create graphs that represent these data.
    

    Intro to Regex (Regular Expressions)

    
    
    In [ ]:
    import re
    
    
    
    In [ ]:
    # Example from previous lesson
    for line in open('lecture notes 09-22-15.txt'):
        for word in line.split():
            if word.endswith('ing'):
                print(word)
    
    
    
    In [ ]:
    # Reproduced using regex
    for line in open('lecture notes 09-22-15.txt'):
        for word in line.split():
            if re.search(r'ing$', word): # only change from above
                print(word)
    
    
    
    In [ ]:
    # EX. Remove the "$" from the code above. How does it change the output? Why?
    
    
    
    In [ ]:
    word = 'Having'
    re.search(r'ing$', word)
    
    
    
    In [ ]:
    word = 'Ideas'
    re.search(r'ing$', word)
    
    
    
    In [ ]:
    # assign list of words to variable, so we don't have to read in the file each time
    with open('lecture notes 09-22-15.txt') as file_in:
        lec_notes = file_in.read()
    word_list = lec_notes.split()
    
    
    
    In [ ]:
    [word for word in word_list if re.search(r'^..t..$', word)]
    
    
    
    In [ ]:
    # EX. What do you think the "^" and "." metacharacters do in the code?
    
    
    
    In [ ]:
    [word for word in word_list if re.search(r'^a.*t', word)]
    
    
    
    In [ ]:
    # EX. What do you think the "*" metacharacter does in the code?
    
    
    
    In [ ]:
    poe = "While I nodded, nearly napping, suddenly there came a tapping,\
    As of someone gently rapping, rapping at my chamber door."
    
    
    
    In [ ]:
    re.findall(r'.apping', poe)
    
    
    
    In [ ]:
    re.findall(r'.(?=apping)', poe)
    
    
    
    In [ ]:
    re.findall(r"(?<=ly ).apping", poe)
    
    
    
    In [ ]:
    re.findall(r"(?<=ly ).(?=apping)", poe)
    
    
    
    In [ ]:
    # EX. Find a list of "-apping" words that are followed by a comma in the line from Poe
    #     -- but make sure the comma doesn't appear in your list entries!
    

    A Fortunate Formula

    
    
    In [ ]:
    def istheXofY(text):
        return re.search(r'the .* of .*', text.lower())!=None and len(text.split())<=4
    
    
    
    In [ ]:
    print(istheXofY('The Castle of Otronto'))
    print(istheXofY('The Castle in which there are some people of Otronto and other places'))
    
    
    
    In [ ]:
    # Graph the frequency of "The X of Y" titles per decade
    singlevol_tb['theXofY'] = singlevol_tb.apply(istheXofY, 'title')
    singlevol_tb['decade'] = singlevol_tb['date']//10*10
    singlevol_tb.group('decade', collect=np.mean).scatter('decade', 'theXofY mean')
    
    
    
    In [ ]:
    # Create table containing only "The X of Y" titles
    theXofY_tb = singlevol_tb.where('theXofY').drop('theXofY')
    
    
    
    In [ ]:
    def gettheX(text):
        X = re.findall(r'(?<=the ).*(?= of )', text.lower())[0]
        return X
    
    def gettheY(text):
        Y = re.findall(r'(?<= of ).*', text.lower())[0]
        return Y
    
    
    
    In [ ]:
    print(gettheX('The Castle of Otronto'))
    print(gettheY('The Castle of Otronto'))
    print()
    print(gettheX('The castle in which there are some people of Otronto and other places'))
    print(gettheY('The castle in which there are some people of Otronto and other places'))
    
    
    
    In [ ]:
    # Create new columns containing on the the Y and Y from each title
    theXofY_tb['theX'] = theXofY_tb.apply(gettheX, 'title')
    theXofY_tb['ofY'] = theXofY_tb.apply(gettheY, 'title')
    
    
    
    In [ ]:
    theXofY_tb
    
    
    
    In [ ]:
    from collections import Counter
    
    Xs = Counter(theXofY_tb['theX'])
    Ys = Counter(theXofY_tb['ofY'])
    
    
    
    In [ ]:
    Xs.most_common(10)
    
    
    
    In [ ]:
    Ys.most_common(10)
    
    
    
    In [ ]:
    # EX. In Moretti's study, he gives examples of titles using the formula "The X of Y"
    #     with lengths of up to seven words. If we tweak our function istheXofY()to allow
    #     for longer titles, how does this change our findings? Why?