In [ ]:
%%capture
!rm -rf data/
!unzip data.zip -d data
from datascience import *
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
%matplotlib inline
plt.style.use('ggplot')
This notebook is designed to reproduce several findings from Ted Underwood and Jordan Sellers's article "How Quickly Do Literary Standards Change?" (draft (2015), forthcoming in Modern Language Quarterly). See especially Fig 1 and reported classifier accuracy (p 8).
Underwood and Sellers have made their corpus of poems and their code available here: https://github.com/tedunderwood/paceofchange
In [ ]:
metadata_tb = Table.read_table('data/poemeta.csv', keep_default_na=False)
metadata_tb.show(5)
We see above a recept column that has their reception status. We want to look at reviewed and random, just like Underwood and Sellers did:
In [ ]:
reception_mask = (metadata_tb['recept']=='reviewed') + (metadata_tb['recept']=='random')
clf_tb = metadata_tb.where(reception_mask)
clf_tb.show(5)
Great, now we've successfully subsetted our data!
No we're going to import Term Frequencies by Text. This script is specifically tailored to the format of textual data made available from Hathi Trust. This consists of a series of spreadsheets, each containing book-level term frequencies.
Each spreadsheet will become a row in our Document-Term Matrix.
In [ ]:
# Create list that will contain a series of dictionaries
freqdict_list = []
# Iterate through texts in our spreadsheet
for _id in clf_tb['docid']:
# Each text will have its own dictionary
# Keys are terms and values are frequencies
termfreq_dict = {}
# Open the given text's spreadsheet
with open('data/poems/' + _id + '.poe.tsv', encoding='utf-8') as f:
filelines = f.readlines()
# Each line in the spreadsheet contains a unique term and its frequency
for line in filelines:
termfreq = line.split('\t')
# 'If' conditions throw out junk lines in the spreadsheet
if len(termfreq) > 2 or len(termfreq) > 2:
continue
term, freq = termfreq[0], int(termfreq[1])
if len(term)>0 and term[0].isalpha():
# Create new entry in text's dictionary for the term
termfreq_dict[term] = freq
freqdict_list.append(termfreq_dict)
We can use sklearn's DictVectorizer to turn this into a DTM:
In [ ]:
from sklearn.feature_extraction import DictVectorizer
dv = DictVectorizer()
dtm = dv.fit_transform(freqdict_list)
term_list = dv.feature_names_
In [ ]:
dtm
In [ ]:
dtm_df = pd.DataFrame(dtm.toarray(), columns = term_list)
dtm_df.head()
Let's add in the docid as the index instead of just a counter for the row:
In [ ]:
dtm_df.set_index(clf_tb['docid'], inplace=True)
dtm_df.head()
Underwood and Sellers create a unique model for each author in the corpus. They set aside a given author from the training set and then use the model to predict whether she was likely to be reviewed or not. Create a list of authors and an "empty" array in which to record probabilities.
In [ ]:
authors = list(set(clf_tb['author']))
probabilities = np.zeros([len(clf_tb['docid'])])
Now we'll set up the regression model with sklearn.
Underwood and Sellers use a regularization constant ('C') to prevent overfitting, since this is a major concern when observing thousands of variables.
In [ ]:
from sklearn.linear_model import LogisticRegression
clf = LogisticRegression(C = 0.00007)
We'll then write a function set_author_aside that sifts out the target author's works from the dataset. Recall that Underwood and Sellers trained a model for each author when they were taken out.
In [ ]:
def set_author_aside(author, tb, df):
'''
Set aside each author's texts from training set
'''
train_ids = tb.where(tb['author']!=author).column('docid')
test_ids = tb.where(tb['author']==author).column('docid')
train_df_ = df.loc[train_ids]
test_df_ = df.loc[test_ids]
train_targets_ = tb.where(tb['author']!=author)['recept']=='reviewed'
return train_df_, test_df_, train_targets_
We also need to get out the most common words by their document frequency for each model. The function below, top_vocab_by_docfreq will do this each time we loop and create a new model:
In [ ]:
def top_vocab_by_docfreq(df, num_words):
'''
Retrieve the most common words (by document frequency) for a given model
'''
docfreq_df = df > 0
wordcolumn_sums = docfreq_df.sum()
words_by_freq = wordcolumn_sums.sort_values(ascending=False)
top_words = words_by_freq[:num_words]
top_words_list = top_words.index.tolist()
return top_words_list
We then need to normalize the frequencies:
In [ ]:
def normalize_model(train_df_, test_df_, vocabulary):
'''
Normalize the model's term frequencies and put them into standard units
'''
# Select columns for only the most common words
train_df_ = train_df_[vocabulary]
test_df_ = test_df_[vocabulary]
# Normalize each value by the sum of all values in its row
train_df_ = train_df_.apply(lambda x: x/sum(x), axis=1)
test_df_ = test_df_.apply(lambda x: x/sum(x), axis=1)
# Get mean and stdev for each column
train_mean = np.mean(train_df_)
train_std = np.std(train_df_)
# Transform each value to standard units for its column
train_df_ = ( train_df_ - train_mean ) / train_std
test_df_ = ( test_df_ - train_mean ) / train_std
return train_df_, test_df_
In [ ]:
import time
start = time.time()
for author in authors[:1]:
# Set aside each author's texts from training set
train_df, test_df, train_targets = set_author_aside(author, clf_tb, dtm_df)
# Retrieve the most common words (by document frequency) for a given model
vocab_list = top_vocab_by_docfreq(train_df, 3200)
# Normalize the model's term frequencies and put them into standard units
train_df, test_df = normalize_model(train_df, test_df, vocab_list)
# Learn the Logistic Regression over our model
clf.fit(train_df, train_targets)
# Some authors have more than one text in the corpus, so we retrieve all
for _id in test_df.index.tolist():
# Make prediction whether text was reviewed
text = test_df.loc[_id]
probability = clf.predict_proba([text])[0][1]
# Record predictions in same order as the metadata spreadsheet
_index = list(clf_tb.column('docid')).index(_id)
probabilities[_index] = probability
end = time.time()
print(end - start)
So how long is this going to take us?
In [ ]:
len(authors) * (end-start) / 60
In [ ]:
len(term_list)
Let's read in a preprocessed vocabulary file. This contains only words that will be used in classification. This list was created by simply iterating through each model and observing the words that appeared in it.
In [ ]:
import pickle
with open('data/preprocessed_vocab.pickle', 'rb') as f:
pp_vocab = pickle.load(f)
In [ ]:
len(pp_vocab)
Now let's select only columns for words in our pre-processed vocabulary. This will make our computation more efficient later:
In [ ]:
dtm_df = dtm_df[pp_vocab]
We'll use the unique IDs from our metadata to keep track of each text:
In [ ]:
dtm_df.set_index(clf_tb['docid'], inplace=True)
dtm_df.head()
In [ ]:
# Create new DataFrame that simply lists whether a term appears in
# each document, so that we don't have to repeat this process evey iteration
term_in_doc_df = dtm_df>0
In [ ]:
term_in_doc_df
In [ ]:
# Re-write the model-building function
def set_author_aside(author, tb, dtm_df_, dfreq_df_):
train_ids = tb.where(tb['author']!=author).column('docid')
test_ids = tb.where(tb['author']==author).column('docid')
train_df_ = dtm_df_.loc[train_ids]
dfreq_df_ = dfreq_df_.loc[train_ids] # Include only term_in_doc values for texts in training set
test_df_ = dtm_df_.loc[test_ids]
train_targets_ = tb.where(tb['author']!=author)['recept']=='reviewed'
return train_df_, test_df_, train_targets_, dfreq_df_
# Re-write our vocabulary selection function
def top_vocab_by_docfreq(df, num_words):
# Removed the test of whether a term is in a given document (i.e. df>0)
wordcolumn_sums = sum(df)
words_by_freq = wordcolumn_sums.sort_values(ascending=False)
top_words = words_by_freq[:num_words]
top_words_list = top_words.index.tolist()
return top_words_list
In [ ]:
# Parallel Processing means running our script on multiple cores simultaneously
# This can be used in situations where we might otherwise use a 'FOR' loop
# (when it doesn't matter what order we go through the list of values!)
clf = LogisticRegression(C = 0.00007)
def master_function(author):
# Note: Our only input is the name of the author.
# Remember that we had iterated over the list of authors previously.
train_df, test_df, train_targets, dfreq_df = set_author_aside(author, clf_tb, dtm_df, term_in_doc_df)
vocab_list = top_vocab_by_docfreq(dfreq_df, 3200)
train_df, test_df = normalize_model(train_df, test_df, vocab_list)
clf.fit(train_df, train_targets)
# Create a list of each text's probability of review AND its index in the metadata table
index_probability_tuples = []
for _id in test_df.index.tolist():
text = test_df.loc[_id]
probability = clf.predict_proba([text])[0][1]
_index = list(clf_tb.column('docid')).index(_id)
index_probability_tuples.append( (_index, probability) )
return index_probability_tuples
In [ ]:
# Multiprocessing enables Python to parallelize
import multiprocessing
# Return number of cores
multiprocessing.cpu_count()
In [ ]:
# By default, the Pool contains one worker for each core
# Since we are working on a shared server, we'll set the number
# of workers to 4.
pool = multiprocessing.Pool(4, maxtasksperchild=1)
In [ ]:
# Efficiently applies the master_function() to our list of authors
# Returns a list where each entry is an item returned by the function
# Timing the process again
start = time.time()
output = pool.map(master_function, authors)
end = time.time()
print(end-start)
In [ ]:
output[:10]
In [ ]:
# In this case, each element in output is itself a list ('index_probability_tuples'),
# the length of which is the number of texts by a given author. We'll flatten it for
# ease of use.
flat_output = [tup for lst in output for tup in lst]
flat_output[:10]
In [ ]:
# Use the indices returned with the output to arrange probabilities properly
probabilities = np.zeros([len(clf_tb['docid'])])
for tup in flat_output:
probabilities[tup[0]] = tup[1]
clf_tb['P(reviewed)'] = probabilities
In [ ]:
clf_tb.select(['docid', 'firstpub','author', 'title', 'recept', 'P(reviewed)'])
In [ ]:
# Visualize the probability each text was reviewed
colors = ['r' if recept=='reviewed' else 'b' for recept in clf_tb['recept']]
clf_tb.scatter('firstpub', 'P(reviewed)', c=colors, fit_line=True)
In [ ]:
# Does the Logistic Regression Model think its likely each book was reviewed?
predictions = probabilities>0.5
predictions
In [ ]:
from sklearn.metrics import accuracy_score
# Creates array where '1' indicates a reviewed book and '0' indicates not
targets = clf_tb['recept']=='reviewed'
print(accuracy_score(predictions, targets))
In [ ]:
# Note: Often we prefer to evaluate accuracy based on the F1-score, which
# weighs the number of times we correctly predicted reviewed texts against
# the number of times we incorrectly predicted them as 'random'.
from sklearn.metrics import f1_score
print(f1_score(predictions, targets))
In [ ]:
## EX. Change the regularization parameter ('C') in our Logistic Regression function.
## How does this change the classifier's accuracy?
## EX. Reduce the size of the vocabulary used for classification. How does accuracy change?
In [ ]:
## Q. Are there cases when we might not want to set the classification threshold
## to 50% likelihood? How certain are we that 51% is different from a 49% probability?
In [ ]:
# Train model using full set of 'reviewed' and 'random' texts
# Use this to predict the probability that other prestigious texts
# (i.e. ones that we haven't trained on) might have been reviewed
# ...if they had been published! The new texts include, for example,
# Emily Dickinson and Gerard Manley Hopkins.
In [ ]:
# Re-run script from scratch
%pylab inline
matplotlib.style.use('ggplot')
from datascience import *
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction import DictVectorizer
corpus_path = 'poems/'
In [ ]:
# Read metadata from spreadsheet
metadata_tb = Table.read_table('poemeta.csv', keep_default_na=False)
In [ ]:
# We'll copy just our new texts into a separate table as well, for later
canon_tb = metadata_tb.where('recept','addcanon')
In [ ]:
# Read Term Frequencies from files
freqdict_list = []
# Iterate through texts in our spreadsheet
for _id in metadata_tb['docid']:
# Each text will have its own dictionary
# Keys are terms and values are frequencies
termfreq_dict = {}
# Open the given text's spreadsheet
with open(corpus_path+_id+'.poe.tsv', encoding='utf-8') as file_in:
filelines = file_in.readlines()
# Each line in the spreadsheet contains a unique term and its frequency
for line in filelines:
termfreq = line.split('\t')
# 'If' conditions throw out junk lines in the spreadsheet
if len(termfreq) > 2 or len(termfreq) > 2:
continue
term, freq = termfreq[0], int(termfreq[1])
if len(term)>0 and term[0].isalpha():
# Create new entry in text's dictionary for the term
termfreq_dict[term] = freq
freqdict_list.append(termfreq_dict)
In [ ]:
# Create the Document-Term-Matrix
dv = DictVectorizer()
dtm = dv.fit_transform(freqdict_list)
term_list = dv.feature_names_
In [ ]:
# Place the DTM into a Pandas DataFrame for further manipulation
dtm_df = pd.DataFrame(dtm.toarray(), columns = term_list)
dtm_df.set_index(metadata_tb['docid'], inplace=True)
In [ ]:
# These are Feature Selection functions like the ones we originally defined,
# not their efficiency minded counterparts, since we only train once
# Set aside each canonic texts from training set
def set_canon_aside(tb, df):
train_ids = tb.where(tb['recept']!='addcanon').column('docid')
classify_ids = tb.where(tb['recept']=='addcanon').column('docid')
train_df_ = df.loc[train_ids]
classify_df_ = df.loc[classify_ids]
train_targets_ = tb.where(tb['recept']!='addcanon')['recept']=='reviewed'
return train_df_, classify_df_, train_targets_
# Retrieve the most common words (by document frequency) for a given model
def top_vocab_by_docfreq(df, num_words):
docfreq_df = df > 0
wordcolumn_sums = sum(docfreq_df)
words_by_freq = wordcolumn_sums.sort_values(ascending=False)
top_words = words_by_freq[:num_words]
top_words_list = top_words.index.tolist()
return top_words_list
# Normalize the model's term frequencies and put them into standard units
def normalize_model(train_df_, classify_df_, vocabulary):
# Select columns for only the most common words
train_df_ = train_df_[vocabulary]
classify_df_ = classify_df_[vocabulary]
# Normalize each value by the sum of all values in its row
train_df_ = train_df_.apply(lambda x: x/sum(x), axis=1)
classify_df_ = classify_df_.apply(lambda x: x/sum(x), axis=1)
# Get mean and stdev for each column
train_mean = np.mean(train_df_)
train_std = np.std(train_df_)
# Transform each value to standard units for its column
train_df_ = ( train_df_ - train_mean ) / train_std
classify_df_ = ( classify_df_ - train_mean ) / train_std
return train_df_, classify_df_
In [ ]:
# Train our Logistic Regression Model
clf = LogisticRegression(C = 0.00007)
model_df, classify_df, model_targets = set_canon_aside(metadata_tb, dtm_df)
vocab_list = top_vocab_by_docfreq(model_df, 3200)
model_df, classify_df = normalize_model(model_df, classify_df, vocab_list)
clf.fit(model_df, model_targets)
In [ ]:
# Predict whether our new prestigious texts might have been reviewed
probabilities = numpy.zeros([len(canon_tb.column('docid'))])
for _id in classify_df.index.tolist():
text = classify_df.loc[_id]
probability = clf.predict_proba([text])[0][1]
_index = list(canon_tb.column('docid')).index(_id)
probabilities[_index] = probability
In [ ]:
# Add this probability as a new column to our table of canonic texts
canon_tb['P(reviewed)'] = probabilities
In [ ]:
# Visualize
canon_tb.scatter('firstpub','P(reviewed)', fit_line=True)
In [ ]:
## Q. Two of the prestigious texts are assigned less than 50% probability
## that they were reviewed. How do we make sense of that?