In [1]:
import numpy as np
import numpy.linalg as LA
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel
In [2]:
f = open('State of the Union Addresses 1970-2016.txt')
lines = f.readlines()
bigline = " ".join(lines)
stars = bigline.split('***')
splits = [s.split('\r\n') for s in stars[1:]]
tups = [(s[2], s[3], s[4], "".join(s[5:])) for s in splits]
df = pd.DataFrame(tups)
In [3]:
df.columns = ['title', 'pres', 'date', 'speech']
df['year'] = df['date'].str.split(',', expand=True)[1]
In [4]:
tfidf_orig = TfidfVectorizer(stop_words='english')
tfidf = tfidf_orig.fit_transform(df['speech'])
cosine_similarities = linear_kernel(tfidf, tfidf)
# Most highly-related speeches
related_docs_indices = cosine_similarities.argsort()[:-5:-4]
# print(related_docs_indices)
cosine_similarities = pd.DataFrame(cosine_similarities)
# Set Column and Row names for graphing
cosine_similarities.columns = df['pres'] + df['year']
cosine_similarities = cosine_similarities.set_index(df['pres'] + df['year'])
In [17]:
# Set up the matplotlib figure
f, ax = plt.subplots(figsize=(9, 9))
# Generate a custom diverging colormap
cmap = sns.diverging_palette(250, 10, n=3, as_cmap=True)
h = sns.heatmap(cosine_similarities.iloc[178:, 178:], #mask=mask, cmap=cmap, #vmax=.3,
square=True, #xticklabels=5, yticklabels=5,
linewidths=0, cbar_kws={"shrink": .5}, ax=ax)
plt.xticks(rotation=90)
plt.yticks(rotation=0)
plt.show()