In [255]:
import nltk
import pickle
import pandas as pd
import re
import matplotlib
import pylab as plt
In [330]:
from nltk.corpus import stopwords
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from collections import Counter
# Cleaning and arranging SOTU
f = open('State of the Union Addresses 1970-2016_edited.txt')
lines = f.readlines()
bigline = " ".join(lines)
stars = bigline.split('***')
splits = [s.split('\n') for s in stars[1:]]
filtered_words = [word for word in splits if word not in stopwords.words('english')]
tups = [(s[2].strip(), s[3].strip(), s[4].strip(), "".join(s[5:])) for s in filtered_words]
speech_df = pd.DataFrame(tups)
In [340]:
# overall Sotu
count_vect = CountVectorizer(stop_words='english')
count_vect.fit(speech_df[3])
X = count_vect.transform(speech_df[3])
freq = zip(count_vect.get_feature_names(), np.asarray(X.sum(axis=0)).ravel())
df = pd.DataFrame(freq)
df.columns = ['word', 'count']
result_overall = df.sort('count', ascending=[0])
top_20 = result_overall.head(20)
top_20.plot(x = 'word', y = 'count',kind = 'bar')
plt.show()
In [257]:
# Cleaning Violence data
# from http://www.johnstonsarchive.net/terrorism/wrjp255a.html
events = "violence_2.csv"
tags = ['description']
event_df = pd.read_csv(events, header = None, names = tags)
In [258]:
# vectorizing violence and finding top 10
count_vect = CountVectorizer(stop_words='english')
count_vect.fit(event_df['description'])
X = count_vect.transform(event_df['description'])
freq = zip(count_vect.get_feature_names(), np.asarray(X.sum(axis=0)).ravel())
df = pd.DataFrame(freq)
df.columns = ['word', 'count']
result = df.sort('count', ascending=[0])
top_10 = result.head(10)
In [269]:
# Data set with type of event and year
events_year = "Events by year.csv"
tags_year = ['year','type']
event_year_df = pd.read_csv(events_year, header = True, names = tags_year)
event_year_df['year_string'] = event_year_df['year'].astype(str).str[:-2]
event_pivot = pd.pivot_table(event_year_df,index='year_string', columns='type', aggfunc=len, fill_value=0)
In [260]:
# vectorizing SOTU and finding the count of words in each
columns = ['word','count','Pres','Date']
df_words = pd.DataFrame(columns=columns)
df_words = df_words.fillna(0)
for i in range(len(speech_df)):
words = speech_df.loc[i,3].lower().split()
count = Counter(words).items()
df_speech_words = pd.DataFrame(count)
df_speech_words.columns = ['word', 'count']
df_speech_words['Pres'] = speech_df.loc[i,1]
df_speech_words['Date'] = speech_df.loc[i,2]
df_words = df_words.append(df_speech_words)
In [261]:
# merge top 10 list and
combined_result = pd.merge(top_10, df_words, how='inner', on=['word'])
combined_result['year'] = combined_result['Date'].str[-4:]
In [262]:
pivoted = pd.pivot_table(combined_result, values='count_y', columns='word', index='year')
In [314]:
# Combining speech and events
results_to_plot = pd.concat([pivoted,event_pivot], axis=1).fillna(0)
ax = results_to_plot['shot'].plot(kind="bar", legend = True);plt.xticks(rotation=90)
ax.set_xlabel('Year')
ax.set_ylabel('count')
plt.plot(ax.get_xticks(), results_to_plot['year','TER-islm'], '-b', label='Islamic Terrorism')
plt.plot(ax.get_xticks(), results_to_plot['year','TER-right'], '-r', label='Rightist Terrorism')
plt.plot(ax.get_xticks(), results_to_plot['year','TER-left'], '-g', label='Leftist Terrorism')
plt.plot(ax.get_xticks(), results_to_plot['year','CRI'], '-c', label='Criminal Incident')
plt.plot(ax.get_xticks(), results_to_plot['year','TER-natl'], '-m', label='Nationalist Terrorism')
plt.legend(loc='upper left')
plt.show()