In [255]:
import nltk
import pickle
import pandas as pd
import re
import matplotlib
import pylab as plt

In [330]:
from nltk.corpus import stopwords
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from collections import Counter

# Cleaning and arranging SOTU
f = open('State of the Union Addresses 1970-2016_edited.txt')
lines = f.readlines()
bigline = " ".join(lines)
stars = bigline.split('***')
splits = [s.split('\n') for s in stars[1:]]

filtered_words = [word for word in splits if word not in stopwords.words('english')]

tups = [(s[2].strip(), s[3].strip(), s[4].strip(), "".join(s[5:])) for s in filtered_words]
speech_df = pd.DataFrame(tups)

In [340]:
# overall Sotu

count_vect = CountVectorizer(stop_words='english')
count_vect.fit(speech_df[3])
X = count_vect.transform(speech_df[3])
freq = zip(count_vect.get_feature_names(), np.asarray(X.sum(axis=0)).ravel())
df = pd.DataFrame(freq)
df.columns = ['word', 'count']
result_overall = df.sort('count', ascending=[0])

top_20 = result_overall.head(20)

top_20.plot(x = 'word', y = 'count',kind = 'bar')

plt.show()

In [257]:
# Cleaning Violence data
# from http://www.johnstonsarchive.net/terrorism/wrjp255a.html 

events = "violence_2.csv"
tags = ['description']
event_df = pd.read_csv(events, header = None, names = tags)

In [258]:
# vectorizing violence and finding top 10
count_vect = CountVectorizer(stop_words='english')

count_vect.fit(event_df['description'])
X = count_vect.transform(event_df['description'])

freq = zip(count_vect.get_feature_names(), np.asarray(X.sum(axis=0)).ravel())

df = pd.DataFrame(freq)

df.columns = ['word', 'count']
result = df.sort('count', ascending=[0])
top_10 = result.head(10)

In [269]:
# Data set with type of event and year

events_year = "Events by year.csv"
tags_year = ['year','type']
event_year_df = pd.read_csv(events_year, header = True, names = tags_year)

event_year_df['year_string'] = event_year_df['year'].astype(str).str[:-2]

event_pivot = pd.pivot_table(event_year_df,index='year_string', columns='type', aggfunc=len, fill_value=0)

In [260]:
# vectorizing SOTU and finding the count of words in each

columns = ['word','count','Pres','Date']
df_words = pd.DataFrame(columns=columns)
df_words = df_words.fillna(0)

for i in range(len(speech_df)):
    words = speech_df.loc[i,3].lower().split()
    count = Counter(words).items()
    df_speech_words = pd.DataFrame(count)
    df_speech_words.columns = ['word', 'count']
    df_speech_words['Pres'] = speech_df.loc[i,1]
    df_speech_words['Date'] = speech_df.loc[i,2]


    df_words = df_words.append(df_speech_words)

In [261]:
# merge top 10 list and 
combined_result = pd.merge(top_10, df_words, how='inner', on=['word'])
combined_result['year'] = combined_result['Date'].str[-4:]

In [262]:
pivoted = pd.pivot_table(combined_result, values='count_y', columns='word', index='year')

In [314]:
# Combining speech and events

results_to_plot = pd.concat([pivoted,event_pivot], axis=1).fillna(0)

ax = results_to_plot['shot'].plot(kind="bar", legend = True);plt.xticks(rotation=90)
ax.set_xlabel('Year')
ax.set_ylabel('count')

plt.plot(ax.get_xticks(), results_to_plot['year','TER-islm'], '-b', label='Islamic Terrorism')
plt.plot(ax.get_xticks(), results_to_plot['year','TER-right'], '-r', label='Rightist Terrorism')
plt.plot(ax.get_xticks(), results_to_plot['year','TER-left'], '-g', label='Leftist Terrorism')
plt.plot(ax.get_xticks(), results_to_plot['year','CRI'], '-c', label='Criminal Incident')
plt.plot(ax.get_xticks(), results_to_plot['year','TER-natl'], '-m', label='Nationalist Terrorism')
plt.legend(loc='upper left')

plt.show()