In [477]:
import pandas as pd
import numpy as np
import seaborn as sb
import matplotlib.pyplot as plt
%matplotlib inline

In [478]:
#load/create df
f = open('State+of+the+Union+Addresses+1970-2016.txt')
lines = f.readlines()
bigline = " ".join(lines)
stars = bigline.split('***')
splits = [s.split('\n') for s in stars[1:]]
tups = [(s[2], s[3], s[4], "".join(s[5:])) for s in splits]
speech_df = pd.DataFrame(tups)

In [479]:
#dirty reformatting 
speech_df.columns = ['Type', 'Name', 'Date', 'Speech']
speech_df = speech_df.drop('Type', 1)
speech_df = speech_df[:][178:]
speech_df = speech_df.reset_index()
speech_df = speech_df.drop('index',1)
speech_df['Name'][36:] = speech_df['Date'][36:]
speech_df['Date'][36] = speech_df['Speech'][36][0:17]
speech_df['Date'][37] = speech_df['Speech'][37][0:17]
speech_df['Date'][38] = speech_df['Speech'][38][0:18]
speech_df['Date'][39] = speech_df['Speech'][39][0:17]
speech_df['Date'][40] = speech_df['Speech'][40][0:17]
speech_df['Date'][41] = speech_df['Speech'][41][0:17]
speech_df['Date'][42] = speech_df['Speech'][42][0:18]
speech_df['Date'][43] = speech_df['Speech'][43][0:17]
speech_df['Date'][44] = speech_df['Speech'][44][0:17]
speech_df['Date'][45] = speech_df['Speech'][45][0:17]
speech_df['Speech'][36] = speech_df['Speech'][36][18:]
speech_df['Speech'][37] = speech_df['Speech'][37][18:]
speech_df['Speech'][38] = speech_df['Speech'][38][19:]
speech_df['Speech'][39] = speech_df['Speech'][39][18:]
speech_df['Speech'][40] = speech_df['Speech'][40][18:]
speech_df['Speech'][41] = speech_df['Speech'][41][18:]
speech_df['Speech'][42] = speech_df['Speech'][42][19:]
speech_df['Speech'][43] = speech_df['Speech'][43][18:]
speech_df['Speech'][44] = speech_df['Speech'][44][18:]
speech_df['Speech'][45] = speech_df['Speech'][45][18:]

In [480]:
from sklearn.feature_extraction.text import CountVectorizer
count_vect = CountVectorizer(stop_words='english')
count_vect.fit(speech_df['Speech'])
X = count_vect.transform(speech_df['Speech'])
freq = zip(count_vect.get_feature_names(), np.asarray(X.sum(axis=0)).ravel())
df = pd.DataFrame(freq)
df.columns = ['word', 'count']

In [481]:
#top words in entire corpus
df.sort('count', ascending = False)


C:\Users\Michael\Anaconda\lib\site-packages\IPython\kernel\__main__.py:2: FutureWarning: sort(columns=....) is deprecated, use sort_values(by=.....)
  from IPython.kernel.zmq import kernelapp as app
Out[481]:
word count
675 america 1199
7243 people 1101
6678 new 1030
11112 world 868
11159 year 806
11164 years 792
676 american 770
2231 congress 761
4537 government 687
11098 work 613
677 americans 599
6119 make 576
769 applause 567
6612 nation 555
10182 time 528
4842 help 519
5867 let 492
2461 country 467
9562 states 441
8948 security 434
4041 federal 430
5717 know 417
10760 ve 411
10228 tonight 410
5604 jobs 400
3403 economy 399
6642 need 397
1835 children 394
9993 tax 388
4805 health 384
... ... ...
5453 interview 1
5451 intervene 1
5450 intertwined 1
5447 interrupted 1
5445 interpreter 1
5441 internationalized 1
5478 inventories 1
5480 inventory 1
5483 investigated 1
5484 investigating 1
5555 italians 1
5554 istanbul 1
5546 isolationist 1
5537 islamist 1
5533 isaiah 1
5531 irrigation 1
5528 irresponsibility 1
5527 irresistible 1
5525 irony 1
5524 irons 1
5523 ironic 1
5521 iron 1
5515 iranians 1
5508 invulnerability 1
5502 invoke 1
5500 invites 1
5496 invincible 1
5495 invigorate 1
5492 investor 1
11197 zooming 1

11198 rows × 2 columns


In [483]:
#http://pages.stern.nyu.edu/~adamodar/New_Home_Page/datafile/histretSP.html
#import stock returns
returns = pd.read_csv('returns.csv')

In [484]:
#reformat to only include years: 69-87, 89-91, 93-00, 00-15
test = pd.DataFrame(returns[0:19][:])
test1 = pd.DataFrame(returns[20:23][:])
test2 = pd.DataFrame(returns[24:32][:])
test3 = pd.DataFrame(returns[31:][:])
new_ret = pd.concat([test, test1, test2, test3])
new_ret = new_ret.reset_index()
new_ret = new_ret.drop('index', axis = 1)
i = 0
for each in new_ret['S&P 500']:
    new_ret['S&P 500'][i] = each.strip('%')
    i+=1


C:\Users\Michael\Anaconda\lib\site-packages\IPython\kernel\__main__.py:11: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy

In [485]:
#append to speech df
speech_df['S&P_ret'] = new_ret['S&P 500'].astype(float)

In [486]:
#create wordcounts
speech_df['word_counts'] = ""
for i in xrange(0,len(speech_df)):
    speech_df['word_counts'].iloc[i] = zip(count_vect.get_feature_names(), np.asarray(X[i].sum(axis=0)).ravel())

In [488]:
#list of words pertaining to economy
#pull out a list of indexes for words/similar words
#search each 
word_list = ['econom', 'tax', 'spend', 'budget', 'business', 
             'job', 'wealth', 'poor', 'recession', 'depression', 
             'income', 'deficit', 'expand']

idx = []

for i in word_list:
    temp = []
    for each in xrange(0,len(df)):
        if df['word'].iloc[each].find(i) > -1:
            temp.append(each)
    idx.append(temp)

#create empty columns
for each in word_list:
    speech_df[each] = ""

In [546]:
#count words in column
for word in xrange(0,len(word_list)):
    i = 0
    for speech in speech_df['word_counts']:
        total = 0.0
        for each in counts[word]:
            total = total + speech[each][1]
        speech_df[word_list[word]].iloc[i] = total
        i+=1

In [547]:
speech_df


Out[547]:
Name Date Speech S&P_ret word_counts econom tax spend budget business job wealth poor recession depression income deficit expand
0 Richard Nixon January 22, 1970 Mr. Speaker, Mr. President, my colleagues in... -8.24 [(00, 0), (000, 0), (02, 0), (03, 0), (04, 0),... 7 1 5 7 1 2 2 1 0 0 2 1 3
1 Richard Nixon January 22, 1971 Mr. Speaker, Mr. President, my colleagues in... 3.56 [(00, 0), (000, 1), (02, 0), (03, 0), (04, 0),... 9 4 2 8 3 4 1 0 0 0 2 0 2
2 Richard Nixon January 20, 1972 Mr. Speaker, Mr. President, my colleagues in... 14.22 [(00, 0), (000, 1), (02, 0), (03, 0), (04, 0),... 1 10 4 4 1 5 1 1 0 0 4 0 2
3 Richard Nixon February 2, 1973 To the Congress of the United States: The t... 18.76 [(00, 0), (000, 0), (02, 0), (03, 0), (04, 0),... 4 9 6 0 1 4 0 0 0 0 1 0 1
4 Richard Nixon January 30, 1974 Mr. Speaker, Mr. President, my colleagues in... -14.31 [(00, 0), (000, 2), (02, 0), (03, 0), (04, 0),... 6 6 2 1 0 6 0 1 3 0 3 0 4
5 Gerald R. Ford January 15, 1975 Mr. Speaker, Mr. Vice President, Members of ... -25.90 [(00, 0), (000, 4), (02, 0), (03, 0), (04, 0),... 18 29 11 7 6 6 0 1 1 0 10 3 0
6 Gerald R. Ford January 19, 1976 Mr. Speaker, Mr. Vice President, Members of ... 37.00 [(00, 0), (000, 5), (02, 0), (03, 0), (04, 0),... 18 19 6 14 5 18 0 3 4 0 6 0 1
7 Gerald R. Ford January 12, 1977 Mr. Speaker, Mr. Vice President, Members of ... 23.83 [(00, 0), (000, 1), (02, 0), (03, 0), (04, 0),... 15 7 4 4 0 8 0 2 4 0 1 0 3
8 Jimmy Carter January 19, 1978 Two years ago today we had the first caucus ... -6.98 [(00, 0), (000, 0), (02, 0), (03, 0), (04, 0),... 26 18 4 7 8 14 0 1 0 1 3 4 1
9 Jimmy Carter January 25, 1979 Tonight I want to examine in a broad sense t... 6.51 [(00, 0), (000, 1), (02, 0), (03, 0), (04, 0),... 14 1 3 7 2 6 0 2 3 0 3 3 1
10 Jimmy Carter January 21, 1980 This last few months has not been an easy ti... 18.52 [(00, 0), (000, 0), (02, 0), (03, 0), (04, 0),... 8 0 0 3 2 4 0 0 0 0 0 1 2
11 Jimmy Carter January 16, 1981 To the Congress of the United States: The S... 31.74 [(00, 0), (000, 21), (02, 0), (03, 0), (04, 0)... 131 27 17 41 30 28 3 10 4 0 26 8 29
12 Ronald Reagan January 26, 1982 Mr. Speaker, Mr. President, distinguished Me... -4.70 [(00, 0), (000, 3), (02, 0), (03, 0), (04, 0),... 20 29 13 8 7 8 1 2 6 0 1 12 0
13 Ronald Reagan January 25, 1983 Mr. Speaker, Mr. President, distinguished Me... 20.42 [(00, 0), (000, 0), (02, 0), (03, 1), (04, 0),... 26 14 17 7 3 15 0 0 2 0 3 12 1
14 Ronald Reagan January 25, 1984 Mr. Speaker, Mr. President, distinguished Me... 22.34 [(00, 0), (000, 2), (02, 1), (03, 0), (04, 0),... 16 21 14 11 4 5 0 0 0 0 3 8 2
15 Ronald Reagan February 6, 1985 Mr. Speaker, Mr. President, distinguished Me... 6.15 [(00, 0), (000, 0), (02, 0), (03, 0), (04, 0),... 22 23 8 5 1 10 1 0 0 0 5 3 3
16 Ronald Reagan February 4, 1986 Mr. Speaker, Mr. President, distinguished Me... 31.24 [(00, 0), (000, 1), (02, 0), (03, 0), (04, 1),... 8 12 4 11 0 2 0 2 0 0 0 5 1
17 Ronald Reagan January 27, 1987 Mr. Speaker, Mr. President, distinguished Me... 18.49 [(00, 0), (000, 3), (02, 0), (03, 1), (04, 0),... 4 3 4 9 0 4 0 0 0 1 0 4 3
18 Ronald Reagan January 25, 1988 Mr. Speaker, Mr. President, and distinguishe... 5.81 [(00, 0), (000, 1), (02, 0), (03, 0), (04, 0),... 13 1 15 17 0 5 1 3 0 0 1 8 4
19 George H.W. Bush January 31, 1990 Tonight, I come not to speak about the "Stat... 31.48 [(00, 0), (000, 1), (02, 0), (03, 0), (04, 0),... 4 2 4 8 1 3 1 0 0 0 0 3 3
20 George H.W. Bush January 29, 1991 Mr. President, Mr. Speaker, members of the U... -3.06 [(00, 0), (000, 0), (02, 0), (03, 0), (04, 0),... 11 3 4 7 2 7 1 0 1 0 0 0 1
21 George H.W. Bush January 28, 1992 Mr. Speaker, Mr. President, distinguished me... 30.23 [(00, 0), (000, 1), (02, 0), (03, 0), (04, 0),... 14 21 7 6 4 11 1 1 0 0 4 1 0
22 William J. Clinton January 25, 1994 Mr. Speaker, Mr. President, members of the 1... 9.97 [(00, 0), (000, 3), (02, 0), (03, 0), (04, 0),... 14 13 10 7 10 30 3 1 0 0 8 12 5
23 William J. Clinton January 24, 1995 Mr. President, Mr. Speaker, members of the 1... 1.33 [(00, 0), (000, 10), (02, 0), (03, 0), (04, 0)... 18 20 10 11 11 19 1 3 0 1 7 9 6
24 William J. Clinton January 23, 1996 Mr. Speaker, Mr. Vice President, members of ... 37.20 [(00, 0), (000, 7), (02, 0), (03, 0), (04, 0),... 8 11 2 9 12 15 0 2 0 0 1 6 4
25 William J. Clinton February 4, 1997 Mr. Speaker, Mr. Vice President, members of ... 22.68 [(00, 0), (000, 7), (02, 0), (03, 0), (04, 0),... 16 10 2 17 7 14 0 2 0 0 1 1 10
26 William J. Clinton January 27, 1998 Mr. Speaker, Mr. Vice President, members of ... 33.10 [(00, 2), (000, 8), (02, 0), (03, 0), (04, 0),... 17 16 3 9 6 16 1 4 1 0 6 8 4
27 William J. Clinton January 19, 1999 Mr. Speaker, Mr. Vice President, members of ... 28.34 [(00, 0), (000, 12), (02, 0), (03, 0), (04, 0)... 15 11 2 16 5 9 2 3 2 1 4 2 9
28 William J. Clinton January 27, 2000 Mr. Speaker, Mr. Vice President, Members of ... 20.89 [(00, 0), (000, 18), (02, 0), (03, 0), (04, 0)... 24 20 1 12 10 11 2 4 0 0 8 2 9
29 George W. Bush February 27, 2001 Mr. Speaker, Mr. Vice President, members of ... -9.03 [(00, 0), (000, 5), (02, 0), (03, 0), (04, 0),... 6 30 14 26 2 4 1 1 0 0 10 1 1
30 George W. Bush September 20, 2001 Mr. Speaker, Mr. President Pro Tempore, memb... -9.03 [(00, 0), (000, 0), (02, 0), (03, 0), (04, 0),... 3 0 0 0 0 0 0 0 0 0 0 0 1
31 George W. Bush January 29, 2002 Thank you very much. Mr. Speaker, Vice Presi... -11.85 [(00, 0), (000, 3), (02, 0), (03, 0), (04, 0),... 7 7 4 7 1 13 0 0 3 0 0 1 3
32 George W. Bush January 28, 2003 Mr. Speaker, Vice President Cheney, Members ... -21.97 [(00, 0), (000, 10), (02, 0), (03, 0), (04, 0)... 13 13 4 4 3 4 1 0 2 0 7 1 2
33 George W. Bush January 20, 2004 Mr. Speaker, Vice President Cheney, members ... 28.36 [(00, 0), (000, 2), (02, 0), (03, 0), (04, 0),... 17 21 2 4 7 13 0 0 1 0 4 1 4
34 George W. Bush February 2, 2005 Mr. Speaker, Vice President Cheney, members ... 10.74 [(00, 0), (000, 1), (02, 0), (03, 0), (04, 0),... 14 11 3 3 4 7 1 1 1 0 3 1 6
35 George W. Bush January 31, 2006 Thank you all. Mr. Speaker, Vice President C... 4.83 [(00, 0), (000, 3), (02, 0), (03, 0), (04, 0),... 23 10 6 3 5 7 0 2 0 0 0 2 1
36 George W. Bush January 23, 2007 Thank you very much. And tonight, I have a hi... 15.61 [(00, 0), (000, 8), (02, 0), (03, 0), (04, 0),... 8 10 3 4 5 6 1 2 0 0 4 3 3
37 George W. Bush January 28, 2008 THE PRESIDENT: Madam Speaker, Vice President ... 5.48 [(00, 0), (000, 3), (02, 0), (03, 0), (04, 0),... 9 16 5 3 6 6 0 2 0 0 1 1 6
38 Barack Obama February 24, 2009 Madam Speaker, Mr. Vice President, Members of... -36.55 [(00, 0), (000, 2), (02, 0), (03, 0), (04, 0),... 30 16 7 13 14 19 3 0 6 1 0 9 5
39 Barack Obama January 27, 2010 Madam Speaker, Vice President Biden, Members... 25.94 [(00, 0), (000, 3), (02, 0), (03, 0), (04, 0),... 18 23 7 6 30 29 1 0 6 3 5 14 3
40 Barack Obama January 25, 2011 Mr. Speaker, Mr. Vice President, Members of ... 14.82 [(00, 0), (000, 8), (02, 0), (03, 0), (04, 0),... 9 14 13 2 20 31 1 0 4 0 1 11 0
41 Barack Obama January 24, 2012 Mr. Speaker, Mr. Vice President, Members of ... 2.10 [(00, 0), (000, 6), (02, 0), (03, 0), (04, 0),... 14 34 6 4 24 42 2 1 2 2 3 7 2
42 Barack Obama February 12, 2013 Please, everybody, have a seat. Mr. Speaker,... 15.89 [(00, 0), (000, 5), (02, 0), (03, 0), (04, 0),... 20 17 5 4 14 47 4 1 1 0 3 11 4
43 Barack Obama January 28, 2014 The President. Mr. Speaker, Mr. Vice Preside... 32.15 [(00, 0), (000, 4), (02, 0), (03, 0), (04, 0),... 17 10 1 4 23 39 2 1 1 0 2 4 4
44 Barack Obama January 20, 2015 The President. Mr. Speaker, Mr. Vice Preside... 13.52 [(00, 0), (000, 6), (02, 0), (03, 0), (04, 0),... 29 7 3 2 13 28 1 0 2 0 2 3 1
45 Barack Obama January 12, 2016 Thank you. Mr. Speaker, Mr. Vice President, ... 1.36 [(00, 0), (000, 3), (02, 0), (03, 0), (04, 0),... 22 5 1 2 9 20 2 0 1 1 4 1 2

In [548]:
corr_df = speech_df[word_list].astype(float)
corr_df['S&P_returns'] = speech_df['S&P_ret']

In [549]:
corr_df.corr()


Out[549]:
econom tax spend budget business job wealth poor recession depression income deficit expand S&P_returns
econom 1.000000 0.360545 0.405620 0.666590 0.551643 0.344599 0.389797 0.725681 0.327362 0.027306 0.773112 0.272253 0.808817 0.134699
tax 0.360545 1.000000 0.502747 0.335185 0.482476 0.370167 0.206664 0.273462 0.259708 0.272922 0.532114 0.392023 0.174127 -0.015745
spend 0.405620 0.502747 1.000000 0.487212 0.201357 0.124941 0.127073 0.254297 0.236574 -0.024904 0.430133 0.593890 0.167582 -0.088051
budget 0.666590 0.335185 0.487212 1.000000 0.244018 0.034271 0.268370 0.722511 0.128636 -0.015741 0.677992 0.177492 0.712505 0.156450
business 0.551643 0.482476 0.201357 0.244018 1.000000 0.823798 0.518045 0.316367 0.439961 0.516208 0.424980 0.562949 0.438289 0.187612
job 0.344599 0.370167 0.124941 0.034271 0.823798 1.000000 0.604438 0.123464 0.327553 0.362661 0.229716 0.556960 0.219260 0.195923
wealth 0.389797 0.206664 0.127073 0.268370 0.518045 0.604438 1.000000 0.254956 0.162901 0.181312 0.377103 0.387227 0.411764 -0.062959
poor 0.725681 0.273462 0.254297 0.722511 0.316367 0.123464 0.254956 1.000000 0.122899 -0.112624 0.696532 0.136011 0.791331 0.321000
recession 0.327362 0.259708 0.236574 0.128636 0.439961 0.327553 0.162901 0.122899 1.000000 0.335608 0.128528 0.386472 0.120915 -0.138673
depression 0.027306 0.272922 -0.024904 -0.015741 0.516208 0.362661 0.181312 -0.112624 0.335608 1.000000 0.002217 0.360489 0.002133 -0.062665
income 0.773112 0.532114 0.430133 0.677992 0.424980 0.229716 0.377103 0.696532 0.128528 0.002217 1.000000 0.161173 0.696932 0.079284
deficit 0.272253 0.392023 0.593890 0.177492 0.562949 0.556960 0.387227 0.136011 0.386472 0.360489 0.161173 1.000000 0.114328 0.097495
expand 0.808817 0.174127 0.167582 0.712505 0.438289 0.219260 0.411764 0.791331 0.120915 0.002133 0.696932 0.114328 1.000000 0.243703
S&P_returns 0.134699 -0.015745 -0.088051 0.156450 0.187612 0.195923 -0.062959 0.321000 -0.138673 -0.062665 0.079284 0.097495 0.243703 1.000000

In [518]:
g = sb.PairGrid(corr_df)
g.map(plt.scatter)
plt.savefig('corr.png')