In [1]:
# imports
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
#from ggplot import *
from pandas import Series, DataFrame, Panel
import seaborn as sns
import statsmodels.tsa.api
import datetime
import pandas.io.data
from pandas.tools.plotting import *
%pylab qt
In [2]:
# date converter function for pandas time series to add a new, non-indexed column
# to the set for ggplot
def dateConvert(df):
dt = df.index
df['Date'] = dt
df.reset_index(drop=True)
return df
In [3]:
pylab.rcParams['figure.figsize'] = (14, 14.0)
In [3]:
In [3]:
In [3]:
In [3]:
Here we read the data from our csv file into a data frame, parsing the date column as dates and indexing on dates.
In [4]:
tweet_data = pd.read_csv('tweet_sentiment_scores.csv', parse_dates=[0], index_col=['time'])
In [5]:
#sanity check
tweet_data.keys()
Out[5]:
In [6]:
words = tweet_data.lsi_words
print words['2014-10-20']
In [7]:
word2 = words['2014-10-31'][0]
word2 = word2.split(']')
print word2
In [7]:
In [7]:
In [7]:
In [7]:
In [7]:
In [8]:
#tweet_data.total_tweets.rolplot()
In [8]:
In [8]:
In [8]:
tweet_dates = dateConvert(tweet_data)
tweet_dates.head()
tweet_dates = tweet_dates[['Date','lsi_score','lda_score']]
In [9]:
data = pd.melt(tweet_dates, id_vars='Date').dropna() # Here is the method I use to format the data in the long format. Please note the use of 'Date' as the id_vars.
data = data.rename(columns = {'Date':'Date','variable':'index','value':'score'}) # common to rename these columns
In [10]:
print data.info()
data.head()
Out[10]:
In [10]:
In [10]:
In [10]:
In [10]:
In [10]:
In [10]:
In [11]:
In [12]:
rets = df.pct_change()
In [12]:
In [13]:
corr = rets.corr()
corr
In [14]:
plt.imshow(corr, cmap = 'hot', interpolation='none')
plt.colorbar()
plt.xticks(range(len(corr)), corr.columns)
plt.yticks(range(len(corr)), corr.columns)
plt.show()
In [14]:
In [14]:
In [14]:
In [14]:
In [15]:
plt.figure()
Out[15]:
In [16]:
lag_plot(tweet_data.lsi_score)
show()
In [17]:
lag_plot(tweet_data.lda_score)
show()
In [18]:
autocorrelation_plot(tweet_data.lsi_score)
show()
In [14]:
autocorrelation_plot(tweet_data.lda_score)
Out[14]:
In [19]:
from pandas.tools.plotting import bootstrap_plot
bootstrap_plot(tweet_data.lsi_score, size = 50, samples = 300
, color = 'grey')
Out[19]:
In [11]:
mas = pd.rolling_mean(tweet_data.lsi_score, 20)
mstd = pd.rolling_std(tweet_data.lsi_score, 20)
mad = pd.rolling_mean(tweet_data.lda_score, 20)
madst = pd.rolling_std(tweet_data.lda_score, 20)
#plt.plot(tweet_data.index, tweet_data.lsi_score, 'k')
plt.plot(mas.index, mas)
plt.fill_between(mstd.index, mas-mstd, mas+mstd, color = 'b', alpha = 0.1)
plt.plot(mad.index, mad)
plt.fill_between(madst.index, mad-madst, mad+madst, color = 'g', alpha = 0.1)
plt.xticks(rotation=50)
plt.legend()
show()
In [ ]:
In [12]:
stock_data = pd.read_csv('../data/stocks.csv', parse_dates={'Timestamp': ['Date', 'Time']},
index_col='Timestamp')
For our purposes, we only want to use the hourly closing time in our analysis. We may return in the future and use other information from our stock data.
In [13]:
stock_data = stock_data[['Stockid','Close']]
In [13]:
In [14]:
dateConvert(stock_data);
In [14]:
here we reshape the data into a 'wide' format using pandas pivot function.
In [15]:
stock_data_pivot = stock_data.pivot(index = 'Date', columns='Stockid', values='Close')
stock_data_pivot.head()
Out[15]:
In [16]:
stock_data_pivot.plot()
Out[16]:
In [17]:
test = stock_data_pivot
test = test.join(tweet_data)
This shows a comparison of the LDA and LSI scores with two running means, 10 and 24 hours.
In [18]:
mas10 = pd.rolling_mean(tweet_data.lsi_score, 10)
mstd10 = pd.rolling_std(tweet_data.lsi_score, 10)
mad10 = pd.rolling_mean(tweet_data.lda_score, 10)
madst10 = pd.rolling_std(tweet_data.lda_score, 10)
mas24 = pd.rolling_mean(tweet_data.lsi_score, 24)
mstd24 = pd.rolling_std(tweet_data.lsi_score, 24)
mad24 = pd.rolling_mean(tweet_data.lda_score, 24)
madst24 = pd.rolling_std(tweet_data.lda_score, 24)
# plot the mean / std dev for 10h rolling mean
plt.plot(mas10.index, mas10, label='LSI, 10-hour Rolling Mean')
plt.fill_between(mstd10.index, mas10-(mstd10*.5), mas10+(mstd10*.5), color = 'b', alpha = 0.1)
plt.plot(mad10.index, mad10, label='LDA, 10-hour Rolling Mean')
plt.fill_between(madst10.index, mad10-(0.5*madst10), mad10+(madst10* 0.5), color = 'g', alpha = 0.1)
# plot mean/std dev for 24h rolling mean
plt.plot(mas24.index, mas24, label='LSI, 24-hour Rolling Mean')
plt.fill_between(mstd24.index, mas24-(mstd24*.5), mas24+(mstd24*.5), color = 'r', alpha = 0.1)
plt.plot(mad24.index, mad24, label='LDA, 24-hour Rolling Mean')
plt.fill_between(madst24.index, mad24-(0.5*madst24), mad24+(madst24* 0.5), color = 'k', alpha = 0.1)
# plot options
plt.xticks(rotation=50)
plt.legend(loc = 2, prop={'size':'20'})
plt.title('Comparison of LSI and LDA scores for tweet data', size=24)
plt.tick_params(labelsize=14)
For presentation purposes, we'll just use the LSI Scores.
In [19]:
# plot the mean / std dev for 10h rolling mean
plt.plot(mas10.index, mas10, label='LSI, 10-hour Rolling Mean', color = 'b')
plt.fill_between(mstd10.index, mas10-(mstd10*.5), mas10+(mstd10*.5), color = 'b', alpha = 0.1)
# plot mean/std dev for 24h rolling mean
plt.plot(mas24.index, mas24, label='LSI, 24-hour Rolling Mean', color = 'r')
plt.fill_between(mstd24.index, mas24-(mstd24*.5), mas24+(mstd24*.5), color = 'r', alpha = 0.1)
# plot options
plt.xticks(rotation=50)
plt.legend(loc = 2, prop={'size':'20'})
plt.title('LSI score per hour, rolling means', size=24)
plt.tick_params(labelsize=14)
In [28]:
print test.keys()
testa = test[['aapl', 'amzn', 'fb', 'goog', 'googl', 'msft', 'twtr', 'total_tweets', 'lsi_score', 'lda_score']]
In [29]:
from sklearn.preprocessing import scale
from sklearn import preprocessing
from pandas import DataFrame
zscore = lambda x: (x - x.mean()) / x.std()
In [30]:
tweet_data.lsi_score.count()
Out[30]:
In [31]:
z_tweets = scale(tweet_data.lsi_score)
z_tweets = DataFrame(z_tweets, index=tweet_data.index)
z_tweets['tph'] = scale(log(tweet_data.total_tweets))
z_tweets.columns = ['lsi_score','tweets_per_hour']
z_tweets.plot()
Out[31]:
In [32]:
# interpolation
print stock_data_pivot.amzn.count()
stock_data_pivot.amzn = stock_data_pivot.amzn.interpolate()
print stock_data_pivot.amzn.count()
In [33]:
z_stock = scale(stock_data_pivot)
z_stock = DataFrame(z_stock, index=stock_data_pivot.index)
In [46]:
z_stock.columns = ['aapl', 'amzn', 'fb', 'goog', 'googl', 'msft', 'twtr']
z_stock = z_stock[['aapl', 'amzn', 'fb', 'goog', 'msft', 'twtr']]
pd.rolling_mean(z_stock, 24).plot()
Out[46]:
In [47]:
test = z_stock
test = test.join(z_tweets)
print test.lsi_score.count()
test = test.interpolate()
test.interpolate().plot()
Out[47]:
In [ ]:
#test.pct_change().plot()
In [52]:
pd.rolling_mean(tweet_data.total_tweets, 1).plot()
plt.tick_params(labelsize=14)
In [55]:
print tweet_data.mean()
print tweet_data.std()
In [ ]:
In [56]:
test.lsi_score.interpolate(method='polynomial', order=2).pct_change().plot()
Out[56]:
In [57]:
import datetime as dt
start = test.index.searchsorted(dt.datetime(2014,10,10))
end = test.index.searchsorted(dt.datetime(2014,11,18))
In [130]:
small_range = test.ix[start:end]
In [131]:
small_range.lsi_score.interpolate().plot(label = 'lsi')
small_range.goog.plot(label='GOOG')
plt.legend()
plt.tick_params(labelsize=14)
In [60]:
mvavg = pd.rolling_mean(small_range, 60)
#pd.rolling_mean(small_range.goog, 60).plot(label = 'rmean')
mvavg.plot()
plt.legend()
plt.tick_params(labelsize=14)
In [62]:
model = statsmodels.tsa.api.VAR(small_range.interpolate(), missing = 'drop')
In [63]:
model.select_order(5)
Out[63]:
In [64]:
results = model.fit(5,ic='aic')
In [65]:
results.summary()
Out[65]:
In [240]:
results.plot()
In [239]:
results.plot_acorr()
Impulse Response Analysis allows us to examine the impact of 'shock' to a variable in our model
In [249]:
irf = results.irf(2)
irf.plot(impulse='lsi_score', orth=False)
plt.suptitle('Impulse Responses', fontsize = 20)
Out[249]:
In [67]:
In [68]:
fevd = results.fevd(10)
fevd.plot()
In [250]:
for var in results.names:
results.test_causality(var, ['lsi_score'], signif=0.1, kind='f')
In [69]:
results.plotsim()
In [ ]:
In [ ]:
In [ ]:
In [70]:
plt.imshow(corr, cmap = 'hot', interpolation='none')
plt.colorbar()
plt.xticks(range(len(corr)), corr.columns)
plt.yticks(range(len(corr)), corr.columns)
plt.show()
In [179]:
smooth_lsi = small_range.interpolate()
smooth_lsi.lsi_score = pd.rolling_mean(smooth_lsi.lsi_score, 24)
smooth_lsi.aapl = pd.rolling_mean(smooth_lsi.aapl, 24)
smooth_lsi.amzn = pd.rolling_mean(smooth_lsi.amzn, 24)
smooth_lsi.fb = pd.rolling_mean(smooth_lsi.fb, 24)
smooth_lsi.goog = pd.rolling_mean(smooth_lsi.goog, 24)
smooth_lsi.msft = pd.rolling_mean(smooth_lsi.msft, 24)
smooth_lsi.twtr = pd.rolling_mean(smooth_lsi.twtr, 24)
#smooth_lsi.lsi_score.plot()
Out[179]:
In [164]:
In [186]:
fig = plt.figure()
ax1 = fig.add_subplot(211)
ax1.plot(smooth_lsi.lsi_score.plot(label = 'LSI, 24h rolling mean', color = '#5e402e'))
fig.add_subplot(plot(smooth_lsi.aapl, label='Apple, 24h rolling mean', color = '#519e8b'))
#pd.rolling_mean(small_range.amzn, 24).plot(label='Amazon, 24h rolling mean', color = '#be5fc1')
#pd.rolling_mean(small_range.fb, 24).plot(label='Facebook, 24h rolling mean', color = '#c8803a')
#axarr[0].legend()
#axarr[1].plot(smooth_lsi.index, smooth_lsi.lsi_score, label = 'LSI, 24h rolling mean', color = '#5e402e')
#axarr[1].plot(smooth_lsi.index, fb, label='Facebook, 24h rolling mean', color = '#6574b3')
#axarr[1].plot(smooth_lsi.index, smooth_lsi.goog, label='Google, 24h rolling mean', color = '#bb5868')
#axarr[1].plot(smooth_lsi.index, smooth_lsi.twtr, label='Twitter, 24h rolling mean', color = '#5da148')
#fig.legend()
#fig.tick_params(labelsize = 14)
plt.show()
#p.plot()
#plt.legend()
#plt.tick_params(labelsize=14)smoo
In [181]:
pd.rolling_mean(smooth_lsi, 24).plot(label = 'LSI, 24h rolling mean', color = '#5e402e')
pd.rolling_mean(small_range.fb, 24).plot(label='Facebook, 24h rolling mean', color = '#6574b3')
pd.rolling_mean(small_range.goog, 24).plot(label='Google, 24h rolling mean', color = '#bb5868')
pd.rolling_mean(small_range.twtr, 24).plot(label='Twitter, 24h rolling mean', color = '#5da148')
plt.legend()
plt.tick_params(labelsize = 14)
In [238]:
#smooth_lsi = small_range.lsi_score.interpolate()
#smooth_lsi = pd.rolling_mean(smooth_lsi, 24)
figure(1)
subplot(211)
pd.rolling_mean(small_range.lsi_score, 12).plot(label = 'LSI, 12h rolling mean', color = '#5e402e', linewidth = 2.5)
pd.rolling_mean(small_range.aapl, 24).plot(label='Apple, 24h rolling mean', color = '#519e8b', linewidth = 2)
pd.rolling_mean(small_range.amzn, 24).plot(label='Amazon, 24h rolling mean', color = '#be5fc1', linewidth = 2)
pd.rolling_mean(small_range.fb, 24).plot(label='Facebook, 24h rolling mean', color = '#c8803a', linewidth = 2)
plt.xlabel('')
plt.ylabel('z-scored value')
#fig = plt.figure()
#ax = fig.add_subplot(111)
#ax.grid(True)
#ax.set_xticklabels([])
plt.tick_params(labelsize = 14)
plt.legend()
subplot(212)
pd.rolling_mean(small_range.lsi_score, 12).plot(label = 'LSI, 12h rolling mean', color = '#5e402e', linewidth = 2.5)
#pd.rolling_mean(small_range.lsi_score, 2).plot(label = 'LSI, 24h rolling mean', color = '#5e402e')
pd.rolling_mean(small_range.goog, 24).plot(label='Google, 24h rolling mean', color = '#bb5868', linewidth = 2)
pd.rolling_mean(small_range.msft, 24).plot(label='Microsoft, 24h rolling mean', color = '#6574b3', linewidth = 2)
pd.rolling_mean(small_range.twtr, 24).plot(label='Twitter, 24h rolling mean', color = '#5da148', linewidth = 2)
plt.legend()
plt.ylabel('z-scored value')
plt.tick_params(labelsize = 14)
plt.show()
In [237]:
#smooth_lsi = small_range.lsi_score.interpolate()
#smooth_lsi = pd.rolling_mean(smooth_lsi, 24)
figure(1)
subplot(211)
pd.rolling_mean(small_range.lsi_score, 1).plot(label = 'LSI, 12h rolling mean', color = '#5e402e', linewidth = 2.5)
pd.rolling_mean(small_range.aapl, 4).plot(label='Apple, 24h rolling mean', color = '#519e8b', linewidth = 2)
pd.rolling_mean(small_range.amzn, 4).plot(label='Amazon, 24h rolling mean', color = '#be5fc1', linewidth = 2)
pd.rolling_mean(small_range.fb, 4).plot(label='Facebook, 24h rolling mean', color = '#c8803a', linewidth = 2)
plt.xlabel('')
plt.ylabel('z-scored value')
plt.title('Detail of October 28th - October 30th ', fontsize = 22)
#fig = plt.figure()
#ax = fig.add_subplot(111)
#ax.grid(True)
#ax.set_xticklabels([])
plt.tick_params(labelsize = 14)
plt.legend()
subplot(212)
pd.rolling_mean(small_range.lsi_score, 1).plot(label = 'LSI, 12h rolling mean', color = '#5e402e', linewidth = 2.5)
#pd.rolling_mean(small_range.lsi_score, 2).plot(label = 'LSI, 24h rolling mean', color = '#5e402e')
pd.rolling_mean(small_range.goog, 4).plot(label='Google, 24h rolling mean', color = '#bb5868', linewidth = 2)
pd.rolling_mean(small_range.msft, 4).plot(label='Microsoft, 24h rolling mean', color = '#6574b3', linewidth = 2)
pd.rolling_mean(small_range.twtr, 4).plot(label='Twitter, 24h rolling mean', color = '#5da148', linewidth = 2)
plt.legend()
plt.ylabel('z-scored value')
plt.tick_params(labelsize = 14)
plt.show()
In [88]:
pd.rolling_mean(smooth_lsi, 24).plot(label = 'LSI, 24h rolling mean')
pd.rolling_mean(small_range.msft, 24).plot(label='GOOG, 24h rolling mean')
plt.legend()
plt.tick_params(labelsize=14)
In [89]:
pd.rolling_mean(smooth_lsi, 24).plot(label = 'LSI, 24h rolling mean')
pd.rolling_mean(small_range.twtr, 24).plot(label='Twitter, 24h rolling mean')
plt.legend()
plt.tick_params(labelsize=14)
In [ ]: