Model

Amy Ma



In [1]:

    
import pandas as pd
%matplotlib inline
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_style("whitegrid")

from sklearn.decomposition import FastICA
import sklearn.linear_model as lm
import statsmodels.api as sm

import math
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import LSTM
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import mean_squared_error
from sklearn import metrics,preprocessing,cross_validation
from statsmodels.tsa.stattools import adfuller
from pandas.tools.plotting import autocorrelation_plot
from sklearn.ensemble import GradientBoostingRegressor









    



Using TensorFlow backend.

Introduction

Functions



In [2]:

    
# define a function to import datasets
'''
Define a function to read all the dataset:
    1.source = the data folder
    2.dataname = the file name
    3.date = the date column
''' 
def read_adj(source,dataname,date):
    # define a function to parse the date
    dateparse = lambda x: pd.datetime.strptime(x, '%Y-%m-%d')
    # give path
    path = source+dataname
    # read and save the data 
    data = pd.read_csv(path,parse_dates=[date], date_parser=dateparse) 
    data = data.set_index(date)
    # sort data by datatime index
    data = data.sort_index()
    print "Basic information of "+dataname+":"
    print data.head() # to see the data
    print "*" *100
    print data.describe() # summary the data
    return data
def read_tweet(source,dataname,date):
    # define a function to parse the date
    dateparse = lambda x: pd.datetime.strptime(x, '%Y-%m-%d')
    # give path
    path = source+dataname
    # read and save the data 
    data = pd.read_csv(path,parse_dates=[date], date_parser=dateparse) 
    #data = data.set_index(date)
    # sort data by datatime index
    data = data.sort_index()
    print "Basic information of "+dataname+":"
    print data.head() # to see the data
    print "*" *100
    print data.describe() # summary the data
    print "*" *100
    return data



In [3]:

    
# define a function to give daily returns
'''
This function will take a DataFrame with adjuseted close price, 
and return a DataFrame with daily returns.
For dsk, we also account for corporate action
1. adj_close = adjusted close price
2. ca = corporation action
'''
def get_returns(adj_close, ca=None):
    if ca is None:
        # color is not None
        return adj_close.diff()/adj_close.shift(1)
    else: 
        return (adj_close.diff()+ca)/adj_close.shift(1)
def merge_tweet(data1,data2,label):
    data2 = pd.DataFrame(data1).merge(data2, how='left', right_index=True, left_index=True)
    data2.fillna(0,inplace=True)# fill na=0
    data2 = data2[label]
    return pd.DataFrame(data2)



In [4]:

    
# define a function to plot the returns
'''
This function will make a gerneral plot for returns
1. returns1,returns2 = company's returns
2. width,height = the size of the graph
'''
def plot_return(returns1,name1,returns2,name2,width,height,color1,color2):
    fig = plt.figure()
    fig.set_size_inches(width,height)
    # put together
    ax1 = fig.add_subplot(211)
    ax1.set_title('Combined plot', fontsize=20)
    ax1.plot(returns1, color1)
    ax1.plot(returns2, color2)
    # separate
    ax2 = fig.add_subplot(223)
    ax2.set_title(name1, fontsize=20)
    ax2.plot(returns1, color1)

    ax3 = fig.add_subplot(224)
    ax3.set_title(name2, fontsize=20)
    ax3.plot(returns2, color2)
    #fig, ax = plt.subplots()
    #fig.set_size_inches(width,height)
    #plt.plot(cab_returns,color=color)

def plot_tweets(data1,data2,return1,return2,label):
    a = scale_01(data1)
    b = scale_01(data2)
    fig = plt.figure()
    fig.set_size_inches(18,15)
    # put together
    ax1 = fig.add_subplot(221)
    title = 'Returns vs. ' +label+'(CAB)'
    ax1.set_title(title, fontsize=20)
    ax1.plot(cab, "black")
    ax1.plot(a,"blue")
    # separate
    ax2 = fig.add_subplot(222)
    title = 'Returns vs. ' +label+'(DKS)'
    ax2.set_title(title, fontsize=20)
    ax2.plot(cab, "black")
    ax2.plot(b,"green")



In [5]:

    
def get_ICA(data,n_comps=3,plot=False):
    #result = pd.concat(frame, axis=1)
    #result = data
    #X=result.values
    X = data
    #X = (X-X.mean())#/X.std(ddof=0)
    ica = FastICA(n_components=n_comps)
    S_ = ica.fit_transform(X)  # Reconstruct signals
    A_ = ica.mixing_  # Get estimated mixing matrix
    ICs = pd.DataFrame({"c1":S_[:,0],'c2':S_[:,1],'c3':S_[:,2]})
    # Make plots to see the signal
    if plot==True:
        fig=plt.figure()
        fig.set_size_inches(18,16)
        models = [S_]
        names = ['ICA recovered signals']
        colors = range(0,n_comps)
        for ii, (model, name) in enumerate(zip(models, names), 1):
            plt.subplot(2, 1, ii)
            plt.title(name)
            for sig, color in zip(model.T, colors):
                plt.plot(sig, color=color)
        plt.subplots_adjust(0.09, 0.04, 0.94, 0.94, 0.26, 0.46)
        plt.show()
    return ICs



In [6]:

    
def crosscorr(data1, data2, start, end, name):
    dict_for_df = {}
    for i in range(start,end):   
        lag = i
        corr = data1.corr(data2.shift(lag)) 
        dict_for_df[i] = [corr]     # store in a dict
    df = pd.DataFrame(dict_for_df).T # after the loop convert the dict to a dataframe
    df.columns = name
    # make a plot 
    plt.plot(df)
    plt.show()
    return df

def leadlag_corr_ic(data1,data2,start,end,threshold):
    df = data1.reset_index(drop=True)
    # component1
    cr1 = crosscorr(df,data2["c1"],0,30,["corr1"])
    print ("For compenent1 use lag = ")
    print cr1[cr1["corr1"]>threshold]
    # component2
    cr2 = crosscorr(df,data2["c2"],0,30,["corr2"])
    #cr2 = cr2.reindex(cr2.abs().sort_values(["corr2"],inplace=False,ascending= False).index).T.iloc[0,0:n_show]
    print "For compenent2 use lag = " 
    print cr2[cr2["corr2"]>threshold]
    # component3
    cr3 = crosscorr(df,data2["c3"],0,30,["corr3"])
    #cr3 = cr3.reindex(cr3.abs().sort_values(["corr3"],inplace=False,ascending= False).index).T.iloc[0,0:n_show]
    print "For compenent3 use lag = " 
    print cr3[cr3["corr3"]>threshold]
    c1 = cr1[cr1["corr1"]>threshold].index.values
    c2 = cr2[cr2["corr2"]>threshold].index.values
    c3 = cr3[cr3["corr3"]>threshold].index.values
    return c1,c2,c3

def leadlag_corr(data1,data2_frame,start,end,threshold):
    df = data1.reset_index(drop=True)
    data2 = pd.concat(data2_frame, axis=1).reset_index(drop=True)
#     lag = []
#     def print_leadlag(col):
#         data = crosscorr(df,data2.iloc[:,col],0,30,[col])
#         print "For "+col+ "use lag = " 
#         print data[data[label.abs()]>threshold]
#         corr= data[data[label.abs()]>threshold].index.values
#         return corr
#     for i in range(1,data2.shape[1]+1):
#         lag.append(print_leadlag(col="data"))
# data1
    cr1 = crosscorr(df,data2.iloc[:,0],0,30,["corr1"])
    print ("For dataset1 use lag = ")
    print cr1[cr1["corr1"].abs()>threshold]
    # data2
    cr2 = crosscorr(df,data2.iloc[:,1],0,30,["corr2"])
    #cr2 = cr2.reindex(cr2.abs().sort_values(["corr2"],inplace=False,ascending= False).index).T.iloc[0,0:n_show]
    print "For dataset2 use lag = " 
    print cr2[cr2["corr2"].abs()>threshold]
    # data3
    cr3 = crosscorr(df,data2.iloc[:,2],0,30,["corr3"])
    #cr3 = cr3.reindex(cr3.abs().sort_values(["corr3"],inplace=False,ascending= False).index).T.iloc[0,0:n_show]
    print "For dataset3 use lag = " 
    print cr3[cr3["corr3"].abs()>threshold]
    # data4
    cr4 = crosscorr(df,data2.iloc[:,3],0,30,["corr4"])
    print "For dataset4 use lag = " 
    print cr4[cr4["corr4"].abs()>threshold]
    # data5
    cr5 = crosscorr(df,data2.iloc[:,4],0,30,["corr5"])
    print "For dataset5 use lag = " 
    print cr5[cr5["corr5"].abs()>threshold]
#     # data6
#     cr6 = crosscorr(df,data2.iloc[:,4],0,30,["corr6"])
#     print "For dataset6 use lag = " 
#     print cr6[cr6["corr6"].abs()>threshold]
#     # data7
#     cr7 = crosscorr(df,data2.iloc[:,4],0,30,["corr7"])
#     print "For dataset7 use lag = " 
#     print cr7[cr7["corr7"].abs()>threshold]
#     # data8
#     cr8 = crosscorr(df,data2.iloc[:,4],0,30,["corr8"])
#     print "For dataset8 use lag = " 
#     print cr8[cr8["corr8"].abs()>threshold]
    c1 = cr1[cr1["corr1"].abs()>threshold].index.values
    c2 = cr2[cr2["corr2"].abs()>threshold].index.values
    c3 = cr3[cr3["corr3"].abs()>threshold].index.values
    c4 = cr4[cr4["corr4"].abs()>threshold].index.values
    c5 = cr5[cr5["corr5"].abs()>threshold].index.values
#     c6 = cr6[cr6["corr6"].abs()>threshold].index.values
#     c7 = cr7[cr7["corr7"].abs()>threshold].index.values
#     c8 = cr8[cr8["corr8"].abs()>threshold].index.values
    return c1,c2,c3,c4,c5#,c6,c7,c8



In [7]:

    
def split_trn_tst(data,porp):
    train_size = int(len(data)*porp)
    test_size = len(data)-train_size
    train = data.iloc[0:train_size]
    test = data.iloc[train_size:len(data)] 
    print(len(train), len(test))
    return train,test
# convert an array of values into a dataset matrix
def create_dataset(dataset, look_back=1):
	dataX, dataY = [], []
	for i in range(len(dataset)-look_back-1):
		a = dataset[i:(i+look_back), 0]
		dataX.append(a)
		dataY.append(dataset[i + look_back, 0])
	return np.array(dataX), np.array(dataY)
def leadlag_feature(dataset,look_back):
    dataset = dataset.values.reshape(len(dataset),1)
    trainX,trainY=create_dataset(dataset,look_back)
    trainX= np.reshape(trainX, (trainX.shape[0], trainX.shape[1]))
    trainX= pd.DataFrame(trainX)
    frame=[trainX.max(axis=1), trainX.min(axis=1), trainX.mean(axis=1), trainX.median(axis=1)]
    others_feature = reduce(lambda left,right: pd.concat([left.reset_index(drop=True), 
                                                      right.reset_index(drop=True)],axis=1),frame)
    return others_feature



In [8]:

    
def scale_01(data):
    return (data-data.min())/(data.max()-data.min())
def nminmax_tsttrn(seed,data,split):    
    np.random.seed(seed)
    # Change into arrays
    dataset = data.values
    # normalize the dataset
    scaler = MinMaxScaler(feature_range=(0, 1)) # or 0,1
    dataset = scaler.fit_transform(dataset).reshape(len(data),1)
    # split into train and test sets
    train_size = int(len(dataset) * split)
    test_size = len(dataset) - train_size
    train, test = dataset[0:train_size], dataset[train_size:len(dataset)]
    print(len(train), len(test))
    return train, test 
def get_wmdata(seed,data,split,look_back):
    train, test = nminmax_tsttrn(seed,data,split)
    trainX, trainY = create_dataset(train, look_back)
    testX, testY = create_dataset(test, look_back)
    return trainX,trainY,testX,testY

def wmd_ica(seed, data, split, look_back, n_component):
    trainX,trainY,testX,testY = get_wmdata(seed,data,split,look_back=10)
    trainX_ica=get_ICA(trainX,n_component)
    testX_ica=get_ICA(testX,n_component)
    return np.array(trainX_ica),trainY,np.array(testX_ica),testY 
def Rmse(model, X, Y):
    return math.sqrt(mean_squared_error(Y, model.predict(X)))



In [9]:

    
def plot_lm(datay,trainPredict,testPredict):
    # shift train predictions for plotting
    trainPredictPlot = np.empty_like(datay)
    trainPredictPlot[:] = np.nan
    trainPredictPlot[0:len(trainPredict)] = trainPredict
    # shift test predictions for plotting
    testPredictPlot = np.empty_like(datay)
    testPredictPlot[:] = np.nan
    testPredictPlot[len(trainPredict):] = testPredict
    # plot baseline and predictions
    fig = plt.figure()
    fig.set_size_inches(18,16)
    plt.plot(cab_others["returns"].values)
    plt.plot(trainPredictPlot)
    plt.plot(testPredictPlot)
    plt.show()



In [10]:

    
def test_stationarity(timeseries):
    
    #Determing rolling statistics
    rolmean = pd.rolling_mean(timeseries, window=10)
    rolstd = pd.rolling_std(timeseries, window=10)

    #Plot rolling statistics:
    orig = plt.plot(timeseries, color='blue',label='Original')
    mean = plt.plot(rolmean, color='red', label='Rolling Mean')
    std = plt.plot(rolstd, color='black', label = 'Rolling Std')
    plt.legend(loc='best')
    plt.title('Rolling Mean & Standard Deviation')
    plt.show(block=False)
    
    #Perform Dickey-Fuller test:
    print 'Results of Dickey-Fuller Test:'
    dftest = adfuller(timeseries, autolag='AIC')
    dfoutput = pd.Series(dftest[0:4], index=['Test Statistic','p-value','#Lags Used','Number of Observations Used'])
    for key,value in dftest[4].items():
        dfoutput['Critical Value (%s)'%key] = value
    print dfoutput



In [11]:

    
def cv_lstm(model,data,k_fold,**extraKwArgs):
    train_score = [0]*(k_fold-1)
    test_score = [0]*(k_fold-1)
    # always make sure input is [y,x]
    if len(data)> 1:
        data = map(pd.DataFrame,data)
        data = pd.concat(data, axis=1)
    else: 
        data = data
    L = int(len(data)/k_fold)
    data = np.array(data)
    for i in range(1,k_fold):
        
        ind = i*(L+1)
        ind2 = (i+1)*(L+1)
        
        trainX = data[0:ind,1:]
        trainY = data[0:ind,0]
        validX= data[ind:ind2,1:]
        validY = data[ind:ind2,0]
        train_score[i-1],test_score[i-1] = model(trainX, trainY,validX,validY,**extraKwArgs)[2]
    print('Train Score: %.5f RMSE' % ((np.array(train_score)).mean()))
    print('Test Score: %.5f RMSE' % ((np.array(test_score)).mean()))

def cv_ridge(data,k_fold,**extraKwArgs):
    train_score = [0]*(k_fold-1)
    test_score = [0]*(k_fold-1)
    # always make sure input is [y,x]
    if len([data])> 1:
        data = map(pd.DataFrame,data)
        data = pd.concat(data, axis=1)
    else: 
        data = data
    L = int(len(data)/k_fold)
    data = np.array(data)
    for i in range(1,k_fold):       
        ind = i*(L+1)
        ind2 = (i+1)*(L+1)
        trainX = data[0:ind,1:]
        trainY = data[0:ind,0]
        validX= data[ind:ind2,1:]
        validY = data[ind:ind2,0]
        model = lm.Ridge(**extraKwArgs)
        model.fit(trainX,trainY)

        train_score[i-1] = Rmse(model,trainX, trainY )
        test_score[i-1] = Rmse(model,validX,validY)
        rmse_trn = (np.array(train_score)).mean()
        rmse_tst = (np.array(test_score)).mean()
    print('Train Score: %.5f RMSE' % ((np.array(train_score)).mean()))
    print('Test Score: %.5f RMSE' % ((np.array(test_score)).mean()))
    return rmse_trn,rmse_tst



In [12]:

    
### LSTM model
# ------------ Model 1 ------------- #
def LSM(dataset,seed=10,look_back=10,split=0.7,n_loop=100,layer2=4,show_log = 1,batch_size = 1,cv=False):
    np.random.seed(7)
    # load the dataset
    dataset = dataset.values
    # normalize the dataset
    scaler = MinMaxScaler(feature_range=(0, 1))
    dataset = scaler.fit_transform(dataset).reshape(len(dataset),1)
    # split into train and test sets
    train_size = int(len(dataset) * split)
    test_size = len(dataset) - train_size
    train, test = dataset[0:train_size,:], dataset[train_size:len(dataset),:]
    # reshape into X=t and Y=t+1
    trainX, trainY = create_dataset(train, look_back)
    testX, testY = create_dataset(test, look_back)
    # reshape input to be [samples, time steps, features]
    trainX = np.reshape(trainX, (trainX.shape[0], trainX.shape[1], 1))
    testX = np.reshape(testX, (testX.shape[0], testX.shape[1], 1))
    # create and fit the LSTM network
    batch_size = 1
    model = Sequential()
    model.add(LSTM(layer2, batch_input_shape=(batch_size, look_back, 1), stateful=True))
    model.add(Dense(1))
    model.compile(loss='mean_squared_error', optimizer='adam')
    for i in range(n_loop):
        model.fit(trainX, trainY, nb_epoch=1, batch_size=batch_size, verbose=show_log, shuffle=False)
        model.reset_states()
    # make predictions
    trainPredict = model.predict(trainX, batch_size=batch_size)
    model.reset_states()
    testPredict = model.predict(testX, batch_size=batch_size)
    # invert predictions
    trainPredict = scaler.inverse_transform(trainPredict)
    trainY = scaler.inverse_transform([trainY])
    testPredict = scaler.inverse_transform(testPredict)
    testY = scaler.inverse_transform([testY])
    # calculate root mean squared error
    trainScore = math.sqrt(mean_squared_error(trainY[0], trainPredict[:,0]))
    testScore = math.sqrt(mean_squared_error(testY[0], testPredict[:,0]))
    if cv == False:
        print('Train Score: %.5f RMSE' % (trainScore))
        print('Test Score: %.5f RMSE' % (testScore))
        # shift train predictions for plotting
        trainPredictPlot = np.empty_like(dataset)
        trainPredictPlot[:, :] = np.nan
        trainPredictPlot[look_back:len(trainPredict)+look_back, :] = trainPredict
        # shift test predictions for plotting
        testPredictPlot = np.empty_like(dataset)
        testPredictPlot[:, :] = np.nan
        testPredictPlot[len(trainPredict)+(look_back*2)+1:(len(dataset)-1), :] = testPredict
    # plot baseline and predictions
        fig = plt.figure()
        fig.set_size_inches(18,16)
        plt.plot(scaler.inverse_transform(dataset))
        plt.plot(trainPredictPlot)
        plt.plot(testPredictPlot)
        plt.show()
        return trainPredict,testPredict
    else:
        return trainScore,testScore
    
# ------------ Model 2 ------------- #
def LSTM(dataset,seed=10,look_back=10,split=0.7,n_loop=100,layer2=4,batch =1,show_log = 1,cv=False):
    np.random.seed(7)
    # load the dataset
    dataset = dataset.values
    # normalize the dataset
    scaler = MinMaxScaler(feature_range=(0, 1))
    dataset = scaler.fit_transform(dataset).reshape(len(dataset),1)
    # split into train and test sets
    train_size = int(len(dataset) * split)
    test_size = len(dataset) - train_size
    train, test = dataset[0:train_size,:], dataset[train_size:len(dataset),:]
    # reshape into X=t and Y=t+1
    trainX, trainY = create_dataset(train, look_back)
    testX, testY = create_dataset(test, look_back)
    # reshape input to be [samples, time steps, features]
    trainX = np.reshape(trainX, (trainX.shape[0], trainX.shape[1], 1))
    testX = np.reshape(testX, (testX.shape[0], testX.shape[1], 1))
    # create and fit the LSTM network
    batch_size = batch
    model = Sequential()
    model.add(LSTM(layer2, batch_input_shape=(batch_size, look_back, 1), stateful=True, return_sequences=True))
    model.add(LSTM(layer2, batch_input_shape=(batch_size, look_back, 1), stateful=True))
    model.add(Dense(1))
    model.compile(loss='mean_squared_error', optimizer='adam')
    for i in range(n_loop):
        model.fit(trainX, trainY, nb_epoch=1, batch_size=batch_size, verbose=show_log, shuffle=False)
        model.reset_states()
    # make predictions
    trainPredict = model.predict(trainX, batch_size=batch_size)
    model.reset_states()
    testPredict = model.predict(testX, batch_size=batch_size)
    # invert predictions
    trainPredict = scaler.inverse_transform(trainPredict)
    trainY = scaler.inverse_transform([trainY])
    testPredict = scaler.inverse_transform(testPredict)
    testY = scaler.inverse_transform([testY])
    # calculate root mean squared error
    trainScore = math.sqrt(mean_squared_error(trainY[0], trainPredict[:,0]))
    testScore = math.sqrt(mean_squared_error(testY[0], testPredict[:,0]))
    if cv == False:
        print('Train Score: %.5f RMSE' % (trainScore))
        print('Test Score: %.5f RMSE' % (testScore))
        # shift train predictions for plotting
        trainPredictPlot = np.empty_like(dataset)
        trainPredictPlot[:, :] = np.nan
        trainPredictPlot[look_back:len(trainPredict)+look_back, :] = trainPredict
        # shift test predictions for plotting
        testPredictPlot = np.empty_like(dataset)
        testPredictPlot[:, :] = np.nan
        testPredictPlot[len(trainPredict)+(look_back*2)+1:(len(dataset)-1), :] = testPredict
    # plot baseline and predictions
        fig = plt.figure()
        fig.set_size_inches(18,16)
        plt.plot(scaler.inverse_transform(dataset))
        plt.plot(trainPredictPlot)
        plt.plot(testPredictPlot)
        plt.show()
        return trainPredict,testPredict
    else:
        return trainScore,testScore
    
# ------------ Model 3 ------------- #
def LSTM_ica(dataset,seed=10,look_back=10,n_component=3,split=0.7,layer2=4,n_loop=100,show_log = 1,batch_size = 1,cv=False):  
    np.random.seed(seed)
    # Change into arrays
    dataset = dataset.values
    # normalize the dataset
    scaler = MinMaxScaler(feature_range=(0, 1)) # or 0,1
    dataset = scaler.fit_transform(dataset).reshape(len(dataset),1)
    # split into train and test sets
    train_size = int(len(dataset) * split)
    test_size = len(dataset) - train_size
    train, test = dataset[0:train_size], dataset[train_size:len(dataset)]
    print(len(train), len(test))
    trainX, trainY = create_dataset(train, look_back)
    testX, testY = create_dataset(test, look_back)
    # get ICA for all the lookback
    trainX = np.array(get_ICA(trainX,n_component))
    testX = np.array(get_ICA(testX,n_component))
    # reshape input to be [samples, time steps, features]
    trainX = np.reshape(trainX, (trainX.shape[0], trainX.shape[1], 1))
    testX = np.reshape(testX, (testX.shape[0], testX.shape[1], 1))
    # create and fit the LSTM network
    model = Sequential()
    model.add(LSTM(layer2, batch_input_shape=(batch_size, n_component, 1), stateful=True))
    model.add(Dense(1))
    model.compile(loss='mean_squared_error', optimizer='adam')
    for i in range(n_loop):
        model.fit(trainX, trainY, nb_epoch=1, batch_size=batch_size, verbose=show_log, shuffle=False)
        model.reset_states()
    # make predictions
    trainPredict = model.predict(trainX, batch_size=batch_size)
    model.reset_states()
    testPredict = model.predict(testX, batch_size=batch_size)
    # invert predictions
    trainPredict = scaler.inverse_transform(trainPredict)
    trainY = scaler.inverse_transform([trainY])
    testPredict = scaler.inverse_transform(testPredict)
    testY = scaler.inverse_transform([testY])
    # calculate root mean squared error
    trainScore = math.sqrt(mean_squared_error(trainY[0], trainPredict[:,0]))
    testScore = math.sqrt(mean_squared_error(testY[0], testPredict[:,0]))
    if cv == False:
        print('Train Score: %.5f RMSE' % (trainScore))
        print('Test Score: %.5f RMSE' % (testScore))
        predict =pd.DataFrame(np.concatenate([trainPredict,testPredict]))
    # plot baseline and predictions
        fig = plt.figure()
        fig.set_size_inches(18,16)
        plt.plot(scaler.inverse_transform(dataset))
        plt.plot(Predict)
        plt.show()
        return trainPredict,testPredict
    else:
        return trainScore,testScore
# ------------ Model 4 ------------- #
def LSTM_others(dataset,seed=10,n_features=10,split=0.7,layer2=4,n_loop=100,show_log = 1,batch_size = 1,cv=False):  
    np.random.seed(seed)
    # Change into arrays
    dataset = dataset.values
    shape = dataset.shape
    print shape
    # normalize the dataset
    scaler = MinMaxScaler(feature_range=(0, 1)) # or 0,1
    dataset = scaler.fit_transform(dataset).reshape(shape)
    # split into train and test sets
    train_size = int(len(dataset) * split)
    test_size = len(dataset) - train_size
    train, test = dataset[0:train_size], dataset[train_size:len(dataset)]
    print(len(train), len(test))
    trainX = train[:,1:]
    trainY = train[:,0]
    testX = test[:,1:]
    testY = test[:,0]
    print trainX.shape
    # reshape input to be [samples, time steps, features]
    trainX = np.reshape(trainX, (trainX.shape[0], trainX.shape[1], 1))
    testX = np.reshape(testX, (testX.shape[0], testX.shape[1], 1))
    # create and fit the LSTM network
    model = Sequential()
    model.add(LSTM(layer2, batch_input_shape=(batch_size,n_features , 1), stateful=True))
    model.add(Dense(1))
    model.compile(loss='mean_squared_error', optimizer='adam')
    for i in range(n_loop):
        model.fit(trainX, trainY, nb_epoch=1, batch_size=batch_size, verbose=show_log, shuffle=False)
        model.reset_states()
    # make predictions
    trainPredict = model.predict(trainX, batch_size=batch_size)
    model.reset_states()
    testPredict = model.predict(testX, batch_size=batch_size)
    # invert predictions
    trainPredict = scaler.inverse_transform(trainPredict)
    trainY = scaler.inverse_transform([trainY])
    testPredict = scaler.inverse_transform(testPredict)
    testY = scaler.inverse_transform([testY])
    # calculate root mean squared error
    trainScore = math.sqrt(mean_squared_error(trainY[0], trainPredict[:,0]))
    testScore = math.sqrt(mean_squared_error(testY[0], testPredict[:,0]))
    if cv == False:
        print('Train Score: %.5f RMSE' % (trainScore))
        print('Test Score: %.5f RMSE' % (testScore))
        # shift train predictions for plotting
        trainPredictPlot = np.empty_like(dataset)
        trainPredictPlot[:, :] = np.nan
        trainPredictPlot[:len(trainPredict)+1, :] = trainPredict
        # shift test predictions for plotting
        testPredictPlot = np.empty_like(dataset)
        testPredictPlot[:, :] = np.nan
        testPredictPlot[len(trainPredict)+1:(len(dataset)-1), :] = testPredict
    # plot baseline and predictions
        fig = plt.figure()
        fig.set_size_inches(18,16)
        plt.plot(scaler.inverse_transform(dataset))
        plt.plot(trainPredictPlot)
        plt.plot(testPredictPlot)
        plt.show()
        return trainPredict,testPredict
    else:
        return trainScore,testScore



In [13]:

    
def blending(data,split,**extraKwArgs):
    '''
    need to define learning_rate,
    max_depth,random_state
    '''
    train, test = split_trn_tst(data,split)
    train, test = map(np.array,[train,test])
    X_train , X_test = train[:,1:], test[:,1:]
    y_train, y_test = train[:,0], test[:,0]
    est = GradientBoostingRegressor(n_estimators=20,loss='ls',**extraKwArgs).fit(X_train, y_train)
    print math.sqrt(mean_squared_error(y_test, est.predict(X_test)))
    return est.predict(X_train),est.predict(X_test)

Data preparation

1. Stcok market data



In [14]:

    
# read adjusted close price 
cab_adj = read_adj("Data/StockData/","CAB_2011_2016.csv","Date")
dks_adj = read_adj('Data/StockData/','DKS_2011_2016.csv',"Date")
sp500_adj = read_adj('Data/StockData/','SP500_2011_2016.csv',"Date")
#ust_adj = read_adj("Data/StockData/","UST_2011_2016.csv","Date")
hibb_adj = read_adj('Data/StockData/','HIBB_2011_2016.csv',"Date")
cab_peps = read_adj("Data/StockData/","CAB_2011_2016_PEPS.csv","Date")
dks_peps = read_adj('Data/StockData/','DKS_2011_2016_PEPS.csv',"Date")
# read corporate action
dks_ca = read_adj('Data/StockData/','dks_ca.csv',"Before_Ex_Date")
# merge data by date 
dks_adj = dks_adj.merge(dks_ca, how='outer', right_index=True, left_index=True)
dks_adj["Gross"].fillna(0,inplace=True)# fill na=0









    



Basic information of CAB_2011_2016.csv:
                 Open       High        Low      Close  Volume  Adj Close
Date                                                                     
2011-10-10  22.030001  23.430000  21.969999  23.360001  534900  23.360001
2011-10-11  23.040001  23.209999  22.570000  23.110001  323400  23.110001
2011-10-12  23.160000  24.000000  22.549999  23.570000  520700  23.570000
2011-10-13  23.370001  23.719999  22.799999  23.160000  226400  23.160000
2011-10-14  23.549999  23.730000  23.100000  23.690001  312400  23.690001
****************************************************************************************************
              Open         High          Low        Close        Volume  \
count  1258.000000  1258.000000  1258.000000  1258.000000  1.258000e+03   
mean     51.160851    51.796828    50.527440    51.188625  8.663331e+05   
std      11.598876    11.653256    11.530282    11.581525  1.039083e+06   
min      22.030001    22.350000    21.510000    21.719999  1.188000e+05   
25%      45.214999    45.832501    44.585001    45.142500  4.670500e+05   
50%      51.580002    52.084999    51.050001    51.560000  6.601000e+05   
75%      60.740002    61.265000    59.860001    60.707499  9.312000e+05   
max      71.769997    72.540001    71.320000    71.709999  1.985460e+07   

         Adj Close  
count  1258.000000  
mean     51.188625  
std      11.581525  
min      21.719999  
25%      45.142500  
50%      51.560000  
75%      60.707499  
max      71.709999  
Basic information of DKS_2011_2016.csv:
                 Open       High        Low      Close   Volume  Adj Close
Date                                                                      
2011-10-10  34.950001  35.759998  34.919998  35.669998   902500  33.457192
2011-10-11  35.349998  35.450001  34.790001  35.150002  1421400  32.969454
2011-10-12  35.389999  36.299999  35.090000  35.689999  1084200  33.475952
2011-10-13  36.259998  36.980000  35.340000  36.049999  1232100  33.813619
2011-10-14  36.529999  37.240002  35.380001  37.209999  1767800  34.901658
****************************************************************************************************
              Open         High          Low        Close        Volume  \
count  1258.000000  1258.000000  1258.000000  1258.000000  1.258000e+03   
mean     48.517043    49.043132    47.983808    48.520493  1.661620e+06   
std       5.682248     5.661853     5.685976     5.660839  1.339297e+06   
min      33.439999    34.459999    33.419998    34.240002  2.257000e+05   
25%      44.932500    45.490002    44.529999    44.964999  1.029775e+06   
50%      49.279999    49.820000    48.759998    49.299999  1.332150e+06   
75%      52.290001    52.709999    51.759998    52.287501  1.884200e+06   
max      61.599998    61.939999    61.060001    61.590000  2.298370e+07   

         Adj Close  
count  1258.000000  
mean     47.167740  
std       5.664285  
min      32.904567  
25%      43.696340  
50%      47.561726  
75%      50.728573  
max      61.590000  
Basic information of SP500_2011_2016.csv:
                   Open         High          Low        Close      Volume  \
Date                                                                         
2011-10-10  1158.150024  1194.910034  1158.150024  1194.890015  4446800000   
2011-10-11  1194.599976  1199.239990  1187.300049  1195.540039  4424500000   
2011-10-12  1196.189941  1220.250000  1196.189941  1207.250000  5355360000   
2011-10-13  1206.959961  1207.459961  1190.579956  1203.660034  4436270000   
2011-10-14  1205.650024  1224.609985  1205.650024  1224.579956  4116690000   

              Adj Close  
Date                     
2011-10-10  1194.890015  
2011-10-11  1195.540039  
2011-10-12  1207.250000  
2011-10-13  1203.660034  
2011-10-14  1224.579956  
****************************************************************************************************
              Open         High          Low        Close        Volume  \
count  1258.000000  1258.000000  1258.000000  1258.000000  1.258000e+03   
mean   1778.084721  1786.762226  1768.980392  1778.768537  3.591461e+09   
std     297.466069   297.740380   296.557592   297.134179  6.870853e+08   
min    1158.150024  1172.660034  1158.150024  1158.670044  5.362000e+08   
25%    1467.519989  1472.412537  1461.705017  1471.022553  3.188635e+09   
50%    1867.380005  1875.965027  1854.505005  1867.065002  3.524105e+09   
75%    2052.169983  2064.132446  2042.119964  2052.127502  3.936755e+09   
max    2187.810059  2193.810059  2186.800049  2190.149902  7.597450e+09   

         Adj Close  
count  1258.000000  
mean   1778.768537  
std     297.134179  
min    1158.670044  
25%    1471.022553  
50%    1867.065002  
75%    2052.127502  
max    2190.149902  
Basic information of HIBB_2011_2016.csv:
                 Open       High        Low      Close  Volume  Adj Close
Date                                                                     
2011-10-10  36.820000  37.320000  36.169998  37.060001  269100  37.060001
2011-10-11  36.750000  37.900002  36.330002  37.750000  269000  37.750000
2011-10-12  37.990002  38.570000  37.250000  38.310001  424700  38.310001
2011-10-13  38.009998  38.369999  37.240002  38.259998  311800  38.259998
2011-10-14  38.630001  38.980000  37.770000  38.959999  170300  38.959999
****************************************************************************************************
              Open         High          Low        Close        Volume  \
count  1258.000000  1258.000000  1258.000000  1258.000000  1.258000e+03   
mean     48.697735    49.328593    48.069316    48.706153  3.550426e+05   
std       9.127066     9.178047     9.083108     9.124431  2.931716e+05   
min      27.719999    29.450001    27.580000    28.650000  5.260000e+04   
25%      42.230000    42.765002    41.572501    42.047501  2.218750e+05   
50%      50.029999    50.680000    49.485001    50.160000  2.942500e+05   
75%      55.585000    56.270000    55.000000    55.737500  3.944000e+05   
max      67.839996    68.309998    67.040001    67.730003  6.236800e+06   

         Adj Close  
count  1258.000000  
mean     48.706153  
std       9.124431  
min      28.650000  
25%      42.047501  
50%      50.160000  
75%      55.737500  
max      67.730003  
Basic information of CAB_2011_2016_PEPS.csv:
            Price to Earnings Ratio  Price to Sales Ratio
Date                                                     
2006-08-28                   16.897                0.6736
2006-08-29                   17.060                0.6801
2006-08-30                   17.302                0.6897
2006-08-31                   17.310                0.6901
2006-09-01                   17.233                0.6870
****************************************************************************************************
       Price to Earnings Ratio  Price to Sales Ratio
count              2549.000000           2549.000000
mean                 17.177009              0.766450
std                   5.068869              0.347915
min                   3.504000              0.113900
25%                  13.194000              0.438300
50%                  18.065000              0.760900
75%                  20.857000              1.038400
max                  34.275000              1.525300
Basic information of DKS_2011_2016_PEPS.csv:
            Price to Earnings Ratio  Price to Sales Ratio
Date                                                     
2006-08-28                   23.028                0.7398
2006-08-29                   23.056                0.7407
2006-08-30                   23.309                0.7488
2006-08-31                   23.219                0.7459
2006-09-01                   23.376                0.7510
****************************************************************************************************
       Price to Earnings Ratio  Price to Sales Ratio
count              2298.000000           2298.000000
mean                 20.657124              0.866390
std                   4.519458              0.191226
min                   7.648000              0.263200
25%                  17.948500              0.754950
50%                  20.760500              0.891100
75%                  23.797750              1.000650
max                  30.321000              1.189000
Basic information of dks_ca.csv:
               Effective_Date Record_Date    Pay_Date  Frequency  Gross  \
Before_Ex_Date                                                            
2011-12-02         2011-12-05  2011-12-07  2011-12-28          1   0.50   
2012-02-28         2012-02-29  2012-03-02  2012-03-30          4   0.13   
2012-05-29         2012-05-30  2012-06-01  2012-06-29          4   0.13   
2012-08-28         2012-08-29  2012-08-31  2012-09-28          4   0.13   
2012-11-27         2012-11-28  2012-11-30  2012-12-28          4   0.13   

                                     Description  
Before_Ex_Date                                    
2011-12-02        Dividend amount:0.5 per share.  
2012-02-28      Dividend amount:0.125 per share.  
2012-05-29      Dividend amount:0.125 per share.  
2012-08-28      Dividend amount:0.125 per share.  
2012-11-27      Dividend amount:0.125 per share.  
****************************************************************************************************
       Frequency      Gross
count  21.000000  21.000000
mean    3.666667   0.253810
std     1.064581   0.409102
min     0.000000   0.130000
25%     4.000000   0.130000
50%     4.000000   0.140000
75%     4.000000   0.150000
max     4.000000   2.000000



In [15]:

    
# calculate returns 
cab_returns = get_returns(cab_adj["Adj Close"])
sp500_returns = get_returns(sp500_adj["Adj Close"])
hibb_returns = get_returns(hibb_adj["Adj Close"])
#ust_returns = get_returns(ust_adj["Adj Close"])
dks_returns = get_returns(dks_adj["Adj Close"],ca=dks_adj["Gross"])
# print dsk_returns.loc['2016-09-06'] to check the adjusted return by using corporate action
## Subsetting and arrange the data
cab = cab_returns["2011-10-11":]
dks = dks_returns["2011-10-11":]
hibb = hibb_returns["2011-10-11":]
sp500 = sp500_returns["2011-10-11":]
cab_pe = cab_peps["Price to Earnings Ratio"]["2011-10-11":"2016-10-07"]
cab_ps = cab_peps["Price to Sales Ratio"]["2011-10-11":"2016-10-07"]
dks_pe = dks_peps["Price to Earnings Ratio"]["2011-10-11":"2016-10-07"]
dks_ps = dks_peps["Price to Sales Ratio"]["2011-10-11":"2016-10-07"]
## Subsetting and arrange the data
cab = cab_returns["2011-10-11":]
dks = dks_returns["2011-10-11":]
hibb = hibb_returns["2011-10-11":]
sp500 = sp500_returns["2011-10-11":]
cab_pe = cab_peps["Price to Earnings Ratio"]["2011-10-11":"2016-10-07"]
cab_ps = cab_peps["Price to Sales Ratio"]["2011-10-11":"2016-10-07"]
dks_pe = dks_peps["Price to Earnings Ratio"]["2011-10-11":"2016-10-07"]
dks_ps = dks_peps["Price to Sales Ratio"]["2011-10-11":"2016-10-07"]
# save data
cab.to_csv("output/cab.csv", sep=',')
dks.to_csv("output/dks.csv", sep=',')



In [16]:

    
# Data perpartion
# TwoYear = ["2011-10-11":"2013-10-10"]
# ThirdYear = ["2013-10-11":"2014-10-10"]
# ThreeYear = ["2011-10-11":"2014-10-10"]
# ForthYear = ["2014-10-11":"2015-10-10"]
# FifithYear = ["2015-10-11":]
## Input: 
## 1. two years of stocks
cab_2 = cab_returns["2011-10-11":"2013-10-10"]
dks_2 = dks_returns["2011-10-11":"2013-10-10"]
hibb_2 = hibb_returns["2011-10-11":"2013-10-10"]
sp500_2 = sp500_returns["2011-10-11":"2013-10-10"]
cab_pe_2 = cab_peps["Price to Earnings Ratio"]["2011-10-11":"2013-10-10"]
cab_ps_2 = cab_peps["Price to Sales Ratio"]["2011-10-11":"2013-10-10"]
dks_pe_2 = dks_peps["Price to Earnings Ratio"]["2011-10-11":"2013-10-10"]
dks_ps_2 = dks_peps["Price to Sales Ratio"]["2011-10-11":"2013-10-10"]
## 2. three years of stocks
cab_3 = cab_returns["2011-10-11":"2014-10-10"]
dks_3 = dks_returns["2011-10-11":"2014-10-10"]
hibb_3 = hibb_returns["2011-10-11":"2014-10-10"]
sp500_3 = sp500_returns["2011-10-11":"2014-10-10"]
cab_pe_3 = cab_peps["Price to Earnings Ratio"]["2011-10-11":"2014-10-10"]
cab_ps_3 = cab_peps["Price to Sales Ratio"]["2011-10-11":"2014-10-10"]
dks_pe_3 = dks_peps["Price to Earnings Ratio"]["2011-10-11":"2014-10-10"]
dks_ps_3 = dks_peps["Price to Sales Ratio"]["2011-10-11":"2014-10-10"]
## 3. four years of stocks
cab_4 = cab_returns["2011-10-11":"2015-10-10"]
dks_4 = dks_returns["2011-10-11":"2015-10-10"]
hibb_4 = hibb_returns["2011-10-11":"2015-10-10"]
sp500_4 = sp500_returns["2011-10-11":"2015-10-10"]
cab_pe_4 = cab_peps["Price to Earnings Ratio"]["2011-10-11":"2015-10-10"]
cab_ps_4 = cab_peps["Price to Sales Ratio"]["2011-10-11":"2015-10-10"]
dks_pe_4 = dks_peps["Price to Earnings Ratio"]["2011-10-11":"2015-10-10"]
dks_ps_4 = dks_peps["Price to Sales Ratio"]["2011-10-11":"2015-10-10"]

## 4. the 3rd year of stocka
cab_3rd = cab_returns["2013-10-11":"2014-10-10"]
dks_3rd = dks_returns["2013-10-11":"2014-10-10"]
hibb_3rd = hibb_returns["2013-10-11":"2014-10-10"]
sp500_3rd = sp500_returns["2013-10-11":"2014-10-10"]
cab_pe_3rd = cab_peps["Price to Earnings Ratio"]["2013-10-11":"2014-10-10"]
cab_ps_3rd = cab_peps["Price to Sales Ratio"]["2013-10-11":"2014-10-10"]
dks_pe_3rd = dks_peps["Price to Earnings Ratio"]["2013-10-11":"2014-10-10"]
dks_ps_3rd = dks_peps["Price to Sales Ratio"]["2013-10-11":"2014-10-10"]
## 5. the 4th year of stocka
cab_4th = cab_returns["2014-10-11":"2015-10-10"]
dks_4th = dks_returns["2014-10-11":"2015-10-10"]
hibb_4th = hibb_returns["2014-10-11":"2015-10-10"]
sp500_4th = sp500_returns["2014-10-11":"2015-10-10"]
cab_pe_4th = cab_peps["Price to Earnings Ratio"]["2014-10-11":"2015-10-10"]
cab_ps_4th = cab_peps["Price to Sales Ratio"]["2014-10-11":"2015-10-10"]
dks_pe_4th = dks_peps["Price to Earnings Ratio"]["2014-10-11":"2015-10-10"]
dks_ps_4th = dks_peps["Price to Sales Ratio"]["2014-10-11":"2015-10-10"]
## 5. the 5th year of stocka
cab_5th = cab_returns["2015-10-11":]
dks_5th = dks_returns["2015-10-11":]
hibb_5th = hibb_returns["2015-10-11":]
sp500_5th = sp500_returns["2015-10-11":]
cab_pe_5th = cab_peps["Price to Earnings Ratio"]["2015-10-11":"2016-10-07"]
cab_ps_5th = cab_peps["Price to Sales Ratio"]["2015-10-11":"2016-10-07"]
dks_pe_5th = dks_peps["Price to Earnings Ratio"]["2015-10-11":"2016-10-07"]
dks_ps_5th = dks_peps["Price to Sales Ratio"]["2015-10-11":"2016-10-07"]

frame_2=[cab_2,dks_2,hibb_2,sp500_2]
frame_3=[cab_3,dks_3,hibb_3,sp500_3]
frame_4=[cab_4,dks_4,hibb_4,sp500_4]



In [17]:

    
# tweets data
cab_tweets = read_tweet("Data/SocialMediaData/","CAB_oldtweets.csv","date")
dks_tweets = read_tweet("Data/SocialMediaData/","DKS_oldtweets.csv","date")

cab_tnum = cab_tweets.groupby(['date'])[['username']].count()
cab_tret = cab_tweets.groupby(['date'])[['retweets']].sum()
cab_tfav = cab_tweets.groupby(['date'])[['favorites']].sum()
cab_ttim = cab_tweets.groupby(['time'])[['username']].count()

dks_tnum = dks_tweets.groupby(['date'])[['username']].count()
dks_tret = dks_tweets.groupby(['date'])[['retweets']].sum()
dks_tfav = dks_tweets.groupby(['date'])[['favorites']].sum()
dks_ttim = dks_tweets.groupby(['time'])[['username']].count()
## merge
cab_tnum = merge_tweet(cab,cab_tnum,'username')
cab_tret = merge_tweet(cab,cab_tret,'retweets')
cab_tfav = merge_tweet(cab,cab_tfav,'favorites')


dks_tnum = merge_tweet(dks,dks_tnum,'username')
dks_tret = merge_tweet(dks,dks_tret,'retweets')
dks_tfav = merge_tweet(dks,dks_tfav,'favorites')

cab_tnum_4 = cab_tnum["2011-10-11":"2015-10-10"]
cab_tret_4 = cab_tret["2011-10-11":"2015-10-10"]
cab_tfav_4 = cab_tfav["2011-10-11":"2015-10-10"]
dks_tnum_4 = dks_tnum["2011-10-11":"2015-10-10"]
dks_tret_4= dks_tret["2011-10-11":"2015-10-10"]
dks_tfav_4= dks_tfav["2011-10-11":"2015-10-10"]

# tweets and mood 
cab_mood = read_tweet("Data/SocialMediaData/","cab_sentiment.csv","date")
dks_mood = read_tweet("Data/SocialMediaData/","dks_sentiment.csv","date")

cab_mood = cab_mood.groupby(['date'])[['mood']].mean()
dks_mood = dks_mood.groupby(['date'])[['mood']].mean()

## merge
cab_mood = merge_tweet(cab,cab_mood,'mood')
dks_mood = merge_tweet(dks,dks_mood,'mood')

cab_mood_4 = cab_mood["2011-10-11":"2015-10-10"]
dks_mood_4 = dks_mood["2011-10-11":"2015-10-10"]









    



Basic information of CAB_oldtweets.csv:
  username       date  time  retweets  favorites  \
0  Cabelas 2016-10-06    16        10         35   
1  Cabelas 2016-10-06     9         0          1   
2  Cabelas 2016-10-06     8         0          2   
3  Cabelas 2016-10-06     6         0          0   
4  Cabelas 2016-10-06     6         0          0   

                                                text       mentions hashtags  \
0  Get $20 in Cabela's Bucks when you buy select ...  @thenorthface      NaN   
1                               @_Mark_Freeman Oh no            NaN      NaN   
2                              @Michael_Shaw Michael            NaN      NaN   
3                                @likeMike69 Hi Mike            NaN      NaN   
4  @22Aimless We expect to run the Cabela's CLUB ...     @22Aimless      NaN   

                                           permalink  n_link  
0  https://twitter.com/Cabelas/status/78413489266...       1  
1                                                NaN       0  
2                                                NaN       0  
3                                                NaN       0  
4  https://twitter.com/Cabelas/status/78397493640...       1  
****************************************************************************************************
               time      retweets     favorites        n_link
count  17800.000000  17800.000000  17800.000000  17800.000000
mean      13.647247      6.842191     13.350169      0.750674
std        5.266237    128.786975     80.169663      0.432635
min        0.000000      0.000000      0.000000      0.000000
25%       11.000000      0.000000      0.000000      1.000000
50%       14.000000      0.000000      1.000000      1.000000
75%       17.000000      1.000000      3.000000      1.000000
max       23.000000  12421.000000   5284.000000      1.000000
****************************************************************************************************
Basic information of DKS_oldtweets.csv:
  username       date  time  retweets  favorites  \
0    DICKS 2016-10-06    19         1          6   
1    DICKS 2016-10-06    16         0          0   
2    DICKS 2016-10-06    16         0          0   
3    DICKS 2016-10-06    12         5          6   
4    DICKS 2016-10-06     8         0          0   

                                                text     mentions  \
0  In college they called him the Comeback Kid." ...          NaN   
1  @ShaunyWony We apologize. We_��ve worked with ...  @ShaunyWony   
2  @djess1995 We apologize. We_��ve worked with o...   @djess1995   
3  Show us your old football jersey for a chance ...          NaN   
4  @SpringHSLibrary You just need to use your has...          NaN   

        hashtags                                          permalink  n_link  
0  #JerseyReport  https://twitter.com/DICKS/status/7841792813626...       1  
1            NaN  https://twitter.com/DICKS/status/7841278507397...       1  
2            NaN  https://twitter.com/DICKS/status/7841275900512...       1  
3            NaN                                                NaN       0  
4            NaN                                                NaN       0  
****************************************************************************************************
               time      retweets     favorites        n_link
count  26480.000000  26480.000000  26480.000000  26480.000000
mean      14.874887      9.899245     10.331571      0.651473
std        4.674028    120.982739     81.000073      0.476513
min        0.000000      0.000000      0.000000      0.000000
25%       12.000000      0.000000      0.000000      0.000000
50%       15.000000      0.000000      0.000000      1.000000
75%       18.000000      2.000000      1.000000      1.000000
max       23.000000  10501.000000   4884.000000      1.000000
****************************************************************************************************
Basic information of cab_sentiment.csv:
        date    mood
0 2016-10-06  0.2170
1 2016-10-06 -0.1420
2 2016-10-06 -0.0435
3 2016-10-06 -0.0570
4 2016-10-06  0.1510
****************************************************************************************************
               mood
count  17800.000000
mean       0.049501
std        0.079509
min       -0.214000
25%       -0.002632
50%        0.050050
75%        0.104000
max        0.352000
****************************************************************************************************
Basic information of dks_sentiment.csv:
        date   mood
0 2016-10-06  0.475
1 2016-10-06  0.916
2 2016-10-06  0.774
3 2016-10-06  0.925
4 2016-10-06  0.786
****************************************************************************************************
               mood
count  26480.000000
mean       0.874939
std        0.099903
min        0.137000
25%        0.812000
50%        0.883000
75%        0.963000
max        1.000000
****************************************************************************************************

Basic visualizations

1. Stock Market



In [18]:

    
# Make plots to see returns
plot_return(cab_returns,"Cabela's Return",dks_returns,"Dick's Return",18.5,10.5,"blue","green")



In [19]:

    
plot_return(cab_returns,"Cabela's Return",sp500_returns,"SP500's Return",18.5,10.5,"blue","red")



In [20]:

    
plot_return(dks_returns,"Dick's Return",sp500_returns,"SP500's Return",18.5,10.5,"green","red")

Returns vs. Tweets

The black line is the returns, and all the tweets related data has been scaled into [0,1] to see the trends



In [21]:

    
plot_tweets(cab_tnum,dks_tnum,cab,dks,"Num. of tweets")



In [22]:

    
plot_tweets(cab_tret,dks_tret,cab,dks,"Num. of retweets")



In [23]:

    
plot_tweets(cab_tfav,dks_tfav,cab,dks,"Num. of favorites")

Feature Generation

1.Leadlag correlation data (one vs. others)

cab_other, cab_other1(with tweets)
dks_other, dks_other1(with tweets) #### 2.Leadlag correlation data (one vs. others) with features
cab_feature
dks_feature #### 3.Window method data (one vs. itself)
window=1:trainX,trainY,testX,testY
window=10:trainX10,trainY10,testX10,testY10
window=10 + ICA=3:trainX10_ica3,trainY10,testX10_ica3,testY10

1.Leadlag correlation data



In [24]:

    
# Leadlag correlation data
print "*"*100
print "For CAB:"

## Calculate the leadlag correlation with 4 years data
print "Calculate the leadlag correlations"
leadlag_corr(cab_4,[dks_4,hibb_4,sp500_4,cab_pe_4,cab_ps_4],0,30,0.06)
print "*"*100

## Put data together
cab_index = cab.index.values[24:]
#cab = cab.reset_index(drop=True)
frame=[cab,dks.shift(24),# dks.shift(8), dks.shift(10),dks.shift(24),
       hibb.shift(22), # hibb.shift(24),
       cab_pe.shift(3),cab_pe.shift(1),cab_pe.shift(2),cab_pe.shift(4),cab_pe.shift(5),cab_pe.shift(6),
       cab_ps.shift(1),cab_ps.shift(2),cab_ps.shift(3),cab_ps.shift(4),cab_ps.shift(5)]
cab_others = reduce(lambda left,right: pd.concat([left.reset_index(drop=True), right.reset_index(drop=True)],axis=1),frame)
cab_others = cab_others[24:]
cab_others = cab_others.set_index(cab_index)
cab_others.columns = ['returns', 'dks_24',"hibb_22","pe_1","pe_2","pe_3","pe_4","pe_5","pe_6",
                      "ps_1","ps_2","ps_3","ps_4","ps_5"]
print cab_others.shape

cab_others.to_csv("output/cabothers.csv", sep=';', encoding='utf-8')









    



****************************************************************************************************
For CAB:
Calculate the leadlag correlations






    












    



For dataset1 use lag = 
       corr1
0   0.426338
8   0.086763
24  0.076371






    












    



For dataset2 use lag = 
       corr2
0   0.366627
14 -0.060110
26  0.060374






    












    



For dataset3 use lag = 
      corr3
0  0.438557
4 -0.086215






    












    



For dataset4 use lag = 
       corr4
1  -0.085198
2  -0.085974
3  -0.090637
4  -0.088776
5  -0.085847
6  -0.074881
7  -0.072058
8  -0.065780
9  -0.072013
10 -0.063145
11 -0.064902
12 -0.062286
13 -0.061377
14 -0.062793
15 -0.062214
16 -0.063844
17 -0.064252
18 -0.063446
19 -0.066722
20 -0.068209
22 -0.063097
23 -0.060923
27 -0.060702
28 -0.068965
29 -0.068491






    












    



For dataset5 use lag = 
       corr5
1  -0.081401
2  -0.081928
3  -0.085286
4  -0.082607
5  -0.081901
6  -0.073076
7  -0.070226
8  -0.065460
9  -0.068803
10 -0.060368
11 -0.062864
13 -0.060454
14 -0.062410
15 -0.061199
16 -0.062677
17 -0.060357
18 -0.060875
19 -0.062087
20 -0.061913
22 -0.060694
28 -0.064951
29 -0.064543
****************************************************************************************************
(1233, 14)



In [25]:

    
print "*"*100
print "For DKS:"
## Calculate the leadlag correlation with 4 years data
print "Calculate the leadlag correlations"
leadlag_corr(dks_4,[cab_4,hibb_4,sp500_4,dks_pe_4,dks_ps_4],0,30,0.06)
print "*"*100

## Put data together
dks_index = dks.index.values[29:]
frame=[dks,cab.shift(2), cab.shift(29),\
       hibb.shift(2), hibb.shift(29),\
       sp500.shift(2), sp500.shift(29),dks_ps.shift(3)]

dks_others = reduce(lambda left,right: pd.concat([left, right],axis=1),frame)
dks_others = dks_others[29:]
#dks_others = dks_others.set_index(dks_index)

dks_others.columns = ['returns', 'cab_2','cab_29','hibb_2',"hibb_29",\
                      "sp500_2","sp500_29",'dks_ps_3']
dks_others.to_csv("output/dksothers.csv", sep=';', encoding='utf-8')









    



****************************************************************************************************
For DKS:
Calculate the leadlag correlations






    












    



For dataset1 use lag = 
       corr1
0   0.426338
24  0.071839






    












    



For dataset2 use lag = 
       corr2
0   0.515865
9   0.060867
19 -0.069590






    












    



For dataset3 use lag = 
      corr3
0  0.501081






    












    



For dataset4 use lag = 
      corr4
0  0.094383






    












    



For dataset5 use lag = 
      corr5
0  0.075153
1 -0.081737
2 -0.068285
5 -0.062168
****************************************************************************************************



In [26]:

    
# ## adding twitter data
# leadlag_corr(cab_4,[dks_4,hibb_4,sp500_4,cab_pe_4,cab_ps_4,cab_tnum_4,cab_tret_4,cab_tfav_4,cab_mood_4],0,30,0.07)
cab_index = dks.index.values[27:]
frame=[cab,dks.shift(24),# dks.shift(8), dks.shift(10),dks.shift(24),
       hibb.shift(22), # hibb.shift(24),
       cab_pe.shift(3),cab_pe.shift(1),cab_pe.shift(2),cab_pe.shift(4),cab_pe.shift(5),cab_pe.shift(6),
       cab_ps.shift(1),cab_ps.shift(2),cab_ps.shift(3),cab_ps.shift(4),cab_ps.shift(5),
       cab_tnum.shift(19),
       cab_tret.shift(2),cab_tret.shift(15),cab_tret.shift(23),
       cab_tfav.shift(2),cab_tfav.shift(13),cab_tfav.shift(15),cab_tfav.shift(23),
       cab_mood.shift(18),cab_mood.shift(19),cab_mood.shift(26),cab_mood.shift(27)]
cab_others1 = reduce(lambda left,right: pd.concat([left.reset_index(drop=True), right.reset_index(drop=True)],axis=1),frame)
cab_others1 = cab_others1[27:]
cab_others1 = cab_others1.set_index(cab_index)
cab_others1.columns = ['returns', 'dks_24',"hibb_22","pe_3","pe_1","pe_2","pe_4","pe_5","pe_6",\
                      "ps_1","ps_2","ps_3","ps_4","ps_5","num_19","ret_2","ret_15","ret_23","fav_2","fav_13","fav_15","fav_23",
                      "mood_18","mood_19","mood_26","mood_27"]
cab_others1.shape
cab_others1.to_csv("output/cabotherstweets.csv", sep=',', encoding='utf-8')



In [27]:

    
### Adding tweet data
# print "*"*100
# print "For DKS:"
# ## Calculate the leadlag correlation with 4 years data
# print "Calculate the leadlag correlations"
# leadlag_corr(dks_4,[cab_4,hibb_4,sp500_4,dks_pe_4,dks_ps_4,dks_tnum_4,dks_tret_4,dks_tfav_4],0,30,0.07)
# print "*"*100

# ## Put data together
dks_index = dks.index.values[29:]
frame=[dks,cab.shift(2), cab.shift(29),\
       hibb.shift(2), hibb.shift(29),\
       sp500.shift(2), sp500.shift(29),dks_ps.shift(3),\
       dks_tnum.shift(1),dks_tret.shift(1),dks_tfav.shift(1),dks_mood.shift(25)]

dks_others1 = reduce(lambda left,right: pd.concat([left, right],axis=1),frame)
dks_others1= dks_others1[29:]
#dks_others = dks_others.set_index(dks_index)

dks_others1.columns = ['returns', 'cab_2','cab_29','hibb_2',"hibb_29",\
                      "sp500_2","sp500_29","dks_ps_3","num_1","ret_1","fav_1","mood_25"]
dks_others1.to_csv("output/dksotherstweet.csv", sep=',', encoding='utf-8')

2. Leadlag correlation data with features



In [28]:

    
### For CAB ###
dks_feature = leadlag_feature(dks,look_back=24)
pe_feature = leadlag_feature(cab_pe,look_back=6)[18:]
ps_feature = leadlag_feature(cab_ps,look_back=6)[18:]
tnum_feature = leadlag_feature(cab_tnum,look_back=19)[5:]
tret_feature = leadlag_feature(cab_tret,look_back=23)[1:]
tfav_feature = leadlag_feature(cab_tfav,look_back=23)[1:]
mood_feature = leadlag_feature(cab_mood,look_back=23)[1:]
frame=[cab[25:],dks_feature,pe_feature,ps_feature,tnum_feature,tret_feature,tfav_feature,mood_feature]
cab_feature = reduce(lambda left,right: pd.concat([left.reset_index(drop=True), 
                                                      right.reset_index(drop=True)],axis=1),frame)
cab_feature.columns = ['returns', 'dks_max',"dks_min","dks_mean","dks_med",
                       'pe_max',"pe_min","pe_mean","pe_med",
                      'ps_max',"ps_min","ps_mean","ps_med",
                      'tn_max',"tn_min","tn_mean","tn_med",
                      'tr_max',"tr_min","tr_mean","tr_med",
                      'tf_max',"tf_min","tf_mean","tf_med",
                      'md_max',"md_min","md_mean","md_med"]
cab_feature.to_csv("output/cab_feature.csv", sep=',', encoding='utf-8')



In [29]:

    
### For DKS ###
cab_feature = leadlag_feature(cab,look_back=24)
hibb_feature = leadlag_feature(hibb,look_back=19)[5:]
ps_feature = leadlag_feature(dks_ps,look_back=5)[21:]
tnum_feature = leadlag_feature(cab_tnum,look_back=16)[8:]
tfav_feature = leadlag_feature(cab_tfav,look_back=22)[2:]
frame=[dks[25:],cab_feature,hibb_feature,ps_feature,tnum_feature,tfav_feature]
dks_feature = reduce(lambda left,right: pd.concat([left.reset_index(drop=True), 
                                                      right.reset_index(drop=True)],axis=1),frame)
dks_feature.columns = ['returns', 'dks_max',"dks_min","dks_mean","dks_med",
                       'hibb_max',"hibb_min","hibb_mean","hibb_med",
                      'ps_max',"ps_min","ps_mean","ps_med",
                      'tn_max',"tn_min","tn_mean","tn_med",
                      'tf_max',"tf_min","tf_mean","tf_med"]
dks_feature.to_csv("output/dks_feature.csv", sep=',', encoding='utf-8')

3. Window method data



In [30]:

    
# with only t and (t-1) values
trainX,trainY,testX,testY = get_wmdata(seed=7,data=cab,split=0.7,look_back=1)
# with only t ...(t-10) values
trainX10,trainY10,testX10,testY10 = get_wmdata(seed=7,data=cab,split=0.7,look_back=10)









    



(879, 378)
(879, 378)






    



/Users/yujingma/anaconda/lib/python2.7/site-packages/sklearn/preprocessing/data.py:324: DeprecationWarning: Passing 1d arrays as data is deprecated in 0.17 and will raise ValueError in 0.19. Reshape your data either using X.reshape(-1, 1) if your data has a single feature or X.reshape(1, -1) if it contains a single sample.
  warnings.warn(DEPRECATION_MSG_1D, DeprecationWarning)
/Users/yujingma/anaconda/lib/python2.7/site-packages/sklearn/preprocessing/data.py:359: DeprecationWarning: Passing 1d arrays as data is deprecated in 0.17 and will raise ValueError in 0.19. Reshape your data either using X.reshape(-1, 1) if your data has a single feature or X.reshape(1, -1) if it contains a single sample.
  warnings.warn(DEPRECATION_MSG_1D, DeprecationWarning)
/Users/yujingma/anaconda/lib/python2.7/site-packages/sklearn/preprocessing/data.py:324: DeprecationWarning: Passing 1d arrays as data is deprecated in 0.17 and will raise ValueError in 0.19. Reshape your data either using X.reshape(-1, 1) if your data has a single feature or X.reshape(1, -1) if it contains a single sample.
  warnings.warn(DEPRECATION_MSG_1D, DeprecationWarning)
/Users/yujingma/anaconda/lib/python2.7/site-packages/sklearn/preprocessing/data.py:359: DeprecationWarning: Passing 1d arrays as data is deprecated in 0.17 and will raise ValueError in 0.19. Reshape your data either using X.reshape(-1, 1) if your data has a single feature or X.reshape(1, -1) if it contains a single sample.
  warnings.warn(DEPRECATION_MSG_1D, DeprecationWarning)



In [31]:

    
### 3.ICA for window method data
# n_component=3 for window size 10
trainX10_ica3,trainY10,testX10_ica3,testY10 = wmd_ica(10, cab, split=0.7, look_back=10, n_component=3)









    



(879, 378)






    



/Users/yujingma/anaconda/lib/python2.7/site-packages/sklearn/preprocessing/data.py:324: DeprecationWarning: Passing 1d arrays as data is deprecated in 0.17 and will raise ValueError in 0.19. Reshape your data either using X.reshape(-1, 1) if your data has a single feature or X.reshape(1, -1) if it contains a single sample.
  warnings.warn(DEPRECATION_MSG_1D, DeprecationWarning)
/Users/yujingma/anaconda/lib/python2.7/site-packages/sklearn/preprocessing/data.py:359: DeprecationWarning: Passing 1d arrays as data is deprecated in 0.17 and will raise ValueError in 0.19. Reshape your data either using X.reshape(-1, 1) if your data has a single feature or X.reshape(1, -1) if it contains a single sample.
  warnings.warn(DEPRECATION_MSG_1D, DeprecationWarning)
/Users/yujingma/anaconda/lib/python2.7/site-packages/scipy/linalg/basic.py:884: RuntimeWarning: internal gelsd driver lwork query error, required iwork dimension not returned. This is likely the result of LAPACK bug 0038, fixed in LAPACK 3.2.2 (released July 21, 2010). Falling back to 'gelss' driver.
  warnings.warn(mesg, RuntimeWarning)

Model Fittings

Feature Extraction



In [52]:

    
#get_ICA(cab_feature,n_comps=3,plot=True)
X = cab_others
n_comps=3
#X = (X-X.mean())#/X.std(ddof=0)
# Compute ICA
ica = FastICA(n_components=3)
S_ = ica.fit_transform(X)  # Reconstruct signals
A_ = ica.mixing_  # Get estimated mixing matrix
ICs = pd.DataFrame({"c1":S_[:,0],'c2':S_[:,1],'c3':S_[:,2]})
    # Make plots to see the signal

fig=plt.figure()
fig.set_size_inches(18,16)
models = [S_]
names = ['ICA recovered signals']
colors = ['red', 'steelblue', 'orange']

for ii, (model, name) in enumerate(zip(models, names), 1):
    plt.subplot(4, 1, ii)
    plt.title(name)
    for sig, color in zip(model.T, colors):
        plt.plot(sig, color=color)

plt.subplots_adjust(0.09, 0.04, 0.94, 0.94, 0.26, 0.46)
plt.show()

1. Linear regression (Leadlag correlation data )



In [260]:

    
##### For CAB #####
## Separate and Standaridize

# split into train and test sets

train,test = split_trn_tst(cab_others,porp=0.7)

X_trn = train[['dks_24',"hibb_22","pe_1","pe_2","pe_3","pe_4","pe_5","pe_6"]]
                      #"ps_1","ps_2","ps_3","ps_4","ps_5"]] # input varialbe
Y_trn = train['returns'] # output varialbe
X_tst = test[['dks_24',"hibb_22","pe_1","pe_2","pe_3","pe_4","pe_5","pe_6"]]
                      #"ps_1","ps_2","ps_3","ps_4","ps_5"]] # input varialbe
Y_tst = test['returns']


# standardize the data
X_trn = (X_trn-X_trn.mean())#/X_trn.std(ddof=0)
X_tst = (X_tst-X_trn.mean())#/X_trn.std(ddof=0)
Y_trn = (Y_trn-Y_trn.mean())#/Y_trn.std(ddof=0)
Y_tst = (Y_tst-Y_trn.mean())#/Y_trn.std(ddof=0)

## Fitting a linear regression model
print "Fitting a linear regression model with all the shifts"
ols = lm.LinearRegression()
ols.fit(X_trn, Y_trn)
print "Intercept: %f" %ols.intercept_
print "Coefficients: %s" %str(ols.coef_)


#model = sm.OLS(Y_trn, X_trn)
#results = model.fit()
#print results.summary()
# save the prediction
lm_cabs = ols.predict(X_tst)-ols.predict(X_tst).mean() # adjusted the shift of the prediction
lm_cabn = ols.predict(X_trn)

# training and test errors
print "R^2 for training set:",
print ols.score(X_trn, Y_trn)
print "RMSE: %.5f" % math.sqrt(mean_squared_error(Y_trn, lm_cabn))

print "R^2 for test set:",
print ols.score(X_tst, Y_tst)
print "RMSE: %.5f" % math.sqrt(mean_squared_error(Y_tst, lm_cabs))

print "*"*100
# Make plots
plot_lm(cab_others["returns"],lm_cabn,lm_cabs)









    



(863, 370)
Fitting a linear regression model with all the shifts
Intercept: -0.000000
Coefficients: [ 0.09355299 -0.05850729 -0.00086127  0.00133558 -0.00101822 -0.00120562
 -0.00088618  0.00201807]
R^2 for training set: 0.0219864890835
RMSE: 0.02140
R^2 for test set: -0.209518825255
RMSE: 0.02478
****************************************************************************************************



In [261]:

    
##### For DKS #####
## Separate and Standaridize

# split into train and test sets
train,test = split_trn_tst(dks_others,porp=0.7)

X_trn1 = train[ ['cab_2','cab_29','hibb_2',"hibb_29",\
                      "sp500_2","sp500_29",'dks_ps_3']] # input varialbe
Y_trn1 = train['returns'] # output varialbe
X_tst1 = test[ ['cab_2','cab_29','hibb_2',"hibb_29",\
                      "sp500_2","sp500_29",'dks_ps_3']] # input varialbe
Y_tst1 = test['returns']

# standardize the data
X_trn1 = (X_trn1-X_trn1.mean())#/X_trn.std(ddof=0)
X_tst1 = (X_tst1-X_trn1.mean())#/X_trn.std(ddof=0)
Y_trn1 = (Y_trn1-Y_trn1.mean())#/Y_trn.std(ddof=0)
Y_tst1 = (Y_tst1-Y_trn1.mean())#/Y_trn.std(ddof=0)

## Fitting a linear regression model
print "Fitting a linear regression model with all the shifts"
ols = lm.LinearRegression()
ols.fit(X_trn1, Y_trn1)
print "Intercept: %f" %ols.intercept_
print "Coefficients: %s" %str(ols.coef_)


#model2 = sm.OLS(Y_trn, X_trn)
#results = model2.fit()
#print results.summary()

# save the prediction
lm_dkst = ols.predict(X_tst1)-ols.predict(X_tst1).mean() # adjusted the shift of the prediction
lm_dksn = ols.predict(X_trn1)
# training and test errors
print "R^2 for training set:",
print ols.score(X_trn1, Y_trn1)
print "RMSE: %.5f" % math.sqrt(mean_squared_error(Y_trn1, lm_dksn))
print '-'*100

print "R^2 for test set:",
print ols.score(X_tst1, Y_tst1)
print "RMSE: %.5f" % math.sqrt(mean_squared_error(Y_tst1, lm_dkst))

print "*"*100
                               
# Make plots
plot_lm(dks_others["returns"],lm_dksn,lm_dkst)









    



(859, 369)
Fitting a linear regression model with all the shifts
Intercept: 0.000000
Coefficients: [ 0.00371677  0.04965778 -0.10294097  0.02550683  0.08703417  0.02043865
 -0.01200454]
R^2 for training set: 0.0180601412105
RMSE: 0.01712
----------------------------------------------------------------------------------------------------
R^2 for test set: -0.262948309705
RMSE: 0.01975
****************************************************************************************************

Note: cab_others and cab_others1 are both tried, cab_others shows better performance

2. Ridge regression with Cross Validation



In [206]:

    
##### For CAB #####
C_list = np.arange(0,30000, 10000)
index = C_list
rmse = pd.DataFrame(columns=['train_rmse','test_rmse'],\
                    index = index)
rmse = rmse.fillna(0)
for c in C_list:
    print " C = "+ str(c)
    r_trn,r_tst = cv_ridge(cab_others,k_fold=5,alpha=c)
    rmse.loc[c] = [r_trn,r_tst]
c_= rmse["test_rmse"].idxmin()
print "*"* 80
print c_
plt.plot(rmse["test_rmse"])









    



 C = 0
Train Score: 0.02298 RMSE
Test Score: 0.02184 RMSE
 C = 10000
Train Score: 0.02333 RMSE
Test Score: 0.02160 RMSE
 C = 20000
Train Score: 0.02333 RMSE
Test Score: 0.02160 RMSE
********************************************************************************
10000






    Out[206]:





[<matplotlib.lines.Line2D at 0x11d4f1250>]



In [207]:

    
print "Fitting a linear regression model with all the shifts"
rg = lm.Ridge(alpha = c_)
rg.fit(X_trn, Y_trn)
print "Intercept: %f" %rg.intercept_
print "Coefficients: %s" %str(rg.coef_)
# save the prediction
rg_cabt = rg.predict(X_tst)-rg.predict(X_tst).mean() # adjusted the shift of the prediction
rg_cabn = rg.predict(X_trn)
# training and test errors
print "R^2 for training set:",
print rg.score(X_trn, Y_trn)
print "RMSE: %.5f" % math.sqrt(mean_squared_error(Y_trn, rg_cabn))

print "R^2 for test set:",
print rg.score(X_tst, Y_tst)
print "RMSE: %.5f" % math.sqrt(mean_squared_error(Y_tst, rg_cabt))
print "with alpah = "+ str(c_)
print "*"*100
# Make plots
#plot_lm(cab_others["returns"],rg_cabn,rg_cabt)









    



Fitting a linear regression model with all the shifts
Intercept: -0.000000
Coefficients: [  2.68964194e-06  -1.69906950e-06  -1.17835682e-04  -7.66946245e-05
  -1.06423965e-04  -1.12568130e-04  -8.49485821e-05  -4.18315138e-05]
R^2 for training set: 0.00881960765486
RMSE: 0.02155
R^2 for test set: -0.166816655887
RMSE: 0.02483
with alpah = 10000
****************************************************************************************************



In [208]:

    
df =pd.DataFrame(np.concatenate([rg_cabn,rg_cabt]))
df.columns =["rg_cab"]
df.to_csv("output/rg_cab.csv", sep=',')



In [209]:

    
##### For DKS #####
C_list = np.linspace(0, 101, num =50)
index = C_list
rmse = pd.DataFrame(columns=['train_rmse','test_rmse'],\
                    index = index)
rmse = rmse.fillna(0)
for c in C_list:
    print " C = "+ str(c)
    r_trn,r_tst = cv_ridge(dks_others,k_fold=5,alpha=c)
    rmse.loc[c] = [r_trn,r_tst]
c_= rmse["test_rmse"].idxmin()
print "*"* 80
print c_
plt.plot(rmse["test_rmse"])









    



 C = 0.0
Train Score: 0.01699 RMSE
Test Score: 0.01806 RMSE
 C = 2.0612244898
Train Score: 0.01721 RMSE
Test Score: 0.01778 RMSE
 C = 4.12244897959
Train Score: 0.01723 RMSE
Test Score: 0.01776 RMSE
 C = 6.18367346939
Train Score: 0.01724 RMSE
Test Score: 0.01775 RMSE
 C = 8.24489795918
Train Score: 0.01725 RMSE
Test Score: 0.01775 RMSE
 C = 10.306122449
Train Score: 0.01725 RMSE
Test Score: 0.01775 RMSE
 C = 12.3673469388
Train Score: 0.01726 RMSE
Test Score: 0.01775 RMSE
 C = 14.4285714286
Train Score: 0.01726 RMSE
Test Score: 0.01774 RMSE
 C = 16.4897959184
Train Score: 0.01726 RMSE
Test Score: 0.01774 RMSE
 C = 18.5510204082
Train Score: 0.01726 RMSE
Test Score: 0.01774 RMSE
 C = 20.612244898
Train Score: 0.01727 RMSE
Test Score: 0.01774 RMSE
 C = 22.6734693878
Train Score: 0.01727 RMSE
Test Score: 0.01774 RMSE
 C = 24.7346938776
Train Score: 0.01727 RMSE
Test Score: 0.01774 RMSE
 C = 26.7959183673
Train Score: 0.01727 RMSE
Test Score: 0.01774 RMSE
 C = 28.8571428571
Train Score: 0.01727 RMSE
Test Score: 0.01775 RMSE
 C = 30.9183673469
Train Score: 0.01727 RMSE
Test Score: 0.01775 RMSE
 C = 32.9795918367
Train Score: 0.01727 RMSE
Test Score: 0.01775 RMSE
 C = 35.0408163265
Train Score: 0.01727 RMSE
Test Score: 0.01775 RMSE
 C = 37.1020408163
Train Score: 0.01727 RMSE
Test Score: 0.01775 RMSE
 C = 39.1632653061
Train Score: 0.01727 RMSE
Test Score: 0.01775 RMSE
 C = 41.2244897959
Train Score: 0.01727 RMSE
Test Score: 0.01775 RMSE
 C = 43.2857142857
Train Score: 0.01728 RMSE
Test Score: 0.01775 RMSE
 C = 45.3469387755
Train Score: 0.01728 RMSE
Test Score: 0.01775 RMSE
 C = 47.4081632653
Train Score: 0.01728 RMSE
Test Score: 0.01775 RMSE
 C = 49.4693877551
Train Score: 0.01728 RMSE
Test Score: 0.01775 RMSE
 C = 51.5306122449
Train Score: 0.01728 RMSE
Test Score: 0.01775 RMSE
 C = 53.5918367347
Train Score: 0.01728 RMSE
Test Score: 0.01775 RMSE
 C = 55.6530612245
Train Score: 0.01728 RMSE
Test Score: 0.01775 RMSE
 C = 57.7142857143
Train Score: 0.01728 RMSE
Test Score: 0.01775 RMSE
 C = 59.7755102041
Train Score: 0.01728 RMSE
Test Score: 0.01775 RMSE
 C = 61.8367346939
Train Score: 0.01728 RMSE
Test Score: 0.01775 RMSE
 C = 63.8979591837
Train Score: 0.01728 RMSE
Test Score: 0.01775 RMSE
 C = 65.9591836735
Train Score: 0.01728 RMSE
Test Score: 0.01775 RMSE
 C = 68.0204081633
Train Score: 0.01728 RMSE
Test Score: 0.01775 RMSE
 C = 70.0816326531
Train Score: 0.01728 RMSE
Test Score: 0.01775 RMSE
 C = 72.1428571429
Train Score: 0.01728 RMSE
Test Score: 0.01775 RMSE
 C = 74.2040816327
Train Score: 0.01728 RMSE
Test Score: 0.01775 RMSE
 C = 76.2653061224
Train Score: 0.01728 RMSE
Test Score: 0.01775 RMSE
 C = 78.3265306122
Train Score: 0.01728 RMSE
Test Score: 0.01775 RMSE
 C = 80.387755102
Train Score: 0.01728 RMSE
Test Score: 0.01775 RMSE
 C = 82.4489795918
Train Score: 0.01728 RMSE
Test Score: 0.01775 RMSE
 C = 84.5102040816
Train Score: 0.01728 RMSE
Test Score: 0.01775 RMSE
 C = 86.5714285714
Train Score: 0.01728 RMSE
Test Score: 0.01775 RMSE
 C = 88.6326530612
Train Score: 0.01728 RMSE
Test Score: 0.01775 RMSE
 C = 90.693877551
Train Score: 0.01728 RMSE
Test Score: 0.01775 RMSE
 C = 92.7551020408
Train Score: 0.01728 RMSE
Test Score: 0.01775 RMSE
 C = 94.8163265306
Train Score: 0.01728 RMSE
Test Score: 0.01775 RMSE
 C = 96.8775510204
Train Score: 0.01728 RMSE
Test Score: 0.01775 RMSE
 C = 98.9387755102
Train Score: 0.01728 RMSE
Test Score: 0.01775 RMSE
 C = 101.0
Train Score: 0.01728 RMSE
Test Score: 0.01775 RMSE
********************************************************************************
18.5510204082






    Out[209]:





[<matplotlib.lines.Line2D at 0x11da9b0d0>]



In [210]:

    
print "Fitting a linear regression model with all the shifts"
rg = lm.Ridge(alpha = c_)
rg.fit(X_trn1, Y_trn1)
print "Intercept: %f" %rg.intercept_
print "Coefficients: %s" %str(rg.coef_)
# save the prediction
rg_dkst = rg.predict(X_tst1)-rg.predict(X_tst1).mean() # adjusted the shift of the prediction
rg_dksn = rg.predict(X_trn1)
# training and test errors
print "R^2 for training set:",
print rg.score(X_trn1, Y_trn1)
print "RMSE: %.5f" % math.sqrt(mean_squared_error(Y_trn1, rg_dksn))

print "R^2 for test set:",
print rg.score(X_tst1, Y_tst1)
print "RMSE: %.5f" % math.sqrt(mean_squared_error(Y_tst1, rg_dkst))

print "*"*100
# Make plots
#plot_lm(cab_others["returns"],rg_cabn,rg_cabt)









    



Fitting a linear regression model with all the shifts
Intercept: 0.000000
Coefficients: [ -1.59648089e-04   1.20788472e-03  -9.34056188e-04   6.56405135e-04
   1.43229221e-06   3.17741351e-04  -3.82674606e-03]
R^2 for training set: 0.00305882411741
RMSE: 0.01725
R^2 for test set: -0.0243558706403
RMSE: 0.01944
****************************************************************************************************



In [510]:

    
df =pd.DataFrame(np.concatenate([rg_dksn,rg_dkst]))
df.columns =["rg_dks"]
df.to_csv("output/rg_dks.csv", sep=',')



In [273]:

    
cab_feature









    Out[273]:






  
    
      
      returns
      dks_max
      dks_min
      dks_mean
      dks_med
      pe_max
      pe_min
      pe_mean
      pe_med
      ps_max
      ...
      tr_mean
      tr_med
      tf_max
      tf_min
      tf_mean
      tf_med
      md_max
      md_min
      md_mean
      md_med
    
  
  
    
      0
      -0.016949
      0.038632
      -0.033736
      0.004966
      5.686946e-03
      13.121
      12.404
      12.842667
      12.8715
      0.6542
      ...
      3.347826
      0.0
      4.0
      0.0
      0.521739
      0.0
      0.173000
      -0.061800
      0.039278
      0.000000
    
    
      1
      -0.033699
      0.038632
      -0.033736
      0.005146
      5.686946e-03
      13.121
      12.404
      12.902333
      13.0480
      0.6542
      ...
      3.521739
      0.0
      4.0
      0.0
      0.565217
      0.0
      0.173000
      -0.061800
      0.044341
      0.000000
    
    
      2
      -0.021492
      0.046235
      -0.033736
      0.006433
      5.686946e-03
      13.121
      12.404
      12.885500
      12.9975
      0.6542
      ...
      3.782609
      0.0
      4.0
      0.0
      0.565217
      0.0
      0.172000
      -0.061800
      0.038398
      0.000000
    
    
      3
      -0.004973
      0.046235
      -0.033736
      0.004815
      2.146015e-03
      13.111
      12.404
      12.774500
      12.7855
      0.6537
      ...
      3.304348
      0.0
      4.0
      0.0
      0.478261
      0.0
      0.172000
      -0.061800
      0.040482
      0.000000
    
    
      4
      -0.021241
      0.046235
      -0.033736
      0.003029
      8.927312e-04
      13.111
      12.187
      12.738333
      12.7855
      0.6537
      ...
      3.347826
      0.0
      4.0
      0.0
      0.478261
      0.0
      0.172000
      -0.061800
      0.047135
      0.014500
    
    
      5
      0.005106
      0.046235
      -0.033736
      0.004416
      1.009494e-03
      13.111
      12.126
      12.645667
      12.6720
      0.6537
      ...
      3.347826
      0.0
      4.0
      0.0
      0.478261
      0.0
      0.172000
      -0.061800
      0.047135
      0.014500
    
    
      6
      -0.059695
      0.046235
      -0.033736
      0.003052
      3.858948e-04
      13.111
      11.869
      12.439500
      12.3210
      0.6537
      ...
      3.521739
      0.0
      4.0
      0.0
      0.565217
      0.0
      0.172000
      -0.061800
      0.047160
      0.014500
    
    
      7
      -0.022062
      0.046235
      -0.033736
      0.002758
      2.513289e-04
      12.889
      11.869
      12.242500
      12.1565
      0.6427
      ...
      3.173913
      0.0
      4.0
      0.0
      0.608696
      0.0
      0.172000
      0.000000
      0.050181
      0.014500
    
    
      8
      0.048343
      0.046235
      -0.043016
      0.000596
      -2.512027e-04
      12.455
      11.217
      11.963833
      12.0275
      0.6210
      ...
      3.217391
      1.0
      4.0
      0.0
      0.652174
      0.0
      0.172000
      0.000000
      0.051911
      0.021695
    
    
      9
      -0.010979
      0.046235
      -0.043016
      -0.001211
      -5.028478e-04
      12.187
      10.970
      11.716333
      11.8990
      0.6077
      ...
      3.521739
      1.0
      4.0
      0.0
      0.782609
      0.0
      0.172000
      0.000000
      0.048126
      0.021695
    
    
      10
      0.046625
      0.046235
      -0.043016
      -0.000341
      -5.028478e-04
      12.126
      10.970
      11.601833
      11.6845
      0.6046
      ...
      3.608696
      1.0
      4.0
      0.0
      0.782609
      0.0
      0.172000
      0.000000
      0.051994
      0.035950
    
    
      11
      0.023335
      0.046235
      -0.043016
      0.000467
      -5.028478e-04
      11.929
      10.970
      11.476500
      11.4370
      0.5948
      ...
      3.652174
      1.0
      4.0
      0.0
      0.782609
      0.0
      0.172000
      0.000000
      0.057298
      0.036300
    
    
      12
      0.007463
      0.046235
      -0.043016
      0.001310
      -5.028478e-04
      11.929
      10.970
      11.482333
      11.4370
      0.5948
      ...
      2.913043
      1.0
      4.0
      0.0
      0.652174
      0.0
      0.172000
      0.000000
      0.054800
      0.035950
    
    
      13
      0.018930
      0.046235
      -0.043016
      -0.000607
      -1.158374e-03
      12.182
      10.970
      11.524500
      11.4370
      0.6074
      ...
      2.782609
      1.0
      3.0
      0.0
      0.478261
      0.0
      0.172000
      0.000000
      0.048105
      0.021695
    
    
      14
      0.004039
      0.046235
      -0.043016
      0.000921
      -5.028478e-04
      12.273
      10.970
      11.700500
      11.7020
      0.6119
      ...
      2.347826
      1.0
      3.0
      0.0
      0.478261
      0.0
      0.172000
      -0.012350
      0.041350
      0.007680
    
    
      15
      -0.008447
      0.046235
      -0.043016
      0.002230
      1.262690e-07
      12.505
      11.374
      11.956333
      12.0430
      0.6235
      ...
      2.347826
      1.0
      3.0
      0.0
      0.478261
      0.0
      0.172000
      -0.012350
      0.041350
      0.007680
    
    
      16
      -0.027586
      0.046235
      -0.043016
      0.002418
      1.262690e-07
      12.556
      11.374
      12.132333
      12.2275
      0.6260
      ...
      2.521739
      1.0
      3.0
      0.0
      0.521739
      0.0
      0.172000
      -0.012350
      0.042537
      0.021695
    
    
      17
      0.052149
      0.046235
      -0.043016
      0.001445
      -5.028478e-04
      12.556
      11.904
      12.311500
      12.3610
      0.6260
      ...
      2.608696
      1.0
      3.0
      0.0
      0.521739
      0.0
      0.172000
      -0.012350
      0.045876
      0.027300
    
    
      18
      -0.018636
      0.046235
      -0.043016
      0.000035
      -1.158374e-03
      12.556
      12.106
      12.345167
      12.3610
      0.6260
      ...
      3.304348
      1.0
      4.0
      0.0
      0.695652
      0.0
      0.172000
      -0.012350
      0.050789
      0.035950
    
    
      19
      -0.012121
      0.046235
      -0.043016
      0.000493
      -1.158374e-03
      12.737
      12.106
      12.437667
      12.4770
      0.6351
      ...
      3.478261
      2.0
      4.0
      0.0
      0.695652
      0.0
      0.172000
      -0.037700
      0.049150
      0.035950
    
    
      20
      -0.021677
      0.046235
      -0.043016
      -0.000384
      -4.299549e-03
      12.737
      12.106
      12.475500
      12.5025
      0.6351
      ...
      3.652174
      2.0
      4.0
      0.0
      0.695652
      0.0
      0.153000
      -0.037700
      0.044683
      0.035950
    
    
      21
      -0.001672
      0.046235
      -0.053225
      -0.002580
      -7.049828e-03
      12.737
      12.106
      12.449333
      12.4745
      0.6351
      ...
      3.739130
      2.0
      4.0
      0.0
      0.739130
      0.0
      0.153000
      -0.037700
      0.047900
      0.035950
    
    
      22
      0.028895
      0.046235
      -0.053225
      -0.002179
      -7.049828e-03
      12.737
      12.081
      12.370167
      12.3985
      0.6351
      ...
      3.521739
      2.0
      4.0
      0.0
      0.739130
      0.0
      0.153000
      -0.037700
      0.048322
      0.035950
    
    
      23
      -0.010582
      0.046235
      -0.053225
      -0.002501
      -7.049828e-03
      12.737
      12.061
      12.305500
      12.2270
      0.6351
      ...
      3.521739
      2.0
      4.0
      0.0
      0.739130
      0.0
      0.153000
      -0.037700
      0.051731
      0.036300
    
    
      24
      0.053476
      0.046235
      -0.053225
      -0.002710
      -7.049828e-03
      12.737
      12.061
      12.356000
      12.3785
      0.6351
      ...
      3.652174
      2.0
      4.0
      0.0
      0.739130
      0.0
      0.153000
      -0.037700
      0.050764
      0.036300
    
    
      25
      0.004686
      0.046235
      -0.053225
      -0.003190
      -7.049828e-03
      12.500
      12.061
      12.279500
      12.3130
      0.6233
      ...
      3.391304
      2.0
      4.0
      0.0
      0.695652
      0.0
      0.153000
      -0.037700
      0.048303
      0.035950
    
    
      26
      -0.001166
      0.034853
      -0.053225
      -0.003742
      -7.049828e-03
      12.934
      12.061
      12.351833
      12.3130
      0.6449
      ...
      3.478261
      2.0
      4.0
      0.0
      0.739130
      0.0
      0.153000
      -0.037700
      0.045790
      0.035950
    
    
      27
      0.001945
      0.034853
      -0.053225
      -0.002477
      -4.299549e-03
      12.995
      12.061
      12.459667
      12.3435
      0.6479
      ...
      3.608696
      2.0
      4.0
      0.0
      0.782609
      0.0
      0.134000
      -0.075000
      0.035877
      0.031400
    
    
      28
      0.002718
      0.034853
      -0.053225
      -0.002349
      -4.299549e-03
      12.995
      12.061
      12.609500
      12.6715
      0.6479
      ...
      3.608696
      2.0
      4.0
      0.0
      0.782609
      0.0
      0.134000
      -0.075000
      0.035695
      0.031400
    
    
      29
      -0.017428
      0.034853
      -0.053225
      -0.002336
      -4.299549e-03
      13.005
      12.278
      12.766833
      12.9570
      0.6485
      ...
      3.434783
      2.0
      4.0
      0.0
      0.695652
      0.0
      0.134000
      -0.075000
      0.035670
      0.031400
    
    
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
    
    
      1202
      -0.008434
      0.070505
      -0.019231
      0.006944
      3.517806e-03
      19.102
      18.939
      19.023833
      19.0360
      0.8353
      ...
      37.695652
      16.0
      806.0
      5.0
      130.608696
      76.0
      0.084625
      0.000601
      0.043454
      0.043935
    
    
      1203
      0.000202
      0.070505
      -0.019231
      0.006552
      2.652204e-03
      19.102
      18.864
      19.001833
      19.0095
      0.8353
      ...
      40.434783
      16.0
      806.0
      5.0
      134.695652
      76.0
      0.084625
      0.000601
      0.044490
      0.043935
    
    
      1204
      -0.006681
      0.070505
      -0.019231
      0.006414
      2.652204e-03
      19.087
      18.705
      18.935667
      18.9410
      0.8346
      ...
      40.913043
      16.0
      806.0
      5.0
      139.130435
      77.0
      0.084625
      0.000601
      0.043665
      0.043935
    
    
      1205
      -0.001427
      0.070505
      -0.019231
      0.006565
      2.652204e-03
      19.076
      18.705
      18.872500
      18.9015
      0.8341
      ...
      42.043478
      16.0
      806.0
      5.0
      144.869565
      77.0
      0.084625
      0.000601
      0.041017
      0.043440
    
    
      1206
      0.004899
      0.070505
      -0.019231
      0.006758
      2.652204e-03
      18.943
      18.583
      18.790333
      18.7860
      0.8284
      ...
      41.913043
      16.0
      806.0
      5.0
      144.695652
      77.0
      0.084625
      0.000601
      0.039896
      0.043440
    
    
      1207
      -0.002234
      0.070505
      -0.019231
      0.006151
      2.214395e-03
      18.943
      18.557
      18.726667
      18.7065
      0.8284
      ...
      42.086957
      16.0
      806.0
      5.0
      144.608696
      77.0
      0.084625
      0.000601
      0.039886
      0.043440
    
    
      1208
      0.007125
      0.070505
      -0.019231
      0.005843
      2.214395e-03
      18.864
      18.557
      18.677500
      18.6765
      0.8249
      ...
      42.652174
      16.0
      806.0
      5.0
      146.695652
      77.0
      0.084625
      0.000601
      0.039681
      0.043440
    
    
      1209
      -0.010916
      0.070505
      -0.019231
      0.006370
      2.652204e-03
      18.708
      18.557
      18.634500
      18.6270
      0.8181
      ...
      41.826087
      15.0
      806.0
      5.0
      143.000000
      76.0
      0.113860
      0.000601
      0.042742
      0.043935
    
    
      1210
      -0.005109
      0.070505
      -0.019231
      0.007224
      3.082898e-03
      18.739
      18.557
      18.640167
      18.6270
      0.8194
      ...
      42.260870
      16.0
      806.0
      5.0
      143.956522
      77.0
      0.113860
      0.000601
      0.042439
      0.043935
    
    
      1211
      -0.008833
      0.070505
      -0.019231
      0.006102
      2.652204e-03
      18.739
      18.534
      18.611167
      18.5945
      0.8194
      ...
      41.652174
      15.0
      806.0
      5.0
      141.434783
      76.0
      0.113860
      0.000601
      0.039250
      0.041478
    
    
      1212
      0.008912
      0.070505
      -0.019231
      0.006214
      2.652204e-03
      18.739
      18.439
      18.587167
      18.5815
      0.8194
      ...
      41.956522
      15.0
      806.0
      5.0
      142.739130
      76.0
      0.113860
      0.000601
      0.038814
      0.041478
    
    
      1213
      -0.003287
      0.070505
      -0.019231
      0.004376
      2.214395e-03
      18.739
      18.277
      18.540500
      18.5700
      0.8194
      ...
      40.391304
      15.0
      806.0
      5.0
      135.869565
      71.0
      0.113860
      0.000601
      0.039933
      0.041478
    
    
      1214
      0.000618
      0.070505
      -0.019231
      0.004947
      2.214395e-03
      18.739
      18.277
      18.505667
      18.4865
      0.8194
      ...
      41.739130
      17.0
      806.0
      5.0
      141.826087
      71.0
      0.113860
      0.000601
      0.039996
      0.041478
    
    
      1215
      0.007415
      0.070505
      -0.029838
      0.003797
      2.214395e-03
      18.739
      18.277
      18.467833
      18.4390
      0.8194
      ...
      39.782609
      15.0
      806.0
      5.0
      136.826087
      65.0
      0.113860
      -0.001712
      0.037918
      0.039241
    
    
      1216
      -0.000818
      0.070505
      -0.029838
      0.004041
      2.652204e-03
      18.534
      18.277
      18.409667
      18.4145
      0.8105
      ...
      37.478261
      12.0
      806.0
      5.0
      130.826087
      63.0
      0.113860
      -0.001712
      0.037869
      0.039241
    
    
      1217
      0.026397
      0.070505
      -0.029838
      0.004558
      2.652204e-03
      18.527
      18.277
      18.408500
      18.4145
      0.8101
      ...
      37.304348
      12.0
      806.0
      5.0
      131.304348
      63.0
      0.113860
      -0.001712
      0.038539
      0.039241
    
    
      1218
      -0.008174
      0.070505
      -0.029838
      0.004514
      2.485632e-03
      18.527
      18.277
      18.420500
      18.4145
      0.8101
      ...
      36.217391
      11.0
      806.0
      5.0
      126.043478
      63.0
      0.113860
      -0.001712
      0.038906
      0.039241
    
    
      1219
      0.006633
      0.070505
      -0.029838
      0.004661
      2.645089e-03
      19.000
      18.379
      18.541000
      18.4750
      0.8308
      ...
      35.652174
      11.0
      806.0
      5.0
      125.000000
      63.0
      0.113860
      -0.001712
      0.039032
      0.039241
    
    
      1220
      0.022164
      0.029156
      -0.029838
      0.001702
      1.686586e-03
      19.000
      18.379
      18.608667
      18.5190
      0.8308
      ...
      32.956522
      11.0
      806.0
      5.0
      114.739130
      63.0
      0.113860
      -0.001712
      0.041121
      0.041478
    
    
      1221
      0.013675
      0.029156
      -0.029838
      0.002997
      2.645089e-03
      19.000
      18.390
      18.707167
      18.6860
      0.8308
      ...
      17.347826
      11.0
      329.0
      5.0
      83.086957
      63.0
      0.137027
      -0.001712
      0.044488
      0.041478
    
    
      1222
      0.006938
      0.029156
      -0.029838
      0.002488
      1.686586e-03
      19.390
      18.511
      18.873833
      18.9075
      0.8479
      ...
      21.478261
      11.0
      329.0
      5.0
      91.130435
      63.0
      0.137027
      -0.001712
      0.044738
      0.043983
    
    
      1223
      0.002679
      0.029156
      -0.029838
      0.001450
      1.231595e-03
      19.655
      18.511
      19.061833
      18.9850
      0.8595
      ...
      22.130435
      12.0
      329.0
      5.0
      91.304348
      63.0
      0.137027
      -0.005180
      0.043529
      0.043983
    
    
      1224
      -0.000573
      0.029156
      -0.029838
      0.001071
      1.231595e-03
      19.792
      18.845
      19.275333
      19.1950
      0.8655
      ...
      22.304348
      12.0
      329.0
      17.0
      92.782609
      63.0
      0.137027
      -0.005180
      0.044018
      0.043983
    
    
      1225
      0.022727
      0.029156
      -0.029838
      0.001376
      1.686586e-03
      19.845
      18.845
      19.416167
      19.5225
      0.8678
      ...
      25.043478
      15.0
      329.0
      17.0
      104.695652
      63.0
      0.137027
      -0.005180
      0.042725
      0.041478
    
    
      1226
      0.025770
      0.029156
      -0.029838
      0.000664
      1.686586e-03
      19.845
      18.970
      19.580833
      19.7235
      0.8678
      ...
      20.521739
      12.0
      319.0
      3.0
      90.521739
      63.0
      0.137027
      -0.005180
      0.043019
      0.043983
    
    
      1227
      0.150191
      0.029156
      -0.043294
      -0.001225
      1.231595e-03
      20.284
      19.390
      19.799833
      19.8125
      0.8870
      ...
      21.304348
      15.0
      319.0
      3.0
      90.260870
      63.0
      0.137027
      -0.005180
      0.041532
      0.036681
    
    
      1228
      0.002849
      0.029156
      -0.043294
      -0.001051
      1.988474e-03
      20.807
      19.655
      20.036000
      19.8390
      0.9098
      ...
      19.913043
      13.0
      319.0
      3.0
      83.478261
      62.0
      0.137027
      -0.005180
      0.044877
      0.043983
    
    
      1229
      0.000316
      0.029156
      -0.043294
      -0.001972
      1.231595e-03
      23.932
      19.792
      20.748833
      20.0645
      1.0465
      ...
      24.260870
      15.0
      319.0
      3.0
      85.130435
      62.0
      0.183000
      -0.005180
      0.052206
      0.044994
    
    
      1230
      -0.003629
      0.029156
      -0.043294
      -0.001687
      1.467251e-03
      24.000
      19.833
      21.450167
      20.5455
      1.0495
      ...
      25.260870
      15.0
      319.0
      3.0
      89.391304
      63.0
      0.183000
      -0.005180
      0.049275
      0.043983
    
    
      1231
      -0.014252
      0.029156
      -0.043294
      -0.001032
      2.130377e-03
      24.008
      19.833
      22.144000
      22.3695
      1.0498
      ...
      24.652174
      15.0
      319.0
      3.0
      86.782609
      62.0
      0.183000
      -0.005180
      0.051106
      0.044994
    
  

1232 rows × 29 columns



In [314]:

    
# ##### For CAB #####
dataset = cab_feature
split = 0.7
shape = dataset.shape
dataset = dataset.values

scaler = MinMaxScaler(feature_range=(0, 1)) # or 0,1
dataset = scaler.fit_transform(dataset).reshape(shape)
# split into train and test sets
train_size = int(len(dataset) * split)
test_size = len(dataset) - train_size
train, test = dataset[0:train_size], dataset[train_size:len(dataset)]

trainX = train[:,1:]
trainY = train[:,0]
testX = test[:,1:]
testY = test[:,0]

C_list = np.arange(0,100000, 10000)
index = C_list
rmse = pd.DataFrame(columns=['train_rmse','test_rmse'],\
                    index = index)
rmse = rmse.fillna(0)
for c in C_list:
    print " C = "+ str(c)
    r_trn,r_tst = cv_ridge(train,k_fold=5,alpha=c)
    rmse.loc[c] = [r_trn,r_tst]
c_= rmse["test_rmse"].idxmin()
print "*"* 80
print c_
plt.plot(rmse["test_rmse"])

print "Fitting a linear regression model with all the shifts"
rg = lm.Ridge(alpha = c_)
rg.fit(trainX, trainY)
print "Intercept: %f" %rg.intercept_
print "Coefficients: %s" %str(rg.coef_)
# save the prediction
testPredict = rg.predict(testX) 
trainPredict = rg.predict(trainX)

trainPredict = scaler.inverse_transform(trainPredict)
trainY = scaler.inverse_transform([trainY])
testPredict = scaler.inverse_transform(testPredict)
testY = scaler.inverse_transform([testY])
# training and test errors
print "R^2 for training set:",
print rg.score(trainX, trainY)
print "RMSE: %.5f" % math.sqrt(mean_squared_error(trainY, trainPredict))

print "R^2 for test set:",
print rg.score(testX, testY)
print "RMSE: %.5f" % math.sqrt(mean_squared_error(testY, testPredict))
print "with alpah = "+ str(c_)
print "*"*100



In [315]:

    
##### For DKS #####
dataset = dks_feature
split = 0.7
shape = dataset.shape
dataset = dataset.values

scaler = MinMaxScaler(feature_range=(0, 1)) # or 0,1
dataset = scaler.fit_transform(dataset).reshape(shape)
# split into train and test sets
train_size = int(len(dataset) * split)
test_size = len(dataset) - train_size
train, test = dataset[0:train_size], dataset[train_size:len(dataset)]

trainX = train[:,1:]
trainY = train[:,0]
testX = test[:,1:]
testY = test[:,0]

C_list = np.arange(0,100000, 10000)
index = C_list
rmse = pd.DataFrame(columns=['train_rmse','test_rmse'],\
                    index = index)
rmse = rmse.fillna(0)
for c in C_list:
    print " C = "+ str(c)
    r_trn,r_tst = cv_ridge(train,k_fold=5,alpha=c)
    rmse.loc[c] = [r_trn,r_tst]
c_= rmse["test_rmse"].idxmin()
print "*"* 80
print c_
plt.plot(rmse["test_rmse"])

print "Fitting a linear regression model with all the shifts"
rg = lm.Ridge(alpha = c_)
rg.fit(trainX, trainY)
print "Intercept: %f" %rg.intercept_
print "Coefficients: %s" %str(rg.coef_)
# save the prediction
testPredict = rg.predict(testX) 
trainPredict = rg.predict(trainX)

# training and test errors
print "R^2 for training set:",
print rg.score(trainX, trainY)
print "RMSE: %.5f" % math.sqrt(mean_squared_error(trainY, trainPredict))

print "R^2 for test set:",
print rg.score(testX, testY)
print "RMSE: %.5f" % math.sqrt(mean_squared_error(testY, testPredict))
print "with alpah = "+ str(c_)
print "*"*100

3. AR model



In [423]:

    
##### For CAB #####
test_stationarity(cab)









    



/Users/yujingma/anaconda/lib/python2.7/site-packages/ipykernel/__main__.py:4: FutureWarning: pd.rolling_mean is deprecated for Series and will be removed in a future version, replace with 
	Series.rolling(window=10,center=False).mean()
/Users/yujingma/anaconda/lib/python2.7/site-packages/ipykernel/__main__.py:5: FutureWarning: pd.rolling_std is deprecated for Series and will be removed in a future version, replace with 
	Series.rolling(window=10,center=False).std()






    












    



Results of Dickey-Fuller Test:
Test Statistic                -1.856349e+01
p-value                        2.086195e-30
#Lags Used                     3.000000e+00
Number of Observations Used    1.253000e+03
Critical Value (5%)           -2.863849e+00
Critical Value (1%)           -3.435580e+00
Critical Value (10%)          -2.568000e+00
dtype: float64



In [430]:

    
fig = plt.figure(figsize=(12,8))
ax1 = fig.add_subplot(211)
fig = sm.graphics.tsa.plot_acf(cab.values.squeeze(), lags=40, ax=ax1)
ax2 = fig.add_subplot(212)
fig = sm.graphics.tsa.plot_pacf(cab, lags=40, ax=ax2)



In [431]:

    
cab1 = cab
cab1 = (cab1 - cab1.mean()) / (cab1.std())
plt.acorr(cab1,maxlags = len(cab1) -1, linestyle = "solid", usevlines = False, marker='')
plt.show()
autocorrelation_plot(cab1)
plt.show()



In [493]:

    
## according the plots, try ar(4,0)
arma_mod40 = sm.tsa.ARMA(cab, (4,0)).fit()
print arma_mod40.params
print arma_mod40.aic, arma_mod40.bic, arma_mod40.hqic
## check residual
print "*"*50
resid40 = arma_mod40.resid
print  "check residual"
fig = plt.figure(figsize=(12,8))
ax1 = fig.add_subplot(211)
fig = sm.graphics.tsa.plot_acf(resid40.values.squeeze(), lags=40, ax=ax1)
ax2 = fig.add_subplot(212)
fig = sm.graphics.tsa.plot_pacf(resid40, lags=40, ax=ax2)
print "*"*50
print  "Prediction for the 5th year"
length = cab.shape[0]
predict_cab40 = arma_mod40.predict(7*length/10,length-1, dynamic=True)
cab_test = cab[(7*length/10):]
#print predict_cab40
#print cab_test
print "RMSE: %.5f" % math.sqrt(mean_squared_error(cab_test, predict_cab40))









    



const              0.001038
ar.L1.Adj Close    0.032064
ar.L2.Adj Close    0.007998
ar.L3.Adj Close    0.018676
ar.L4.Adj Close   -0.074131
dtype: float64
-5936.45429633 -5905.63539708 -5924.87165505
**************************************************
check residual
**************************************************
Prediction for the 5th year
RMSE: 0.02466



In [316]:

    
##### For DKS #####
test_stationarity(dks)









    



/Users/yujingma/anaconda/lib/python2.7/site-packages/ipykernel/__main__.py:4: FutureWarning: pd.rolling_mean is deprecated for Series and will be removed in a future version, replace with 
	Series.rolling(window=10,center=False).mean()
/Users/yujingma/anaconda/lib/python2.7/site-packages/ipykernel/__main__.py:5: FutureWarning: pd.rolling_std is deprecated for Series and will be removed in a future version, replace with 
	Series.rolling(window=10,center=False).std()






    












    



Results of Dickey-Fuller Test:
Test Statistic                  -37.591524
p-value                           0.000000
#Lags Used                        0.000000
Number of Observations Used    1256.000000
Critical Value (5%)              -2.863844
Critical Value (1%)              -3.435567
Critical Value (10%)             -2.567997
dtype: float64



In [317]:

    
fig = plt.figure(figsize=(12,8))
ax1 = fig.add_subplot(211)
fig = sm.graphics.tsa.plot_acf(dks.values.squeeze(), lags=40, ax=ax1)
ax2 = fig.add_subplot(212)
fig = sm.graphics.tsa.plot_pacf(dks, lags=40, ax=ax2)



In [319]:

    
dks1 = dks
dks1 = (dks1 - dks1.mean()) / (dks1.std())
plt.acorr(dks1,maxlags = len(dks1) -1, linestyle = "solid", usevlines = False, marker='')
plt.show()
autocorrelation_plot(dks1)
plt.show()



In [320]:

    
## according the plots, try ar(17,0)
arma_mod70 = sm.tsa.ARMA(dks, (17,0)).fit()
print arma_mod70.params
print arma_mod70.aic, arma_mod70.bic, arma_mod70.hqic
## check residual
print "*"*50
resid70 = arma_mod70.resid
print  "check residual"
fig = plt.figure(figsize=(12,8))
ax1 = fig.add_subplot(211)
fig = sm.graphics.tsa.plot_acf(resid70.values.squeeze(), lags=40, ax=ax1)
ax2 = fig.add_subplot(212)
fig = sm.graphics.tsa.plot_pacf(resid70, lags=40, ax=ax2)
print "*"*50
print  "Prediction for the 5th year"
length = dks.shape[0]
predict_dks71 = arma_mod70.predict(7*length/10,length-1, dynamic=True)
dks_test = dks[(7*length/10):]

print "RMSE: %.5f" % math.sqrt(mean_squared_error(dks_test, predict_dks71))









    



/Users/yujingma/anaconda/lib/python2.7/site-packages/statsmodels/base/model.py:466: ConvergenceWarning: Maximum Likelihood optimization failed to converge. Check mle_retvals
  "Check mle_retvals", ConvergenceWarning)






    



const       0.000696
ar.L1.y    -0.053160
ar.L2.y    -0.031886
ar.L3.y     0.000039
ar.L4.y     0.022337
ar.L5.y    -0.027724
ar.L6.y     0.006904
ar.L7.y    -0.046411
ar.L8.y    -0.007571
ar.L9.y     0.036364
ar.L10.y    0.031015
ar.L11.y   -0.027093
ar.L12.y    0.005629
ar.L13.y   -0.018688
ar.L14.y   -0.019820
ar.L15.y    0.024211
ar.L16.y   -0.053132
ar.L17.y    0.101446
dtype: float64
-6520.68894946 -6423.0957685 -6484.01058539
**************************************************
check residual
**************************************************
Prediction for the 5th year






    



/Users/yujingma/anaconda/lib/python2.7/site-packages/statsmodels/base/data.py:503: FutureWarning: TimeSeries is deprecated. Please use Series
  return TimeSeries(result, index=self.predict_dates)






    



RMSE: 0.01936

4. LSTM

Model1: window method with one layer



In [132]:

    
### Model1 with window size but no ICA 
## According to AR, Window size= 4 
lstm_np1,lstm_tp1=LSM(cab,seed=10,look_back=4,split=0.7,n_loop=50,layer2=4,show_log = 1,batch_size = 1,cv=False)
#Train Score: 0.02176 RMSE
#Test Score: 0.02454 RMSE
# save data
#df = pd.DataFrame([np.concatenate([lstm_np1,lstm_tp1])].reshape())
df =pd.DataFrame(np.concatenate([lstm_np1,lstm_tp1]))
df.columns =["lstm_cab1"]
df.to_csv("output/lstm_cab1.csv", sep=',')
# Train Score: 0.02184 RMSE
#Test Score: 0.02477 RMSE









    



/Users/yujingma/anaconda/lib/python2.7/site-packages/sklearn/preprocessing/data.py:324: DeprecationWarning: Passing 1d arrays as data is deprecated in 0.17 and will raise ValueError in 0.19. Reshape your data either using X.reshape(-1, 1) if your data has a single feature or X.reshape(1, -1) if it contains a single sample.
  warnings.warn(DEPRECATION_MSG_1D, DeprecationWarning)
/Users/yujingma/anaconda/lib/python2.7/site-packages/sklearn/preprocessing/data.py:359: DeprecationWarning: Passing 1d arrays as data is deprecated in 0.17 and will raise ValueError in 0.19. Reshape your data either using X.reshape(-1, 1) if your data has a single feature or X.reshape(1, -1) if it contains a single sample.
  warnings.warn(DEPRECATION_MSG_1D, DeprecationWarning)






    



Epoch 1/1
874/874 [==============================] - 21s - loss: 0.0069    
Epoch 1/1
874/874 [==============================] - 21s - loss: 0.0042    
Epoch 1/1
874/874 [==============================] - 20s - loss: 0.0042    
Epoch 1/1
874/874 [==============================] - 20s - loss: 0.0042    
Epoch 1/1
874/874 [==============================] - 19s - loss: 0.0042    
Epoch 1/1
874/874 [==============================] - 20s - loss: 0.0042    
Epoch 1/1
874/874 [==============================] - 20s - loss: 0.0042    
Epoch 1/1
874/874 [==============================] - 22s - loss: 0.0042    
Epoch 1/1
874/874 [==============================] - 20s - loss: 0.0042    
Epoch 1/1
874/874 [==============================] - 21s - loss: 0.0042    
Epoch 1/1
874/874 [==============================] - 22s - loss: 0.0042    
Epoch 1/1
874/874 [==============================] - 21s - loss: 0.0041    
Epoch 1/1
874/874 [==============================] - 21s - loss: 0.0041    
Epoch 1/1
874/874 [==============================] - 21s - loss: 0.0041    
Epoch 1/1
874/874 [==============================] - 21s - loss: 0.0041    
Epoch 1/1
874/874 [==============================] - 20s - loss: 0.0041    
Epoch 1/1
874/874 [==============================] - 23s - loss: 0.0041    
Epoch 1/1
874/874 [==============================] - 22s - loss: 0.0041    
Epoch 1/1
874/874 [==============================] - 22s - loss: 0.0041    
Epoch 1/1
874/874 [==============================] - 22s - loss: 0.0041    
Epoch 1/1
874/874 [==============================] - 22s - loss: 0.0041    
Epoch 1/1
874/874 [==============================] - 21s - loss: 0.0041    
Epoch 1/1
874/874 [==============================] - 22s - loss: 0.0041    
Epoch 1/1
874/874 [==============================] - 22s - loss: 0.0041    
Epoch 1/1
874/874 [==============================] - 22s - loss: 0.0041    
Epoch 1/1
874/874 [==============================] - 25s - loss: 0.0041    
Epoch 1/1
874/874 [==============================] - 26s - loss: 0.0041    
Epoch 1/1
874/874 [==============================] - 27s - loss: 0.0041    
Epoch 1/1
874/874 [==============================] - 28s - loss: 0.0041    
Epoch 1/1
874/874 [==============================] - 29s - loss: 0.0041    
Epoch 1/1
874/874 [==============================] - 23s - loss: 0.0041    
Epoch 1/1
874/874 [==============================] - 21s - loss: 0.0041    
Epoch 1/1
874/874 [==============================] - 27s - loss: 0.0041    
Epoch 1/1
874/874 [==============================] - 28s - loss: 0.0041    
Epoch 1/1
874/874 [==============================] - 28s - loss: 0.0041    
Epoch 1/1
874/874 [==============================] - 21s - loss: 0.0041    
Epoch 1/1
874/874 [==============================] - 21s - loss: 0.0041    
Epoch 1/1
874/874 [==============================] - 21s - loss: 0.0041    
Epoch 1/1
874/874 [==============================] - 22s - loss: 0.0041    
Epoch 1/1
874/874 [==============================] - 25s - loss: 0.0041    
Epoch 1/1
874/874 [==============================] - 28s - loss: 0.0041    
Epoch 1/1
874/874 [==============================] - 29s - loss: 0.0041    
Epoch 1/1
874/874 [==============================] - 23s - loss: 0.0041    
Epoch 1/1
874/874 [==============================] - 21s - loss: 0.0041    
Epoch 1/1
874/874 [==============================] - 22s - loss: 0.0041    
Epoch 1/1
874/874 [==============================] - 21s - loss: 0.0041    
Epoch 1/1
874/874 [==============================] - 22s - loss: 0.0041    
Epoch 1/1
874/874 [==============================] - 21s - loss: 0.0041    
Epoch 1/1
874/874 [==============================] - 21s - loss: 0.0041    
Epoch 1/1
874/874 [==============================] - 21s - loss: 0.0041    
Train Score: 0.02184 RMSE
Test Score: 0.02477 RMSE



In [25]:

    
lstm_np1,lstm_tp1=LSM(cab,seed=10,look_back=10,split=0.7,n_loop=100,layer2=4,show_log = 1,batch_size = 1,cv=False)
#Train Score: 0.02176 RMSE
#Test Score: 0.02454 RMSE
# save data
#df = pd.DataFrame([np.concatenate([lstm_np1,lstm_tp1])].reshape())
df =pd.DataFrame(np.concatenate([lstm_np1,lstm_tp1]))
df.columns =["lstm_cab1.1"]
df.to_csv("output/lstm_cab1_1.csv", sep=',')









    



/Users/yujingma/anaconda/lib/python2.7/site-packages/sklearn/preprocessing/data.py:324: DeprecationWarning: Passing 1d arrays as data is deprecated in 0.17 and will raise ValueError in 0.19. Reshape your data either using X.reshape(-1, 1) if your data has a single feature or X.reshape(1, -1) if it contains a single sample.
  warnings.warn(DEPRECATION_MSG_1D, DeprecationWarning)
/Users/yujingma/anaconda/lib/python2.7/site-packages/sklearn/preprocessing/data.py:359: DeprecationWarning: Passing 1d arrays as data is deprecated in 0.17 and will raise ValueError in 0.19. Reshape your data either using X.reshape(-1, 1) if your data has a single feature or X.reshape(1, -1) if it contains a single sample.
  warnings.warn(DEPRECATION_MSG_1D, DeprecationWarning)






    



Epoch 1/1
868/868 [==============================] - 38s - loss: 0.0064    
Epoch 1/1
868/868 [==============================] - 39s - loss: 0.0041    
Epoch 1/1
868/868 [==============================] - 39s - loss: 0.0041    
Epoch 1/1
868/868 [==============================] - 39s - loss: 0.0041    
Epoch 1/1
868/868 [==============================] - 39s - loss: 0.0041    
Epoch 1/1
868/868 [==============================] - 39s - loss: 0.0041    
Epoch 1/1
868/868 [==============================] - 36s - loss: 0.0041    
Epoch 1/1
868/868 [==============================] - 36s - loss: 0.0041    
Epoch 1/1
868/868 [==============================] - 39s - loss: 0.0041    
Epoch 1/1
868/868 [==============================] - 39s - loss: 0.0041    
Epoch 1/1
868/868 [==============================] - 38s - loss: 0.0041    
Epoch 1/1
868/868 [==============================] - 37s - loss: 0.0041    
Epoch 1/1
868/868 [==============================] - 36s - loss: 0.0041    
Epoch 1/1
868/868 [==============================] - 35s - loss: 0.0041    
Epoch 1/1
868/868 [==============================] - 34s - loss: 0.0041    
Epoch 1/1
868/868 [==============================] - 37s - loss: 0.0041    
Epoch 1/1
868/868 [==============================] - 42s - loss: 0.0041    
Epoch 1/1
868/868 [==============================] - 41s - loss: 0.0041    
Epoch 1/1
868/868 [==============================] - 39s - loss: 0.0041    
Epoch 1/1
868/868 [==============================] - 40s - loss: 0.0041    
Epoch 1/1
868/868 [==============================] - 38s - loss: 0.0041    
Epoch 1/1
868/868 [==============================] - 34s - loss: 0.0041    
Epoch 1/1
868/868 [==============================] - 34s - loss: 0.0041    
Epoch 1/1
868/868 [==============================] - 34s - loss: 0.0041    
Epoch 1/1
868/868 [==============================] - 34s - loss: 0.0041    
Epoch 1/1
868/868 [==============================] - 36s - loss: 0.0041    
Epoch 1/1
868/868 [==============================] - 38s - loss: 0.0041    
Epoch 1/1
868/868 [==============================] - 40s - loss: 0.0041    
Epoch 1/1
868/868 [==============================] - 40s - loss: 0.0041    
Epoch 1/1
868/868 [==============================] - 40s - loss: 0.0041    
Epoch 1/1
868/868 [==============================] - 37s - loss: 0.0041    
Epoch 1/1
868/868 [==============================] - 36s - loss: 0.0041    
Epoch 1/1
868/868 [==============================] - 37s - loss: 0.0041    
Epoch 1/1
868/868 [==============================] - 37s - loss: 0.0041    
Epoch 1/1
868/868 [==============================] - 36s - loss: 0.0041    
Epoch 1/1
868/868 [==============================] - 36s - loss: 0.0041    
Epoch 1/1
868/868 [==============================] - 37s - loss: 0.0041    
Epoch 1/1
868/868 [==============================] - 37s - loss: 0.0041    
Epoch 1/1
868/868 [==============================] - 39s - loss: 0.0041    
Epoch 1/1
868/868 [==============================] - 40s - loss: 0.0041    
Epoch 1/1
868/868 [==============================] - 38s - loss: 0.0041    
Epoch 1/1
868/868 [==============================] - 37s - loss: 0.0041    
Epoch 1/1
868/868 [==============================] - 40s - loss: 0.0041    
Epoch 1/1
868/868 [==============================] - 38s - loss: 0.0041    
Epoch 1/1
868/868 [==============================] - 40s - loss: 0.0041    
Epoch 1/1
868/868 [==============================] - 40s - loss: 0.0041    
Epoch 1/1
868/868 [==============================] - 40s - loss: 0.0041    
Epoch 1/1
868/868 [==============================] - 39s - loss: 0.0041    
Epoch 1/1
868/868 [==============================] - 39s - loss: 0.0041    
Epoch 1/1
868/868 [==============================] - 39s - loss: 0.0041    
Epoch 1/1
868/868 [==============================] - 39s - loss: 0.0040    
Epoch 1/1
868/868 [==============================] - 40s - loss: 0.0040    
Epoch 1/1
868/868 [==============================] - 39s - loss: 0.0041    
Epoch 1/1
868/868 [==============================] - 42s - loss: 0.0041    
Epoch 1/1
868/868 [==============================] - 39s - loss: 0.0040    
Epoch 1/1
868/868 [==============================] - 38s - loss: 0.0040    
Epoch 1/1
868/868 [==============================] - 38s - loss: 0.0040    
Epoch 1/1
868/868 [==============================] - 38s - loss: 0.0040    
Epoch 1/1
868/868 [==============================] - 39s - loss: 0.0040    
Epoch 1/1
868/868 [==============================] - 37s - loss: 0.0040    
Epoch 1/1
868/868 [==============================] - 37s - loss: 0.0040    
Epoch 1/1
868/868 [==============================] - 37s - loss: 0.0040    
Epoch 1/1
868/868 [==============================] - 38s - loss: 0.0040    
Epoch 1/1
868/868 [==============================] - 38s - loss: 0.0040    
Epoch 1/1
868/868 [==============================] - 37s - loss: 0.0040    
Epoch 1/1
868/868 [==============================] - 37s - loss: 0.0040    
Epoch 1/1
868/868 [==============================] - 37s - loss: 0.0040    
Epoch 1/1
868/868 [==============================] - 39s - loss: 0.0040    
Epoch 1/1
868/868 [==============================] - 39s - loss: 0.0040    
Epoch 1/1
868/868 [==============================] - 37s - loss: 0.0040    
Epoch 1/1
868/868 [==============================] - 38s - loss: 0.0040    
Epoch 1/1
868/868 [==============================] - 40s - loss: 0.0040    
Epoch 1/1
868/868 [==============================] - 40s - loss: 0.0040    
Epoch 1/1
868/868 [==============================] - 38s - loss: 0.0040    
Epoch 1/1
868/868 [==============================] - 40s - loss: 0.0040    
Epoch 1/1
868/868 [==============================] - 40s - loss: 0.0040    
Epoch 1/1
868/868 [==============================] - 42s - loss: 0.0040    
Epoch 1/1
868/868 [==============================] - 41s - loss: 0.0040    
Epoch 1/1
868/868 [==============================] - 43s - loss: 0.0040    
Epoch 1/1
868/868 [==============================] - 42s - loss: 0.0040    
Epoch 1/1
868/868 [==============================] - 42s - loss: 0.0040    
Epoch 1/1
868/868 [==============================] - 41s - loss: 0.0040    
Epoch 1/1
868/868 [==============================] - 41s - loss: 0.0040    
Epoch 1/1
868/868 [==============================] - 41s - loss: 0.0040    
Epoch 1/1
868/868 [==============================] - 41s - loss: 0.0040    
Epoch 1/1
868/868 [==============================] - 41s - loss: 0.0040    
Epoch 1/1
868/868 [==============================] - 41s - loss: 0.0040    
Epoch 1/1
868/868 [==============================] - 41s - loss: 0.0040    
Epoch 1/1
868/868 [==============================] - 42s - loss: 0.0040    
Epoch 1/1
868/868 [==============================] - 41s - loss: 0.0040    
Epoch 1/1
868/868 [==============================] - 40s - loss: 0.0040    
Epoch 1/1
868/868 [==============================] - 38s - loss: 0.0040    
Epoch 1/1
868/868 [==============================] - 39s - loss: 0.0040    
Epoch 1/1
868/868 [==============================] - 40s - loss: 0.0040    
Epoch 1/1
868/868 [==============================] - 40s - loss: 0.0040    
Epoch 1/1
868/868 [==============================] - 40s - loss: 0.0040    
Epoch 1/1
868/868 [==============================] - 40s - loss: 0.0040    
Epoch 1/1
868/868 [==============================] - 38s - loss: 0.0040    
Epoch 1/1
868/868 [==============================] - 40s - loss: 0.0040    
Epoch 1/1
868/868 [==============================] - 40s - loss: 0.0040    
Train Score: 0.02176 RMSE
Test Score: 0.02454 RMSE



In [27]:

    
## According to AR, Window size= 4 
lstm_np2,lstm_tp2=LSM(dks,seed=10,look_back=17,split=0.7,n_loop=30,layer2=4,show_log = 1,batch_size = 1,cv=False)
df =pd.DataFrame(np.concatenate([lstm_np2,lstm_tp2]))
df.columns =["lstm_dks1"]
df.to_csv("output/lstm_dks1.csv", sep=',')
#Train Score: 0.01738 RMSE
#Test Score: 0.01997 RMSE









    



/Users/yujingma/anaconda/lib/python2.7/site-packages/sklearn/preprocessing/data.py:324: DeprecationWarning: Passing 1d arrays as data is deprecated in 0.17 and will raise ValueError in 0.19. Reshape your data either using X.reshape(-1, 1) if your data has a single feature or X.reshape(1, -1) if it contains a single sample.
  warnings.warn(DEPRECATION_MSG_1D, DeprecationWarning)
/Users/yujingma/anaconda/lib/python2.7/site-packages/sklearn/preprocessing/data.py:359: DeprecationWarning: Passing 1d arrays as data is deprecated in 0.17 and will raise ValueError in 0.19. Reshape your data either using X.reshape(-1, 1) if your data has a single feature or X.reshape(1, -1) if it contains a single sample.
  warnings.warn(DEPRECATION_MSG_1D, DeprecationWarning)






    



Epoch 1/1
863/863 [==============================] - 57s - loss: 0.0077    
Epoch 1/1
863/863 [==============================] - 58s - loss: 0.0035    
Epoch 1/1
863/863 [==============================] - 58s - loss: 0.0035    
Epoch 1/1
863/863 [==============================] - 58s - loss: 0.0034    
Epoch 1/1
863/863 [==============================] - 58s - loss: 0.0034    
Epoch 1/1
863/863 [==============================] - 57s - loss: 0.0034    
Epoch 1/1
863/863 [==============================] - 58s - loss: 0.0034    
Epoch 1/1
863/863 [==============================] - 61s - loss: 0.0034    
Epoch 1/1
863/863 [==============================] - 60s - loss: 0.0034    
Epoch 1/1
863/863 [==============================] - 58s - loss: 0.0034    
Epoch 1/1
863/863 [==============================] - 58s - loss: 0.0034    
Epoch 1/1
863/863 [==============================] - 62s - loss: 0.0034    
Epoch 1/1
863/863 [==============================] - 60s - loss: 0.0034    
Epoch 1/1
863/863 [==============================] - 59s - loss: 0.0034    
Epoch 1/1
863/863 [==============================] - 59s - loss: 0.0034    
Epoch 1/1
863/863 [==============================] - 59s - loss: 0.0034    
Epoch 1/1
863/863 [==============================] - 59s - loss: 0.0034    
Epoch 1/1
863/863 [==============================] - 60s - loss: 0.0034    
Epoch 1/1
863/863 [==============================] - 60s - loss: 0.0034    
Epoch 1/1
863/863 [==============================] - 60s - loss: 0.0034    
Epoch 1/1
863/863 [==============================] - 60s - loss: 0.0034    
Epoch 1/1
863/863 [==============================] - 60s - loss: 0.0034    
Epoch 1/1
863/863 [==============================] - 60s - loss: 0.0034    
Epoch 1/1
863/863 [==============================] - 60s - loss: 0.0034    
Epoch 1/1
863/863 [==============================] - 60s - loss: 0.0034    
Epoch 1/1
863/863 [==============================] - 60s - loss: 0.0034    
Epoch 1/1
863/863 [==============================] - 60s - loss: 0.0034    
Epoch 1/1
863/863 [==============================] - 60s - loss: 0.0034    
Epoch 1/1
863/863 [==============================] - 60s - loss: 0.0034    
Epoch 1/1
863/863 [==============================] - 59s - loss: 0.0034    
Train Score: 0.01738 RMSE
Test Score: 0.01997 RMSE

Model2: window method with two layers



In [210]:

    
lstm_np3,lstm_tp3=LSTM(cab,seed=10,look_back=4,split=0.7,n_loop=30,layer2=4,show_log = 1,batch = 1,cv=False)
#Train Score: 0.02176 RMSE
#Test Score: 0.02454 RMSE
# save data
#df = pd.DataFrame([np.concatenate([lstm_np1,lstm_tp1])].reshape())
df =pd.DataFrame(np.concatenate([lstm_np3,lstm_tp3]))
df.columns =["lstm_cab2"]
df.to_csv("output/lstm_cab2.csv", sep=',')



In [211]:

    
lstm_np4,lstm_tp4=LSTM(dks,seed=10,look_back=17,split=0.7,n_loop=100,layer2=4,show_log = 1,batch = 1,cv=False)
#Train Score: 0.02176 RMSE
#Test Score: 0.02454 RMSE
# save data
#df = pd.DataFrame([np.concatenate([lstm_np1,lstm_tp1])].reshape())
df =pd.DataFrame(np.concatenate([lstm_np4,lstm_tp4]))
df.columns =["lstm_dks2"]
df.to_csv("output/lstm_dks2.csv", sep=',')

Model3: window method with two layers + ICA



In [213]:

    
lstm_np5,lstm_tp5=LSTM_ica(cab,seed=10,look_back=10,n_component=3,layer2=5,split=0.7,n_loop=50,show_log = 1,cv=False)
df =pd.DataFrame(np.concatenate([lstm_np5,lstm_tp5]))
df.columns =["lstm_ica_cab"]
df.to_csv("output/lstm_cab3.csv", sep=',')



In [214]:

    
lstm_np6,lstm_tp6=LSTM_ica(dks,seed=10,look_back=17,n_component=8,layer2=5,split=0.7,n_loop=50,show_log = 1,cv=False)
df = pd.DataFrame(np.concatenate([lstm_np6,lstm_tp6]))
df.columns =["lstm_ica_dks"]
df.to_csv("output/lstm_dks3.csv", sep=',')

Model4: Leadlag correlation with two layers + IC



In [216]:

    
cab_others1 = pd.read_csv("output/cabotherstweets.csv",index_col=0)[3:]
lstm_np7,lstm_tp7=LSTM_others(cab_others1,seed=10,n_features=(cab_others1.shape[1]-1),layer2=5,split=0.7,n_loop=50,show_log = 1,cv=False)
df =pd.DataFrame(np.concatenate([lstm_np7,lstm_tp7]))
df.columns =["lstm_othert_cab"]
df.to_csv("output/lstm_cab4.csv", sep=',')



In [220]:

    
lstm_np8,lstm_tp8=LSTM_others(dks_others1,seed=10,n_features=(dks_others1.shape[1]-1),layer2=5,split=0.7,n_loop=50,show_log = 1,cv=False)
df =pd.DataFrame(np.concatenate([lstm_np8,lstm_tp8]))
df.columns =["lstm_othert_dks"]
df.to_csv("output/lstm_dks4.csv", sep=',')

Cross-validation process



In [202]:

    
## split the data into 5 parts
## if data is training and test together, we need to concate 
def cv_lstm(model,data,k_fold,split,**extraKwArgs):
    total = int(len(data) * split)
    trntotal = data[0:total]
    train_score = [0]*(k_fold-1)
    test_score = [0]*(k_fold-1)
    L = int(len(data)/k_fold)
    for i in range(1,k_fold):
        ind = i*(L+1)
        subset = data[0:ind]
        train_score[i-1],test_score[i-1] = model(dataset=subset,**extraKwArgs)
    print('Train Score: %.2f RMSE' % ((numpy.array(train_score)).mean()))
    print('Test Score: %.2f RMSE' % ((numpy.array(test_score)).mean()))



In [219]:

    
for size in range(1,20):
    print "Batch = "+ str(size)
    cv_lstm(LSTM,data=cab,k_fold=5,batch_size =size,n_loop=10,split=0.7,cv=True)

4.1 Used LSTM models



In [ ]:

    
# Simple model with lookback=1
# default is very simple RNN layers=[1,4,1]
def LSTM_1(trainX,trainY,testX,testY,dataset=cab,layers=[1,4,1],lookback=1,epoch=100,batch=1):
    scaler = MinMaxScaler(feature_range=(0, 1))
    dataset = dataset.values.reshape(len(dataset),1)
    trainX,trainY,testX,testY = map(np.array,(trainX,trainY,testX,testY))
    look_back=lookback
    # prepare data
    # reshape input to be [samples, time steps, features]
    trainX = np.reshape(trainX, (trainX.shape[0], 1, trainX.shape[1]))
    testX = np.reshape(testX, (testX.shape[0], 1, testX.shape[1]))
    # create and fit the LSTM network
    model = Sequential()
    layers = layers
    model.add(LSTM(layers[1], input_dim=layers[0]))
    model.add(Dense(layers[2])) #a Dense layer ( = feedforward)
    model.compile(loss='mean_squared_error', optimizer='adam')
    model.fit(trainX, trainY, nb_epoch=epoch, batch_size=batch, verbose=2)
    # make predictions
    trainPredict = model.predict(trainX)
    testPredict = model.predict(testX)
    # invert predictions
    trainPredict = scaler.inverse_transform(trainPredict)
    trainY = scaler.inverse_transform([trainY])
    testPredict = scaler.inverse_transform(testPredict)
    testY = scaler.inverse_transform([testY])

    # calculate root mean squared error
    trainScore = math.sqrt(mean_squared_error(trainY[0], trainPredict[:,0]))
    print('Train Score: %.2f RMSE' % (trainScore))
    testScore = math.sqrt(mean_squared_error(testY[0], testPredict[:,0]))
    print('Test Score: %.2f RMSE' % (testScore))

    # shift train predictions for plotting
    trainPredictPlot = np.empty_like(dataset.values)
    trainPredictPlot[:, :] = np.nan
    trainPredictPlot[look_back:len(trainPredict)+look_back, :] = trainPredict
    # shift test predictions for plotting
    testPredictPlot = np.empty_like(dataset)
    testPredictPlot[:, :] = np.nan
    testPredictPlot[len(trainPredict)+(look_back*2)+1:len(dataset)-1, :] = testPredict
    # plot baseline and predictions
    fig = plt.figure()
    fig.set_size_inches(18,16)
    plt.plot(scaler.inverse_transform(dataset))
    plt.plot(trainPredictPlot)
    plt.plot(testPredictPlot)
    plt.show()

    return trainPredict,testPredict
## Use window method data
#trainX,trainY,testX,testY = get_wmdata(seed=7,data=cab,split=0.7,look_back=1)
#trainX,trainY,testX,testY = get_wmdata(seed=7,data=cab,split=0.7,look_back=10)
############# With default layers #############################
# First model split=0.7,epoch=100,batch_size=1
# LSM1_1_trnpred,LSM1_1_tstpred = LSTM_1(trainX,trainY,testX,testY,layers=[1,4,1],epoch=100,batch=1)
    # Train Score: 0.02 RMSE
    # Test Score: 0.02 RMSE
# Second model split=0.7,epoch=100,batch_size=2
#LSM1_2_trnpred,LSM1_2_tstpred = LSTM_1(trainX,trainY,testX,testY,layers=[1,4,1],epoch=100,batch=2)
    # Train Score: 0.02 RMSE
    # Test Score: 0.02 RMSE
# Third model split=0.7,epoch=30,batch_size=2

############# With a hidden layer of sizes 50 #################
#LSM1_3_trnpred,LSM1_3_tstpred = LSTM_1(trainX,trainY,testX,testY,layers=[1,4,1],epoch=100,batch=2)
    # Train Score: 0.02 RMSE
    # Test Score: 0.02 RMSE

############# With a window method ################# 
#LSM1_4_trnpred,LSM1_4_tstpred = LSTM_1(trainX10,trainY10,testX10,testY10,dataset=cab,layers=[10,50,1],epoch=100,batch=1)
    # Train Score: 0.02 RMSE
    # Test Score: 0.03 RMSE
    # But this one capture the small pikes
## Use window method and Ica data
# trainX10_ica3,trainY10,testX10_ica3,testY10 = wmd_ica(10, cab, split=0.7, look_back=10, n_component=3)
# LSM1_5_trnpred,LSM1_5_tstpred = LSTM_1(trainX10_ica3,trainY10,testX10_ica3,testY10,lookback=10,layers=[3,50,1],epoch=100,batch=1)
     #Train Score: 0.02 RMSE
    #Test Score: 0.02 RMSE



In [ ]:

    
# A model with window method and time steps
# Use the past observations them as time steps of the one input feature
# default is very simple RNN layers=[1,4,1]
def LSTM_2(trainX,trainY,testX,testY,dataset = cab, layers=[1,4,1],epoch=100,batch=1):
    scaler = MinMaxScaler(feature_range=(0, 1))
    dataset = dataset.values.reshape(len(dataset),1)
    trainX,trainY,testX,testY = map(np.array,(trainX,trainY,testX,testY))
    # reshape input to be [samples, time steps, features], set the columns to be the time steps dimension 
    trainX = np.reshape(trainX, (trainX.shape[0], trainX.shape[1], 1))
    testX = np.reshape(testX, (testX.shape[0], testX.shape[1], 1))
    # create and fit the LSTM network
    model = Sequential()
    layers = layers
    look_back = layers[0]
    model.add(LSTM(layers[1], input_dim=layers[0]))
    model.add(Dense(layers[2])) #a Dense layer ( = feedforward)
    model.compile(loss='mean_squared_error', optimizer='adam')
    model.fit(trainX, trainY, nb_epoch=epoch, batch_size=batch, verbose=2)
    # make predictions
    trainPredict = model.predict(trainX)
    testPredict = model.predict(testX)
    # invert predictions
    trainPredict = scaler.inverse_transform(trainPredict)
    trainY = scaler.inverse_transform([trainY])
    testPredict = scaler.inverse_transform(testPredict)
    testY = scaler.inverse_transform([testY])
    # calculate root mean squared error
    trainScore = math.sqrt(mean_squared_error(trainY[0], trainPredict[:,0]))
    print('Train Score: %.2f RMSE' % (trainScore))
    testScore = math.sqrt(mean_squared_error(testY[0], testPredict[:,0]))
    print('Test Score: %.2f RMSE' % (testScore))
    # shift train predictions for plotting
    trainPredictPlot = np.empty_like(dataset)
    trainPredictPlot[:, :] = np.nan
    trainPredictPlot[look_back:len(trainPredict)+look_back, :] = trainPredict
    # shift test predictions for plotting
    testPredictPlot = np.empty_like(dataset.values)
    testPredictPlot[:, :] = np.nan
    testPredictPlot[len(trainPredict)+(look_back*2)+1:len(dataset)-1, :] = testPredict
    # plot baseline and predictions
    fig = plot.figure()
    fig.set_size_inch(18,16)
    plt.plot(scaler.inverse_transform(dataset))
    plt.plot(trainPredictPlot)
    plt.plot(testPredictPlot)
    plt.show()
    
    return trainPredict,testPredict
## Use window method data
#LSM2_1_trnpred,LSM2_1_tstpred = LSTM_2(trainX,trainY,testX,testY,layers=[1,4,1],epoch=1,batch=1)
#Train Score: 0.02 RMSE
#Test Score: 0.02 RMSE
## Use window method and Ica data
#LSM2_2_trnpred,LSM2_2_tstpred = LSTM_2(trainX10_ica3,trainY10,testX10_ica3,testY10,layers=[3,50,1],epoch=100,batch=1)
#Train Score: 0.02 RMSE
#Test Score: 0.02 RMSE

Blending model via Gradient Boosting



In [229]:

    
cab_predictors = pd.read_csv("output/cab_prediction.csv")
cab_trn_pred, cab_test_pred = blending(cab_predictors,0.7,learning_rate=0.1,max_depth=15,random_state=45)



In [235]:

    
dks_predictors = pd.read_csv("output/dks_prediction.csv")
dks_trn_pred, dks_test_pred = blending(dks_predictors,0.7,learning_rate=0.1,max_depth=15,random_state=45)



In [ ]:

	returns	dks_max	dks_min	dks_mean	dks_med	pe_max	pe_min	pe_mean	pe_med	ps_max	...	tr_mean	tr_med	tf_max	tf_min	tf_mean	tf_med	md_max	md_min	md_mean	md_med
0	-0.016949	0.038632	-0.033736	0.004966	5.686946e-03	13.121	12.404	12.842667	12.8715	0.6542	...	3.347826	0.0	4.0	0.0	0.521739	0.0	0.173000	-0.061800	0.039278	0.000000
1	-0.033699	0.038632	-0.033736	0.005146	5.686946e-03	13.121	12.404	12.902333	13.0480	0.6542	...	3.521739	0.0	4.0	0.0	0.565217	0.0	0.173000	-0.061800	0.044341	0.000000
2	-0.021492	0.046235	-0.033736	0.006433	5.686946e-03	13.121	12.404	12.885500	12.9975	0.6542	...	3.782609	0.0	4.0	0.0	0.565217	0.0	0.172000	-0.061800	0.038398	0.000000
3	-0.004973	0.046235	-0.033736	0.004815	2.146015e-03	13.111	12.404	12.774500	12.7855	0.6537	...	3.304348	0.0	4.0	0.0	0.478261	0.0	0.172000	-0.061800	0.040482	0.000000
4	-0.021241	0.046235	-0.033736	0.003029	8.927312e-04	13.111	12.187	12.738333	12.7855	0.6537	...	3.347826	0.0	4.0	0.0	0.478261	0.0	0.172000	-0.061800	0.047135	0.014500
5	0.005106	0.046235	-0.033736	0.004416	1.009494e-03	13.111	12.126	12.645667	12.6720	0.6537	...	3.347826	0.0	4.0	0.0	0.478261	0.0	0.172000	-0.061800	0.047135	0.014500
6	-0.059695	0.046235	-0.033736	0.003052	3.858948e-04	13.111	11.869	12.439500	12.3210	0.6537	...	3.521739	0.0	4.0	0.0	0.565217	0.0	0.172000	-0.061800	0.047160	0.014500
7	-0.022062	0.046235	-0.033736	0.002758	2.513289e-04	12.889	11.869	12.242500	12.1565	0.6427	...	3.173913	0.0	4.0	0.0	0.608696	0.0	0.172000	0.000000	0.050181	0.014500
8	0.048343	0.046235	-0.043016	0.000596	-2.512027e-04	12.455	11.217	11.963833	12.0275	0.6210	...	3.217391	1.0	4.0	0.0	0.652174	0.0	0.172000	0.000000	0.051911	0.021695
9	-0.010979	0.046235	-0.043016	-0.001211	-5.028478e-04	12.187	10.970	11.716333	11.8990	0.6077	...	3.521739	1.0	4.0	0.0	0.782609	0.0	0.172000	0.000000	0.048126	0.021695
10	0.046625	0.046235	-0.043016	-0.000341	-5.028478e-04	12.126	10.970	11.601833	11.6845	0.6046	...	3.608696	1.0	4.0	0.0	0.782609	0.0	0.172000	0.000000	0.051994	0.035950
11	0.023335	0.046235	-0.043016	0.000467	-5.028478e-04	11.929	10.970	11.476500	11.4370	0.5948	...	3.652174	1.0	4.0	0.0	0.782609	0.0	0.172000	0.000000	0.057298	0.036300
12	0.007463	0.046235	-0.043016	0.001310	-5.028478e-04	11.929	10.970	11.482333	11.4370	0.5948	...	2.913043	1.0	4.0	0.0	0.652174	0.0	0.172000	0.000000	0.054800	0.035950
13	0.018930	0.046235	-0.043016	-0.000607	-1.158374e-03	12.182	10.970	11.524500	11.4370	0.6074	...	2.782609	1.0	3.0	0.0	0.478261	0.0	0.172000	0.000000	0.048105	0.021695
14	0.004039	0.046235	-0.043016	0.000921	-5.028478e-04	12.273	10.970	11.700500	11.7020	0.6119	...	2.347826	1.0	3.0	0.0	0.478261	0.0	0.172000	-0.012350	0.041350	0.007680
15	-0.008447	0.046235	-0.043016	0.002230	1.262690e-07	12.505	11.374	11.956333	12.0430	0.6235	...	2.347826	1.0	3.0	0.0	0.478261	0.0	0.172000	-0.012350	0.041350	0.007680
16	-0.027586	0.046235	-0.043016	0.002418	1.262690e-07	12.556	11.374	12.132333	12.2275	0.6260	...	2.521739	1.0	3.0	0.0	0.521739	0.0	0.172000	-0.012350	0.042537	0.021695
17	0.052149	0.046235	-0.043016	0.001445	-5.028478e-04	12.556	11.904	12.311500	12.3610	0.6260	...	2.608696	1.0	3.0	0.0	0.521739	0.0	0.172000	-0.012350	0.045876	0.027300
18	-0.018636	0.046235	-0.043016	0.000035	-1.158374e-03	12.556	12.106	12.345167	12.3610	0.6260	...	3.304348	1.0	4.0	0.0	0.695652	0.0	0.172000	-0.012350	0.050789	0.035950
19	-0.012121	0.046235	-0.043016	0.000493	-1.158374e-03	12.737	12.106	12.437667	12.4770	0.6351	...	3.478261	2.0	4.0	0.0	0.695652	0.0	0.172000	-0.037700	0.049150	0.035950
20	-0.021677	0.046235	-0.043016	-0.000384	-4.299549e-03	12.737	12.106	12.475500	12.5025	0.6351	...	3.652174	2.0	4.0	0.0	0.695652	0.0	0.153000	-0.037700	0.044683	0.035950
21	-0.001672	0.046235	-0.053225	-0.002580	-7.049828e-03	12.737	12.106	12.449333	12.4745	0.6351	...	3.739130	2.0	4.0	0.0	0.739130	0.0	0.153000	-0.037700	0.047900	0.035950
22	0.028895	0.046235	-0.053225	-0.002179	-7.049828e-03	12.737	12.081	12.370167	12.3985	0.6351	...	3.521739	2.0	4.0	0.0	0.739130	0.0	0.153000	-0.037700	0.048322	0.035950
23	-0.010582	0.046235	-0.053225	-0.002501	-7.049828e-03	12.737	12.061	12.305500	12.2270	0.6351	...	3.521739	2.0	4.0	0.0	0.739130	0.0	0.153000	-0.037700	0.051731	0.036300
24	0.053476	0.046235	-0.053225	-0.002710	-7.049828e-03	12.737	12.061	12.356000	12.3785	0.6351	...	3.652174	2.0	4.0	0.0	0.739130	0.0	0.153000	-0.037700	0.050764	0.036300
25	0.004686	0.046235	-0.053225	-0.003190	-7.049828e-03	12.500	12.061	12.279500	12.3130	0.6233	...	3.391304	2.0	4.0	0.0	0.695652	0.0	0.153000	-0.037700	0.048303	0.035950
26	-0.001166	0.034853	-0.053225	-0.003742	-7.049828e-03	12.934	12.061	12.351833	12.3130	0.6449	...	3.478261	2.0	4.0	0.0	0.739130	0.0	0.153000	-0.037700	0.045790	0.035950
27	0.001945	0.034853	-0.053225	-0.002477	-4.299549e-03	12.995	12.061	12.459667	12.3435	0.6479	...	3.608696	2.0	4.0	0.0	0.782609	0.0	0.134000	-0.075000	0.035877	0.031400
28	0.002718	0.034853	-0.053225	-0.002349	-4.299549e-03	12.995	12.061	12.609500	12.6715	0.6479	...	3.608696	2.0	4.0	0.0	0.782609	0.0	0.134000	-0.075000	0.035695	0.031400
29	-0.017428	0.034853	-0.053225	-0.002336	-4.299549e-03	13.005	12.278	12.766833	12.9570	0.6485	...	3.434783	2.0	4.0	0.0	0.695652	0.0	0.134000	-0.075000	0.035670	0.031400
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
1202	-0.008434	0.070505	-0.019231	0.006944	3.517806e-03	19.102	18.939	19.023833	19.0360	0.8353	...	37.695652	16.0	806.0	5.0	130.608696	76.0	0.084625	0.000601	0.043454	0.043935
1203	0.000202	0.070505	-0.019231	0.006552	2.652204e-03	19.102	18.864	19.001833	19.0095	0.8353	...	40.434783	16.0	806.0	5.0	134.695652	76.0	0.084625	0.000601	0.044490	0.043935
1204	-0.006681	0.070505	-0.019231	0.006414	2.652204e-03	19.087	18.705	18.935667	18.9410	0.8346	...	40.913043	16.0	806.0	5.0	139.130435	77.0	0.084625	0.000601	0.043665	0.043935
1205	-0.001427	0.070505	-0.019231	0.006565	2.652204e-03	19.076	18.705	18.872500	18.9015	0.8341	...	42.043478	16.0	806.0	5.0	144.869565	77.0	0.084625	0.000601	0.041017	0.043440
1206	0.004899	0.070505	-0.019231	0.006758	2.652204e-03	18.943	18.583	18.790333	18.7860	0.8284	...	41.913043	16.0	806.0	5.0	144.695652	77.0	0.084625	0.000601	0.039896	0.043440
1207	-0.002234	0.070505	-0.019231	0.006151	2.214395e-03	18.943	18.557	18.726667	18.7065	0.8284	...	42.086957	16.0	806.0	5.0	144.608696	77.0	0.084625	0.000601	0.039886	0.043440
1208	0.007125	0.070505	-0.019231	0.005843	2.214395e-03	18.864	18.557	18.677500	18.6765	0.8249	...	42.652174	16.0	806.0	5.0	146.695652	77.0	0.084625	0.000601	0.039681	0.043440
1209	-0.010916	0.070505	-0.019231	0.006370	2.652204e-03	18.708	18.557	18.634500	18.6270	0.8181	...	41.826087	15.0	806.0	5.0	143.000000	76.0	0.113860	0.000601	0.042742	0.043935
1210	-0.005109	0.070505	-0.019231	0.007224	3.082898e-03	18.739	18.557	18.640167	18.6270	0.8194	...	42.260870	16.0	806.0	5.0	143.956522	77.0	0.113860	0.000601	0.042439	0.043935
1211	-0.008833	0.070505	-0.019231	0.006102	2.652204e-03	18.739	18.534	18.611167	18.5945	0.8194	...	41.652174	15.0	806.0	5.0	141.434783	76.0	0.113860	0.000601	0.039250	0.041478
1212	0.008912	0.070505	-0.019231	0.006214	2.652204e-03	18.739	18.439	18.587167	18.5815	0.8194	...	41.956522	15.0	806.0	5.0	142.739130	76.0	0.113860	0.000601	0.038814	0.041478
1213	-0.003287	0.070505	-0.019231	0.004376	2.214395e-03	18.739	18.277	18.540500	18.5700	0.8194	...	40.391304	15.0	806.0	5.0	135.869565	71.0	0.113860	0.000601	0.039933	0.041478
1214	0.000618	0.070505	-0.019231	0.004947	2.214395e-03	18.739	18.277	18.505667	18.4865	0.8194	...	41.739130	17.0	806.0	5.0	141.826087	71.0	0.113860	0.000601	0.039996	0.041478
1215	0.007415	0.070505	-0.029838	0.003797	2.214395e-03	18.739	18.277	18.467833	18.4390	0.8194	...	39.782609	15.0	806.0	5.0	136.826087	65.0	0.113860	-0.001712	0.037918	0.039241
1216	-0.000818	0.070505	-0.029838	0.004041	2.652204e-03	18.534	18.277	18.409667	18.4145	0.8105	...	37.478261	12.0	806.0	5.0	130.826087	63.0	0.113860	-0.001712	0.037869	0.039241
1217	0.026397	0.070505	-0.029838	0.004558	2.652204e-03	18.527	18.277	18.408500	18.4145	0.8101	...	37.304348	12.0	806.0	5.0	131.304348	63.0	0.113860	-0.001712	0.038539	0.039241
1218	-0.008174	0.070505	-0.029838	0.004514	2.485632e-03	18.527	18.277	18.420500	18.4145	0.8101	...	36.217391	11.0	806.0	5.0	126.043478	63.0	0.113860	-0.001712	0.038906	0.039241
1219	0.006633	0.070505	-0.029838	0.004661	2.645089e-03	19.000	18.379	18.541000	18.4750	0.8308	...	35.652174	11.0	806.0	5.0	125.000000	63.0	0.113860	-0.001712	0.039032	0.039241
1220	0.022164	0.029156	-0.029838	0.001702	1.686586e-03	19.000	18.379	18.608667	18.5190	0.8308	...	32.956522	11.0	806.0	5.0	114.739130	63.0	0.113860	-0.001712	0.041121	0.041478
1221	0.013675	0.029156	-0.029838	0.002997	2.645089e-03	19.000	18.390	18.707167	18.6860	0.8308	...	17.347826	11.0	329.0	5.0	83.086957	63.0	0.137027	-0.001712	0.044488	0.041478
1222	0.006938	0.029156	-0.029838	0.002488	1.686586e-03	19.390	18.511	18.873833	18.9075	0.8479	...	21.478261	11.0	329.0	5.0	91.130435	63.0	0.137027	-0.001712	0.044738	0.043983
1223	0.002679	0.029156	-0.029838	0.001450	1.231595e-03	19.655	18.511	19.061833	18.9850	0.8595	...	22.130435	12.0	329.0	5.0	91.304348	63.0	0.137027	-0.005180	0.043529	0.043983
1224	-0.000573	0.029156	-0.029838	0.001071	1.231595e-03	19.792	18.845	19.275333	19.1950	0.8655	...	22.304348	12.0	329.0	17.0	92.782609	63.0	0.137027	-0.005180	0.044018	0.043983
1225	0.022727	0.029156	-0.029838	0.001376	1.686586e-03	19.845	18.845	19.416167	19.5225	0.8678	...	25.043478	15.0	329.0	17.0	104.695652	63.0	0.137027	-0.005180	0.042725	0.041478
1226	0.025770	0.029156	-0.029838	0.000664	1.686586e-03	19.845	18.970	19.580833	19.7235	0.8678	...	20.521739	12.0	319.0	3.0	90.521739	63.0	0.137027	-0.005180	0.043019	0.043983
1227	0.150191	0.029156	-0.043294	-0.001225	1.231595e-03	20.284	19.390	19.799833	19.8125	0.8870	...	21.304348	15.0	319.0	3.0	90.260870	63.0	0.137027	-0.005180	0.041532	0.036681
1228	0.002849	0.029156	-0.043294	-0.001051	1.988474e-03	20.807	19.655	20.036000	19.8390	0.9098	...	19.913043	13.0	319.0	3.0	83.478261	62.0	0.137027	-0.005180	0.044877	0.043983
1229	0.000316	0.029156	-0.043294	-0.001972	1.231595e-03	23.932	19.792	20.748833	20.0645	1.0465	...	24.260870	15.0	319.0	3.0	85.130435	62.0	0.183000	-0.005180	0.052206	0.044994
1230	-0.003629	0.029156	-0.043294	-0.001687	1.467251e-03	24.000	19.833	21.450167	20.5455	1.0495	...	25.260870	15.0	319.0	3.0	89.391304	63.0	0.183000	-0.005180	0.049275	0.043983
1231	-0.014252	0.029156	-0.043294	-0.001032	2.130377e-03	24.008	19.833	22.144000	22.3695	1.0498	...	24.652174	15.0	319.0	3.0	86.782609	62.0	0.183000	-0.005180	0.051106	0.044994

Model

Amy Ma

Introduction

Functions

Data preparation

1. Stcok market data

2. Social media data

Basic visualizations

1. Stock Market

Returns vs. Tweets

Feature Generation

1.Leadlag correlation data (one vs. others)

1.Leadlag correlation data

2. Leadlag correlation data with features

3. Window method data

Model Fittings

Feature Extraction

1. Linear regression (Leadlag correlation data )

2. Ridge regression with Cross Validation

3. AR model

4. LSTM

Model1: window method with one layer

Model2: window method with two layers

Model3: window method with two layers + ICA

Model4: Leadlag correlation with two layers + IC

Cross-validation process

4.1 Used LSTM models

Blending model via Gradient Boosting