In [1]:
    
import pandas as pd
%matplotlib inline
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_style("whitegrid")
from sklearn.decomposition import FastICA
import sklearn.linear_model as lm
import statsmodels.api as sm
import math
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import LSTM
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import mean_squared_error
from sklearn import metrics,preprocessing,cross_validation
from statsmodels.tsa.stattools import adfuller
from pandas.tools.plotting import autocorrelation_plot
from sklearn.ensemble import GradientBoostingRegressor
    
    
In [2]:
    
# define a function to import datasets
'''
Define a function to read all the dataset:
    1.source = the data folder
    2.dataname = the file name
    3.date = the date column
''' 
def read_adj(source,dataname,date):
    # define a function to parse the date
    dateparse = lambda x: pd.datetime.strptime(x, '%Y-%m-%d')
    # give path
    path = source+dataname
    # read and save the data 
    data = pd.read_csv(path,parse_dates=[date], date_parser=dateparse) 
    data = data.set_index(date)
    # sort data by datatime index
    data = data.sort_index()
    print "Basic information of "+dataname+":"
    print data.head() # to see the data
    print "*" *100
    print data.describe() # summary the data
    return data
def read_tweet(source,dataname,date):
    # define a function to parse the date
    dateparse = lambda x: pd.datetime.strptime(x, '%Y-%m-%d')
    # give path
    path = source+dataname
    # read and save the data 
    data = pd.read_csv(path,parse_dates=[date], date_parser=dateparse) 
    #data = data.set_index(date)
    # sort data by datatime index
    data = data.sort_index()
    print "Basic information of "+dataname+":"
    print data.head() # to see the data
    print "*" *100
    print data.describe() # summary the data
    print "*" *100
    return data
    
In [3]:
    
# define a function to give daily returns
'''
This function will take a DataFrame with adjuseted close price, 
and return a DataFrame with daily returns.
For dsk, we also account for corporate action
1. adj_close = adjusted close price
2. ca = corporation action
'''
def get_returns(adj_close, ca=None):
    if ca is None:
        # color is not None
        return adj_close.diff()/adj_close.shift(1)
    else: 
        return (adj_close.diff()+ca)/adj_close.shift(1)
def merge_tweet(data1,data2,label):
    data2 = pd.DataFrame(data1).merge(data2, how='left', right_index=True, left_index=True)
    data2.fillna(0,inplace=True)# fill na=0
    data2 = data2[label]
    return pd.DataFrame(data2)
    
In [4]:
    
# define a function to plot the returns
'''
This function will make a gerneral plot for returns
1. returns1,returns2 = company's returns
2. width,height = the size of the graph
'''
def plot_return(returns1,name1,returns2,name2,width,height,color1,color2):
    fig = plt.figure()
    fig.set_size_inches(width,height)
    # put together
    ax1 = fig.add_subplot(211)
    ax1.set_title('Combined plot', fontsize=20)
    ax1.plot(returns1, color1)
    ax1.plot(returns2, color2)
    # separate
    ax2 = fig.add_subplot(223)
    ax2.set_title(name1, fontsize=20)
    ax2.plot(returns1, color1)
    ax3 = fig.add_subplot(224)
    ax3.set_title(name2, fontsize=20)
    ax3.plot(returns2, color2)
    #fig, ax = plt.subplots()
    #fig.set_size_inches(width,height)
    #plt.plot(cab_returns,color=color)
def plot_tweets(data1,data2,return1,return2,label):
    a = scale_01(data1)
    b = scale_01(data2)
    fig = plt.figure()
    fig.set_size_inches(18,15)
    # put together
    ax1 = fig.add_subplot(221)
    title = 'Returns vs. ' +label+'(CAB)'
    ax1.set_title(title, fontsize=20)
    ax1.plot(cab, "black")
    ax1.plot(a,"blue")
    # separate
    ax2 = fig.add_subplot(222)
    title = 'Returns vs. ' +label+'(DKS)'
    ax2.set_title(title, fontsize=20)
    ax2.plot(cab, "black")
    ax2.plot(b,"green")
    
In [5]:
    
def get_ICA(data,n_comps=3,plot=False):
    #result = pd.concat(frame, axis=1)
    #result = data
    #X=result.values
    X = data
    #X = (X-X.mean())#/X.std(ddof=0)
    ica = FastICA(n_components=n_comps)
    S_ = ica.fit_transform(X)  # Reconstruct signals
    A_ = ica.mixing_  # Get estimated mixing matrix
    ICs = pd.DataFrame({"c1":S_[:,0],'c2':S_[:,1],'c3':S_[:,2]})
    # Make plots to see the signal
    if plot==True:
        fig=plt.figure()
        fig.set_size_inches(18,16)
        models = [S_]
        names = ['ICA recovered signals']
        colors = range(0,n_comps)
        for ii, (model, name) in enumerate(zip(models, names), 1):
            plt.subplot(2, 1, ii)
            plt.title(name)
            for sig, color in zip(model.T, colors):
                plt.plot(sig, color=color)
        plt.subplots_adjust(0.09, 0.04, 0.94, 0.94, 0.26, 0.46)
        plt.show()
    return ICs
    
In [6]:
    
def crosscorr(data1, data2, start, end, name):
    dict_for_df = {}
    for i in range(start,end):   
        lag = i
        corr = data1.corr(data2.shift(lag)) 
        dict_for_df[i] = [corr]     # store in a dict
    df = pd.DataFrame(dict_for_df).T # after the loop convert the dict to a dataframe
    df.columns = name
    # make a plot 
    plt.plot(df)
    plt.show()
    return df
def leadlag_corr_ic(data1,data2,start,end,threshold):
    df = data1.reset_index(drop=True)
    # component1
    cr1 = crosscorr(df,data2["c1"],0,30,["corr1"])
    print ("For compenent1 use lag = ")
    print cr1[cr1["corr1"]>threshold]
    # component2
    cr2 = crosscorr(df,data2["c2"],0,30,["corr2"])
    #cr2 = cr2.reindex(cr2.abs().sort_values(["corr2"],inplace=False,ascending= False).index).T.iloc[0,0:n_show]
    print "For compenent2 use lag = " 
    print cr2[cr2["corr2"]>threshold]
    # component3
    cr3 = crosscorr(df,data2["c3"],0,30,["corr3"])
    #cr3 = cr3.reindex(cr3.abs().sort_values(["corr3"],inplace=False,ascending= False).index).T.iloc[0,0:n_show]
    print "For compenent3 use lag = " 
    print cr3[cr3["corr3"]>threshold]
    c1 = cr1[cr1["corr1"]>threshold].index.values
    c2 = cr2[cr2["corr2"]>threshold].index.values
    c3 = cr3[cr3["corr3"]>threshold].index.values
    return c1,c2,c3
def leadlag_corr(data1,data2_frame,start,end,threshold):
    df = data1.reset_index(drop=True)
    data2 = pd.concat(data2_frame, axis=1).reset_index(drop=True)
#     lag = []
#     def print_leadlag(col):
#         data = crosscorr(df,data2.iloc[:,col],0,30,[col])
#         print "For "+col+ "use lag = " 
#         print data[data[label.abs()]>threshold]
#         corr= data[data[label.abs()]>threshold].index.values
#         return corr
#     for i in range(1,data2.shape[1]+1):
#         lag.append(print_leadlag(col="data"))
# data1
    cr1 = crosscorr(df,data2.iloc[:,0],0,30,["corr1"])
    print ("For dataset1 use lag = ")
    print cr1[cr1["corr1"].abs()>threshold]
    # data2
    cr2 = crosscorr(df,data2.iloc[:,1],0,30,["corr2"])
    #cr2 = cr2.reindex(cr2.abs().sort_values(["corr2"],inplace=False,ascending= False).index).T.iloc[0,0:n_show]
    print "For dataset2 use lag = " 
    print cr2[cr2["corr2"].abs()>threshold]
    # data3
    cr3 = crosscorr(df,data2.iloc[:,2],0,30,["corr3"])
    #cr3 = cr3.reindex(cr3.abs().sort_values(["corr3"],inplace=False,ascending= False).index).T.iloc[0,0:n_show]
    print "For dataset3 use lag = " 
    print cr3[cr3["corr3"].abs()>threshold]
    # data4
    cr4 = crosscorr(df,data2.iloc[:,3],0,30,["corr4"])
    print "For dataset4 use lag = " 
    print cr4[cr4["corr4"].abs()>threshold]
    # data5
    cr5 = crosscorr(df,data2.iloc[:,4],0,30,["corr5"])
    print "For dataset5 use lag = " 
    print cr5[cr5["corr5"].abs()>threshold]
#     # data6
#     cr6 = crosscorr(df,data2.iloc[:,4],0,30,["corr6"])
#     print "For dataset6 use lag = " 
#     print cr6[cr6["corr6"].abs()>threshold]
#     # data7
#     cr7 = crosscorr(df,data2.iloc[:,4],0,30,["corr7"])
#     print "For dataset7 use lag = " 
#     print cr7[cr7["corr7"].abs()>threshold]
#     # data8
#     cr8 = crosscorr(df,data2.iloc[:,4],0,30,["corr8"])
#     print "For dataset8 use lag = " 
#     print cr8[cr8["corr8"].abs()>threshold]
    c1 = cr1[cr1["corr1"].abs()>threshold].index.values
    c2 = cr2[cr2["corr2"].abs()>threshold].index.values
    c3 = cr3[cr3["corr3"].abs()>threshold].index.values
    c4 = cr4[cr4["corr4"].abs()>threshold].index.values
    c5 = cr5[cr5["corr5"].abs()>threshold].index.values
#     c6 = cr6[cr6["corr6"].abs()>threshold].index.values
#     c7 = cr7[cr7["corr7"].abs()>threshold].index.values
#     c8 = cr8[cr8["corr8"].abs()>threshold].index.values
    return c1,c2,c3,c4,c5#,c6,c7,c8
    
In [7]:
    
def split_trn_tst(data,porp):
    train_size = int(len(data)*porp)
    test_size = len(data)-train_size
    train = data.iloc[0:train_size]
    test = data.iloc[train_size:len(data)] 
    print(len(train), len(test))
    return train,test
# convert an array of values into a dataset matrix
def create_dataset(dataset, look_back=1):
	dataX, dataY = [], []
	for i in range(len(dataset)-look_back-1):
		a = dataset[i:(i+look_back), 0]
		dataX.append(a)
		dataY.append(dataset[i + look_back, 0])
	return np.array(dataX), np.array(dataY)
def leadlag_feature(dataset,look_back):
    dataset = dataset.values.reshape(len(dataset),1)
    trainX,trainY=create_dataset(dataset,look_back)
    trainX= np.reshape(trainX, (trainX.shape[0], trainX.shape[1]))
    trainX= pd.DataFrame(trainX)
    frame=[trainX.max(axis=1), trainX.min(axis=1), trainX.mean(axis=1), trainX.median(axis=1)]
    others_feature = reduce(lambda left,right: pd.concat([left.reset_index(drop=True), 
                                                      right.reset_index(drop=True)],axis=1),frame)
    return others_feature
    
In [8]:
    
def scale_01(data):
    return (data-data.min())/(data.max()-data.min())
def nminmax_tsttrn(seed,data,split):    
    np.random.seed(seed)
    # Change into arrays
    dataset = data.values
    # normalize the dataset
    scaler = MinMaxScaler(feature_range=(0, 1)) # or 0,1
    dataset = scaler.fit_transform(dataset).reshape(len(data),1)
    # split into train and test sets
    train_size = int(len(dataset) * split)
    test_size = len(dataset) - train_size
    train, test = dataset[0:train_size], dataset[train_size:len(dataset)]
    print(len(train), len(test))
    return train, test 
def get_wmdata(seed,data,split,look_back):
    train, test = nminmax_tsttrn(seed,data,split)
    trainX, trainY = create_dataset(train, look_back)
    testX, testY = create_dataset(test, look_back)
    return trainX,trainY,testX,testY
def wmd_ica(seed, data, split, look_back, n_component):
    trainX,trainY,testX,testY = get_wmdata(seed,data,split,look_back=10)
    trainX_ica=get_ICA(trainX,n_component)
    testX_ica=get_ICA(testX,n_component)
    return np.array(trainX_ica),trainY,np.array(testX_ica),testY 
def Rmse(model, X, Y):
    return math.sqrt(mean_squared_error(Y, model.predict(X)))
    
In [9]:
    
def plot_lm(datay,trainPredict,testPredict):
    # shift train predictions for plotting
    trainPredictPlot = np.empty_like(datay)
    trainPredictPlot[:] = np.nan
    trainPredictPlot[0:len(trainPredict)] = trainPredict
    # shift test predictions for plotting
    testPredictPlot = np.empty_like(datay)
    testPredictPlot[:] = np.nan
    testPredictPlot[len(trainPredict):] = testPredict
    # plot baseline and predictions
    fig = plt.figure()
    fig.set_size_inches(18,16)
    plt.plot(cab_others["returns"].values)
    plt.plot(trainPredictPlot)
    plt.plot(testPredictPlot)
    plt.show()
    
In [10]:
    
def test_stationarity(timeseries):
    
    #Determing rolling statistics
    rolmean = pd.rolling_mean(timeseries, window=10)
    rolstd = pd.rolling_std(timeseries, window=10)
    #Plot rolling statistics:
    orig = plt.plot(timeseries, color='blue',label='Original')
    mean = plt.plot(rolmean, color='red', label='Rolling Mean')
    std = plt.plot(rolstd, color='black', label = 'Rolling Std')
    plt.legend(loc='best')
    plt.title('Rolling Mean & Standard Deviation')
    plt.show(block=False)
    
    #Perform Dickey-Fuller test:
    print 'Results of Dickey-Fuller Test:'
    dftest = adfuller(timeseries, autolag='AIC')
    dfoutput = pd.Series(dftest[0:4], index=['Test Statistic','p-value','#Lags Used','Number of Observations Used'])
    for key,value in dftest[4].items():
        dfoutput['Critical Value (%s)'%key] = value
    print dfoutput
    
In [11]:
    
def cv_lstm(model,data,k_fold,**extraKwArgs):
    train_score = [0]*(k_fold-1)
    test_score = [0]*(k_fold-1)
    # always make sure input is [y,x]
    if len(data)> 1:
        data = map(pd.DataFrame,data)
        data = pd.concat(data, axis=1)
    else: 
        data = data
    L = int(len(data)/k_fold)
    data = np.array(data)
    for i in range(1,k_fold):
        
        ind = i*(L+1)
        ind2 = (i+1)*(L+1)
        
        trainX = data[0:ind,1:]
        trainY = data[0:ind,0]
        validX= data[ind:ind2,1:]
        validY = data[ind:ind2,0]
        train_score[i-1],test_score[i-1] = model(trainX, trainY,validX,validY,**extraKwArgs)[2]
    print('Train Score: %.5f RMSE' % ((np.array(train_score)).mean()))
    print('Test Score: %.5f RMSE' % ((np.array(test_score)).mean()))
def cv_ridge(data,k_fold,**extraKwArgs):
    train_score = [0]*(k_fold-1)
    test_score = [0]*(k_fold-1)
    # always make sure input is [y,x]
    if len([data])> 1:
        data = map(pd.DataFrame,data)
        data = pd.concat(data, axis=1)
    else: 
        data = data
    L = int(len(data)/k_fold)
    data = np.array(data)
    for i in range(1,k_fold):       
        ind = i*(L+1)
        ind2 = (i+1)*(L+1)
        trainX = data[0:ind,1:]
        trainY = data[0:ind,0]
        validX= data[ind:ind2,1:]
        validY = data[ind:ind2,0]
        model = lm.Ridge(**extraKwArgs)
        model.fit(trainX,trainY)
        train_score[i-1] = Rmse(model,trainX, trainY )
        test_score[i-1] = Rmse(model,validX,validY)
        rmse_trn = (np.array(train_score)).mean()
        rmse_tst = (np.array(test_score)).mean()
    print('Train Score: %.5f RMSE' % ((np.array(train_score)).mean()))
    print('Test Score: %.5f RMSE' % ((np.array(test_score)).mean()))
    return rmse_trn,rmse_tst
    
In [12]:
    
### LSTM model
# ------------ Model 1 ------------- #
def LSM(dataset,seed=10,look_back=10,split=0.7,n_loop=100,layer2=4,show_log = 1,batch_size = 1,cv=False):
    np.random.seed(7)
    # load the dataset
    dataset = dataset.values
    # normalize the dataset
    scaler = MinMaxScaler(feature_range=(0, 1))
    dataset = scaler.fit_transform(dataset).reshape(len(dataset),1)
    # split into train and test sets
    train_size = int(len(dataset) * split)
    test_size = len(dataset) - train_size
    train, test = dataset[0:train_size,:], dataset[train_size:len(dataset),:]
    # reshape into X=t and Y=t+1
    trainX, trainY = create_dataset(train, look_back)
    testX, testY = create_dataset(test, look_back)
    # reshape input to be [samples, time steps, features]
    trainX = np.reshape(trainX, (trainX.shape[0], trainX.shape[1], 1))
    testX = np.reshape(testX, (testX.shape[0], testX.shape[1], 1))
    # create and fit the LSTM network
    batch_size = 1
    model = Sequential()
    model.add(LSTM(layer2, batch_input_shape=(batch_size, look_back, 1), stateful=True))
    model.add(Dense(1))
    model.compile(loss='mean_squared_error', optimizer='adam')
    for i in range(n_loop):
        model.fit(trainX, trainY, nb_epoch=1, batch_size=batch_size, verbose=show_log, shuffle=False)
        model.reset_states()
    # make predictions
    trainPredict = model.predict(trainX, batch_size=batch_size)
    model.reset_states()
    testPredict = model.predict(testX, batch_size=batch_size)
    # invert predictions
    trainPredict = scaler.inverse_transform(trainPredict)
    trainY = scaler.inverse_transform([trainY])
    testPredict = scaler.inverse_transform(testPredict)
    testY = scaler.inverse_transform([testY])
    # calculate root mean squared error
    trainScore = math.sqrt(mean_squared_error(trainY[0], trainPredict[:,0]))
    testScore = math.sqrt(mean_squared_error(testY[0], testPredict[:,0]))
    if cv == False:
        print('Train Score: %.5f RMSE' % (trainScore))
        print('Test Score: %.5f RMSE' % (testScore))
        # shift train predictions for plotting
        trainPredictPlot = np.empty_like(dataset)
        trainPredictPlot[:, :] = np.nan
        trainPredictPlot[look_back:len(trainPredict)+look_back, :] = trainPredict
        # shift test predictions for plotting
        testPredictPlot = np.empty_like(dataset)
        testPredictPlot[:, :] = np.nan
        testPredictPlot[len(trainPredict)+(look_back*2)+1:(len(dataset)-1), :] = testPredict
    # plot baseline and predictions
        fig = plt.figure()
        fig.set_size_inches(18,16)
        plt.plot(scaler.inverse_transform(dataset))
        plt.plot(trainPredictPlot)
        plt.plot(testPredictPlot)
        plt.show()
        return trainPredict,testPredict
    else:
        return trainScore,testScore
    
# ------------ Model 2 ------------- #
def LSTM(dataset,seed=10,look_back=10,split=0.7,n_loop=100,layer2=4,batch =1,show_log = 1,cv=False):
    np.random.seed(7)
    # load the dataset
    dataset = dataset.values
    # normalize the dataset
    scaler = MinMaxScaler(feature_range=(0, 1))
    dataset = scaler.fit_transform(dataset).reshape(len(dataset),1)
    # split into train and test sets
    train_size = int(len(dataset) * split)
    test_size = len(dataset) - train_size
    train, test = dataset[0:train_size,:], dataset[train_size:len(dataset),:]
    # reshape into X=t and Y=t+1
    trainX, trainY = create_dataset(train, look_back)
    testX, testY = create_dataset(test, look_back)
    # reshape input to be [samples, time steps, features]
    trainX = np.reshape(trainX, (trainX.shape[0], trainX.shape[1], 1))
    testX = np.reshape(testX, (testX.shape[0], testX.shape[1], 1))
    # create and fit the LSTM network
    batch_size = batch
    model = Sequential()
    model.add(LSTM(layer2, batch_input_shape=(batch_size, look_back, 1), stateful=True, return_sequences=True))
    model.add(LSTM(layer2, batch_input_shape=(batch_size, look_back, 1), stateful=True))
    model.add(Dense(1))
    model.compile(loss='mean_squared_error', optimizer='adam')
    for i in range(n_loop):
        model.fit(trainX, trainY, nb_epoch=1, batch_size=batch_size, verbose=show_log, shuffle=False)
        model.reset_states()
    # make predictions
    trainPredict = model.predict(trainX, batch_size=batch_size)
    model.reset_states()
    testPredict = model.predict(testX, batch_size=batch_size)
    # invert predictions
    trainPredict = scaler.inverse_transform(trainPredict)
    trainY = scaler.inverse_transform([trainY])
    testPredict = scaler.inverse_transform(testPredict)
    testY = scaler.inverse_transform([testY])
    # calculate root mean squared error
    trainScore = math.sqrt(mean_squared_error(trainY[0], trainPredict[:,0]))
    testScore = math.sqrt(mean_squared_error(testY[0], testPredict[:,0]))
    if cv == False:
        print('Train Score: %.5f RMSE' % (trainScore))
        print('Test Score: %.5f RMSE' % (testScore))
        # shift train predictions for plotting
        trainPredictPlot = np.empty_like(dataset)
        trainPredictPlot[:, :] = np.nan
        trainPredictPlot[look_back:len(trainPredict)+look_back, :] = trainPredict
        # shift test predictions for plotting
        testPredictPlot = np.empty_like(dataset)
        testPredictPlot[:, :] = np.nan
        testPredictPlot[len(trainPredict)+(look_back*2)+1:(len(dataset)-1), :] = testPredict
    # plot baseline and predictions
        fig = plt.figure()
        fig.set_size_inches(18,16)
        plt.plot(scaler.inverse_transform(dataset))
        plt.plot(trainPredictPlot)
        plt.plot(testPredictPlot)
        plt.show()
        return trainPredict,testPredict
    else:
        return trainScore,testScore
    
# ------------ Model 3 ------------- #
def LSTM_ica(dataset,seed=10,look_back=10,n_component=3,split=0.7,layer2=4,n_loop=100,show_log = 1,batch_size = 1,cv=False):  
    np.random.seed(seed)
    # Change into arrays
    dataset = dataset.values
    # normalize the dataset
    scaler = MinMaxScaler(feature_range=(0, 1)) # or 0,1
    dataset = scaler.fit_transform(dataset).reshape(len(dataset),1)
    # split into train and test sets
    train_size = int(len(dataset) * split)
    test_size = len(dataset) - train_size
    train, test = dataset[0:train_size], dataset[train_size:len(dataset)]
    print(len(train), len(test))
    trainX, trainY = create_dataset(train, look_back)
    testX, testY = create_dataset(test, look_back)
    # get ICA for all the lookback
    trainX = np.array(get_ICA(trainX,n_component))
    testX = np.array(get_ICA(testX,n_component))
    # reshape input to be [samples, time steps, features]
    trainX = np.reshape(trainX, (trainX.shape[0], trainX.shape[1], 1))
    testX = np.reshape(testX, (testX.shape[0], testX.shape[1], 1))
    # create and fit the LSTM network
    model = Sequential()
    model.add(LSTM(layer2, batch_input_shape=(batch_size, n_component, 1), stateful=True))
    model.add(Dense(1))
    model.compile(loss='mean_squared_error', optimizer='adam')
    for i in range(n_loop):
        model.fit(trainX, trainY, nb_epoch=1, batch_size=batch_size, verbose=show_log, shuffle=False)
        model.reset_states()
    # make predictions
    trainPredict = model.predict(trainX, batch_size=batch_size)
    model.reset_states()
    testPredict = model.predict(testX, batch_size=batch_size)
    # invert predictions
    trainPredict = scaler.inverse_transform(trainPredict)
    trainY = scaler.inverse_transform([trainY])
    testPredict = scaler.inverse_transform(testPredict)
    testY = scaler.inverse_transform([testY])
    # calculate root mean squared error
    trainScore = math.sqrt(mean_squared_error(trainY[0], trainPredict[:,0]))
    testScore = math.sqrt(mean_squared_error(testY[0], testPredict[:,0]))
    if cv == False:
        print('Train Score: %.5f RMSE' % (trainScore))
        print('Test Score: %.5f RMSE' % (testScore))
        predict =pd.DataFrame(np.concatenate([trainPredict,testPredict]))
    # plot baseline and predictions
        fig = plt.figure()
        fig.set_size_inches(18,16)
        plt.plot(scaler.inverse_transform(dataset))
        plt.plot(Predict)
        plt.show()
        return trainPredict,testPredict
    else:
        return trainScore,testScore
# ------------ Model 4 ------------- #
def LSTM_others(dataset,seed=10,n_features=10,split=0.7,layer2=4,n_loop=100,show_log = 1,batch_size = 1,cv=False):  
    np.random.seed(seed)
    # Change into arrays
    dataset = dataset.values
    shape = dataset.shape
    print shape
    # normalize the dataset
    scaler = MinMaxScaler(feature_range=(0, 1)) # or 0,1
    dataset = scaler.fit_transform(dataset).reshape(shape)
    # split into train and test sets
    train_size = int(len(dataset) * split)
    test_size = len(dataset) - train_size
    train, test = dataset[0:train_size], dataset[train_size:len(dataset)]
    print(len(train), len(test))
    trainX = train[:,1:]
    trainY = train[:,0]
    testX = test[:,1:]
    testY = test[:,0]
    print trainX.shape
    # reshape input to be [samples, time steps, features]
    trainX = np.reshape(trainX, (trainX.shape[0], trainX.shape[1], 1))
    testX = np.reshape(testX, (testX.shape[0], testX.shape[1], 1))
    # create and fit the LSTM network
    model = Sequential()
    model.add(LSTM(layer2, batch_input_shape=(batch_size,n_features , 1), stateful=True))
    model.add(Dense(1))
    model.compile(loss='mean_squared_error', optimizer='adam')
    for i in range(n_loop):
        model.fit(trainX, trainY, nb_epoch=1, batch_size=batch_size, verbose=show_log, shuffle=False)
        model.reset_states()
    # make predictions
    trainPredict = model.predict(trainX, batch_size=batch_size)
    model.reset_states()
    testPredict = model.predict(testX, batch_size=batch_size)
    # invert predictions
    trainPredict = scaler.inverse_transform(trainPredict)
    trainY = scaler.inverse_transform([trainY])
    testPredict = scaler.inverse_transform(testPredict)
    testY = scaler.inverse_transform([testY])
    # calculate root mean squared error
    trainScore = math.sqrt(mean_squared_error(trainY[0], trainPredict[:,0]))
    testScore = math.sqrt(mean_squared_error(testY[0], testPredict[:,0]))
    if cv == False:
        print('Train Score: %.5f RMSE' % (trainScore))
        print('Test Score: %.5f RMSE' % (testScore))
        # shift train predictions for plotting
        trainPredictPlot = np.empty_like(dataset)
        trainPredictPlot[:, :] = np.nan
        trainPredictPlot[:len(trainPredict)+1, :] = trainPredict
        # shift test predictions for plotting
        testPredictPlot = np.empty_like(dataset)
        testPredictPlot[:, :] = np.nan
        testPredictPlot[len(trainPredict)+1:(len(dataset)-1), :] = testPredict
    # plot baseline and predictions
        fig = plt.figure()
        fig.set_size_inches(18,16)
        plt.plot(scaler.inverse_transform(dataset))
        plt.plot(trainPredictPlot)
        plt.plot(testPredictPlot)
        plt.show()
        return trainPredict,testPredict
    else:
        return trainScore,testScore
    
In [13]:
    
def blending(data,split,**extraKwArgs):
    '''
    need to define learning_rate,
    max_depth,random_state
    '''
    train, test = split_trn_tst(data,split)
    train, test = map(np.array,[train,test])
    X_train , X_test = train[:,1:], test[:,1:]
    y_train, y_test = train[:,0], test[:,0]
    est = GradientBoostingRegressor(n_estimators=20,loss='ls',**extraKwArgs).fit(X_train, y_train)
    print math.sqrt(mean_squared_error(y_test, est.predict(X_test)))
    return est.predict(X_train),est.predict(X_test)
    
In [14]:
    
# read adjusted close price 
cab_adj = read_adj("Data/StockData/","CAB_2011_2016.csv","Date")
dks_adj = read_adj('Data/StockData/','DKS_2011_2016.csv',"Date")
sp500_adj = read_adj('Data/StockData/','SP500_2011_2016.csv',"Date")
#ust_adj = read_adj("Data/StockData/","UST_2011_2016.csv","Date")
hibb_adj = read_adj('Data/StockData/','HIBB_2011_2016.csv',"Date")
cab_peps = read_adj("Data/StockData/","CAB_2011_2016_PEPS.csv","Date")
dks_peps = read_adj('Data/StockData/','DKS_2011_2016_PEPS.csv',"Date")
# read corporate action
dks_ca = read_adj('Data/StockData/','dks_ca.csv',"Before_Ex_Date")
# merge data by date 
dks_adj = dks_adj.merge(dks_ca, how='outer', right_index=True, left_index=True)
dks_adj["Gross"].fillna(0,inplace=True)# fill na=0
    
    
In [15]:
    
# calculate returns 
cab_returns = get_returns(cab_adj["Adj Close"])
sp500_returns = get_returns(sp500_adj["Adj Close"])
hibb_returns = get_returns(hibb_adj["Adj Close"])
#ust_returns = get_returns(ust_adj["Adj Close"])
dks_returns = get_returns(dks_adj["Adj Close"],ca=dks_adj["Gross"])
# print dsk_returns.loc['2016-09-06'] to check the adjusted return by using corporate action
## Subsetting and arrange the data
cab = cab_returns["2011-10-11":]
dks = dks_returns["2011-10-11":]
hibb = hibb_returns["2011-10-11":]
sp500 = sp500_returns["2011-10-11":]
cab_pe = cab_peps["Price to Earnings Ratio"]["2011-10-11":"2016-10-07"]
cab_ps = cab_peps["Price to Sales Ratio"]["2011-10-11":"2016-10-07"]
dks_pe = dks_peps["Price to Earnings Ratio"]["2011-10-11":"2016-10-07"]
dks_ps = dks_peps["Price to Sales Ratio"]["2011-10-11":"2016-10-07"]
## Subsetting and arrange the data
cab = cab_returns["2011-10-11":]
dks = dks_returns["2011-10-11":]
hibb = hibb_returns["2011-10-11":]
sp500 = sp500_returns["2011-10-11":]
cab_pe = cab_peps["Price to Earnings Ratio"]["2011-10-11":"2016-10-07"]
cab_ps = cab_peps["Price to Sales Ratio"]["2011-10-11":"2016-10-07"]
dks_pe = dks_peps["Price to Earnings Ratio"]["2011-10-11":"2016-10-07"]
dks_ps = dks_peps["Price to Sales Ratio"]["2011-10-11":"2016-10-07"]
# save data
cab.to_csv("output/cab.csv", sep=',')
dks.to_csv("output/dks.csv", sep=',')
    
In [16]:
    
# Data perpartion
# TwoYear = ["2011-10-11":"2013-10-10"]
# ThirdYear = ["2013-10-11":"2014-10-10"]
# ThreeYear = ["2011-10-11":"2014-10-10"]
# ForthYear = ["2014-10-11":"2015-10-10"]
# FifithYear = ["2015-10-11":]
## Input: 
## 1. two years of stocks
cab_2 = cab_returns["2011-10-11":"2013-10-10"]
dks_2 = dks_returns["2011-10-11":"2013-10-10"]
hibb_2 = hibb_returns["2011-10-11":"2013-10-10"]
sp500_2 = sp500_returns["2011-10-11":"2013-10-10"]
cab_pe_2 = cab_peps["Price to Earnings Ratio"]["2011-10-11":"2013-10-10"]
cab_ps_2 = cab_peps["Price to Sales Ratio"]["2011-10-11":"2013-10-10"]
dks_pe_2 = dks_peps["Price to Earnings Ratio"]["2011-10-11":"2013-10-10"]
dks_ps_2 = dks_peps["Price to Sales Ratio"]["2011-10-11":"2013-10-10"]
## 2. three years of stocks
cab_3 = cab_returns["2011-10-11":"2014-10-10"]
dks_3 = dks_returns["2011-10-11":"2014-10-10"]
hibb_3 = hibb_returns["2011-10-11":"2014-10-10"]
sp500_3 = sp500_returns["2011-10-11":"2014-10-10"]
cab_pe_3 = cab_peps["Price to Earnings Ratio"]["2011-10-11":"2014-10-10"]
cab_ps_3 = cab_peps["Price to Sales Ratio"]["2011-10-11":"2014-10-10"]
dks_pe_3 = dks_peps["Price to Earnings Ratio"]["2011-10-11":"2014-10-10"]
dks_ps_3 = dks_peps["Price to Sales Ratio"]["2011-10-11":"2014-10-10"]
## 3. four years of stocks
cab_4 = cab_returns["2011-10-11":"2015-10-10"]
dks_4 = dks_returns["2011-10-11":"2015-10-10"]
hibb_4 = hibb_returns["2011-10-11":"2015-10-10"]
sp500_4 = sp500_returns["2011-10-11":"2015-10-10"]
cab_pe_4 = cab_peps["Price to Earnings Ratio"]["2011-10-11":"2015-10-10"]
cab_ps_4 = cab_peps["Price to Sales Ratio"]["2011-10-11":"2015-10-10"]
dks_pe_4 = dks_peps["Price to Earnings Ratio"]["2011-10-11":"2015-10-10"]
dks_ps_4 = dks_peps["Price to Sales Ratio"]["2011-10-11":"2015-10-10"]
## 4. the 3rd year of stocka
cab_3rd = cab_returns["2013-10-11":"2014-10-10"]
dks_3rd = dks_returns["2013-10-11":"2014-10-10"]
hibb_3rd = hibb_returns["2013-10-11":"2014-10-10"]
sp500_3rd = sp500_returns["2013-10-11":"2014-10-10"]
cab_pe_3rd = cab_peps["Price to Earnings Ratio"]["2013-10-11":"2014-10-10"]
cab_ps_3rd = cab_peps["Price to Sales Ratio"]["2013-10-11":"2014-10-10"]
dks_pe_3rd = dks_peps["Price to Earnings Ratio"]["2013-10-11":"2014-10-10"]
dks_ps_3rd = dks_peps["Price to Sales Ratio"]["2013-10-11":"2014-10-10"]
## 5. the 4th year of stocka
cab_4th = cab_returns["2014-10-11":"2015-10-10"]
dks_4th = dks_returns["2014-10-11":"2015-10-10"]
hibb_4th = hibb_returns["2014-10-11":"2015-10-10"]
sp500_4th = sp500_returns["2014-10-11":"2015-10-10"]
cab_pe_4th = cab_peps["Price to Earnings Ratio"]["2014-10-11":"2015-10-10"]
cab_ps_4th = cab_peps["Price to Sales Ratio"]["2014-10-11":"2015-10-10"]
dks_pe_4th = dks_peps["Price to Earnings Ratio"]["2014-10-11":"2015-10-10"]
dks_ps_4th = dks_peps["Price to Sales Ratio"]["2014-10-11":"2015-10-10"]
## 5. the 5th year of stocka
cab_5th = cab_returns["2015-10-11":]
dks_5th = dks_returns["2015-10-11":]
hibb_5th = hibb_returns["2015-10-11":]
sp500_5th = sp500_returns["2015-10-11":]
cab_pe_5th = cab_peps["Price to Earnings Ratio"]["2015-10-11":"2016-10-07"]
cab_ps_5th = cab_peps["Price to Sales Ratio"]["2015-10-11":"2016-10-07"]
dks_pe_5th = dks_peps["Price to Earnings Ratio"]["2015-10-11":"2016-10-07"]
dks_ps_5th = dks_peps["Price to Sales Ratio"]["2015-10-11":"2016-10-07"]
frame_2=[cab_2,dks_2,hibb_2,sp500_2]
frame_3=[cab_3,dks_3,hibb_3,sp500_3]
frame_4=[cab_4,dks_4,hibb_4,sp500_4]
    
In [17]:
    
# tweets data
cab_tweets = read_tweet("Data/SocialMediaData/","CAB_oldtweets.csv","date")
dks_tweets = read_tweet("Data/SocialMediaData/","DKS_oldtweets.csv","date")
cab_tnum = cab_tweets.groupby(['date'])[['username']].count()
cab_tret = cab_tweets.groupby(['date'])[['retweets']].sum()
cab_tfav = cab_tweets.groupby(['date'])[['favorites']].sum()
cab_ttim = cab_tweets.groupby(['time'])[['username']].count()
dks_tnum = dks_tweets.groupby(['date'])[['username']].count()
dks_tret = dks_tweets.groupby(['date'])[['retweets']].sum()
dks_tfav = dks_tweets.groupby(['date'])[['favorites']].sum()
dks_ttim = dks_tweets.groupby(['time'])[['username']].count()
## merge
cab_tnum = merge_tweet(cab,cab_tnum,'username')
cab_tret = merge_tweet(cab,cab_tret,'retweets')
cab_tfav = merge_tweet(cab,cab_tfav,'favorites')
dks_tnum = merge_tweet(dks,dks_tnum,'username')
dks_tret = merge_tweet(dks,dks_tret,'retweets')
dks_tfav = merge_tweet(dks,dks_tfav,'favorites')
cab_tnum_4 = cab_tnum["2011-10-11":"2015-10-10"]
cab_tret_4 = cab_tret["2011-10-11":"2015-10-10"]
cab_tfav_4 = cab_tfav["2011-10-11":"2015-10-10"]
dks_tnum_4 = dks_tnum["2011-10-11":"2015-10-10"]
dks_tret_4= dks_tret["2011-10-11":"2015-10-10"]
dks_tfav_4= dks_tfav["2011-10-11":"2015-10-10"]
# tweets and mood 
cab_mood = read_tweet("Data/SocialMediaData/","cab_sentiment.csv","date")
dks_mood = read_tweet("Data/SocialMediaData/","dks_sentiment.csv","date")
cab_mood = cab_mood.groupby(['date'])[['mood']].mean()
dks_mood = dks_mood.groupby(['date'])[['mood']].mean()
## merge
cab_mood = merge_tweet(cab,cab_mood,'mood')
dks_mood = merge_tweet(dks,dks_mood,'mood')
cab_mood_4 = cab_mood["2011-10-11":"2015-10-10"]
dks_mood_4 = dks_mood["2011-10-11":"2015-10-10"]
    
    
In [18]:
    
# Make plots to see returns
plot_return(cab_returns,"Cabela's Return",dks_returns,"Dick's Return",18.5,10.5,"blue","green")
    
    
In [19]:
    
plot_return(cab_returns,"Cabela's Return",sp500_returns,"SP500's Return",18.5,10.5,"blue","red")
    
    
In [20]:
    
plot_return(dks_returns,"Dick's Return",sp500_returns,"SP500's Return",18.5,10.5,"green","red")
    
    
In [21]:
    
plot_tweets(cab_tnum,dks_tnum,cab,dks,"Num. of tweets")
    
    
In [22]:
    
plot_tweets(cab_tret,dks_tret,cab,dks,"Num. of retweets")
    
    
In [23]:
    
plot_tweets(cab_tfav,dks_tfav,cab,dks,"Num. of favorites")
    
    
In [24]:
    
# Leadlag correlation data
print "*"*100
print "For CAB:"
## Calculate the leadlag correlation with 4 years data
print "Calculate the leadlag correlations"
leadlag_corr(cab_4,[dks_4,hibb_4,sp500_4,cab_pe_4,cab_ps_4],0,30,0.06)
print "*"*100
## Put data together
cab_index = cab.index.values[24:]
#cab = cab.reset_index(drop=True)
frame=[cab,dks.shift(24),# dks.shift(8), dks.shift(10),dks.shift(24),
       hibb.shift(22), # hibb.shift(24),
       cab_pe.shift(3),cab_pe.shift(1),cab_pe.shift(2),cab_pe.shift(4),cab_pe.shift(5),cab_pe.shift(6),
       cab_ps.shift(1),cab_ps.shift(2),cab_ps.shift(3),cab_ps.shift(4),cab_ps.shift(5)]
cab_others = reduce(lambda left,right: pd.concat([left.reset_index(drop=True), right.reset_index(drop=True)],axis=1),frame)
cab_others = cab_others[24:]
cab_others = cab_others.set_index(cab_index)
cab_others.columns = ['returns', 'dks_24',"hibb_22","pe_1","pe_2","pe_3","pe_4","pe_5","pe_6",
                      "ps_1","ps_2","ps_3","ps_4","ps_5"]
print cab_others.shape
cab_others.to_csv("output/cabothers.csv", sep=';', encoding='utf-8')
    
    
    
    
    
    
    
    
    
    
    
    
In [25]:
    
print "*"*100
print "For DKS:"
## Calculate the leadlag correlation with 4 years data
print "Calculate the leadlag correlations"
leadlag_corr(dks_4,[cab_4,hibb_4,sp500_4,dks_pe_4,dks_ps_4],0,30,0.06)
print "*"*100
## Put data together
dks_index = dks.index.values[29:]
frame=[dks,cab.shift(2), cab.shift(29),\
       hibb.shift(2), hibb.shift(29),\
       sp500.shift(2), sp500.shift(29),dks_ps.shift(3)]
dks_others = reduce(lambda left,right: pd.concat([left, right],axis=1),frame)
dks_others = dks_others[29:]
#dks_others = dks_others.set_index(dks_index)
dks_others.columns = ['returns', 'cab_2','cab_29','hibb_2',"hibb_29",\
                      "sp500_2","sp500_29",'dks_ps_3']
dks_others.to_csv("output/dksothers.csv", sep=';', encoding='utf-8')
    
    
    
    
    
    
    
    
    
    
    
    
In [26]:
    
# ## adding twitter data
# leadlag_corr(cab_4,[dks_4,hibb_4,sp500_4,cab_pe_4,cab_ps_4,cab_tnum_4,cab_tret_4,cab_tfav_4,cab_mood_4],0,30,0.07)
cab_index = dks.index.values[27:]
frame=[cab,dks.shift(24),# dks.shift(8), dks.shift(10),dks.shift(24),
       hibb.shift(22), # hibb.shift(24),
       cab_pe.shift(3),cab_pe.shift(1),cab_pe.shift(2),cab_pe.shift(4),cab_pe.shift(5),cab_pe.shift(6),
       cab_ps.shift(1),cab_ps.shift(2),cab_ps.shift(3),cab_ps.shift(4),cab_ps.shift(5),
       cab_tnum.shift(19),
       cab_tret.shift(2),cab_tret.shift(15),cab_tret.shift(23),
       cab_tfav.shift(2),cab_tfav.shift(13),cab_tfav.shift(15),cab_tfav.shift(23),
       cab_mood.shift(18),cab_mood.shift(19),cab_mood.shift(26),cab_mood.shift(27)]
cab_others1 = reduce(lambda left,right: pd.concat([left.reset_index(drop=True), right.reset_index(drop=True)],axis=1),frame)
cab_others1 = cab_others1[27:]
cab_others1 = cab_others1.set_index(cab_index)
cab_others1.columns = ['returns', 'dks_24',"hibb_22","pe_3","pe_1","pe_2","pe_4","pe_5","pe_6",\
                      "ps_1","ps_2","ps_3","ps_4","ps_5","num_19","ret_2","ret_15","ret_23","fav_2","fav_13","fav_15","fav_23",
                      "mood_18","mood_19","mood_26","mood_27"]
cab_others1.shape
cab_others1.to_csv("output/cabotherstweets.csv", sep=',', encoding='utf-8')
    
In [27]:
    
### Adding tweet data
# print "*"*100
# print "For DKS:"
# ## Calculate the leadlag correlation with 4 years data
# print "Calculate the leadlag correlations"
# leadlag_corr(dks_4,[cab_4,hibb_4,sp500_4,dks_pe_4,dks_ps_4,dks_tnum_4,dks_tret_4,dks_tfav_4],0,30,0.07)
# print "*"*100
# ## Put data together
dks_index = dks.index.values[29:]
frame=[dks,cab.shift(2), cab.shift(29),\
       hibb.shift(2), hibb.shift(29),\
       sp500.shift(2), sp500.shift(29),dks_ps.shift(3),\
       dks_tnum.shift(1),dks_tret.shift(1),dks_tfav.shift(1),dks_mood.shift(25)]
dks_others1 = reduce(lambda left,right: pd.concat([left, right],axis=1),frame)
dks_others1= dks_others1[29:]
#dks_others = dks_others.set_index(dks_index)
dks_others1.columns = ['returns', 'cab_2','cab_29','hibb_2',"hibb_29",\
                      "sp500_2","sp500_29","dks_ps_3","num_1","ret_1","fav_1","mood_25"]
dks_others1.to_csv("output/dksotherstweet.csv", sep=',', encoding='utf-8')
    
In [28]:
    
### For CAB ###
dks_feature = leadlag_feature(dks,look_back=24)
pe_feature = leadlag_feature(cab_pe,look_back=6)[18:]
ps_feature = leadlag_feature(cab_ps,look_back=6)[18:]
tnum_feature = leadlag_feature(cab_tnum,look_back=19)[5:]
tret_feature = leadlag_feature(cab_tret,look_back=23)[1:]
tfav_feature = leadlag_feature(cab_tfav,look_back=23)[1:]
mood_feature = leadlag_feature(cab_mood,look_back=23)[1:]
frame=[cab[25:],dks_feature,pe_feature,ps_feature,tnum_feature,tret_feature,tfav_feature,mood_feature]
cab_feature = reduce(lambda left,right: pd.concat([left.reset_index(drop=True), 
                                                      right.reset_index(drop=True)],axis=1),frame)
cab_feature.columns = ['returns', 'dks_max',"dks_min","dks_mean","dks_med",
                       'pe_max',"pe_min","pe_mean","pe_med",
                      'ps_max',"ps_min","ps_mean","ps_med",
                      'tn_max',"tn_min","tn_mean","tn_med",
                      'tr_max',"tr_min","tr_mean","tr_med",
                      'tf_max',"tf_min","tf_mean","tf_med",
                      'md_max',"md_min","md_mean","md_med"]
cab_feature.to_csv("output/cab_feature.csv", sep=',', encoding='utf-8')
    
In [29]:
    
### For DKS ###
cab_feature = leadlag_feature(cab,look_back=24)
hibb_feature = leadlag_feature(hibb,look_back=19)[5:]
ps_feature = leadlag_feature(dks_ps,look_back=5)[21:]
tnum_feature = leadlag_feature(cab_tnum,look_back=16)[8:]
tfav_feature = leadlag_feature(cab_tfav,look_back=22)[2:]
frame=[dks[25:],cab_feature,hibb_feature,ps_feature,tnum_feature,tfav_feature]
dks_feature = reduce(lambda left,right: pd.concat([left.reset_index(drop=True), 
                                                      right.reset_index(drop=True)],axis=1),frame)
dks_feature.columns = ['returns', 'dks_max',"dks_min","dks_mean","dks_med",
                       'hibb_max',"hibb_min","hibb_mean","hibb_med",
                      'ps_max',"ps_min","ps_mean","ps_med",
                      'tn_max',"tn_min","tn_mean","tn_med",
                      'tf_max',"tf_min","tf_mean","tf_med"]
dks_feature.to_csv("output/dks_feature.csv", sep=',', encoding='utf-8')
    
In [30]:
    
# with only t and (t-1) values
trainX,trainY,testX,testY = get_wmdata(seed=7,data=cab,split=0.7,look_back=1)
# with only t ...(t-10) values
trainX10,trainY10,testX10,testY10 = get_wmdata(seed=7,data=cab,split=0.7,look_back=10)
    
    
    
In [31]:
    
### 3.ICA for window method data
# n_component=3 for window size 10
trainX10_ica3,trainY10,testX10_ica3,testY10 = wmd_ica(10, cab, split=0.7, look_back=10, n_component=3)
    
    
    
In [52]:
    
#get_ICA(cab_feature,n_comps=3,plot=True)
X = cab_others
n_comps=3
#X = (X-X.mean())#/X.std(ddof=0)
# Compute ICA
ica = FastICA(n_components=3)
S_ = ica.fit_transform(X)  # Reconstruct signals
A_ = ica.mixing_  # Get estimated mixing matrix
ICs = pd.DataFrame({"c1":S_[:,0],'c2':S_[:,1],'c3':S_[:,2]})
    # Make plots to see the signal
fig=plt.figure()
fig.set_size_inches(18,16)
models = [S_]
names = ['ICA recovered signals']
colors = ['red', 'steelblue', 'orange']
for ii, (model, name) in enumerate(zip(models, names), 1):
    plt.subplot(4, 1, ii)
    plt.title(name)
    for sig, color in zip(model.T, colors):
        plt.plot(sig, color=color)
plt.subplots_adjust(0.09, 0.04, 0.94, 0.94, 0.26, 0.46)
plt.show()
    
    
In [260]:
    
##### For CAB #####
## Separate and Standaridize
# split into train and test sets
train,test = split_trn_tst(cab_others,porp=0.7)
X_trn = train[['dks_24',"hibb_22","pe_1","pe_2","pe_3","pe_4","pe_5","pe_6"]]
                      #"ps_1","ps_2","ps_3","ps_4","ps_5"]] # input varialbe
Y_trn = train['returns'] # output varialbe
X_tst = test[['dks_24',"hibb_22","pe_1","pe_2","pe_3","pe_4","pe_5","pe_6"]]
                      #"ps_1","ps_2","ps_3","ps_4","ps_5"]] # input varialbe
Y_tst = test['returns']
# standardize the data
X_trn = (X_trn-X_trn.mean())#/X_trn.std(ddof=0)
X_tst = (X_tst-X_trn.mean())#/X_trn.std(ddof=0)
Y_trn = (Y_trn-Y_trn.mean())#/Y_trn.std(ddof=0)
Y_tst = (Y_tst-Y_trn.mean())#/Y_trn.std(ddof=0)
## Fitting a linear regression model
print "Fitting a linear regression model with all the shifts"
ols = lm.LinearRegression()
ols.fit(X_trn, Y_trn)
print "Intercept: %f" %ols.intercept_
print "Coefficients: %s" %str(ols.coef_)
#model = sm.OLS(Y_trn, X_trn)
#results = model.fit()
#print results.summary()
# save the prediction
lm_cabs = ols.predict(X_tst)-ols.predict(X_tst).mean() # adjusted the shift of the prediction
lm_cabn = ols.predict(X_trn)
# training and test errors
print "R^2 for training set:",
print ols.score(X_trn, Y_trn)
print "RMSE: %.5f" % math.sqrt(mean_squared_error(Y_trn, lm_cabn))
print "R^2 for test set:",
print ols.score(X_tst, Y_tst)
print "RMSE: %.5f" % math.sqrt(mean_squared_error(Y_tst, lm_cabs))
print "*"*100
# Make plots
plot_lm(cab_others["returns"],lm_cabn,lm_cabs)
    
    
    
In [261]:
    
##### For DKS #####
## Separate and Standaridize
# split into train and test sets
train,test = split_trn_tst(dks_others,porp=0.7)
X_trn1 = train[ ['cab_2','cab_29','hibb_2',"hibb_29",\
                      "sp500_2","sp500_29",'dks_ps_3']] # input varialbe
Y_trn1 = train['returns'] # output varialbe
X_tst1 = test[ ['cab_2','cab_29','hibb_2',"hibb_29",\
                      "sp500_2","sp500_29",'dks_ps_3']] # input varialbe
Y_tst1 = test['returns']
# standardize the data
X_trn1 = (X_trn1-X_trn1.mean())#/X_trn.std(ddof=0)
X_tst1 = (X_tst1-X_trn1.mean())#/X_trn.std(ddof=0)
Y_trn1 = (Y_trn1-Y_trn1.mean())#/Y_trn.std(ddof=0)
Y_tst1 = (Y_tst1-Y_trn1.mean())#/Y_trn.std(ddof=0)
## Fitting a linear regression model
print "Fitting a linear regression model with all the shifts"
ols = lm.LinearRegression()
ols.fit(X_trn1, Y_trn1)
print "Intercept: %f" %ols.intercept_
print "Coefficients: %s" %str(ols.coef_)
#model2 = sm.OLS(Y_trn, X_trn)
#results = model2.fit()
#print results.summary()
# save the prediction
lm_dkst = ols.predict(X_tst1)-ols.predict(X_tst1).mean() # adjusted the shift of the prediction
lm_dksn = ols.predict(X_trn1)
# training and test errors
print "R^2 for training set:",
print ols.score(X_trn1, Y_trn1)
print "RMSE: %.5f" % math.sqrt(mean_squared_error(Y_trn1, lm_dksn))
print '-'*100
print "R^2 for test set:",
print ols.score(X_tst1, Y_tst1)
print "RMSE: %.5f" % math.sqrt(mean_squared_error(Y_tst1, lm_dkst))
print "*"*100
                               
# Make plots
plot_lm(dks_others["returns"],lm_dksn,lm_dkst)
    
    
    
Note: cab_others and cab_others1 are both tried, cab_others shows better performance
In [206]:
    
##### For CAB #####
C_list = np.arange(0,30000, 10000)
index = C_list
rmse = pd.DataFrame(columns=['train_rmse','test_rmse'],\
                    index = index)
rmse = rmse.fillna(0)
for c in C_list:
    print " C = "+ str(c)
    r_trn,r_tst = cv_ridge(cab_others,k_fold=5,alpha=c)
    rmse.loc[c] = [r_trn,r_tst]
c_= rmse["test_rmse"].idxmin()
print "*"* 80
print c_
plt.plot(rmse["test_rmse"])
    
    
    Out[206]:
    
In [207]:
    
print "Fitting a linear regression model with all the shifts"
rg = lm.Ridge(alpha = c_)
rg.fit(X_trn, Y_trn)
print "Intercept: %f" %rg.intercept_
print "Coefficients: %s" %str(rg.coef_)
# save the prediction
rg_cabt = rg.predict(X_tst)-rg.predict(X_tst).mean() # adjusted the shift of the prediction
rg_cabn = rg.predict(X_trn)
# training and test errors
print "R^2 for training set:",
print rg.score(X_trn, Y_trn)
print "RMSE: %.5f" % math.sqrt(mean_squared_error(Y_trn, rg_cabn))
print "R^2 for test set:",
print rg.score(X_tst, Y_tst)
print "RMSE: %.5f" % math.sqrt(mean_squared_error(Y_tst, rg_cabt))
print "with alpah = "+ str(c_)
print "*"*100
# Make plots
#plot_lm(cab_others["returns"],rg_cabn,rg_cabt)
    
    
In [208]:
    
df =pd.DataFrame(np.concatenate([rg_cabn,rg_cabt]))
df.columns =["rg_cab"]
df.to_csv("output/rg_cab.csv", sep=',')
    
In [209]:
    
##### For DKS #####
C_list = np.linspace(0, 101, num =50)
index = C_list
rmse = pd.DataFrame(columns=['train_rmse','test_rmse'],\
                    index = index)
rmse = rmse.fillna(0)
for c in C_list:
    print " C = "+ str(c)
    r_trn,r_tst = cv_ridge(dks_others,k_fold=5,alpha=c)
    rmse.loc[c] = [r_trn,r_tst]
c_= rmse["test_rmse"].idxmin()
print "*"* 80
print c_
plt.plot(rmse["test_rmse"])
    
    
    Out[209]:
    
In [210]:
    
print "Fitting a linear regression model with all the shifts"
rg = lm.Ridge(alpha = c_)
rg.fit(X_trn1, Y_trn1)
print "Intercept: %f" %rg.intercept_
print "Coefficients: %s" %str(rg.coef_)
# save the prediction
rg_dkst = rg.predict(X_tst1)-rg.predict(X_tst1).mean() # adjusted the shift of the prediction
rg_dksn = rg.predict(X_trn1)
# training and test errors
print "R^2 for training set:",
print rg.score(X_trn1, Y_trn1)
print "RMSE: %.5f" % math.sqrt(mean_squared_error(Y_trn1, rg_dksn))
print "R^2 for test set:",
print rg.score(X_tst1, Y_tst1)
print "RMSE: %.5f" % math.sqrt(mean_squared_error(Y_tst1, rg_dkst))
print "*"*100
# Make plots
#plot_lm(cab_others["returns"],rg_cabn,rg_cabt)
    
    
In [510]:
    
df =pd.DataFrame(np.concatenate([rg_dksn,rg_dkst]))
df.columns =["rg_dks"]
df.to_csv("output/rg_dks.csv", sep=',')
    
In [273]:
    
cab_feature
    
    Out[273]:
In [314]:
    
# ##### For CAB #####
dataset = cab_feature
split = 0.7
shape = dataset.shape
dataset = dataset.values
scaler = MinMaxScaler(feature_range=(0, 1)) # or 0,1
dataset = scaler.fit_transform(dataset).reshape(shape)
# split into train and test sets
train_size = int(len(dataset) * split)
test_size = len(dataset) - train_size
train, test = dataset[0:train_size], dataset[train_size:len(dataset)]
trainX = train[:,1:]
trainY = train[:,0]
testX = test[:,1:]
testY = test[:,0]
C_list = np.arange(0,100000, 10000)
index = C_list
rmse = pd.DataFrame(columns=['train_rmse','test_rmse'],\
                    index = index)
rmse = rmse.fillna(0)
for c in C_list:
    print " C = "+ str(c)
    r_trn,r_tst = cv_ridge(train,k_fold=5,alpha=c)
    rmse.loc[c] = [r_trn,r_tst]
c_= rmse["test_rmse"].idxmin()
print "*"* 80
print c_
plt.plot(rmse["test_rmse"])
print "Fitting a linear regression model with all the shifts"
rg = lm.Ridge(alpha = c_)
rg.fit(trainX, trainY)
print "Intercept: %f" %rg.intercept_
print "Coefficients: %s" %str(rg.coef_)
# save the prediction
testPredict = rg.predict(testX) 
trainPredict = rg.predict(trainX)
trainPredict = scaler.inverse_transform(trainPredict)
trainY = scaler.inverse_transform([trainY])
testPredict = scaler.inverse_transform(testPredict)
testY = scaler.inverse_transform([testY])
# training and test errors
print "R^2 for training set:",
print rg.score(trainX, trainY)
print "RMSE: %.5f" % math.sqrt(mean_squared_error(trainY, trainPredict))
print "R^2 for test set:",
print rg.score(testX, testY)
print "RMSE: %.5f" % math.sqrt(mean_squared_error(testY, testPredict))
print "with alpah = "+ str(c_)
print "*"*100
    
In [315]:
    
##### For DKS #####
dataset = dks_feature
split = 0.7
shape = dataset.shape
dataset = dataset.values
scaler = MinMaxScaler(feature_range=(0, 1)) # or 0,1
dataset = scaler.fit_transform(dataset).reshape(shape)
# split into train and test sets
train_size = int(len(dataset) * split)
test_size = len(dataset) - train_size
train, test = dataset[0:train_size], dataset[train_size:len(dataset)]
trainX = train[:,1:]
trainY = train[:,0]
testX = test[:,1:]
testY = test[:,0]
C_list = np.arange(0,100000, 10000)
index = C_list
rmse = pd.DataFrame(columns=['train_rmse','test_rmse'],\
                    index = index)
rmse = rmse.fillna(0)
for c in C_list:
    print " C = "+ str(c)
    r_trn,r_tst = cv_ridge(train,k_fold=5,alpha=c)
    rmse.loc[c] = [r_trn,r_tst]
c_= rmse["test_rmse"].idxmin()
print "*"* 80
print c_
plt.plot(rmse["test_rmse"])
print "Fitting a linear regression model with all the shifts"
rg = lm.Ridge(alpha = c_)
rg.fit(trainX, trainY)
print "Intercept: %f" %rg.intercept_
print "Coefficients: %s" %str(rg.coef_)
# save the prediction
testPredict = rg.predict(testX) 
trainPredict = rg.predict(trainX)
# training and test errors
print "R^2 for training set:",
print rg.score(trainX, trainY)
print "RMSE: %.5f" % math.sqrt(mean_squared_error(trainY, trainPredict))
print "R^2 for test set:",
print rg.score(testX, testY)
print "RMSE: %.5f" % math.sqrt(mean_squared_error(testY, testPredict))
print "with alpah = "+ str(c_)
print "*"*100
    
In [423]:
    
##### For CAB #####
test_stationarity(cab)
    
    
    
    
In [430]:
    
fig = plt.figure(figsize=(12,8))
ax1 = fig.add_subplot(211)
fig = sm.graphics.tsa.plot_acf(cab.values.squeeze(), lags=40, ax=ax1)
ax2 = fig.add_subplot(212)
fig = sm.graphics.tsa.plot_pacf(cab, lags=40, ax=ax2)
    
    
In [431]:
    
cab1 = cab
cab1 = (cab1 - cab1.mean()) / (cab1.std())
plt.acorr(cab1,maxlags = len(cab1) -1, linestyle = "solid", usevlines = False, marker='')
plt.show()
autocorrelation_plot(cab1)
plt.show()
    
    
    
In [493]:
    
## according the plots, try ar(4,0)
arma_mod40 = sm.tsa.ARMA(cab, (4,0)).fit()
print arma_mod40.params
print arma_mod40.aic, arma_mod40.bic, arma_mod40.hqic
## check residual
print "*"*50
resid40 = arma_mod40.resid
print  "check residual"
fig = plt.figure(figsize=(12,8))
ax1 = fig.add_subplot(211)
fig = sm.graphics.tsa.plot_acf(resid40.values.squeeze(), lags=40, ax=ax1)
ax2 = fig.add_subplot(212)
fig = sm.graphics.tsa.plot_pacf(resid40, lags=40, ax=ax2)
print "*"*50
print  "Prediction for the 5th year"
length = cab.shape[0]
predict_cab40 = arma_mod40.predict(7*length/10,length-1, dynamic=True)
cab_test = cab[(7*length/10):]
#print predict_cab40
#print cab_test
print "RMSE: %.5f" % math.sqrt(mean_squared_error(cab_test, predict_cab40))
    
    
    
In [316]:
    
##### For DKS #####
test_stationarity(dks)
    
    
    
    
In [317]:
    
fig = plt.figure(figsize=(12,8))
ax1 = fig.add_subplot(211)
fig = sm.graphics.tsa.plot_acf(dks.values.squeeze(), lags=40, ax=ax1)
ax2 = fig.add_subplot(212)
fig = sm.graphics.tsa.plot_pacf(dks, lags=40, ax=ax2)
    
    
In [319]:
    
dks1 = dks
dks1 = (dks1 - dks1.mean()) / (dks1.std())
plt.acorr(dks1,maxlags = len(dks1) -1, linestyle = "solid", usevlines = False, marker='')
plt.show()
autocorrelation_plot(dks1)
plt.show()
    
    
    
In [320]:
    
## according the plots, try ar(17,0)
arma_mod70 = sm.tsa.ARMA(dks, (17,0)).fit()
print arma_mod70.params
print arma_mod70.aic, arma_mod70.bic, arma_mod70.hqic
## check residual
print "*"*50
resid70 = arma_mod70.resid
print  "check residual"
fig = plt.figure(figsize=(12,8))
ax1 = fig.add_subplot(211)
fig = sm.graphics.tsa.plot_acf(resid70.values.squeeze(), lags=40, ax=ax1)
ax2 = fig.add_subplot(212)
fig = sm.graphics.tsa.plot_pacf(resid70, lags=40, ax=ax2)
print "*"*50
print  "Prediction for the 5th year"
length = dks.shape[0]
predict_dks71 = arma_mod70.predict(7*length/10,length-1, dynamic=True)
dks_test = dks[(7*length/10):]
print "RMSE: %.5f" % math.sqrt(mean_squared_error(dks_test, predict_dks71))
    
    
    
    
    
    
In [132]:
    
### Model1 with window size but no ICA 
## According to AR, Window size= 4 
lstm_np1,lstm_tp1=LSM(cab,seed=10,look_back=4,split=0.7,n_loop=50,layer2=4,show_log = 1,batch_size = 1,cv=False)
#Train Score: 0.02176 RMSE
#Test Score: 0.02454 RMSE
# save data
#df = pd.DataFrame([np.concatenate([lstm_np1,lstm_tp1])].reshape())
df =pd.DataFrame(np.concatenate([lstm_np1,lstm_tp1]))
df.columns =["lstm_cab1"]
df.to_csv("output/lstm_cab1.csv", sep=',')
# Train Score: 0.02184 RMSE
#Test Score: 0.02477 RMSE
    
    
    
    
In [25]:
    
lstm_np1,lstm_tp1=LSM(cab,seed=10,look_back=10,split=0.7,n_loop=100,layer2=4,show_log = 1,batch_size = 1,cv=False)
#Train Score: 0.02176 RMSE
#Test Score: 0.02454 RMSE
# save data
#df = pd.DataFrame([np.concatenate([lstm_np1,lstm_tp1])].reshape())
df =pd.DataFrame(np.concatenate([lstm_np1,lstm_tp1]))
df.columns =["lstm_cab1.1"]
df.to_csv("output/lstm_cab1_1.csv", sep=',')
    
    
    
    
In [27]:
    
## According to AR, Window size= 4 
lstm_np2,lstm_tp2=LSM(dks,seed=10,look_back=17,split=0.7,n_loop=30,layer2=4,show_log = 1,batch_size = 1,cv=False)
df =pd.DataFrame(np.concatenate([lstm_np2,lstm_tp2]))
df.columns =["lstm_dks1"]
df.to_csv("output/lstm_dks1.csv", sep=',')
#Train Score: 0.01738 RMSE
#Test Score: 0.01997 RMSE
    
    
    
    
In [210]:
    
lstm_np3,lstm_tp3=LSTM(cab,seed=10,look_back=4,split=0.7,n_loop=30,layer2=4,show_log = 1,batch = 1,cv=False)
#Train Score: 0.02176 RMSE
#Test Score: 0.02454 RMSE
# save data
#df = pd.DataFrame([np.concatenate([lstm_np1,lstm_tp1])].reshape())
df =pd.DataFrame(np.concatenate([lstm_np3,lstm_tp3]))
df.columns =["lstm_cab2"]
df.to_csv("output/lstm_cab2.csv", sep=',')
    
In [211]:
    
lstm_np4,lstm_tp4=LSTM(dks,seed=10,look_back=17,split=0.7,n_loop=100,layer2=4,show_log = 1,batch = 1,cv=False)
#Train Score: 0.02176 RMSE
#Test Score: 0.02454 RMSE
# save data
#df = pd.DataFrame([np.concatenate([lstm_np1,lstm_tp1])].reshape())
df =pd.DataFrame(np.concatenate([lstm_np4,lstm_tp4]))
df.columns =["lstm_dks2"]
df.to_csv("output/lstm_dks2.csv", sep=',')
    
In [213]:
    
lstm_np5,lstm_tp5=LSTM_ica(cab,seed=10,look_back=10,n_component=3,layer2=5,split=0.7,n_loop=50,show_log = 1,cv=False)
df =pd.DataFrame(np.concatenate([lstm_np5,lstm_tp5]))
df.columns =["lstm_ica_cab"]
df.to_csv("output/lstm_cab3.csv", sep=',')
    
In [214]:
    
lstm_np6,lstm_tp6=LSTM_ica(dks,seed=10,look_back=17,n_component=8,layer2=5,split=0.7,n_loop=50,show_log = 1,cv=False)
df = pd.DataFrame(np.concatenate([lstm_np6,lstm_tp6]))
df.columns =["lstm_ica_dks"]
df.to_csv("output/lstm_dks3.csv", sep=',')
    
In [216]:
    
cab_others1 = pd.read_csv("output/cabotherstweets.csv",index_col=0)[3:]
lstm_np7,lstm_tp7=LSTM_others(cab_others1,seed=10,n_features=(cab_others1.shape[1]-1),layer2=5,split=0.7,n_loop=50,show_log = 1,cv=False)
df =pd.DataFrame(np.concatenate([lstm_np7,lstm_tp7]))
df.columns =["lstm_othert_cab"]
df.to_csv("output/lstm_cab4.csv", sep=',')
    
In [220]:
    
lstm_np8,lstm_tp8=LSTM_others(dks_others1,seed=10,n_features=(dks_others1.shape[1]-1),layer2=5,split=0.7,n_loop=50,show_log = 1,cv=False)
df =pd.DataFrame(np.concatenate([lstm_np8,lstm_tp8]))
df.columns =["lstm_othert_dks"]
df.to_csv("output/lstm_dks4.csv", sep=',')
    
In [202]:
    
## split the data into 5 parts
## if data is training and test together, we need to concate 
def cv_lstm(model,data,k_fold,split,**extraKwArgs):
    total = int(len(data) * split)
    trntotal = data[0:total]
    train_score = [0]*(k_fold-1)
    test_score = [0]*(k_fold-1)
    L = int(len(data)/k_fold)
    for i in range(1,k_fold):
        ind = i*(L+1)
        subset = data[0:ind]
        train_score[i-1],test_score[i-1] = model(dataset=subset,**extraKwArgs)
    print('Train Score: %.2f RMSE' % ((numpy.array(train_score)).mean()))
    print('Test Score: %.2f RMSE' % ((numpy.array(test_score)).mean()))
    
In [219]:
    
for size in range(1,20):
    print "Batch = "+ str(size)
    cv_lstm(LSTM,data=cab,k_fold=5,batch_size =size,n_loop=10,split=0.7,cv=True)
    
In [ ]:
    
# Simple model with lookback=1
# default is very simple RNN layers=[1,4,1]
def LSTM_1(trainX,trainY,testX,testY,dataset=cab,layers=[1,4,1],lookback=1,epoch=100,batch=1):
    scaler = MinMaxScaler(feature_range=(0, 1))
    dataset = dataset.values.reshape(len(dataset),1)
    trainX,trainY,testX,testY = map(np.array,(trainX,trainY,testX,testY))
    look_back=lookback
    # prepare data
    # reshape input to be [samples, time steps, features]
    trainX = np.reshape(trainX, (trainX.shape[0], 1, trainX.shape[1]))
    testX = np.reshape(testX, (testX.shape[0], 1, testX.shape[1]))
    # create and fit the LSTM network
    model = Sequential()
    layers = layers
    model.add(LSTM(layers[1], input_dim=layers[0]))
    model.add(Dense(layers[2])) #a Dense layer ( = feedforward)
    model.compile(loss='mean_squared_error', optimizer='adam')
    model.fit(trainX, trainY, nb_epoch=epoch, batch_size=batch, verbose=2)
    # make predictions
    trainPredict = model.predict(trainX)
    testPredict = model.predict(testX)
    # invert predictions
    trainPredict = scaler.inverse_transform(trainPredict)
    trainY = scaler.inverse_transform([trainY])
    testPredict = scaler.inverse_transform(testPredict)
    testY = scaler.inverse_transform([testY])
    # calculate root mean squared error
    trainScore = math.sqrt(mean_squared_error(trainY[0], trainPredict[:,0]))
    print('Train Score: %.2f RMSE' % (trainScore))
    testScore = math.sqrt(mean_squared_error(testY[0], testPredict[:,0]))
    print('Test Score: %.2f RMSE' % (testScore))
    # shift train predictions for plotting
    trainPredictPlot = np.empty_like(dataset.values)
    trainPredictPlot[:, :] = np.nan
    trainPredictPlot[look_back:len(trainPredict)+look_back, :] = trainPredict
    # shift test predictions for plotting
    testPredictPlot = np.empty_like(dataset)
    testPredictPlot[:, :] = np.nan
    testPredictPlot[len(trainPredict)+(look_back*2)+1:len(dataset)-1, :] = testPredict
    # plot baseline and predictions
    fig = plt.figure()
    fig.set_size_inches(18,16)
    plt.plot(scaler.inverse_transform(dataset))
    plt.plot(trainPredictPlot)
    plt.plot(testPredictPlot)
    plt.show()
    return trainPredict,testPredict
## Use window method data
#trainX,trainY,testX,testY = get_wmdata(seed=7,data=cab,split=0.7,look_back=1)
#trainX,trainY,testX,testY = get_wmdata(seed=7,data=cab,split=0.7,look_back=10)
############# With default layers #############################
# First model split=0.7,epoch=100,batch_size=1
# LSM1_1_trnpred,LSM1_1_tstpred = LSTM_1(trainX,trainY,testX,testY,layers=[1,4,1],epoch=100,batch=1)
    # Train Score: 0.02 RMSE
    # Test Score: 0.02 RMSE
# Second model split=0.7,epoch=100,batch_size=2
#LSM1_2_trnpred,LSM1_2_tstpred = LSTM_1(trainX,trainY,testX,testY,layers=[1,4,1],epoch=100,batch=2)
    # Train Score: 0.02 RMSE
    # Test Score: 0.02 RMSE
# Third model split=0.7,epoch=30,batch_size=2
############# With a hidden layer of sizes 50 #################
#LSM1_3_trnpred,LSM1_3_tstpred = LSTM_1(trainX,trainY,testX,testY,layers=[1,4,1],epoch=100,batch=2)
    # Train Score: 0.02 RMSE
    # Test Score: 0.02 RMSE
############# With a window method ################# 
#LSM1_4_trnpred,LSM1_4_tstpred = LSTM_1(trainX10,trainY10,testX10,testY10,dataset=cab,layers=[10,50,1],epoch=100,batch=1)
    # Train Score: 0.02 RMSE
    # Test Score: 0.03 RMSE
    # But this one capture the small pikes
## Use window method and Ica data
# trainX10_ica3,trainY10,testX10_ica3,testY10 = wmd_ica(10, cab, split=0.7, look_back=10, n_component=3)
# LSM1_5_trnpred,LSM1_5_tstpred = LSTM_1(trainX10_ica3,trainY10,testX10_ica3,testY10,lookback=10,layers=[3,50,1],epoch=100,batch=1)
     #Train Score: 0.02 RMSE
    #Test Score: 0.02 RMSE
    
In [ ]:
    
# A model with window method and time steps
# Use the past observations them as time steps of the one input feature
# default is very simple RNN layers=[1,4,1]
def LSTM_2(trainX,trainY,testX,testY,dataset = cab, layers=[1,4,1],epoch=100,batch=1):
    scaler = MinMaxScaler(feature_range=(0, 1))
    dataset = dataset.values.reshape(len(dataset),1)
    trainX,trainY,testX,testY = map(np.array,(trainX,trainY,testX,testY))
    # reshape input to be [samples, time steps, features], set the columns to be the time steps dimension 
    trainX = np.reshape(trainX, (trainX.shape[0], trainX.shape[1], 1))
    testX = np.reshape(testX, (testX.shape[0], testX.shape[1], 1))
    # create and fit the LSTM network
    model = Sequential()
    layers = layers
    look_back = layers[0]
    model.add(LSTM(layers[1], input_dim=layers[0]))
    model.add(Dense(layers[2])) #a Dense layer ( = feedforward)
    model.compile(loss='mean_squared_error', optimizer='adam')
    model.fit(trainX, trainY, nb_epoch=epoch, batch_size=batch, verbose=2)
    # make predictions
    trainPredict = model.predict(trainX)
    testPredict = model.predict(testX)
    # invert predictions
    trainPredict = scaler.inverse_transform(trainPredict)
    trainY = scaler.inverse_transform([trainY])
    testPredict = scaler.inverse_transform(testPredict)
    testY = scaler.inverse_transform([testY])
    # calculate root mean squared error
    trainScore = math.sqrt(mean_squared_error(trainY[0], trainPredict[:,0]))
    print('Train Score: %.2f RMSE' % (trainScore))
    testScore = math.sqrt(mean_squared_error(testY[0], testPredict[:,0]))
    print('Test Score: %.2f RMSE' % (testScore))
    # shift train predictions for plotting
    trainPredictPlot = np.empty_like(dataset)
    trainPredictPlot[:, :] = np.nan
    trainPredictPlot[look_back:len(trainPredict)+look_back, :] = trainPredict
    # shift test predictions for plotting
    testPredictPlot = np.empty_like(dataset.values)
    testPredictPlot[:, :] = np.nan
    testPredictPlot[len(trainPredict)+(look_back*2)+1:len(dataset)-1, :] = testPredict
    # plot baseline and predictions
    fig = plot.figure()
    fig.set_size_inch(18,16)
    plt.plot(scaler.inverse_transform(dataset))
    plt.plot(trainPredictPlot)
    plt.plot(testPredictPlot)
    plt.show()
    
    return trainPredict,testPredict
## Use window method data
#LSM2_1_trnpred,LSM2_1_tstpred = LSTM_2(trainX,trainY,testX,testY,layers=[1,4,1],epoch=1,batch=1)
#Train Score: 0.02 RMSE
#Test Score: 0.02 RMSE
## Use window method and Ica data
#LSM2_2_trnpred,LSM2_2_tstpred = LSTM_2(trainX10_ica3,trainY10,testX10_ica3,testY10,layers=[3,50,1],epoch=100,batch=1)
#Train Score: 0.02 RMSE
#Test Score: 0.02 RMSE
    
In [229]:
    
cab_predictors = pd.read_csv("output/cab_prediction.csv")
cab_trn_pred, cab_test_pred = blending(cab_predictors,0.7,learning_rate=0.1,max_depth=15,random_state=45)
    
In [235]:
    
dks_predictors = pd.read_csv("output/dks_prediction.csv")
dks_trn_pred, dks_test_pred = blending(dks_predictors,0.7,learning_rate=0.1,max_depth=15,random_state=45)
    
In [ ]: