In [232]:
### ref reg_SVM_logit_LSTM.ipynb

In [1]:
import numpy as np
import pandas as pd
import math

from sklearn.model_selection import train_test_split
from sklearn import linear_model
from sklearn.svm import SVC

df = pd.DataFrame()
pdf= pd.DataFrame()

def get_csv_pd(path):
    #spy_pd=pd.read_csv('C:\\Users\Michal\Dropbox\IB_data\SPY.csv',sep=' ',names=['askPrice','askSize','bidPrice','bidSize'],index_col=0,parse_dates=True)
    #spy_pd=pd.read_csv(path+'\SPY.csv',sep=',',names=['askPrice','askSize','bidPrice','bidSize'],index_col=0,parse_dates=True)
    spy_pd=pd.read_csv(path,sep=',',dtype={'askPrice':np.float32,'askSize':np.float32,
                                           'bidPrice':np.float32,'bidSize':np.float32},index_col=0,parse_dates=True)
    return spy_pd

def get_csv_pd_notime(path):
    #spy_pd=pd.read_csv('C:\\Users\Michal\Dropbox\IB_data\SPY.csv',sep=' ',names=['askPrice','askSize','bidPrice','bidSize'],index_col=0,parse_dates=True)
    #spy_pd=pd.read_csv(path+'\SPY.csv',sep=',',names=['askPrice','askSize','bidPrice','bidSize'],index_col=0,parse_dates=True)
    spy_pd = pd.read_csv(path, usecols=['askPrice','askSize','bidPrice','bidSize'], engine='python', skipfooter=3)
    return spy_pd
def preprocessing_df(df):
    df.bidPrice=df.bidPrice.replace(to_replace=0, method='ffill')
    df.bidSize=df.bidSize.replace(to_replace=0, method='ffill')
    df.askPrice=df.askPrice.replace(to_replace=0, method='ffill')
    df.askSize=df.askSize.replace(to_replace=0, method='ffill')
    df['Close']=(df.bidPrice+df.askPrice)/2
    df['price']=(df.bidPrice*df.bidSize+df.askPrice*df.askSize)/(df.bidSize+df.askSize)
    #velP=np.where(df.Close>df.Close.shift(60),1,0)
    #velN=np.where(df.Close<df.Close.shift(60),-1,0)
    #U=np.where(df.Close>df.price.rolling(60).max(),1,0)
    #D=np.where(df.Close<df.price.rolling(60).max(),-1,0)
    #df['U']= np.where(velP*U==1,1,0)
    #df['D']= np.where(velN*D==1,-1,0)
    #df['U']= np.where(velP==1,1,0)
    #df['D']= np.where(velN==1,-1,0)
    df['U']= np.where(df.Close>df.price,1,0)
    df['D']= np.where(df.Close<df.price,-1,0)
    df['log']=np.log(df.Close)
    #df['logDiff'] = df.log-df.log.rolling(60).mean()# almost 1 min
    df['logDiff'] = df.log-df.log.shift(60)# almost 1 min
    df['sigma']=df.log.rolling(60).std()
    return df

from statsmodels.tsa.arima_model import ARIMA
from statsmodels.tsa.arima_model import ARIMAResults
 
# monkey patch around bug in ARIMA class
def __getnewargs__(self):
	return ((self.endog),(self.k_lags, self.k_diff, self.k_ma))
ARIMA.__getnewargs__ = __getnewargs__


def ARIMA_df(df):
    data=df.logDiff.dropna()
    model = ARIMA(data, order=(2,1,2))  # tested from ARIMA.ipynb
    #predictions = model.fit(disp=0).predict()
    predictions =model.fit(disp=0).fittedvalues
    # save model
    model.fit().save('sevennine_arima.pkl')
    df['pr_arima']=np.exp(predictions+df.log.rolling(60).mean())
    return df

# Import a Kalman filter and other useful libraries
from pykalman import KalmanFilter
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from scipy import poly1d

def kalman_ma(data):
    x=data.price.tail(60)
    y=data.Close.tail(60)
    # Construct a Kalman filter
    kf = KalmanFilter(transition_matrices = [1],
                  observation_matrices = [1],
                  initial_state_mean = 246,
                  initial_state_covariance = 1,
                  observation_covariance=1,
                  transition_covariance=.01)

    # Use the observed values of the price to get a rolling mean
    state_means, _ = kf.filter(x.values)
    state_means = pd.Series(state_means.flatten(), index=x.index)
    data['km']=state_means
    return data


/home/octo/anaconda2/envs/carnd-term1/lib/python3.5/site-packages/statsmodels/compat/pandas.py:56: FutureWarning: The pandas.core.datetools module is deprecated and will be removed in a future version. Please use the pandas.tseries module instead.
  from pandas.core import datetools

In [2]:
#path = 'C:\\Users\Michal\Dropbox\IB_data'
#path = 'C:\\Users\Michal\Desktop'+ '\SPY14Aug17.csv'
path = '/home/octo/Dropbox'+ '/SPY15Aug17.csv'
df1=get_csv_pd_notime(path)
df=get_csv_pd(path)
df=df[500:15500]
df=preprocessing_df(df)

In [3]:
df=ARIMA_df(df)

In [4]:
df.tail()


Out[4]:
askPrice askSize bidPrice bidSize Close price U D log logDiff sigma pr_arima
2017-08-14 11:24:53.702647 246.529999 49.0 246.520004 47.0 246.524994 246.525101 0 -1 5.507463 0.000122 0.000037 246.521014
2017-08-14 11:24:53.889651 246.529999 64.0 246.520004 47.0 246.524994 246.525757 0 -1 5.507463 0.000122 0.000034 246.521514
2017-08-14 11:24:55.384690 246.529999 64.0 246.520004 54.0 246.524994 246.525421 0 -1 5.507463 0.000122 0.000031 246.522015
2017-08-14 11:24:55.577695 246.529999 58.0 246.520004 54.0 246.524994 246.525177 0 -1 5.507463 0.000122 0.000027 246.522514
2017-08-14 11:24:55.769700 246.529999 49.0 246.520004 54.0 246.524994 246.524765 1 0 5.507463 0.000102 0.000025 246.522931

In [ ]:
### load model
#loaded = ARIMAResults.load('sevennine_arima.pkl')

In [143]:
df=kalman_ma(df)

In [144]:
import pickle
#from sklearn.cross_validation import train_test_split
from sklearn import linear_model
from sklearn.svm import SVR
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC

In [145]:
# saving linear model
df=df.dropna()
X=df[['askPrice','askSize','bidPrice','bidSize','Close','pr_arima','U','D','sigma']]
y=df[['logDiff']]
regr = linear_model.LinearRegression()
regr_model=regr.fit(X,y)
regr_model = pickle.dumps(regr_model)
# Fit regression model
svr_rbf = SVR(kernel='rbf', C=1e3, gamma=0.9) #kernel='linear' #kernel='poly'
svr_model = svr_rbf.fit(X, y)
svr_model = pickle.dumps(svr_model)


/home/octo/anaconda2/envs/carnd-term1/lib/python3.5/site-packages/sklearn/utils/validation.py:526: DataConversionWarning: A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples, ), for example using ravel().
  y = column_or_1d(y, warn=True)

In [146]:
# saving logistics and SVC model
df=df.dropna()
X=df[['askPrice','askSize','bidPrice','bidSize','Close','price','pr_arima','sigma']]
y1=df[['U']]
y2=df[['D']]

svm = SVC(kernel='linear')
lm = linear_model.LogisticRegression(C=1e4)
svm_model_up= svm.fit(X,y1)
svm_model_up = pickle.dumps(svm_model_up)
lm_model_up= lm.fit(X, y1)
lm_model_up = pickle.dumps(lm_model_up)
svm_model_dn= svm.fit(X, y2)
svm_model_dn = pickle.dumps(svm_model_dn)
lm_model_dn= lm.fit(X, y2)
lm_model_dn = pickle.dumps(lm_model_dn)


/home/octo/anaconda2/envs/carnd-term1/lib/python3.5/site-packages/sklearn/utils/validation.py:526: DataConversionWarning: A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples, ), for example using ravel().
  y = column_or_1d(y, warn=True)

In [147]:
#loading regression model, first save the model
svr_model = pickle.loads(svr_model)
regr_model = pickle.loads(regr_model)

#loading classification model, first save the model
svm_model_up = pickle.loads(svm_model_up)
svm_model_dn = pickle.loads(svm_model_dn)
lm_model_up = pickle.loads(lm_model_up)
lm_model_dn = pickle.loads(lm_model_dn)

In [148]:
def strat_lr(data):
    data=data.tail(60).dropna()
    X=data[['askPrice','askSize','bidPrice','bidSize','Close','pr_arima','U','D','sigma']]
    y=data[['logDiff']]
    predict_regr=regr_model.predict(X)
    predict_svr=svr_model.predict(X)
    dt=data[['Close']]
    dt['predict_regr']=predict_regr
    dt['predict_svr']=predict_svr
        
    pdf=data
    pdf['pREG']=np.exp(dt.predict_regr+data.log.shift(59))
    pdf['pSVR']=np.exp(dt.predict_regr+data.log.shift(59))
         
    #dt=data[['price','predict']]
    return pdf

In [149]:
df=strat_lr(df)


/home/octo/anaconda2/envs/carnd-term1/lib/python3.5/site-packages/ipykernel/__main__.py:8: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
/home/octo/anaconda2/envs/carnd-term1/lib/python3.5/site-packages/ipykernel/__main__.py:9: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy

In [150]:
def classification_up_dn(data):
    X=data[['askPrice','askSize','bidPrice','bidSize','Close','price','pr_arima','sigma']]
    y1=data[['U']]
    y2=data[['D']]
    pr_df=data.tail(60)
    predict_svm_up=svm_model_up.predict(X.tail(60))
    pr_df['predict_svm_up']=predict_svm_up
    predict_lm_up=lm_model_up.predict(X.tail(60))
    pr_df['predict_lm_up']=predict_lm_up
    predict_svm_dn=svm_model_dn.predict(X.tail(60))
    pr_df['predict_svm_dn']=predict_svm_dn
    predict_lm_dn=lm_model_dn.predict(X.tail(60))
    pr_df['predict_lm_dn']=predict_lm_dn
    pr_df['predict_svm']=pr_df.predict_svm_up+pr_df.predict_svm_dn
    pr_df['predict_lm']=pr_df.predict_lm_up+pr_df.predict_lm_dn
    return pr_df

In [151]:
df=classification_up_dn(df)


/home/octo/anaconda2/envs/carnd-term1/lib/python3.5/site-packages/ipykernel/__main__.py:7: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
/home/octo/anaconda2/envs/carnd-term1/lib/python3.5/site-packages/ipykernel/__main__.py:9: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
/home/octo/anaconda2/envs/carnd-term1/lib/python3.5/site-packages/ipykernel/__main__.py:11: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
/home/octo/anaconda2/envs/carnd-term1/lib/python3.5/site-packages/ipykernel/__main__.py:13: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
/home/octo/anaconda2/envs/carnd-term1/lib/python3.5/site-packages/ipykernel/__main__.py:14: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
/home/octo/anaconda2/envs/carnd-term1/lib/python3.5/site-packages/ipykernel/__main__.py:15: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy

In [163]:
df['predict_svm']=df.predict_svm_up+df.predict_svm_dn
df['predict_lm']=df.predict_lm_up+df.predict_lm_dn
df['UD']=np.where(df.predict_svm+df.predict_lm>0,1,np.where(df.predict_svm+df.predict_lm,-1,0))

LSTM


In [165]:
import numpy
import matplotlib.pyplot as plt
import pandas
import math
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import LSTM
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import mean_squared_error
import numpy
import matplotlib.pyplot as plt
import pandas
import math
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import LSTM
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import mean_squared_error


Using TensorFlow backend.

In [189]:
# Newly loading for longdataset

In [190]:
#path = 'C:\\Users\Michal\Dropbox\IB_data'
#path = 'C:\\Users\Michal\Desktop'+ '\SPY14Aug17.csv'
path = '/home/octo/Dropbox'+ '/SPY15Aug17.csv'
df1=get_csv_pd_notime(path)
df=get_csv_pd(path)
df=df[500:45500]
df=preprocessing_df(df)

In [212]:
df=ARIMA_df(df)

In [217]:
df['spread']=df.Close-df.pr_arima

In [220]:
dataset= df[['askPrice','askSize','bidPrice','bidSize','Close','spread','pr_arima','sigma']]
dataset=dataset.dropna()
dataset=dataset.values
dataset = dataset.astype('float32')

In [221]:
len(dataset)


Out[221]:
44939

In [222]:
# fix random seed for reproducibility
numpy.random.seed(7)

In [223]:
# normalize the dataset
scaler = MinMaxScaler(feature_range=(0, 1))
dataset = scaler.fit_transform(dataset)

In [224]:
# split into train and test sets
train_size = int(len(dataset) * 0.80)
test_size = len(dataset) - train_size
train, test = dataset[0:train_size,:], dataset[train_size:len(dataset),:]
print(len(train), len(test))


35951 8988

In [226]:
# convert an array of values into a dataset matrix
def create_dataset(dataset, look_back=1):
    dataX, dataY = [], []
    for i in range(len(dataset)-look_back-1):
        a = dataset[i:(i+look_back), 0]
        b = dataset[i:(i+look_back), 1]
        c = dataset[i:(i+look_back), 2]
        d = dataset[i:(i+look_back), 3]
        e=  dataset[i:(i+look_back), 4]
        f = dataset[i:(i+look_back), 5]
        g=  dataset[i:(i+look_back), 6]
        h=  dataset[i:(i+look_back), 7]
        dataX.append(np.c_[a,b,c,d,f,g,h])
        #dataX.append(b)
        #dataX.append(c)
        #dataX.append(d)
        #dataX.append(e)
        #dataX.concatenate((a,bT,cT,dT,eT),axis=1)
        dataY.append(dataset[i + look_back,4])
    return np.array(dataX), np.array(dataY)

In [227]:
# reshape into X=t and Y=t+1
look_back = 3
trainX, trainY = create_dataset(train,look_back)
testX, testY = create_dataset(test,look_back)

In [228]:
# reshape input to be [samples, time steps, features]
trainX = numpy.reshape(trainX, (trainX.shape[0],trainX.shape[1],trainX.shape[2]))
testX = numpy.reshape(testX, (testX.shape[0],testX.shape[1],testX.shape[2]))

In [229]:
epochs=3
batch_size=25

In [230]:
# create and fit the LSTM network
model = Sequential()
model.add(LSTM(4, input_shape=(look_back,7)))
model.add(Dense(1))
model.compile(loss='mean_squared_error', optimizer='adam')
model.fit(trainX, trainY, epochs, batch_size, verbose=2)


Epoch 1/25
24s - loss: 0.0018
Epoch 2/25
24s - loss: 1.8888e-05
Epoch 3/25
24s - loss: 1.5021e-05
Epoch 4/25
27s - loss: 1.3375e-05
Epoch 5/25
29s - loss: 1.2357e-05
Epoch 6/25
29s - loss: 1.1432e-05
Epoch 7/25
33s - loss: 1.0977e-05
Epoch 8/25
31s - loss: 1.0838e-05
Epoch 9/25
31s - loss: 1.0328e-05
Epoch 10/25
31s - loss: 9.9659e-06
Epoch 11/25
31s - loss: 9.9224e-06
Epoch 12/25
31s - loss: 9.6704e-06
Epoch 13/25
31s - loss: 9.4364e-06
Epoch 14/25
31s - loss: 9.5454e-06
Epoch 15/25
31s - loss: 9.2212e-06
Epoch 16/25
31s - loss: 9.1803e-06
Epoch 17/25
31s - loss: 9.2058e-06
Epoch 18/25
31s - loss: 9.0244e-06
Epoch 19/25
31s - loss: 8.9191e-06
Epoch 20/25
32s - loss: 8.6847e-06
Epoch 21/25
32s - loss: 8.7769e-06
Epoch 22/25
32s - loss: 8.6767e-06
Epoch 23/25
35s - loss: 8.6899e-06
Epoch 24/25
32s - loss: 8.6722e-06
Epoch 25/25
35s - loss: 8.5339e-06
Out[230]:
<keras.callbacks.History at 0x7fa6ce5fbcf8>

In [231]:
model.save("sevensep.h5")

In [ ]: