In [232]:
### ref reg_SVM_logit_LSTM.ipynb
In [1]:
import numpy as np
import pandas as pd
import math
from sklearn.model_selection import train_test_split
from sklearn import linear_model
from sklearn.svm import SVC
df = pd.DataFrame()
pdf= pd.DataFrame()
def get_csv_pd(path):
#spy_pd=pd.read_csv('C:\\Users\Michal\Dropbox\IB_data\SPY.csv',sep=' ',names=['askPrice','askSize','bidPrice','bidSize'],index_col=0,parse_dates=True)
#spy_pd=pd.read_csv(path+'\SPY.csv',sep=',',names=['askPrice','askSize','bidPrice','bidSize'],index_col=0,parse_dates=True)
spy_pd=pd.read_csv(path,sep=',',dtype={'askPrice':np.float32,'askSize':np.float32,
'bidPrice':np.float32,'bidSize':np.float32},index_col=0,parse_dates=True)
return spy_pd
def get_csv_pd_notime(path):
#spy_pd=pd.read_csv('C:\\Users\Michal\Dropbox\IB_data\SPY.csv',sep=' ',names=['askPrice','askSize','bidPrice','bidSize'],index_col=0,parse_dates=True)
#spy_pd=pd.read_csv(path+'\SPY.csv',sep=',',names=['askPrice','askSize','bidPrice','bidSize'],index_col=0,parse_dates=True)
spy_pd = pd.read_csv(path, usecols=['askPrice','askSize','bidPrice','bidSize'], engine='python', skipfooter=3)
return spy_pd
def preprocessing_df(df):
df.bidPrice=df.bidPrice.replace(to_replace=0, method='ffill')
df.bidSize=df.bidSize.replace(to_replace=0, method='ffill')
df.askPrice=df.askPrice.replace(to_replace=0, method='ffill')
df.askSize=df.askSize.replace(to_replace=0, method='ffill')
df['Close']=(df.bidPrice+df.askPrice)/2
df['price']=(df.bidPrice*df.bidSize+df.askPrice*df.askSize)/(df.bidSize+df.askSize)
#velP=np.where(df.Close>df.Close.shift(60),1,0)
#velN=np.where(df.Close<df.Close.shift(60),-1,0)
#U=np.where(df.Close>df.price.rolling(60).max(),1,0)
#D=np.where(df.Close<df.price.rolling(60).max(),-1,0)
#df['U']= np.where(velP*U==1,1,0)
#df['D']= np.where(velN*D==1,-1,0)
#df['U']= np.where(velP==1,1,0)
#df['D']= np.where(velN==1,-1,0)
df['U']= np.where(df.Close>df.price,1,0)
df['D']= np.where(df.Close<df.price,-1,0)
df['log']=np.log(df.Close)
#df['logDiff'] = df.log-df.log.rolling(60).mean()# almost 1 min
df['logDiff'] = df.log-df.log.shift(60)# almost 1 min
df['sigma']=df.log.rolling(60).std()
return df
from statsmodels.tsa.arima_model import ARIMA
from statsmodels.tsa.arima_model import ARIMAResults
# monkey patch around bug in ARIMA class
def __getnewargs__(self):
return ((self.endog),(self.k_lags, self.k_diff, self.k_ma))
ARIMA.__getnewargs__ = __getnewargs__
def ARIMA_df(df):
data=df.logDiff.dropna()
model = ARIMA(data, order=(2,1,2)) # tested from ARIMA.ipynb
#predictions = model.fit(disp=0).predict()
predictions =model.fit(disp=0).fittedvalues
# save model
model.fit().save('sevennine_arima.pkl')
df['pr_arima']=np.exp(predictions+df.log.rolling(60).mean())
return df
# Import a Kalman filter and other useful libraries
from pykalman import KalmanFilter
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from scipy import poly1d
def kalman_ma(data):
x=data.price.tail(60)
y=data.Close.tail(60)
# Construct a Kalman filter
kf = KalmanFilter(transition_matrices = [1],
observation_matrices = [1],
initial_state_mean = 246,
initial_state_covariance = 1,
observation_covariance=1,
transition_covariance=.01)
# Use the observed values of the price to get a rolling mean
state_means, _ = kf.filter(x.values)
state_means = pd.Series(state_means.flatten(), index=x.index)
data['km']=state_means
return data
In [2]:
#path = 'C:\\Users\Michal\Dropbox\IB_data'
#path = 'C:\\Users\Michal\Desktop'+ '\SPY14Aug17.csv'
path = '/home/octo/Dropbox'+ '/SPY15Aug17.csv'
df1=get_csv_pd_notime(path)
df=get_csv_pd(path)
df=df[500:15500]
df=preprocessing_df(df)
In [3]:
df=ARIMA_df(df)
In [4]:
df.tail()
Out[4]:
In [ ]:
### load model
#loaded = ARIMAResults.load('sevennine_arima.pkl')
In [143]:
df=kalman_ma(df)
In [144]:
import pickle
#from sklearn.cross_validation import train_test_split
from sklearn import linear_model
from sklearn.svm import SVR
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
In [145]:
# saving linear model
df=df.dropna()
X=df[['askPrice','askSize','bidPrice','bidSize','Close','pr_arima','U','D','sigma']]
y=df[['logDiff']]
regr = linear_model.LinearRegression()
regr_model=regr.fit(X,y)
regr_model = pickle.dumps(regr_model)
# Fit regression model
svr_rbf = SVR(kernel='rbf', C=1e3, gamma=0.9) #kernel='linear' #kernel='poly'
svr_model = svr_rbf.fit(X, y)
svr_model = pickle.dumps(svr_model)
In [146]:
# saving logistics and SVC model
df=df.dropna()
X=df[['askPrice','askSize','bidPrice','bidSize','Close','price','pr_arima','sigma']]
y1=df[['U']]
y2=df[['D']]
svm = SVC(kernel='linear')
lm = linear_model.LogisticRegression(C=1e4)
svm_model_up= svm.fit(X,y1)
svm_model_up = pickle.dumps(svm_model_up)
lm_model_up= lm.fit(X, y1)
lm_model_up = pickle.dumps(lm_model_up)
svm_model_dn= svm.fit(X, y2)
svm_model_dn = pickle.dumps(svm_model_dn)
lm_model_dn= lm.fit(X, y2)
lm_model_dn = pickle.dumps(lm_model_dn)
In [147]:
#loading regression model, first save the model
svr_model = pickle.loads(svr_model)
regr_model = pickle.loads(regr_model)
#loading classification model, first save the model
svm_model_up = pickle.loads(svm_model_up)
svm_model_dn = pickle.loads(svm_model_dn)
lm_model_up = pickle.loads(lm_model_up)
lm_model_dn = pickle.loads(lm_model_dn)
In [148]:
def strat_lr(data):
data=data.tail(60).dropna()
X=data[['askPrice','askSize','bidPrice','bidSize','Close','pr_arima','U','D','sigma']]
y=data[['logDiff']]
predict_regr=regr_model.predict(X)
predict_svr=svr_model.predict(X)
dt=data[['Close']]
dt['predict_regr']=predict_regr
dt['predict_svr']=predict_svr
pdf=data
pdf['pREG']=np.exp(dt.predict_regr+data.log.shift(59))
pdf['pSVR']=np.exp(dt.predict_regr+data.log.shift(59))
#dt=data[['price','predict']]
return pdf
In [149]:
df=strat_lr(df)
In [150]:
def classification_up_dn(data):
X=data[['askPrice','askSize','bidPrice','bidSize','Close','price','pr_arima','sigma']]
y1=data[['U']]
y2=data[['D']]
pr_df=data.tail(60)
predict_svm_up=svm_model_up.predict(X.tail(60))
pr_df['predict_svm_up']=predict_svm_up
predict_lm_up=lm_model_up.predict(X.tail(60))
pr_df['predict_lm_up']=predict_lm_up
predict_svm_dn=svm_model_dn.predict(X.tail(60))
pr_df['predict_svm_dn']=predict_svm_dn
predict_lm_dn=lm_model_dn.predict(X.tail(60))
pr_df['predict_lm_dn']=predict_lm_dn
pr_df['predict_svm']=pr_df.predict_svm_up+pr_df.predict_svm_dn
pr_df['predict_lm']=pr_df.predict_lm_up+pr_df.predict_lm_dn
return pr_df
In [151]:
df=classification_up_dn(df)
In [163]:
df['predict_svm']=df.predict_svm_up+df.predict_svm_dn
df['predict_lm']=df.predict_lm_up+df.predict_lm_dn
df['UD']=np.where(df.predict_svm+df.predict_lm>0,1,np.where(df.predict_svm+df.predict_lm,-1,0))
In [165]:
import numpy
import matplotlib.pyplot as plt
import pandas
import math
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import LSTM
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import mean_squared_error
import numpy
import matplotlib.pyplot as plt
import pandas
import math
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import LSTM
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import mean_squared_error
In [189]:
# Newly loading for longdataset
In [190]:
#path = 'C:\\Users\Michal\Dropbox\IB_data'
#path = 'C:\\Users\Michal\Desktop'+ '\SPY14Aug17.csv'
path = '/home/octo/Dropbox'+ '/SPY15Aug17.csv'
df1=get_csv_pd_notime(path)
df=get_csv_pd(path)
df=df[500:45500]
df=preprocessing_df(df)
In [212]:
df=ARIMA_df(df)
In [217]:
df['spread']=df.Close-df.pr_arima
In [220]:
dataset= df[['askPrice','askSize','bidPrice','bidSize','Close','spread','pr_arima','sigma']]
dataset=dataset.dropna()
dataset=dataset.values
dataset = dataset.astype('float32')
In [221]:
len(dataset)
Out[221]:
In [222]:
# fix random seed for reproducibility
numpy.random.seed(7)
In [223]:
# normalize the dataset
scaler = MinMaxScaler(feature_range=(0, 1))
dataset = scaler.fit_transform(dataset)
In [224]:
# split into train and test sets
train_size = int(len(dataset) * 0.80)
test_size = len(dataset) - train_size
train, test = dataset[0:train_size,:], dataset[train_size:len(dataset),:]
print(len(train), len(test))
In [226]:
# convert an array of values into a dataset matrix
def create_dataset(dataset, look_back=1):
dataX, dataY = [], []
for i in range(len(dataset)-look_back-1):
a = dataset[i:(i+look_back), 0]
b = dataset[i:(i+look_back), 1]
c = dataset[i:(i+look_back), 2]
d = dataset[i:(i+look_back), 3]
e= dataset[i:(i+look_back), 4]
f = dataset[i:(i+look_back), 5]
g= dataset[i:(i+look_back), 6]
h= dataset[i:(i+look_back), 7]
dataX.append(np.c_[a,b,c,d,f,g,h])
#dataX.append(b)
#dataX.append(c)
#dataX.append(d)
#dataX.append(e)
#dataX.concatenate((a,bT,cT,dT,eT),axis=1)
dataY.append(dataset[i + look_back,4])
return np.array(dataX), np.array(dataY)
In [227]:
# reshape into X=t and Y=t+1
look_back = 3
trainX, trainY = create_dataset(train,look_back)
testX, testY = create_dataset(test,look_back)
In [228]:
# reshape input to be [samples, time steps, features]
trainX = numpy.reshape(trainX, (trainX.shape[0],trainX.shape[1],trainX.shape[2]))
testX = numpy.reshape(testX, (testX.shape[0],testX.shape[1],testX.shape[2]))
In [229]:
epochs=3
batch_size=25
In [230]:
# create and fit the LSTM network
model = Sequential()
model.add(LSTM(4, input_shape=(look_back,7)))
model.add(Dense(1))
model.compile(loss='mean_squared_error', optimizer='adam')
model.fit(trainX, trainY, epochs, batch_size, verbose=2)
Out[230]:
In [231]:
model.save("sevensep.h5")
In [ ]: