In [1]:
import pandas as pd
import numpy as np
import matplotlib.pylab as plt
import csv
import glob
from statsmodels.tsa.arima_model import ARIMA
from statsmodels.tsa.arima_model import ARIMAResults
import pickle
#from sklearn.cross_validation import train_test_split
from sklearn import linear_model
from sklearn.svm import SVR
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
In [129]:
import time
import numpy
import pandas
import math
from keras.models import Sequential
from keras.layers import LSTM
from keras.layers.core import Dense, Activation, Dropout
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import mean_squared_error
In [2]:
# loading csv file
def get_csv_pd(path):
#spy_pd=pd.read_csv('C:\\Users\Michal\Dropbox\IB_data\SPY.csv',sep=' ',names=['askPrice','askSize','bidPrice','bidSize'],index_col=0,parse_dates=True)
#spy_pd=pd.read_csv(path+'\SPY.csv',sep=',',names=['askPrice','askSize','bidPrice','bidSize'],index_col=0,parse_dates=True)
spy_pd=pd.read_csv(path,sep=',',dtype={'askPrice':np.float32,'askSize':np.float32,
'bidPrice':np.float32,'bidSize':np.float32},index_col=0,parse_dates=True)
#spy_pd = pd.read_csv(path, usecols=['askPrice','askSize','bidPrice','bidSize'], engine='python', skipfooter=3)
return spy_pd
'''
def get_csv_pd_notime(path):
#spy_pd=pd.read_csv('C:\\Users\Michal\Dropbox\IB_data\SPY.csv',sep=' ',names=['askPrice','askSize','bidPrice','bidSize'],index_col=0,parse_dates=True)
#spy_pd=pd.read_csv(path+'\SPY.csv',sep=',',names=['askPrice','askSize','bidPrice','bidSize'],index_col=0,parse_dates=True)
spy_pd = pd.read_csv(path, usecols=['askPrice','askSize','bidPrice','bidSize'], engine='python', skipfooter=3)
return spy_pd
'''
def preprocessing(df):
df.bidPrice=df.loc[:,'bidPrice'].replace(to_replace=0, method='ffill')
df.bidSize=df.loc[:,'bidSize'].replace(to_replace=0, method='ffill')
df.askPrice=df.loc[:,'askPrice'].replace(to_replace=0, method='ffill')
df.askSize=df.loc[:,'askSize'].replace(to_replace=0, method='ffill')
df=df.dropna()
# to exclude 0
df=df[df['bidPrice']>df.bidPrice.mean()-df.bidPrice.std()]
df=df[df['askPrice']>df.askPrice.mean()-df.askPrice.std()]
df['mid']=(df.askPrice+df.bidPrice)/2
df['vwap']=((df.loc[:,'bidPrice']*df.loc[:,'bidSize'])+(df.loc[:,'askPrice']*df.loc[:,'askSize']))/(df.loc[:,'bidSize']+df.loc[:,'askSize'])
df['spread']=df.vwap-(df.askPrice+df.bidPrice)/2
df['v']=(df.askPrice+df.bidPrice)/2-((df.askPrice+df.bidPrice)/2).shift(60)
df['return']=(df.askPrice/df.bidPrice.shift(1))-1
df['sigma']=df.spread.rolling(60).std()
return df
'''
def normalise(df,window_length=60):
dfn=(df-df.rolling(window_length).min())/(df.rolling(window_length).max()-df.rolling(window_length).min())
return dfn
def de_normalise(data,df,window_length=60):
dn=(df*(data.rolling(window_length).max()-data.rolling(window_length).min()))+data.rolling(window_length).min()
return dn
def normalise_z(df,window_length=12):
dfn=(df-df.rolling(window_length).mean())/(df.rolling(window_length).std())
return dfn
'''
def normalise(df,window_length=60):
data=df[['askPrice','askSize','bidPrice','bidSize','vwap','spread','v','return','sigma']]
dfn=data/data.shift(60)
return dfn
def de_normalise(dfn,window_length=60):
data=df[['askPrice','askSize','bidPrice','bidSize','vwap','spread','v','return','sigma']]
data=dfn*data.shift(60)
return data
#https://stackoverflow.com/questions/312443/how-do-you-split-a-list-into-evenly-sized-chunks
def chunks(l, n):
"""Yield successive n-sized chunks from l."""
for i in range(0, len(l), n):
yield l[i:i + n]
# monkey patch around bug in ARIMA class
def __getnewargs__(self):
return ((self.endog),(self.k_lags, self.k_diff, self.k_ma))
ARIMA.__getnewargs__ = __getnewargs__
def arima_processing(df):
#data=df[['vwap','mid']]
df=df.dropna()
df['Lvwap']=np.log(df.vwap)
df['Lmid']=np.log(df.mid)
df['LDvwap']=df.Lvwap-df.Lvwap.shift(60)
df['LDmid']=df.Lmid-df.Lmid.shift(60)
df=df.dropna()
return df
def ARIMA_saving(data):
data=data.dropna()
data1=data.LDvwap
data2=data.LDmid
model_vwap = ARIMA(data1,order=(2,1,2)) # tested from ARIMA.ipynb
#predictions = model.fit(disp=0).predict()
predictions_vwap =model_vwap.fit(disp=0).fittedvalues
# save model
model_vwap.fit().save('vwap_arima.pkl')
vwap_arima=np.exp(predictions_vwap+data.Lvwap.shift(60))
model_mid = ARIMA(data2,order=(2,1,2)) # tested from ARIMA.ipynb
#predictions = model.fit(disp=0).predict()
predictions_mid =model_mid.fit(disp=0).fittedvalues
# save model
model_mid.fit().save('mid_arima.pkl')
In [3]:
filename = '/home/octo/Dropbox'+ '/SPY7Dec.csv'
In [8]:
data=get_csv_pd(filename)
data=preprocessing(data)
data=data.dropna()
data=arima_processing(data)
data=data.dropna()
ARIMA_saving(data)
In [9]:
data.head()
Out[9]:
In [10]:
data=get_csv_pd(filename)
data=preprocessing(data)
#data=normalise(data)
data=data.dropna()
In [11]:
data.tail()
Out[11]:
In [12]:
df=data
X=df[['vwap','v','return','sigma']]
y=df.mid
In [13]:
regr = linear_model.LinearRegression()
regr_model=regr.fit(X,y)
# save the model to disk
filename_rgr = 'rgr.sav'
pickle.dump(regr_model, open(filename_rgr, 'wb'))
svr_rbf = SVR(kernel='rbf', C=1e3, gamma=0.9) #kernel='linear' #kernel='poly'
svr_model = svr_rbf.fit(X, y)
# save the model to disk
filename_svr = 'svr.sav'
pickle.dump(svr_model, open(filename_svr, 'wb'))
In [14]:
###Model is already saved from "/Dropbox/DataScience/ARIMA_model_saving.ipynb". Here loaded and added to "df_ml"
def ARIMA_(data):
### load model
data=data.dropna()
predictions_mid=ARIMA_mid(data.LDmid)
predictions_vwap=ARIMA_vwap(data.LDvwap)
vwap_arima=np.exp(predictions_vwap+data.Lvwap.shift(60))
mid_arima=np.exp(predictions_mid+data.Lmid.shift(60))
df_ml['arima']=data.mid+vwap_arima-mid_arima
def ARIMA_mid(data):
### load model
mid_arima_loaded = ARIMAResults.load('mid_arima.pkl')
predictions_mid = mid_arima_loaded.predict()
return predictions_mid
def ARIMA_vwap(data):
### load model
vwap_arima_loaded = ARIMAResults.load('vwap_arima.pkl')
predictions_vwap = vwap_arima_loaded.predict()
return predictions_vwap
#### KALMAN moving average
##KF moving average
#https://github.com/pykalman/pykalman
# Import a Kalman filter and other useful libraries
from pykalman import KalmanFilter
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from scipy import poly1d
def kalman_ma(data):
#x=data.mid
x=data.mid
# Construct a Kalman filter
kf = KalmanFilter(transition_matrices = [1],
observation_matrices = [1],
initial_state_mean = 248,
initial_state_covariance = 1,
observation_covariance=1,
transition_covariance=.01)
# Use the observed values of the price to get a rolling mean
state_means, _ = kf.filter(x.values)
state_means = pd.Series(state_means.flatten(), index=x.index)
df_ml['km']=state_means
### Linear Regression, sklearn, svm:SVR,linear_model
import pickle
#from sklearn.cross_validation import train_test_split
from sklearn import linear_model
from sklearn.svm import SVR
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
## loading model saved from /Dropbox/DataScience/REG_model_saving.ipynb
filename_rgr = 'rgr.sav'
filename_svr = 'svr.sav'
# load the model from disk
loaded_rgr_model = pickle.load(open(filename_rgr, 'rb'))
loaded_svr_model = pickle.load(open(filename_svr, 'rb'))
def strat_lr(data):
#no normalization
data=data.dropna()
X=data[['vwap','v','return','sigma']]
#y=df[['mid']]
predict_regr=loaded_rgr_model.predict(X)
predict_svr=loaded_svr_model.predict(X)
df_ml['REG']=predict_regr
df_ml['SVR']=predict_svr
## strat_lr(data,dfn) and below needed for normalized data
#df_ml['predict_regr']=predict_regr
#df_ml['predict_svr']=predict_svr
#df_ml['REG']=de_normalise(data.mid,df.predict_regr)
#df_ml['SVR']=de_normalise(data.mid,df.predict_svr)
In [69]:
df_ml=pd.DataFrame()
data=get_csv_pd(filename)
data=preprocessing(data)
data=data.dropna()
df_arima=arima_processing(data)
df_ml['mid']=data.mid
df_ml['vwap']=data.vwap
ARIMA_(df_arima)
kalman_ma(data)
strat_lr(data)
In [100]:
df_ml.dropna().head()
Out[100]:
In [71]:
len(df_ml)
Out[71]:
In [72]:
df_ml=df_ml.dropna()
data_cl=data.tail(len(df_ml))
data_cl['U']=np.where(df_ml.mid>df_ml.km,1,0)
data_cl['D']=np.where(df_ml.mid<df_ml.km,-1,0)
data_cl=data_cl.dropna()
In [90]:
data_cl.head()
Out[90]:
In [74]:
import matplotlib.pyplot as plt
import seaborn as sns; sns.set()
%matplotlib inline
In [89]:
data_cl['U'].plot(kind='hist', grid=True, title='up')
Out[89]:
In [88]:
data_cl['D'].plot(kind='hist', grid=True, title='down')
Out[88]:
In [78]:
X=data_cl[['askPrice','askSize','bidPrice','bidSize','spread','v','return','sigma']]
y1=data_cl.U
y2=data_cl.D
svm = SVC(kernel='linear')
lm = linear_model.LogisticRegression(C=1e4)
In [79]:
svm_model_up=svm.fit(X,y1)
lm_model_up=lm.fit(X,y1)
svm_model_dn=svm.fit(X,y2)
lm_model_dn =lm.fit(X,y2)
In [80]:
# save the model to disk
filename_svm_model_up = 'svm_model_up.sav'
filename_lm_model_up = 'lm_model_up.sav'
filename_svm_model_dn = 'svm_model_dn.sav'
filename_lm_model_dn = 'lm_model_dn.sav'
pickle.dump(svm_model_up, open(filename_svm_model_up, 'wb'))
pickle.dump(lm_model_up, open(filename_lm_model_up, 'wb'))
pickle.dump(svm_model_dn, open(filename_svm_model_dn, 'wb'))
pickle.dump(lm_model_dn, open(filename_lm_model_dn , 'wb'))
In [91]:
#### loading classification model from /Dropbox/DataScience/ML_20Sep
filename_svm_model_up = 'svm_model_up.sav'
filename_lm_model_up = 'lm_model_up.sav'
filename_svm_model_dn = 'svm_model_dn.sav'
filename_lm_model_dn = 'lm_model_dn.sav'
# load the model from disk
loaded_svm_up_model = pickle.load(open(filename_svm_model_up, 'rb'))
loaded_lm_up_model = pickle.load(open(filename_lm_model_up, 'rb'))
loaded_svm_dn_model = pickle.load(open(filename_svm_model_dn, 'rb'))
loaded_lm_dn_model = pickle.load(open(filename_lm_model_dn, 'rb'))
def classification_up_dn(data):
X=data[['askPrice','askSize','bidPrice','bidSize','spread','v','return','sigma']]
predict_svm_up=loaded_svm_up_model.predict(X)
predict_lm_up=loaded_lm_up_model.predict(X)
predict_svm_dn=loaded_svm_dn_model.predict(X)
predict_lm_dn=loaded_lm_dn_model.predict(X)
X['predict_svm_up']=predict_svm_up
X['predict_lm_up']=predict_lm_up
X['predict_svm_dn']=predict_svm_dn
X['predict_lm_dn']=predict_lm_dn
#X['predict_svm']=X.predict_svm_up+X.predict_svm_dn
#X['predict_lm']=X.predict_lm_up+X.predict_lm_dn
X['U']=np.where(np.logical_or(X.predict_svm_up==1,X.predict_lm_up==1),1,0)
X['D']=np.where(np.logical_or(X.predict_svm_dn==-1,X.predict_lm_dn==-1),-1,0)
X['UD']=X.U+X.D
return X
In [93]:
class_data=classification_up_dn(data_cl)
In [99]:
data_cl.tail()
Out[99]:
In [97]:
class_data['predict_svm_up'].plot(kind='hist', grid=True, title='up-down')
Out[97]:
In [104]:
df_ml=pd.DataFrame()
data=get_csv_pd(filename)
data=preprocessing(data)
data=data.dropna()
df_arima=arima_processing(data)
df_ml['mid']=data.mid
df_ml['vwap']=data.vwap
ARIMA_(df_arima)
kalman_ma(data)
strat_lr(data)
df_ml=df_ml.dropna()
data_cl=data.tail(len(df_ml))
data_cl['U']=np.where(df_ml.mid>df_ml.km,1,0)
data_cl['D']=np.where(df_ml.mid<df_ml.km,-1,0)
data_cl=data_cl.dropna()
df_ml['UD']=data_cl.U+data_cl.D
In [105]:
df_ml.head()
Out[105]:
In [106]:
df_ml['UD'].plot(kind='hist', grid=True, title='up-down')
Out[106]:
In [114]:
### LSTM
#df.loc[:, cols].prod(axis=1)
def lstm_processing(df):
df=df.dropna()
df_price=df[['mid','vwap','arima','km','REG','SVR']]
#normalization
dfn=np.log(df_price)
#dfn['UD']=df.UD
return dfn
# convert an array of values into a dataset matrix
def create_dataset(dataset, look_back=1):
dataX, dataY = [], []
for i in range(len(dataset)-look_back-1):
a = dataset[i:(i+look_back), 0]
b = dataset[i:(i+look_back), 1]
c = dataset[i:(i+look_back), 2]
d = dataset[i:(i+look_back), 3]
e= dataset[i:(i+look_back), 4]
f = dataset[i:(i+look_back), 5]
#g= dataset[i:(i+look_back), 6]
#dataX.append(np.c_[b,c,d,e,f,g])
dataX.append(np.c_[b,c,d,e,f])
#dataX.append(b)
#dataX.append(c)
#dataX.append(d)
#dataX.append(e)
#dataX.concatenate((a,bT,cT,dT,eT),axis=1)
dataY.append(a)
return np.array(dataX), np.array(dataY)
In [115]:
#normalization
df_lstm=lstm_processing(df_ml)
df_lstm=df_lstm.dropna()
dataset=df_lstm.values
dataset = dataset.astype('float32')
# reshape into X=t and Y=t+1
look_back = 3
X_,Y_ = create_dataset(dataset,look_back)
# reshape input to be [samples, time steps, features]
X_ = numpy.reshape(X_, (X_.shape[0],X_.shape[1],X_.shape[2]))
In [116]:
df_lstm.head()
Out[116]:
In [126]:
'''
# create and fit the LSTM network
model = Sequential()
model.add(LSTM(4, input_shape=(look_back,5)))
model.add(Dense(1))
model.compile(loss='mean_squared_error', optimizer='adam')
model.fit(X_,Y_, epochs, batch_size, verbose=2)
'''
Out[126]:
In [127]:
def build_model(layers):
model = Sequential()
model.add(LSTM(
input_dim=layers[0],
output_dim=layers[1],
return_sequences=True))
model.add(Dropout(0.2))
model.add(LSTM(
layers[2],
return_sequences=False))
model.add(Dropout(0.2))
model.add(Dense(
output_dim=layers[3]))
model.add(Activation("linear"))
start = time.time()
model.compile(loss="mse", optimizer="rmsprop")
print ("Compilation Time : ", time.time() - start)
return model
In [130]:
epochs=10
batch_size=50
layers=[5, 50, 100,3]
# create and fit the LSTM network
model = build_model(layers)
In [131]:
model.fit(X_,Y_, epochs, batch_size, verbose=2)
Out[131]:
In [132]:
model.save("12Dec.h5")
In [138]:
def plot_results_multiple(predicted_data, true_data, prediction_len):
fig = plt.figure(facecolor='white')
ax = fig.add_subplot(111)
ax.plot(true_data, label='True Data')
#Pad the list of predictions to shift it in the graph to it's correct start
for i, data in enumerate(predicted_data):
padding = [None for p in range(i * prediction_len)]
plt.plot(data, label='Prediction')
plt.legend()
plt.show()
In [134]:
Predict = model.predict(X_)
#testPredict = model.predict(testX)
In [142]:
'''
# shift train predictions for plotting
trainPredictPlot = numpy.empty_like(dataset)
trainPredictPlot[:, :] = numpy.nan
trainPredictPlot[look_back:len(trainPredict)+look_back, :] = trainPredict
# shift test predictions for plotting
testPredictPlot = numpy.empty_like(dataset)
testPredictPlot[:, :] = numpy.nan
testPredictPlot[len(trainPredict)+(look_back*2)+1:len(dataset)-1, :] = testPredict
# plot baseline and predictions
#plt.plot(scaler.inverse_transform(dataset))
plt.plot(dataset[:,0])
#plt.plot(trainPredictPlot)
plt.plot(testPredictPlot)
plt.show()
'''
# shift train predictions for plotting
PredictPlot = numpy.empty_like(dataset)
PredictPlot[:, :] = numpy.nan
PredictPlot[look_back:len(Predict)+look_back,3:6] = Predict
plt.plot(dataset[:,0])
#plt.plot(trainPredictPlot)
plt.plot(PredictPlot)
plt.show()
In [ ]:
plot_results_multiple(Predict,Y_,10000)
In [ ]: