In [1]:
import pandas as pd
import numpy as np
import matplotlib.pylab as plt
import csv
import glob
filename = '/home/octo/Dropbox'+ '/SPY4Aug17.csv'
In [2]:
# loading csv file
def get_csv_pd(path):
#spy_pd=pd.read_csv('C:\\Users\Michal\Dropbox\IB_data\SPY.csv',sep=' ',names=['askPrice','askSize','bidPrice','bidSize'],index_col=0,parse_dates=True)
#spy_pd=pd.read_csv(path+'\SPY.csv',sep=',',names=['askPrice','askSize','bidPrice','bidSize'],index_col=0,parse_dates=True)
spy_pd=pd.read_csv(path,sep=',',dtype={'askPrice':np.float32,'askSize':np.float32,
'bidPrice':np.float32,'bidSize':np.float32},index_col=0,parse_dates=True)
#spy_pd = pd.read_csv(path, usecols=['askPrice','askSize','bidPrice','bidSize'], engine='python', skipfooter=3)
return spy_pd
def get_csv_pd_notime(path):
#spy_pd=pd.read_csv('C:\\Users\Michal\Dropbox\IB_data\SPY.csv',sep=' ',names=['askPrice','askSize','bidPrice','bidSize'],index_col=0,parse_dates=True)
#spy_pd=pd.read_csv(path+'\SPY.csv',sep=',',names=['askPrice','askSize','bidPrice','bidSize'],index_col=0,parse_dates=True)
spy_pd = pd.read_csv(path, usecols=['askPrice','askSize','bidPrice','bidSize'], engine='python', skipfooter=3)
return spy_pd
def preprocessing(df):
df.bidPrice=df.loc[:,'bidPrice'].replace(to_replace=0, method='ffill')
df.bidSize=df.loc[:,'bidSize'].replace(to_replace=0, method='ffill')
df.askPrice=df.loc[:,'askPrice'].replace(to_replace=0, method='ffill')
df.askSize=df.loc[:,'askSize'].replace(to_replace=0, method='ffill')
df=df.dropna()
# to exclude 0
df=df[df['bidPrice']>df.bidPrice.mean()-df.bidPrice.std()]
df=df[df['askPrice']>df.askPrice.mean()-df.askPrice.std()]
df['mid']=(df.askPrice+df.bidPrice)/2
df['vwap']=((df.loc[:,'bidPrice']*df.loc[:,'bidSize'])+(df.loc[:,'askPrice']*df.loc[:,'askSize']))/(df.loc[:,'bidSize']+df.loc[:,'askSize'])
df['spread']=df.vwap-(df.askPrice+df.bidPrice)/2
df['v']=(df.askPrice+df.bidPrice)/2-((df.askPrice+df.bidPrice)/2).shift(60)
df['return']=(df.askPrice/df.bidPrice.shift(1))-1
df['sigma']=df.spread.rolling(60).std()
return df
def normalise(df,window_length=60):
dfn=(df-df.rolling(window_length).min())/(df.rolling(window_length).max()-df.rolling(window_length).min())
return dfn
def de_normalise(df,window_length=60):
dn=(df*(df.rolling(window_length).max()-df.rolling(window_length).min()))+df.rolling(window_length).min()
return dn
In [3]:
data=get_csv_pd(filename)
data=preprocessing(data)
data=normalise(data)
data=data.dropna()
In [4]:
data.head()
Out[4]:
In [5]:
import pickle
#from sklearn.cross_validation import train_test_split
from sklearn import linear_model
from sklearn.svm import SVR
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
In [9]:
# saving linear model
df=data.tail(20000)
X=df[['askPrice','askSize','bidPrice','bidSize','vwap','spread','v','return','sigma']]
y=df.mid
In [10]:
regr = linear_model.LinearRegression()
regr_model=regr.fit(X,y)
# save the model to disk
filename_rgr = 'rgr.sav'
pickle.dump(regr_model, open(filename_rgr, 'wb'))
In [11]:
svr_rbf = SVR(kernel='rbf', C=1e3, gamma=0.9) #kernel='linear' #kernel='poly'
svr_model = svr_rbf.fit(X, y)
# save the model to disk
filename_svr = 'svr.sav'
pickle.dump(svr_model, open(filename_svr, 'wb'))
In [12]:
##### Model is saved from ML_20SEP
In [ ]: