notebook.community

Edit and run



In [3]:

    
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import mean_squared_error
import numpy
import matplotlib.pyplot as plt
import pandas
import math



In [4]:

    
import csv
import glob



In [5]:

    
# Theano loaded using : conda install m2w64-toolchain

from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

#import pandas_datareader.data as pdr_data
import numpy as np
import pandas as pd
import time
import os
import sys
from collections import deque
from collections import OrderedDict
import copy
#import cPickle as pkl
import random
import pdb
import logging

import theano
#import theano.tensor as tensor
from theano.sandbox.rng_mrg import MRG_RandomStreams as RandomStreams
from theano.ifelse import ifelse
from theano import config

import tensorflow as tf
#import config as c
#from tensorflow.models.rnn import rnn
#from tensorflow.models.rnn import rnn_cell
#from tensorflow.models.rnn import seq2seq

import time
import warnings
from numpy import newaxis
from keras.layers.core import Dense, Activation, Dropout
from keras.layers.recurrent import LSTM
from keras.models import Sequential
from keras.layers import Dense,LSTM
#from keras.layers import LSTM
import matplotlib.pyplot as plt

warnings.filterwarnings("ignore")



In [6]:

    
# loading csv file
def get_csv_pd(path):
    #spy_pd=pd.read_csv('C:\\Users\Michal\Dropbox\IB_data\SPY.csv',sep=' ',names=['askPrice','askSize','bidPrice','bidSize'],index_col=0,parse_dates=True)
    #spy_pd=pd.read_csv(path+'\SPY.csv',sep=',',names=['askPrice','askSize','bidPrice','bidSize'],index_col=0,parse_dates=True)
    spy_pd=pd.read_csv(path,sep=',',dtype={'askPrice':np.float32,'askSize':np.float32,
                                           'bidPrice':np.float32,'bidSize':np.float32},index_col=0,parse_dates=True)
    #spy_pd = pd.read_csv(path, usecols=['askPrice','askSize','bidPrice','bidSize'], engine='python', skipfooter=3)
    return spy_pd

def BA(df):
    df.bidPrice=df.loc[:,'bidPrice'].replace(to_replace=0, method='ffill')
    df.bidSize=df.loc[:,'bidSize'].replace(to_replace=0, method='ffill')
    df.askPrice=df.loc[:,'askPrice'].replace(to_replace=0, method='ffill')
    df.askSize=df.loc[:,'askSize'].replace(to_replace=0, method='ffill')
    df=df.dropna()
    return df

def preprocessing(df):
    df=df.dropna()
    # to exclude 0
    df=df[df['bidPrice']>df.bidPrice.mean()-df.bidPrice.std()]
    df=df[df['askPrice']>df.askPrice.mean()-df.askPrice.std()]
    df['mid']=(df.askPrice+df.bidPrice)/2
    df['vwap']=((df.loc[:,'bidPrice']*df.loc[:,'bidSize'])+(df.loc[:,'askPrice']*df.loc[:,'askSize']))/(df.loc[:,'bidSize']+df.loc[:,'askSize'])
    df['spread']=df.vwap-df.mid
    df['v']=(df.mid-df.mid.shift(60))
    df['mom']=np.where(np.logical_and((df.mid-df.mid.shift(12))!=0,df.v!=0),(df.mid-df.mid.shift(12))/df.v,0)
    df['return']=(df.askPrice/df.bidPrice.shift(1))-1
    #df['ret'] = np.log(df.Close/df.Close.shift(1))
    df['sigma']=df.spread.rolling(60).std()
    #df['sigma']=df.Close.rolling(5).std()
    df['high']=df.askPrice.rolling(5).max()
    df['low']=df.bidPrice.rolling(5).min()
    
    #df['mom']=np.where(np.logical_and(df.vel_c==1,df.Close>df.price),1,np.where(np.logical_and(df.vel_c==-1,df.Close<df.price),-1,0))
    #flagD=np.logical_and(np.logical_and(df.Close.shift(10)<df.Close.shift(15),df.Close.shift(15)< df.Close.shift(20)),df.Close< df.Close.shift(10))
    #flagU=np.logical_and(np.logical_and(df.Close.shift(15)>df.Close.shift(20),df.Close.shift(10)> df.Close.shift(15)),df.Close> df.Close.shift(10))
    #df['UD']= np.where(flagU,-1,np.where(flagD,1,0))
    
    #df['P']=(df.High+df.Low+df.Close)/3
    #df['UT']=(pd.rolling_max(df.High,60)+pd.rolling_max(df.P+df.High-df.Low,60))*0.5
    #df['DT']=(pd.rolling_min(df.Low,60)+pd.rolling_min(df.P+df.High-df.Low,60))*0.5
    #df['BA']=np.where(df.Close<=df.DT,-1,np.where(df.Close>=df.UT,1,0))# below or above
    return df

'''
def normalise(df,window_length=60):
    data=df[['askPrice','askSize','bidPrice','bidSize','vwap','spread','v','return','sigma']]
    dfn=data/data.shift(60)
    return dfn
'''
def normalise(dfn,window_length=60):
    dfn['midN']=dfn.mid/dfn.mid.shift(60)
    dfn['vwapN']=dfn.vwap/dfn.mid.shift(60)
    #dfn['highN']=dfn.high/dfn.high.shift(60)
    #dfn['lowN']=dfn.low/dfn.low.shift(60)
    dfn['HL']=(dfn.high-dfn.low)/dfn.mid.shift(60)
    dfn=dfn[['midN','vwapN','HL','spread','v','mom','return','sigma']]
    return dfn



In [7]:

    
filename = '/home/octo/Dropbox'+ '/SPY18Dec.csv'



In [8]:

    
data=get_csv_pd(filename)



In [9]:

    
data.tail()









    Out[9]:







  
    
      
      askPrice
      askSize
      bidPrice
      bidSize
    
  
  
    
      2017-12-19 09:24:27.494064
      268.380005
      52.0
      268.369995
      132.0
    
    
      2017-12-19 09:24:30.229113
      268.380005
      50.0
      268.369995
      132.0
    
    
      2017-12-19 09:24:33.545713
      268.380005
      50.0
      268.369995
      152.0
    
    
      2017-12-19 09:24:37.270081
      268.380005
      41.0
      268.369995
      152.0
    
    
      2017-12-19 09:24:40.471072
      268.380005
      41.0
      268.369995
      157.0



In [10]:

    
baset=BA(data)



In [11]:

    
baset.tail()









    Out[11]:







  
    
      
      askPrice
      askSize
      bidPrice
      bidSize
    
  
  
    
      2017-12-19 09:24:27.494064
      268.380005
      52.0
      268.369995
      132.0
    
    
      2017-12-19 09:24:30.229113
      268.380005
      50.0
      268.369995
      132.0
    
    
      2017-12-19 09:24:33.545713
      268.380005
      50.0
      268.369995
      152.0
    
    
      2017-12-19 09:24:37.270081
      268.380005
      41.0
      268.369995
      152.0
    
    
      2017-12-19 09:24:40.471072
      268.380005
      41.0
      268.369995
      157.0



In [14]:

    
data_=preprocessing(baset)



In [15]:

    
data_.tail()









    Out[15]:







  
    
      
      askPrice
      askSize
      bidPrice
      bidSize
      mid
      vwap
      spread
      v
      mom
      return
      sigma
      high
      low
    
  
  
    
      2017-12-19 09:24:27.494064
      268.380005
      52.0
      268.369995
      132.0
      268.375
      268.372833
      -0.002167
      -0.01001
      1.0
      0.000037
      0.002835
      268.380005
      268.369995
    
    
      2017-12-19 09:24:30.229113
      268.380005
      50.0
      268.369995
      132.0
      268.375
      268.372742
      -0.002258
      -0.01001
      0.5
      0.000037
      0.002810
      268.380005
      268.369995
    
    
      2017-12-19 09:24:33.545713
      268.380005
      50.0
      268.369995
      152.0
      268.375
      268.372467
      -0.002533
      -0.01001
      0.5
      0.000037
      0.002789
      268.380005
      268.369995
    
    
      2017-12-19 09:24:37.270081
      268.380005
      41.0
      268.369995
      152.0
      268.375
      268.372131
      -0.002869
      -0.01001
      0.0
      0.000037
      0.002772
      268.380005
      268.369995
    
    
      2017-12-19 09:24:40.471072
      268.380005
      41.0
      268.369995
      157.0
      268.375
      268.372070
      -0.002930
      -0.01001
      0.0
      0.000037
      0.002745
      268.380005
      268.369995



In [16]:

    
dfn=normalise(data_).dropna()



In [17]:

    
data=data_.dropna()
data=data_[['mid','vwap','spread','v','mom','return','sigma','high','low',]]



In [18]:

    
data.tail()









    Out[18]:







  
    
      
      mid
      vwap
      spread
      v
      mom
      return
      sigma
      high
      low
    
  
  
    
      2017-12-19 09:24:27.494064
      268.375
      268.372833
      -0.002167
      -0.01001
      1.0
      0.000037
      0.002835
      268.380005
      268.369995
    
    
      2017-12-19 09:24:30.229113
      268.375
      268.372742
      -0.002258
      -0.01001
      0.5
      0.000037
      0.002810
      268.380005
      268.369995
    
    
      2017-12-19 09:24:33.545713
      268.375
      268.372467
      -0.002533
      -0.01001
      0.5
      0.000037
      0.002789
      268.380005
      268.369995
    
    
      2017-12-19 09:24:37.270081
      268.375
      268.372131
      -0.002869
      -0.01001
      0.0
      0.000037
      0.002772
      268.380005
      268.369995
    
    
      2017-12-19 09:24:40.471072
      268.375
      268.372070
      -0.002930
      -0.01001
      0.0
      0.000037
      0.002745
      268.380005
      268.369995



In [19]:

    
dfn.tail()









    Out[19]:







  
    
      
      midN
      vwapN
      HL
      spread
      v
      mom
      return
      sigma
    
  
  
    
      2017-12-19 09:24:27.494064
      0.999963
      0.999955
      0.000037
      -0.002167
      -0.01001
      1.0
      0.000037
      0.002835
    
    
      2017-12-19 09:24:30.229113
      0.999963
      0.999954
      0.000037
      -0.002258
      -0.01001
      0.5
      0.000037
      0.002810
    
    
      2017-12-19 09:24:33.545713
      0.999963
      0.999953
      0.000037
      -0.002533
      -0.01001
      0.5
      0.000037
      0.002789
    
    
      2017-12-19 09:24:37.270081
      0.999963
      0.999952
      0.000037
      -0.002869
      -0.01001
      0.0
      0.000037
      0.002772
    
    
      2017-12-19 09:24:40.471072
      0.999963
      0.999952
      0.000037
      -0.002930
      -0.01001
      0.0
      0.000037
      0.002745



In [20]:

    
plt.plot(dfn.midN)
plt.show()



In [21]:

    
dataset = dfn.values
dataset = dataset.astype('float32')



In [22]:

    
# fix random seed for reproducibility
numpy.random.seed(7)



In [23]:

    
# split into train and test sets
train_size = int(len(dataset) * 0.80)
test_size = len(dataset) - train_size
train, test = dataset[0:train_size,:], dataset[train_size:len(dataset),:]
print(len(train), len(test))



In [24]:

    
# convert an array of values into a dataset matrix
def create_dataset(dataset, look_back=1):
    dataX, dataY = [], []
    for i in range(len(dataset)-look_back-1):
        a = dataset[i:(i+look_back), 0]
        b = dataset[i:(i+look_back), 1]
        c = dataset[i:(i+look_back), 2]
        d = dataset[i:(i+look_back), 3]
        e=  dataset[i:(i+look_back), 4]
        f=  dataset[i:(i+look_back), 5]
        g=  dataset[i:(i+look_back), 6]
        h=  dataset[i:(i+look_back), 7]
               
        dataX.append(numpy.c_[b,c,d,e,f,g,h])
        #dataX.append(b)
        #dataX.append(c)
        #dataX.append(d)
        #dataX.append(e)
        #dataX.concatenate((a,bT,cT,dT,eT),axis=1)
        dataY.append(a)
    return numpy.array(dataX), numpy.array(dataY)



In [25]:

    
# reshape into X=t and Y=t+1
look_back =1
trainX, trainY = create_dataset(train,look_back)
testX, testY = create_dataset(test,look_back)



In [26]:

    
# reshape input to be [samples, time steps, features]
trainX = numpy.reshape(trainX, (trainX.shape[0],trainX.shape[1],trainX.shape[2]))
testX = numpy.reshape(testX, (testX.shape[0],testX.shape[1],testX.shape[2]))



In [27]:

    
epochs=3
batch_size=3



In [28]:

    
# create and fit the LSTM network
model = Sequential()
model.add(LSTM(3, input_shape=(look_back,7)))
model.add(Dense(1))
model.compile(loss='mean_squared_error', optimizer='adam')
model.fit(trainX, trainY, epochs, batch_size, verbose=2)









    



Epoch 1/3
13s - loss: 0.0175
Epoch 2/3
13s - loss: 2.4827e-05
Epoch 3/3
12s - loss: 9.7398e-05






    Out[28]:





<keras.callbacks.History at 0x7f1fd47c4cc0>



In [29]:

    
# make predictions
trainPredict = model.predict(trainX)
testPredict = model.predict(testX)



In [30]:

    
trainPredict.shape









    Out[30]:





(30350, 1)



In [31]:

    
trainPredict[3:]









    Out[31]:





array([[ 0.99937874],
       [ 0.99938196],
       [ 0.99937814],
       ..., 
       [ 0.99939966],
       [ 0.99940145],
       [ 0.99941647]], dtype=float32)



In [32]:

    
# shift train predictions for plotting
trainPredictPlot = numpy.empty_like(dataset)
trainPredictPlot[:, :] = numpy.nan
trainPredictPlot[look_back:len(trainPredict)+look_back, :] = trainPredict
# shift test predictions for plotting
testPredictPlot = numpy.empty_like(dataset)
testPredictPlot[:, :] = numpy.nan
testPredictPlot[len(trainPredict)+(look_back*2)+1:len(dataset)-1, :] = testPredict
# plot baseline and predictions
#plt.plot(scaler.inverse_transform(dataset))
plt.plot(dataset[:,0])
plt.plot(trainPredictPlot)
plt.plot(testPredictPlot)
plt.show()



In [ ]:



In [59]:

    
def build_model(layers):
    model = Sequential()

    model.add(LSTM(
        input_dim=layers[0],
        output_dim=layers[1],
        return_sequences=True))
    model.add(Dropout(0.2))

    model.add(LSTM(
        layers[2],
        return_sequences=False))
    model.add(Dropout(0.2))

    model.add(Dense(
        output_dim=layers[3]))
    model.add(Activation("linear"))

    start = time.time()
    model.compile(loss="mse", optimizer="rmsprop")
    print ("Compilation Time : ", time.time() - start)
    return model



In [64]:

    
model = build_model([7, 50, 100, 1])









    



Compilation Time :  0.030323028564453125



In [65]:

    
model.fit(trainX,trainY,batch_size=512,nb_epoch=3,validation_split=0.05)









    



Train on 16951 samples, validate on 893 samples
Epoch 1/3
16951/16951 [==============================] - 2s - loss: 0.4174 - val_loss: 0.0480
Epoch 2/3
16951/16951 [==============================] - 0s - loss: 0.0171 - val_loss: 5.7629e-05
Epoch 3/3
16951/16951 [==============================] - 0s - loss: 0.0083 - val_loss: 2.1168e-04






    Out[65]:





<keras.callbacks.History at 0x7efe3720b978>



In [66]:

    
trainPredict = model.predict(trainX)
testPredict = model.predict(testX)



In [67]:

    
def plot_results_multiple(predicted_data, true_data, prediction_len):
    fig = plt.figure(facecolor='white')
    ax = fig.add_subplot(111)
    ax.plot(true_data, label='True Data')
    #Pad the list of predictions to shift it in the graph to it's correct start
    for i, data in enumerate(predicted_data):
        padding = [None for p in range(i * prediction_len)]
        plt.plot(data, label='Prediction')
        plt.legend()
    plt.show()



In [68]:

    
# shift train predictions for plotting
trainPredictPlot = numpy.empty_like(dataset)
trainPredictPlot[:, :] = numpy.nan
trainPredictPlot[look_back:len(trainPredict)+look_back, :] = trainPredict
# shift test predictions for plotting
testPredictPlot = numpy.empty_like(dataset)
testPredictPlot[:, :] = numpy.nan
testPredictPlot[len(trainPredict)+(look_back*2)+1:len(dataset)-1, :] = testPredict
# plot baseline and predictions
#plt.plot(scaler.inverse_transform(dataset))
plt.plot(dataset[:,0])
#plt.plot(trainPredictPlot)
plt.plot(testPredictPlot)
plt.show()



In [70]:

    
def plot_results_multiple(predicted_data, true_data, prediction_len):
    fig = plt.figure(facecolor='white')
    ax = fig.add_subplot(111)
    ax.plot(true_data, label='True Data')
    #Pad the list of predictions to shift it in the graph to it's correct start
    for i, data in enumerate(predicted_data):
        padding = [None for p in range(i * prediction_len)]
        plt.plot(data, label='Prediction')
        plt.legend()
    plt.show()



In [72]:

    
plot_results_multiple(testPredict,testY,10000)









    



---------------------------------------------------------------------------
KeyboardInterrupt                         Traceback (most recent call last)
<ipython-input-72-2b5077904fb8> in <module>()
----> 1 plot_results_multiple(testPredict,testY,10000)

<ipython-input-70-99104dfe57d9> in plot_results_multiple(predicted_data, true_data, prediction_len)
      5     #Pad the list of predictions to shift it in the graph to it's correct start
      6     for i, data in enumerate(predicted_data):
----> 7         padding = [None for p in range(i * prediction_len)]
      8         plt.plot(data, label='Prediction')
      9         plt.legend()

<ipython-input-70-99104dfe57d9> in <listcomp>(.0)
      5     #Pad the list of predictions to shift it in the graph to it's correct start
      6     for i, data in enumerate(predicted_data):
----> 7         padding = [None for p in range(i * prediction_len)]
      8         plt.plot(data, label='Prediction')
      9         plt.legend()

KeyboardInterrupt:



In [77]:

    
# create model
model = Sequential()
model.add(Dense(3, input_dim=7, init='uniform', activation='relu'))
model.add(Dense(3, init='uniform', activation='relu'))
model.add(Dense(1, init='uniform', activation='sigmoid'))
# Compile model
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
# Fit the model
model.fit(trainX,trainY, nb_epoch=7, batch_size=10)
# evaluate the model
scores = model.evaluate(testX,testY)
print("%s: %.2f%%" % (model.metrics_names[1], scores[1]*100))









    



---------------------------------------------------------------------------
ValueError                                Traceback (most recent call last)
<ipython-input-77-ef8fa457be71> in <module>()
      7 model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
      8 # Fit the model
----> 9 model.fit(trainX,trainY, nb_epoch=7, batch_size=10)
     10 # evaluate the model
     11 scores = model.evaluate(testX,testY)

/home/octo/anaconda2/envs/carnd-term1/lib/python3.5/site-packages/keras/models.py in fit(self, x, y, batch_size, nb_epoch, verbose, callbacks, validation_split, validation_data, shuffle, class_weight, sample_weight, initial_epoch, **kwargs)
    669                               class_weight=class_weight,
    670                               sample_weight=sample_weight,
--> 671                               initial_epoch=initial_epoch)
    672 
    673     def evaluate(self, x, y, batch_size=32, verbose=1,

/home/octo/anaconda2/envs/carnd-term1/lib/python3.5/site-packages/keras/engine/training.py in fit(self, x, y, batch_size, nb_epoch, verbose, callbacks, validation_split, validation_data, shuffle, class_weight, sample_weight, initial_epoch)
   1076             class_weight=class_weight,
   1077             check_batch_axis=False,
-> 1078             batch_size=batch_size)
   1079         # prepare validation data
   1080         if validation_data:

/home/octo/anaconda2/envs/carnd-term1/lib/python3.5/site-packages/keras/engine/training.py in _standardize_user_data(self, x, y, sample_weight, class_weight, check_batch_axis, batch_size)
    989                                    self.internal_input_shapes,
    990                                    check_batch_axis=False,
--> 991                                    exception_prefix='model input')
    992         y = standardize_input_data(y, self.output_names,
    993                                    output_shapes,

/home/octo/anaconda2/envs/carnd-term1/lib/python3.5/site-packages/keras/engine/training.py in standardize_input_data(data, names, shapes, check_batch_axis, exception_prefix)
    110                                  ' to have ' + str(len(shapes[i])) +
    111                                  ' dimensions, but got array with shape ' +
--> 112                                  str(array.shape))
    113             for j, (dim, ref_dim) in enumerate(zip(array.shape, shapes[i])):
    114                 if not j and not check_batch_axis:

ValueError: Error when checking model input: expected dense_input_5 to have 2 dimensions, but got array with shape (17844, 1, 7)



In [ ]:



In [ ]:

    
'''
dataset =numpy.loadtxt("/home/octo/Dropbox/SPY7Dec.csv",
   dtype={'names': ('askPrice', 'askSize', 'bidPrice', 'bidSize'),
          'formats': (numpy.str,numpy.str,numpy.str,numpy.str)},
   delimiter=',', skiprows=0)
'''



In [ ]:

	askPrice	askSize	bidPrice	bidSize
2017-12-19 09:24:27.494064	268.380005	52.0	268.369995	132.0
2017-12-19 09:24:30.229113	268.380005	50.0	268.369995	132.0
2017-12-19 09:24:33.545713	268.380005	50.0	268.369995	152.0
2017-12-19 09:24:37.270081	268.380005	41.0	268.369995	152.0
2017-12-19 09:24:40.471072	268.380005	41.0	268.369995	157.0

	mid	vwap	spread	v	mom	return	sigma	high	low
2017-12-19 09:24:27.494064	268.375	268.372833	-0.002167	-0.01001	1.0	0.000037	0.002835	268.380005	268.369995
2017-12-19 09:24:30.229113	268.375	268.372742	-0.002258	-0.01001	0.5	0.000037	0.002810	268.380005	268.369995
2017-12-19 09:24:33.545713	268.375	268.372467	-0.002533	-0.01001	0.5	0.000037	0.002789	268.380005	268.369995
2017-12-19 09:24:37.270081	268.375	268.372131	-0.002869	-0.01001	0.0	0.000037	0.002772	268.380005	268.369995
2017-12-19 09:24:40.471072	268.375	268.372070	-0.002930	-0.01001	0.0	0.000037	0.002745	268.380005	268.369995

	midN	vwapN	HL	spread	v	mom	return	sigma
2017-12-19 09:24:27.494064	0.999963	0.999955	0.000037	-0.002167	-0.01001	1.0	0.000037	0.002835
2017-12-19 09:24:30.229113	0.999963	0.999954	0.000037	-0.002258	-0.01001	0.5	0.000037	0.002810
2017-12-19 09:24:33.545713	0.999963	0.999953	0.000037	-0.002533	-0.01001	0.5	0.000037	0.002789
2017-12-19 09:24:37.270081	0.999963	0.999952	0.000037	-0.002869	-0.01001	0.0	0.000037	0.002772
2017-12-19 09:24:40.471072	0.999963	0.999952	0.000037	-0.002930	-0.01001	0.0	0.000037	0.002745