BS, buy- sell can be predicted with 85% accuracy

Sentiment
Market state change
VWAP spread
regression spread
Velocity
Market State ( 4 states)
Distance These are features generated to find BS and RSb class extra: tqqq/sqqq ratio



In [1]:

    
import pandas as pd
import numpy as np
import matplotlib.pylab as plt
import csv
import glob

from statsmodels.tsa.arima_model import ARIMA
from statsmodels.tsa.arima_model import ARIMAResults

import pickle
#from sklearn.cross_validation import train_test_split
from sklearn import linear_model
from sklearn.svm import SVR
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC









    



/home/octo/anaconda2/envs/carnd-term1/lib/python3.5/site-packages/statsmodels/compat/pandas.py:56: FutureWarning: The pandas.core.datetools module is deprecated and will be removed in a future version. Please use the pandas.tseries module instead.
  from pandas.core import datetools



In [2]:

    
# loading csv file
def get_csv_pd(path):
    #spy_pd=pd.read_csv('C:\\Users\Michal\Dropbox\IB_data\SPY.csv',sep=' ',names=['askPrice','askSize','bidPrice','bidSize'],index_col=0,parse_dates=True)
    #spy_pd=pd.read_csv(path+'\SPY.csv',sep=',',names=['askPrice','askSize','bidPrice','bidSize'],index_col=0,parse_dates=True)
    spy_pd=pd.read_csv(path,sep=',',dtype={'askPrice':np.float32,'askSize':np.float32,
                                           'bidPrice':np.float32,'bidSize':np.float32},index_col=0,parse_dates=True)
    #spy_pd = pd.read_csv(path, usecols=['askPrice','askSize','bidPrice','bidSize'], engine='python', skipfooter=3)
    return spy_pd

def preprocessing(df):
    df.bidPrice=df.loc[:,'bidPrice'].replace(to_replace=0, method='ffill')
    df.bidSize=df.loc[:,'bidSize'].replace(to_replace=0, method='ffill')
    df.askPrice=df.loc[:,'askPrice'].replace(to_replace=0, method='ffill')
    df.askSize=df.loc[:,'askSize'].replace(to_replace=0, method='ffill')
    df=df.dropna()
    # to exclude 0
    df=df[df['bidPrice']>df.bidPrice.mean()-df.bidPrice.std()]
    df=df[df['askPrice']>df.askPrice.mean()-df.askPrice.std()]
    df['mid']=(df.askPrice+df.bidPrice)/2
    df['vwap']=((df.loc[:,'bidPrice']*df.loc[:,'bidSize'])+(df.loc[:,'askPrice']*df.loc[:,'askSize']))/(df.loc[:,'bidSize']+df.loc[:,'askSize'])
    df['spread']=df.vwap-df.mid
    df['v']=(df.mid-df.mid.shift(60))
    df['mom']=np.where(np.logical_and((df.mid-df.mid.shift(12))!=0,df.v!=0),(df.mid-df.mid.shift(12))/df.v,0)
    df['return']=(df.askPrice/df.bidPrice.shift(1))-1
    #df['ret'] = np.log(df.Close/df.Close.shift(1))
    df['sigma']=df.spread.rolling(60).std()
    #df['sigma']=df.Close.rolling(5).std()
    df['high']=df.askPrice.rolling(5).max()
    df['low']=df.bidPrice.rolling(5).min()
    
    #df['mom']=np.where(np.logical_and(df.vel_c==1,df.Close>df.price),1,np.where(np.logical_and(df.vel_c==-1,df.Close<df.price),-1,0))
    #flagD=np.logical_and(np.logical_and(df.Close.shift(10)<df.Close.shift(15),df.Close.shift(15)< df.Close.shift(20)),df.Close< df.Close.shift(10))
    #flagU=np.logical_and(np.logical_and(df.Close.shift(15)>df.Close.shift(20),df.Close.shift(10)> df.Close.shift(15)),df.Close> df.Close.shift(10))
    #df['UD']= np.where(flagU,-1,np.where(flagD,1,0))
    
    #df['P']=(df.High+df.Low+df.Close)/3
    #df['UT']=(pd.rolling_max(df.High,60)+pd.rolling_max(df.P+df.High-df.Low,60))*0.5
    #df['DT']=(pd.rolling_min(df.Low,60)+pd.rolling_min(df.P+df.High-df.Low,60))*0.5
    #df['BA']=np.where(df.Close<=df.DT,-1,np.where(df.Close>=df.UT,1,0))# below or above
    return df



In [ ]:

    
'''
def normalise(df,window_length=60):
    dfn=(df-df.rolling(window_length).min())/(df.rolling(window_length).max()-df.rolling(window_length).min())
    return dfn

def de_normalise(data,df,window_length=60):
    dn=(df*(data.rolling(window_length).max()-data.rolling(window_length).min()))+data.rolling(window_length).min()
    return dn

#https://stackoverflow.com/questions/312443/how-do-you-split-a-list-into-evenly-sized-chunks
def chunks(l, n):
    """Yield successive n-sized chunks from l."""
    for i in range(0, len(l), n):
        yield l[i:i + n]

##### ARIMA        

from statsmodels.tsa.arima_model import ARIMA
from statsmodels.tsa.arima_model import ARIMAResults
        
###ARIMA preprocessing
def arima_processing(df):
    #data=df[['vwap','mid']]
    df=df.dropna()
    df['Lvwap']=np.log(df.vwap)
    df['Lmid']=np.log(df.mid)
    df['LDvwap']=df.Lvwap-df.Lvwap.shift(60)
    df['LDmid']=df.Lmid-df.Lmid.shift(60)
    df=df.dropna()
    return df   

###Model is already saved from "/Dropbox/DataScience/ARIMA_model_saving.ipynb". Here loaded and added to "df_ml"
def ARIMA_(data):
    ### load model
    data=data.dropna()
    predictions_mid=ARIMA_mid(data.LDmid)
    predictions_vwap=ARIMA_vwap(data.LDvwap) 
    vwap_arima=np.exp(predictions_vwap+data.Lvwap.shift(60))
    mid_arima=np.exp(predictions_mid+data.Lmid.shift(60))
    df_ml['arima']=data.mid+vwap_arima-mid_arima
    
def ARIMA_mid(data):
    ### load model
    mid_arima_loaded = ARIMAResults.load('mid_arima.pkl')
    predictions_mid = mid_arima_loaded.predict()
    return predictions_mid

def ARIMA_vwap(data):
    ### load model
    vwap_arima_loaded = ARIMAResults.load('vwap_arima.pkl')
    predictions_vwap = vwap_arima_loaded.predict()
    return predictions_vwap

#### KALMAN moving average

##KF moving average
#https://github.com/pykalman/pykalman

# Import a Kalman filter and other useful libraries
from pykalman import KalmanFilter
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from scipy import poly1d

def kalman_ma(data):
    #x=data.mid
    x=data.mid
    # Construct a Kalman filter
    kf = KalmanFilter(transition_matrices = [1],
                  observation_matrices = [1],
                  initial_state_mean = 248,
                  initial_state_covariance = 1,
                  observation_covariance=1,
                  transition_covariance=.01)

    # Use the observed values of the price to get a rolling mean
    state_means, _ = kf.filter(x.values)
    state_means = pd.Series(state_means.flatten(), index=x.index)
    df_ml['km']=state_means

### Linear Regression, sklearn, svm:SVR,linear_model
import pickle
#from sklearn.cross_validation import train_test_split
from sklearn import linear_model
from sklearn.svm import SVR
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC


## loading model saved from /Dropbox/DataScience/REG_model_saving.ipynb
filename_rgr = 'rgr.sav'
filename_svr = 'svr.sav'
# load the model from disk
loaded_rgr_model = pickle.load(open(filename_rgr, 'rb'))
loaded_svr_model = pickle.load(open(filename_svr, 'rb'))

def strat_lr(data,df):
    df=df.dropna()
    data=data.dropna()
    X=df[['askPrice','askSize','bidPrice','bidSize','vwap','spread','v','return','sigma']]
    y=df.mid
    predict_regr=loaded_rgr_model.predict(X)
    predict_svr=loaded_svr_model.predict(X)
    df['predict_regr']=predict_regr
    df['predict_svr']=predict_svr
    df_ml['REG']=de_normalise(data.mid,df.predict_regr)
    df_ml['SVR']=de_normalise(data.mid,df.predict_svr)
    
#### loading classification model from /Dropbox/DataScience/ML_20Sep
filename_svm_model_up = 'svm_model_up.sav'
filename_lm_model_up = 'lm_model_up.sav'
filename_svm_model_dn = 'svm_model_dn.sav'
filename_lm_model_dn = 'lm_model_dn.sav'
# load the model from disk
loaded_svm_up_model = pickle.load(open(filename_svm_model_up, 'rb'))
loaded_lm_up_model = pickle.load(open(filename_lm_model_up, 'rb'))
loaded_svm_dn_model = pickle.load(open(filename_svm_model_dn, 'rb'))
loaded_lm_dn_model = pickle.load(open(filename_lm_model_dn, 'rb'))

def classification_up_dn(data):
    X=data[['askPrice','askSize','bidPrice','bidSize','vwap','spread','v','return','sigma']]
    y1=data.U
    y2=data.D
    
    
    predict_svm_up=loaded_svm_up_model.predict(X)
    predict_lm_up=loaded_lm_up_model.predict(X)
    predict_svm_dn=loaded_svm_dn_model.predict(X)
    predict_lm_dn=loaded_lm_dn_model.predict(X)
    
    data['predict_svm_up']=predict_svm_up
    data['predict_lm_up']=predict_lm_up
    data['predict_svm_dn']=predict_svm_dn
    data['predict_lm_dn']=predict_lm_dn
    
    data['predict_svm']=data.predict_svm_up+data.predict_svm_dn
    data['predict_lm']=data.predict_lm_up+data.predict_lm_dn
    
    data['UD']=np.where(np.logical_and(data.predict_svm>0,data.predict_lm>0),1,np.where(np.logical_and(data.predict_svm<0,data.predict_lm<0),-1,0))  
       
    df_ml['UD']=data.UD

### LSTM

#df.loc[:, cols].prod(axis=1)
def lstm_processing(df):
    df=df.dropna()
    df_price=df[['mid','vwap','arima','km','REG','SVR']]
    #normalization
    dfn=normalise(df_price,12)
    dfn['UD']=df.UD
    return dfn


import numpy
import matplotlib.pyplot as plt
import pandas
import math
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import LSTM
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import mean_squared_error
import numpy
import matplotlib.pyplot as plt
import pandas
import math
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import LSTM
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import mean_squared_error

from keras.models import load_model
model = load_model('21sep.h5')

# convert an array of values into a dataset matrix
def create_dataset(dataset, look_back=1):
    dataX, dataY = [], []
    for i in range(len(dataset)-look_back-1):
        a = dataset[i:(i+look_back), 0]
        b = dataset[i:(i+look_back), 1]
        c = dataset[i:(i+look_back), 2]
        d = dataset[i:(i+look_back), 3]
        e=  dataset[i:(i+look_back), 4]
        f = dataset[i:(i+look_back), 5]
        g=  dataset[i:(i+look_back), 6]
        dataX.append(np.c_[b,c,d,e,f,g])
        #dataX.append(b)
        #dataX.append(c)
        #dataX.append(d)
        #dataX.append(e)
        #dataX.concatenate((a,bT,cT,dT,eT),axis=1)
        dataY.append(dataset[i + look_back,0])
    return np.array(dataX), np.array(dataY)


def strat_LSTM(df_ml):
    
    #normalization
    df_lstm=lstm_processing(df_ml)
    df_lstm=df_lstm.dropna()
    dataset=df_lstm.values
    dataset = dataset.astype('float32')
    # reshape into X=t and Y=t+1
    look_back = 3
    X_,Y_ = create_dataset(dataset,look_back)
    
    # reshape input to be [samples, time steps, features]
    X_ = numpy.reshape(X_, (X_.shape[0],X_.shape[1],X_.shape[2]))
    # make predictions
    predict = model.predict(X_)
    df_lstm=df_lstm.tail(len(predict))
    df_lstm['LSTM']=predict

    #LSTM=(df_lstm.LSTM*(df_ml.mid.rolling(60).max()-df_ml.midClose.rolling(60).min()))+df_LSTM.Close.rolling(60).min()
    LSTM=de_normalise(df_ml.mid,df_lstm.LSTM,window_length=12)
    df_lstm['pred']=LSTM
    df_lstm=df_lstm.dropna()
    df_lstm=df_lstm.tail(len(df_ml))
    df_ml['LSTM']=df_lstm.pred
    '''



In [ ]:

    
'''
#https://stackoverflow.com/questions/312443/how-do-you-split-a-list-into-evenly-sized-chunks
def chunks(l, n):
    """Yield successive n-sized chunks from l."""
    for i in range(0, len(l), n):
        yield l[i:i + n]
'''

Dataset



In [3]:

    
filename = '/home/octo/Dropbox'+ '/SPY7Dec.csv'



In [4]:

    
data=get_csv_pd(filename)



In [5]:

    
data=preprocessing(data)



In [6]:

    
df=data.dropna()
df=df[['mid','vwap','spread','v','mom','return','sigma','high','low',]]



In [7]:

    
# split into train and test sets
train_size = int(len(df) * 0.80)
test_size = len(df) - train_size
train= df[0:train_size]
test= df[train_size:len(df)]
print(len(train), len(test))



In [8]:

    
train_X=train[['mid','vwap','spread','v','return','sigma','high','low',]]
train_y=train['mom']
test_X=test[['mid','vwap','spread','v','return','sigma','high','low',]]
test_y=test['mom']



In [9]:

    
train_X.head()









    Out[9]:







  
    
      
      mid
      vwap
      spread
      v
      return
      sigma
      high
      low
    
  
  
    
      2017-12-07 20:12:58.397462
      263.265015
      263.269684
      0.004669
      0.01001
      0.000038
      0.005388
      263.269989
      263.25000
    
    
      2017-12-07 20:12:58.761962
      263.265015
      263.269684
      0.004669
      0.01001
      0.000038
      0.005399
      263.269989
      263.26001
    
    
      2017-12-07 20:12:59.130174
      263.265015
      263.269684
      0.004669
      0.01001
      0.000038
      0.005410
      263.269989
      263.26001
    
    
      2017-12-07 20:12:59.418065
      263.265015
      263.269257
      0.004242
      0.01001
      0.000038
      0.005416
      263.269989
      263.26001
    
    
      2017-12-07 20:12:59.699497
      263.265015
      263.269409
      0.004395
      0.02002
      0.000038
      0.005284
      263.269989
      263.26001



In [10]:

    
test_y.head()









    Out[10]:





2017-12-08 00:53:07.967863    0.2
2017-12-08 00:53:09.368469    0.0
2017-12-08 00:53:10.689636    0.0
2017-12-08 00:53:11.955470    0.0
2017-12-08 00:53:13.181940    0.0
Name: mom, dtype: float32

Regression



In [11]:

    
from sklearn import linear_model
regr = linear_model.LinearRegression()
#regr.fit(X.tail(20),y.tail(20))
#predict=regr.predict(X.tail(5))
regr.fit(train_X,train_y)
predict=regr.predict(test_X)
#X=X.dropna()
#y=y.dropna()
#y[y == inf] = 0
dt=test[['mid']]
dt['predict']=predict
#dt['predict']=dt.mid+dt.mid*dt.predict
dt['predict']=dt.predict*test.v+test.mid.shift(12)
pdf=test
pdf['pREG']=dt.predict









    



/home/octo/anaconda2/envs/carnd-term1/lib/python3.5/site-packages/ipykernel/__main__.py:11: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
/home/octo/anaconda2/envs/carnd-term1/lib/python3.5/site-packages/ipykernel/__main__.py:13: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
/home/octo/anaconda2/envs/carnd-term1/lib/python3.5/site-packages/ipykernel/__main__.py:15: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy



In [12]:

    
pdf.tail()









    Out[12]:







  
    
      
      mid
      vwap
      spread
      v
      mom
      return
      sigma
      high
      low
      pREG
    
  
  
    
      2017-12-08 03:00:32.244925
      263.834991
      263.830017
      -0.004974
      0.019989
      0.499237
      0.000038
      0.003702
      263.839996
      263.829987
      263.875888
    
    
      2017-12-08 03:00:34.284669
      263.834991
      263.830017
      -0.004974
      0.014984
      0.665988
      0.000038
      0.003723
      263.839996
      263.829987
      263.863143
    
    
      2017-12-08 03:00:35.874931
      263.839996
      263.839996
      0.000000
      0.019989
      0.749618
      0.000038
      0.003723
      263.839996
      263.829987
      263.875879
    
    
      2017-12-08 03:00:37.499670
      263.839996
      263.839966
      -0.000031
      0.014984
      1.000000
      0.000000
      0.003723
      263.839996
      263.829987
      263.866532
    
    
      2017-12-08 03:00:39.356631
      263.845001
      263.840546
      -0.004456
      0.019989
      1.000000
      0.000038
      0.003652
      263.850006
      263.829987
      263.843757



In [13]:

    
from sklearn.svm import SVR
# Fit regression model
svr_rbf = SVR(kernel='rbf', C=1e3, gamma=0.9) #kernel='linear' #kernel='poly'
predict_svr = svr_rbf.fit(train_X,train_y).predict(test_X)
dt1=test[['mid']]
dt1['predict']=predict_svr
dt1['predict']=dt1.predict*test.v+test.mid.shift(12)
pdf['pSVR']=dt1.predict









    



/home/octo/anaconda2/envs/carnd-term1/lib/python3.5/site-packages/ipykernel/__main__.py:6: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
/home/octo/anaconda2/envs/carnd-term1/lib/python3.5/site-packages/ipykernel/__main__.py:7: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
/home/octo/anaconda2/envs/carnd-term1/lib/python3.5/site-packages/ipykernel/__main__.py:8: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy



In [14]:

    
pdf.dropna().head()









    Out[14]:







  
    
      
      mid
      vwap
      spread
      v
      mom
      return
      sigma
      high
      low
      pREG
      pSVR
    
  
  
    
      2017-12-08 00:53:25.952821
      263.705017
      263.709717
      0.004700
      0.010010
      0.5
      0.000038
      0.003000
      263.709991
      263.700012
      263.690400
      263.700717
    
    
      2017-12-08 00:53:27.410888
      263.700012
      263.709412
      0.009399
      0.005005
      1.0
      0.000038
      0.003107
      263.709991
      263.690002
      263.700036
      263.695308
    
    
      2017-12-08 00:53:28.741200
      263.700012
      263.696136
      -0.003876
      0.005005
      1.0
      0.000076
      0.003094
      263.709991
      263.690002
      263.688892
      263.695406
    
    
      2017-12-08 00:53:30.070249
      263.695007
      263.693085
      -0.001923
      -0.005005
      0.0
      0.000038
      0.003118
      263.709991
      263.690002
      263.689984
      263.694987
    
    
      2017-12-08 00:53:31.584831
      263.695007
      263.690155
      -0.004852
      -0.005005
      0.0
      0.000038
      0.003217
      263.709991
      263.690002
      263.689994
      263.694920



In [15]:

    
pdf[['mid','high','low','pREG','pSVR']].tail(300).plot(figsize=(15,9))
#df[['Volume']].tail(5000).plot(figsize=(15,9))
#data[['AvgVolume']].tail(5000).plot(figsize=(15,9))
plt.show()



In [16]:

    
# look at the results
plt.scatter(pdf['mid'],test_y, c='k', label='data')
plt.hold('on')
plt.plot(pdf['pREG'],test_y, c='g', label='pREG')
#plt.plot(pdf['pSVR'], y, c='g', label='pSVR')
plt.xlabel('data')
plt.ylabel('target')
plt.title('Support Vector Regression')
plt.legend()
plt.show()



In [17]:

    
# look at the results
plt.scatter(pdf['mid'],pdf['pREG'], c='k', label='pSVR')
plt.hold('on')
plt.plot(pdf['mid'],pdf['pSVR'], c='g', label='pREG')
plt.plot(pdf['mid'], pdf['high'], c='g', label='high')
plt.xlabel('data')
plt.ylabel('target')
plt.title('Support Vector Regression')
plt.legend()
plt.show()

ARCH

    https://www.quantopian.com/posts/some-code-from-ernie-chans-new-book-implemented-in-python
    http://auquan.com/cointegration-stationarity/
    https://www.analyticsvidhya.com/blog/2016/02/time-series-forecasting-codes-python/
    http://machinelearningmastery.com/time-series-forecast-case-study-python-monthly-armed-robberies-boston/
    https://www.quantstart.com/articles/Basics-of-Statistical-Mean-Reversion-Testing-Part-II
    https://datascience.ibm.com/exchange/public/entry/view/815137c868b916821dec777bdc23013c
    http://machinelearningmastery.com/time-series-data-stationary-python/

Classification



In [18]:

    
X=df[['mid','vwap','spread','v','return','sigma','high','low',]]
y=df['mom']



In [19]:

    
len(df)









    Out[19]:





22308



In [20]:

    
from sklearn import linear_model
regr = linear_model.LinearRegression()
#regr.fit(X.tail(20),y.tail(20))
#predict=regr.predict(X.tail(5))
regr.fit(X.dropna(),y.dropna())
predict=regr.predict(X)
#X=X.dropna()
#y=y.dropna()
#y[y == inf] = 0
dt=df[['mid']]
dt['predict']=predict
#dt['predict']=dt.mid+dt.mid*dt.predict
dt['predict']=dt.predict*df.v+df.mid.shift(12)
classify_df=df
classify_df['pREG']=dt.predict

from sklearn.svm import SVR
# Fit regression model
svr_rbf = SVR(kernel='rbf', C=1e3, gamma=0.9) #kernel='linear' #kernel='poly'
predict_svr = svr_rbf.fit(X, y).predict(X)
dt1=df[['mid']]
dt1['predict']=predict_svr
dt1['predict']=dt1.predict*df.v+df.mid.shift(12)
classify_df['pSVR']=dt1.predict









    



/home/octo/anaconda2/envs/carnd-term1/lib/python3.5/site-packages/ipykernel/__main__.py:11: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
/home/octo/anaconda2/envs/carnd-term1/lib/python3.5/site-packages/ipykernel/__main__.py:13: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
/home/octo/anaconda2/envs/carnd-term1/lib/python3.5/site-packages/ipykernel/__main__.py:22: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
/home/octo/anaconda2/envs/carnd-term1/lib/python3.5/site-packages/ipykernel/__main__.py:23: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy



In [21]:

    
def classification(df):
    mid1=(df.high+df.low)/2
    #flagUD=np.where(np.logical_and(df.mid>df.pREG,df.mid>df.pSVR),1,np.where(np.logical_and(df.mid<df.pREG,df.mid<df.pSVR),-1,0))
    #df['UD']= np.where(np.logical_and(df.mid>mid1,flagUD==1),1,np.where(np.logical_and(df.mid<mid1,flagUD==-1),-1,0))
    flagUD=np.where(np.logical_and(df.mid>df.pREG,df.mid>df.pSVR),1,np.where(np.logical_and(df.mid<df.pREG,df.mid<df.pSVR),-1,0))
    UD= np.where(np.logical_and(df.mid>mid1,flagUD==1),1,np.where(np.logical_and(df.mid<mid1,flagUD==-1),-1,0))
    df['U']= np.where(UD==1,1,0)
    df['D']= np.where(UD==-1,-1,0)
    df['UD']=df.U+df.D
    return df



In [22]:

    
data_class=classification(classify_df)
data_class=data_class.dropna()
df=df.dropna()
# both df and data_class have U,D,UD



In [23]:

    
data_class.head()









    Out[23]:







  
    
      
      mid
      vwap
      spread
      v
      mom
      return
      sigma
      high
      low
      pREG
      pSVR
      U
      D
      UD
    
  
  
    
      2017-12-07 20:13:02.022972
      263.279999
      263.279999
      0.000000
      0.029999
      0.499491
      0.000000
      0.004526
      263.279999
      263.269989
      263.283575
      263.272316
      0
      0
      0
    
    
      2017-12-07 20:13:02.331537
      263.285004
      263.289734
      0.004730
      0.035004
      0.571055
      0.000038
      0.004278
      263.290009
      263.269989
      263.251007
      263.273346
      1
      0
      1
    
    
      2017-12-07 20:13:02.624627
      263.285004
      263.287781
      0.002777
      0.035004
      0.571055
      0.000038
      0.004181
      263.290009
      263.269989
      263.254913
      263.272560
      1
      0
      1
    
    
      2017-12-07 20:13:02.914310
      263.285004
      263.287781
      0.002777
      0.035004
      0.571055
      0.000038
      0.004070
      263.290009
      263.279999
      263.265739
      263.271821
      0
      0
      0
    
    
      2017-12-07 20:13:03.218062
      263.285004
      263.287781
      0.002777
      0.035004
      0.571055
      0.000038
      0.004069
      263.290009
      263.279999
      263.265736
      263.271821
      0
      0
      0



In [24]:

    
df.head()









    Out[24]:







  
    
      
      mid
      vwap
      spread
      v
      mom
      return
      sigma
      high
      low
      pREG
      pSVR
      U
      D
      UD
    
  
  
    
      2017-12-07 20:13:02.022972
      263.279999
      263.279999
      0.000000
      0.029999
      0.499491
      0.000000
      0.004526
      263.279999
      263.269989
      263.283575
      263.272316
      0
      0
      0
    
    
      2017-12-07 20:13:02.331537
      263.285004
      263.289734
      0.004730
      0.035004
      0.571055
      0.000038
      0.004278
      263.290009
      263.269989
      263.251007
      263.273346
      1
      0
      1
    
    
      2017-12-07 20:13:02.624627
      263.285004
      263.287781
      0.002777
      0.035004
      0.571055
      0.000038
      0.004181
      263.290009
      263.269989
      263.254913
      263.272560
      1
      0
      1
    
    
      2017-12-07 20:13:02.914310
      263.285004
      263.287781
      0.002777
      0.035004
      0.571055
      0.000038
      0.004070
      263.290009
      263.279999
      263.265739
      263.271821
      0
      0
      0
    
    
      2017-12-07 20:13:03.218062
      263.285004
      263.287781
      0.002777
      0.035004
      0.571055
      0.000038
      0.004069
      263.290009
      263.279999
      263.265736
      263.271821
      0
      0
      0



In [25]:

    
# split into train and test sets
train_size = int(len(data_class) * 0.80)
test_size = len(data_class) - train_size
train= data_class[0:train_size]
test= data_class[train_size:len(data_class)]
print(len(train), len(test))



In [26]:

    
train_X=train[['mid','vwap','spread','v','return','sigma','high','low','mom','pREG','pSVR']]
train_y=train['UD']
test_X=test[['mid','vwap','spread','v','return','sigma','high','low','mom','pREG','pSVR']]
test_y=test['UD']
train_U=train['U']
test_U=test['U']
train_D=train['D']
test_D=test['D']



In [27]:

    
print(len(train_U), len(test_U))

Logistic Regression



In [28]:

    
from sklearn import metrics
from sklearn.linear_model import LogisticRegression



In [29]:

    
model = LogisticRegression()
model.fit(train_X,train_U)
print(model)









    



LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)



In [30]:

    
# make predictions
expected =test_U
predicted = model.predict(test_X)



In [31]:

    
# summarize the fit of the model
print(metrics.classification_report(expected, predicted))
print(metrics.confusion_matrix(expected, predicted))









    



             precision    recall  f1-score   support

          0       0.85      1.00      0.92      3778
          1       0.00      0.00      0.00       682

avg / total       0.72      0.85      0.78      4460

[[3778    0]
 [ 682    0]]






    



/home/octo/anaconda2/envs/carnd-term1/lib/python3.5/site-packages/sklearn/metrics/classification.py:1113: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples.
  'precision', 'predicted', average, warn_for)



In [32]:

    
model = LogisticRegression()
model.fit(train_X,train_D)
print(model)









    



LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)



In [33]:

    
# make predictions
expected =test_D
predicted = model.predict(test_X)



In [34]:

    
# summarize the fit of the model
print(metrics.classification_report(expected, predicted))
print(metrics.confusion_matrix(expected, predicted))









    



             precision    recall  f1-score   support

         -1       0.00      0.00      0.00       745
          0       0.83      1.00      0.91      3715

avg / total       0.69      0.83      0.76      4460

[[   0  745]
 [   0 3715]]






    



/home/octo/anaconda2/envs/carnd-term1/lib/python3.5/site-packages/sklearn/metrics/classification.py:1113: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples.
  'precision', 'predicted', average, warn_for)

Decision Trees



In [35]:

    
from sklearn import metrics
from sklearn.tree import DecisionTreeClassifier



In [36]:

    
# fit a CART model to the data
model = DecisionTreeClassifier()
model.fit(train_X,train_U)
print(model)









    



DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            presort=False, random_state=None, splitter='best')



In [37]:

    
# make predictions
expected =test_U
predicted = model.predict(test_X)



In [38]:

    
# summarize the fit of the model
print(metrics.classification_report(expected, predicted))
print(metrics.confusion_matrix(expected, predicted))









    



             precision    recall  f1-score   support

          0       0.91      0.86      0.88      3778
          1       0.40      0.50      0.44       682

avg / total       0.83      0.81      0.82      4460

[[3263  515]
 [ 340  342]]



In [39]:

    
accuracy =model.score(test_X,test_U) 
accuracy









    Out[39]:





0.80829596412556048



In [40]:

    
# fit a CART model to the data
model = DecisionTreeClassifier()
model.fit(train_X,train_D)
print(model)









    



DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            presort=False, random_state=None, splitter='best')



In [41]:

    
# make predictions
expected =test_D
predicted = model.predict(test_X)



In [42]:

    
# summarize the fit of the model
print(metrics.classification_report(expected, predicted))
print(metrics.confusion_matrix(expected, predicted))









    



             precision    recall  f1-score   support

         -1       0.41      0.40      0.41       745
          0       0.88      0.88      0.88      3715

avg / total       0.80      0.80      0.80      4460

[[ 301  444]
 [ 432 3283]]



In [43]:

    
accuracy =model.score(test_X,test_D) 
accuracy









    Out[43]:





0.80358744394618831

Random Forest

The random forest algorithm can find nonlinearities in data that a linear regression wouldn’t be able to pick up on.



In [44]:

    
# Import the random forest model.
from sklearn.ensemble import RandomForestRegressor



In [45]:

    
# Initialize the model with some parameters.
model = RandomForestRegressor(n_estimators=100, min_samples_leaf=10, random_state=1)
# Fit the model to the data.
model.fit(train_X,train_U)
print(model)









    



RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_split=1e-07, min_samples_leaf=10,
           min_samples_split=2, min_weight_fraction_leaf=0.0,
           n_estimators=100, n_jobs=1, oob_score=False, random_state=1,
           verbose=0, warm_start=False)



In [46]:

    
# Make predictions.
expected=test_U
predicted = model.predict(test_X)



In [47]:

    
accuracy =model.score(test_X,test_U) 
accuracy









    Out[47]:





0.2186929305446812



In [48]:

    
from sklearn.ensemble import RandomForestClassifier



In [49]:

    
clf = RandomForestClassifier(n_estimators=100, n_jobs=-1)
clf.fit(train_X,train_U)
accuracy = clf.score(test_X,test_U)
print(accuracy)









    



0.862556053812

KNN



In [50]:

    
from sklearn import neighbors



In [51]:

    
clf = neighbors.KNeighborsClassifier()
clf.fit(train_X,train_U)









    Out[51]:





KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=1, n_neighbors=5, p=2,
           weights='uniform')



In [52]:

    
accuracy = clf.score(test_X,test_U)
print(accuracy)









    



0.828475336323

Ada Boosting binary Classification



In [53]:

    
from sklearn.ensemble import AdaBoostClassifier



In [54]:

    
clf = AdaBoostClassifier()
clf.fit(train_X,train_U)
accuracy = clf.score(test_X,test_U)
print(accuracy)









    



0.847085201794

Gradient Tree Boosting binary Classification



In [55]:

    
from sklearn.ensemble import GradientBoostingClassifier



In [56]:

    
clf = GradientBoostingClassifier(n_estimators=100)
clf.fit(train_X,train_U)
accuracy = clf.score(test_X,test_U)
print(accuracy)









    



0.845739910314

Quadratic Discriminant Analysis binary Classification



In [57]:

    
from sklearn.qda import QDA









    



/home/octo/anaconda2/envs/carnd-term1/lib/python3.5/site-packages/sklearn/qda.py:4: DeprecationWarning: qda.QDA has been moved to discriminant_analysis.QuadraticDiscriminantAnalysis in 0.17 and will be removed in 0.19.
  "in 0.17 and will be removed in 0.19.", DeprecationWarning)



In [58]:

    
clf = QDA()
clf.fit(train_X,train_U)
accuracy = clf.score(test_X,test_U)
print(accuracy)









    



0.906950672646






    



/home/octo/anaconda2/envs/carnd-term1/lib/python3.5/site-packages/sklearn/discriminant_analysis.py:694: UserWarning: Variables are collinear
  warnings.warn("Variables are collinear")

SVM

SVM (Support Vector Machines) is one of the most popular machine learning algorithms used mainly for the classification problem. As well as logistic regression, SVM allows multi-class classification with the help of the one-vs-all method.



In [59]:

    
from sklearn import metrics
from sklearn.svm import SVC



In [60]:

    
# fit a SVM model to the data
model = SVC()
model.fit(train_X,train_U)
print(model)









    



SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape=None, degree=3, gamma='auto', kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False)



In [61]:

    
# Make predictions.
expected=test_U
predicted = model.predict(test_X)



In [62]:

    
accuracy =model.score(test_X,test_U) 
accuracy









    Out[62]:





0.84618834080717487



In [63]:

    
# fit a SVM model to the data
model = SVC()
model.fit(train_X,train_D)
print(model)









    



SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape=None, degree=3, gamma='auto', kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False)



In [64]:

    
# Make predictions.
expected=test_D
predicted = model.predict(test_X)



In [65]:

    
accuracy =model.score(test_X,test_D) 
accuracy









    Out[65]:





0.83340807174887888

Saving



In [ ]:

    
if savemodel == True:
        fname_out = '{}-{}.pickle'.format(fout, datetime.now())
        with open(fname_out, 'wb') as f:
            cPickle.dump(clf, f, -1)

graphical



In [66]:

    
# plotting
import matplotlib.pyplot as plt
import seaborn as sns; sns.set()
%matplotlib inline
#plt.rcParams['figure.figsize'] = 8,6



In [67]:

    
test.boxplot(column='v')









    Out[67]:





<matplotlib.axes._subplots.AxesSubplot at 0x7fc6ed5edf28>



In [68]:

    
test.boxplot(by='v')
plt.ylim(245,248)









    Out[68]:





(245, 248)



In [69]:

    
test.boxplot(by='UD')









    Out[69]:





array([[<matplotlib.axes._subplots.AxesSubplot object at 0x7fc6922740f0>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x7fc69247b898>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x7fc6b633cd68>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x7fc6922565c0>],
       [<matplotlib.axes._subplots.AxesSubplot object at 0x7fc69224b7f0>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x7fc6ed5057f0>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x7fc692b7d048>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x7fc6924be5c0>],
       [<matplotlib.axes._subplots.AxesSubplot object at 0x7fc691cab9e8>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x7fc692450e80>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x7fc692521128>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x7fc692193f60>],
       [<matplotlib.axes._subplots.AxesSubplot object at 0x7fc6921b9ac8>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x7fc69238fe80>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x7fc6c4f992b0>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x7fc69242b2b0>]], dtype=object)



In [70]:

    
#some descriptive statistics
test.describe()









    Out[70]:







  
    
      
      mid
      vwap
      spread
      v
      mom
      return
      sigma
      high
      low
      pREG
      pSVR
      U
      D
      UD
    
  
  
    
      count
      4460.000000
      4460.000000
      4460.000000
      4460.000000
      4460.000000
      4460.000000
      4460.000000
      4460.000000
      4460.000000
      4460.000000
      4460.000000
      4460.000000
      4460.000000
      4460.000000
    
    
      mean
      263.819397
      263.818878
      0.000462
      0.001618
      0.043287
      0.000038
      0.003904
      263.826193
      263.812398
      263.819161
      263.819042
      0.152915
      -0.167040
      -0.014126
    
    
      std
      0.045108
      0.045403
      0.004088
      0.022001
      16.100435
      0.000014
      0.000536
      0.045081
      0.045367
      0.047122
      0.045793
      0.359946
      0.373054
      0.565533
    
    
      min
      263.695007
      263.690155
      -0.014099
      -0.059998
      -491.000000
      -0.000114
      0.001829
      263.700012
      263.690002
      263.688131
      263.692728
      0.000000
      -1.000000
      -1.000000
    
    
      25%
      263.785004
      263.785004
      -0.003662
      -0.014984
      0.000000
      0.000038
      0.003552
      263.790009
      263.779999
      263.784778
      263.784078
      0.000000
      0.000000
      0.000000
    
    
      50%
      263.825012
      263.821335
      0.000122
      0.000000
      0.000000
      0.000038
      0.003917
      263.829987
      263.820007
      263.820413
      263.822422
      0.000000
      0.000000
      0.000000
    
    
      75%
      263.854980
      263.850197
      0.004272
      0.019989
      0.500000
      0.000038
      0.004261
      263.859985
      263.839996
      263.851297
      263.851654
      0.000000
      0.000000
      0.000000
    
    
      max
      263.934998
      263.939789
      0.019836
      0.059998
      328.000000
      0.000152
      0.005719
      263.940002
      263.929993
      263.969920
      263.945954
      1.000000
      0.000000
      1.000000



In [71]:

    
test['v'].plot(kind='hist', grid=True, title='velocity')









    Out[71]:





<matplotlib.axes._subplots.AxesSubplot at 0x7fc69221c6a0>



In [72]:

    
test['UD'].plot(kind='hist', grid=True, title='up-down')









    Out[72]:





<matplotlib.axes._subplots.AxesSubplot at 0x7fc692600748>



In [73]:

    
test['v'].plot(kind='line', grid=True, title='velocity')









    Out[73]:





<matplotlib.axes._subplots.AxesSubplot at 0x7fc691fa2978>



In [74]:

    
test['UD'].plot(kind='line', grid=True, title='up-down')









    Out[74]:





<matplotlib.axes._subplots.AxesSubplot at 0x7fc691fa75f8>



In [75]:

    
# Find 7, 30, 120 day moving averages (very broadly, rolling week, month and quarter)
spy_12 = test.rolling(window=12).mean()
spy_60 = test.rolling(window=60).mean()
spy_360 = test.rolling(window=360).mean()

fig = plt.figure()
fig.autofmt_xdate()
ax = fig.add_subplot(1,1,1)
ax.plot(test.index,test, label='SPY')
ax.plot(spy_12.index, spy_12, label='1 min rolling')
ax.plot(spy_60.index, spy_60, label='5 min rolling')
ax.plot(spy_360.index,spy_360, label='30 min rolling')
ax.grid()
ax.legend(loc=2)
ax.set_xlabel('Date')
plt.title('SPY Closes & Rolling Averages')
plt.show()



In [76]:

    
#frequency
round(test['mom']).value_counts()









    Out[76]:





 0.0      2925
 1.0       823
-1.0       427
 2.0       115
-2.0        80
 3.0        24
 4.0        19
-3.0        17
-4.0        14
 328.0       4
 5.0         2
-491.0       2
 7.0         2
 6.0         2
-327.0       2
-5.0         1
-163.0       1
Name: mom, dtype: int64



In [77]:

    
round(test['vwap'],1).hist(bins=50)









    Out[77]:





<matplotlib.axes._subplots.AxesSubplot at 0x7fc6b6370f98>



In [78]:

    
test.boxplot(column='mid')









    Out[78]:





<matplotlib.axes._subplots.AxesSubplot at 0x7fc6b7707630>



In [79]:

    
#df for datascience
#signal=df.DataFrame(data=df.mid)
signal=df
#df['time']=df.index.strftime('%H:%M:%S')
time=signal.index.strftime('%H:%M:%S')



In [80]:

    
P=(signal.high+signal.low+signal.mid)/3
signal['UT']=(P+signal.high.rolling(60).max()-signal.low.rolling(60).max())
signal['DT']=(P-signal.high.rolling(60).min()+signal.low.rolling(60).min())
signal['BS']=np.where(signal.mid<=df.DT,"B",np.where(signal.mid>=df.UT,"S","H"))
signal=signal.dropna()









    



/home/octo/anaconda2/envs/carnd-term1/lib/python3.5/site-packages/ipykernel/__main__.py:2: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  from ipykernel import kernelapp as app
/home/octo/anaconda2/envs/carnd-term1/lib/python3.5/site-packages/ipykernel/__main__.py:3: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  app.launch_new_instance()
/home/octo/anaconda2/envs/carnd-term1/lib/python3.5/site-packages/ipykernel/__main__.py:4: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy



In [81]:

    
signal.head()









    Out[81]:







  
    
      
      mid
      vwap
      spread
      v
      mom
      return
      sigma
      high
      low
      pREG
      pSVR
      U
      D
      UD
      UT
      DT
      BS
    
  
  
    
      2017-12-07 20:13:21.961187
      263.285004
      263.289917
      0.004913
      0.005005
      3.993902
      0.000038
      0.004073
      263.290009
      263.269989
      263.262530
      263.265599
      1
      0
      1
      263.291646
      263.271688
      H
    
    
      2017-12-07 20:13:22.348174
      263.285004
      263.289673
      0.004669
      0.005005
      3.993902
      0.000038
      0.004083
      263.290009
      263.269989
      263.262612
      263.265599
      1
      0
      1
      263.291646
      263.271688
      H
    
    
      2017-12-07 20:13:22.720406
      263.285004
      263.289673
      0.004669
      0.000000
      0.000000
      0.000038
      0.004082
      263.290009
      263.279999
      263.269989
      263.269989
      0
      0
      0
      263.294983
      263.275024
      H
    
    
      2017-12-07 20:13:23.092605
      263.285004
      263.289673
      0.004669
      0.000000
      0.000000
      0.000038
      0.004096
      263.290009
      263.279999
      263.269989
      263.269989
      0
      0
      0
      263.294983
      263.275024
      H
    
    
      2017-12-07 20:13:23.408679
      263.290009
      263.290009
      0.000000
      0.005005
      3.000000
      0.000038
      0.004102
      263.290009
      263.279999
      263.275195
      263.275721
      1
      0
      1
      263.296651
      263.276693
      H



In [82]:

    
df[['UT','DT','mid','high','low','pREG','pSVR']].tail(100).plot(figsize=(16, 10))
plt.show()



In [83]:

    
signal.boxplot(column='mid',by ='BS')









    Out[83]:





<matplotlib.axes._subplots.AxesSubplot at 0x7fc6b77247f0>



In [84]:

    
temp1 = round(signal['UD']).value_counts(ascending=True)
temp2 = signal.pivot_table(values='UD',index=['BS'],aggfunc=lambda x: x.map({'B':1,'S':-1,'H':0}).mean())



In [85]:

    
print ('Frequency Table for spread:')
print (temp1)
print ('\nProbility') 
print (temp2.tail())









    



Frequency Table for spread:
 1     3614
-1     3737
 0    14886
Name: UD, dtype: int64

Probility
Empty DataFrame
Columns: []
Index: [B, H, S]



In [86]:

    
temp3 = pd.crosstab(round(signal['UD']),signal['BS'])
temp3.plot(kind='bar', stacked=True, color=['red','blue'], grid=False)









    Out[86]:





<matplotlib.axes._subplots.AxesSubplot at 0x7fc6c50d3cf8>



In [87]:

    
# number of missing values in each column as isnull() returns 1, if the value is null.
signal.apply(lambda x: sum(x.isnull()),axis=0)









    Out[87]:





mid       0
vwap      0
spread    0
v         0
mom       0
return    0
sigma     0
high      0
low       0
pREG      0
pSVR      0
U         0
D         0
UD        0
UT        0
DT        0
BS        0
dtype: int64



In [88]:

    
signal['BS'].value_counts()









    Out[88]:





H    22105
B       90
S       42
Name: BS, dtype: int64



In [89]:

    
signal['UD'].value_counts()









    Out[89]:





 0    14886
-1     3737
 1     3614
Name: UD, dtype: int64



In [90]:

    
table = signal.pivot_table(values='v', index='BS' ,columns='UD', aggfunc=np.median)
print(table)









    



UD        -1         0         1
BS                              
B   0.039978  0.010010       NaN
H  -0.004974  0.000000  0.005005
S        NaN  0.017517  0.022491



In [91]:

    
#Boolean indexing
signal.loc[(signal['v']<0) & (signal["BS"]=="B") & (signal["DT"]>signal["mid"]), ['mid',"spread","BS","DT"]].head()









    Out[91]:







  
    
      
      mid
      spread
      BS
      DT
    
  
  
    
      2017-12-07 22:21:27.635876
      263.459991
      0.0
      B
      263.463328
    
    
      2017-12-07 22:21:28.585295
      263.459991
      0.0
      B
      263.463328
    
    
      2017-12-07 22:21:29.549998
      263.459991
      0.0
      B
      263.463328



In [101]:

    
train_X.head()









    Out[101]:







  
    
      
      mid
      vwap
      spread
      v
      return
      sigma
      high
      low
      mom
      pREG
      pSVR
    
  
  
    
      2017-12-07 20:13:02.022972
      263.279999
      263.279999
      0.000000
      0.029999
      0.000000
      0.004526
      263.279999
      263.269989
      0.499491
      263.283575
      263.272316
    
    
      2017-12-07 20:13:02.331537
      263.285004
      263.289734
      0.004730
      0.035004
      0.000038
      0.004278
      263.290009
      263.269989
      0.571055
      263.251007
      263.273346
    
    
      2017-12-07 20:13:02.624627
      263.285004
      263.287781
      0.002777
      0.035004
      0.000038
      0.004181
      263.290009
      263.269989
      0.571055
      263.254913
      263.272560
    
    
      2017-12-07 20:13:02.914310
      263.285004
      263.287781
      0.002777
      0.035004
      0.000038
      0.004070
      263.290009
      263.279999
      0.571055
      263.265739
      263.271821
    
    
      2017-12-07 20:13:03.218062
      263.285004
      263.287781
      0.002777
      0.035004
      0.000038
      0.004069
      263.290009
      263.279999
      0.571055
      263.265736
      263.271821



In [93]:

    
# Create first network with Keras
from keras.models import Sequential
from keras.layers import Dense
import numpy









    



Using TensorFlow backend.



In [102]:

    
# create model
model = Sequential()
model.add(Dense(12, input_dim=11, init='uniform', activation='relu'))
model.add(Dense(8, init='uniform', activation='relu'))
model.add(Dense(1, init='uniform', activation='sigmoid'))
# Compile model
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
# Fit the model
model.fit(train_X,train_U, nb_epoch=11, batch_size=10)
# evaluate the model
scores = model.evaluate(test_X,test_U)
print("%s: %.2f%%" % (model.metrics_names[1], scores[1]*100))









    



Epoch 1/11






    



---------------------------------------------------------------------------
KeyError                                  Traceback (most recent call last)
<ipython-input-102-b61c9f59b39f> in <module>()
      7 model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
      8 # Fit the model
----> 9 model.fit(train_X,train_U, nb_epoch=11, batch_size=10)
     10 # evaluate the model
     11 scores = model.evaluate(test_X,test_U)

/home/octo/anaconda2/envs/carnd-term1/lib/python3.5/site-packages/keras/models.py in fit(self, x, y, batch_size, nb_epoch, verbose, callbacks, validation_split, validation_data, shuffle, class_weight, sample_weight, initial_epoch, **kwargs)
    669                               class_weight=class_weight,
    670                               sample_weight=sample_weight,
--> 671                               initial_epoch=initial_epoch)
    672 
    673     def evaluate(self, x, y, batch_size=32, verbose=1,

/home/octo/anaconda2/envs/carnd-term1/lib/python3.5/site-packages/keras/engine/training.py in fit(self, x, y, batch_size, nb_epoch, verbose, callbacks, validation_split, validation_data, shuffle, class_weight, sample_weight, initial_epoch)
   1151                               val_f=val_f, val_ins=val_ins, shuffle=shuffle,
   1152                               callback_metrics=callback_metrics,
-> 1153                               initial_epoch=initial_epoch)
   1154 
   1155     def evaluate(self, x, y, batch_size=32, verbose=1, sample_weight=None):

/home/octo/anaconda2/envs/carnd-term1/lib/python3.5/site-packages/keras/engine/training.py in _fit_loop(self, f, ins, out_labels, batch_size, nb_epoch, verbose, callbacks, val_f, val_ins, shuffle, callback_metrics, initial_epoch)
    842                         ins_batch = slice_X(ins[:-1], batch_ids) + [ins[-1]]
    843                     else:
--> 844                         ins_batch = slice_X(ins, batch_ids)
    845                 except TypeError:
    846                     raise TypeError('TypeError while preparing batch. '

/home/octo/anaconda2/envs/carnd-term1/lib/python3.5/site-packages/keras/engine/training.py in slice_X(X, start, stop)
    297             if hasattr(start, 'shape'):
    298                 start = start.tolist()
--> 299             return [x[start] for x in X]
    300         else:
    301             return [x[start:stop] for x in X]

/home/octo/anaconda2/envs/carnd-term1/lib/python3.5/site-packages/keras/engine/training.py in <listcomp>(.0)
    297             if hasattr(start, 'shape'):
    298                 start = start.tolist()
--> 299             return [x[start] for x in X]
    300         else:
    301             return [x[start:stop] for x in X]

/home/octo/anaconda2/envs/carnd-term1/lib/python3.5/site-packages/pandas/core/frame.py in __getitem__(self, key)
   1956         if isinstance(key, (Series, np.ndarray, Index, list)):
   1957             # either boolean or fancy integer index
-> 1958             return self._getitem_array(key)
   1959         elif isinstance(key, DataFrame):
   1960             return self._getitem_frame(key)

/home/octo/anaconda2/envs/carnd-term1/lib/python3.5/site-packages/pandas/core/frame.py in _getitem_array(self, key)
   2000             return self.take(indexer, axis=0, convert=False)
   2001         else:
-> 2002             indexer = self.loc._convert_to_indexer(key, axis=1)
   2003             return self.take(indexer, axis=1, convert=True)
   2004 

/home/octo/anaconda2/envs/carnd-term1/lib/python3.5/site-packages/pandas/core/indexing.py in _convert_to_indexer(self, obj, axis, is_setter)
   1229                 mask = check == -1
   1230                 if mask.any():
-> 1231                     raise KeyError('%s not in index' % objarr[mask])
   1232 
   1233                 return _values_from_object(indexer)

KeyError: '[11246  2182  4681 16448 11991 13882  2347 15909 13215  7383] not in index'



In [ ]:

	mid	vwap	spread	v	return	sigma	high	low
2017-12-07 20:12:58.397462	263.265015	263.269684	0.004669	0.01001	0.000038	0.005388	263.269989	263.25000
2017-12-07 20:12:58.761962	263.265015	263.269684	0.004669	0.01001	0.000038	0.005399	263.269989	263.26001
2017-12-07 20:12:59.130174	263.265015	263.269684	0.004669	0.01001	0.000038	0.005410	263.269989	263.26001
2017-12-07 20:12:59.418065	263.265015	263.269257	0.004242	0.01001	0.000038	0.005416	263.269989	263.26001
2017-12-07 20:12:59.699497	263.265015	263.269409	0.004395	0.02002	0.000038	0.005284	263.269989	263.26001

	mid	vwap	spread	v	mom	return	sigma	high	low	pREG
2017-12-08 03:00:32.244925	263.834991	263.830017	-0.004974	0.019989	0.499237	0.000038	0.003702	263.839996	263.829987	263.875888
2017-12-08 03:00:34.284669	263.834991	263.830017	-0.004974	0.014984	0.665988	0.000038	0.003723	263.839996	263.829987	263.863143
2017-12-08 03:00:35.874931	263.839996	263.839996	0.000000	0.019989	0.749618	0.000038	0.003723	263.839996	263.829987	263.875879
2017-12-08 03:00:37.499670	263.839996	263.839966	-0.000031	0.014984	1.000000	0.000000	0.003723	263.839996	263.829987	263.866532
2017-12-08 03:00:39.356631	263.845001	263.840546	-0.004456	0.019989	1.000000	0.000038	0.003652	263.850006	263.829987	263.843757

	mid	vwap	spread	v	mom	return	sigma	high	low	pREG	pSVR
2017-12-08 00:53:25.952821	263.705017	263.709717	0.004700	0.010010	0.5	0.000038	0.003000	263.709991	263.700012	263.690400	263.700717
2017-12-08 00:53:27.410888	263.700012	263.709412	0.009399	0.005005	1.0	0.000038	0.003107	263.709991	263.690002	263.700036	263.695308
2017-12-08 00:53:28.741200	263.700012	263.696136	-0.003876	0.005005	1.0	0.000076	0.003094	263.709991	263.690002	263.688892	263.695406
2017-12-08 00:53:30.070249	263.695007	263.693085	-0.001923	-0.005005	0.0	0.000038	0.003118	263.709991	263.690002	263.689984	263.694987
2017-12-08 00:53:31.584831	263.695007	263.690155	-0.004852	-0.005005	0.0	0.000038	0.003217	263.709991	263.690002	263.689994	263.694920

	mid	vwap	spread	v	mom	return	sigma	high	low	pREG	pSVR	U	UD
2017-12-07 20:13:02.022972	263.279999	263.279999	0.000000	0.029999	0.499491	0.000000	0.004526	263.279999	263.269989	263.283575	263.272316	0	0
2017-12-07 20:13:02.331537	263.285004	263.289734	0.004730	0.035004	0.571055	0.000038	0.004278	263.290009	263.269989	263.251007	263.273346	1	1
2017-12-07 20:13:02.624627	263.285004	263.287781	0.002777	0.035004	0.571055	0.000038	0.004181	263.290009	263.269989	263.254913	263.272560	1	1
2017-12-07 20:13:02.914310	263.285004	263.287781	0.002777	0.035004	0.571055	0.000038	0.004070	263.290009	263.279999	263.265739	263.271821	0	0
2017-12-07 20:13:03.218062	263.285004	263.287781	0.002777	0.035004	0.571055	0.000038	0.004069	263.290009	263.279999	263.265736	263.271821	0	0

	mid	vwap	spread	v	mom	return	sigma	high	low	pREG	pSVR	U	D	UD
count	4460.000000	4460.000000	4460.000000	4460.000000	4460.000000	4460.000000	4460.000000	4460.000000	4460.000000	4460.000000	4460.000000	4460.000000	4460.000000	4460.000000
mean	263.819397	263.818878	0.000462	0.001618	0.043287	0.000038	0.003904	263.826193	263.812398	263.819161	263.819042	0.152915	-0.167040	-0.014126
std	0.045108	0.045403	0.004088	0.022001	16.100435	0.000014	0.000536	0.045081	0.045367	0.047122	0.045793	0.359946	0.373054	0.565533
min	263.695007	263.690155	-0.014099	-0.059998	-491.000000	-0.000114	0.001829	263.700012	263.690002	263.688131	263.692728	0.000000	-1.000000	-1.000000
25%	263.785004	263.785004	-0.003662	-0.014984	0.000000	0.000038	0.003552	263.790009	263.779999	263.784778	263.784078	0.000000	0.000000	0.000000
50%	263.825012	263.821335	0.000122	0.000000	0.000000	0.000038	0.003917	263.829987	263.820007	263.820413	263.822422	0.000000	0.000000	0.000000
75%	263.854980	263.850197	0.004272	0.019989	0.500000	0.000038	0.004261	263.859985	263.839996	263.851297	263.851654	0.000000	0.000000	0.000000
max	263.934998	263.939789	0.019836	0.059998	328.000000	0.000152	0.005719	263.940002	263.929993	263.969920	263.945954	1.000000	0.000000	1.000000

	mid	vwap	spread	v	mom	return	sigma	high	low	pREG	pSVR	U	UD	UT	DT	BS
2017-12-07 20:13:21.961187	263.285004	263.289917	0.004913	0.005005	3.993902	0.000038	0.004073	263.290009	263.269989	263.262530	263.265599	1	1	263.291646	263.271688	H
2017-12-07 20:13:22.348174	263.285004	263.289673	0.004669	0.005005	3.993902	0.000038	0.004083	263.290009	263.269989	263.262612	263.265599	1	1	263.291646	263.271688	H
2017-12-07 20:13:22.720406	263.285004	263.289673	0.004669	0.000000	0.000000	0.000038	0.004082	263.290009	263.279999	263.269989	263.269989	0	0	263.294983	263.275024	H
2017-12-07 20:13:23.092605	263.285004	263.289673	0.004669	0.000000	0.000000	0.000038	0.004096	263.290009	263.279999	263.269989	263.269989	0	0	263.294983	263.275024	H
2017-12-07 20:13:23.408679	263.290009	263.290009	0.000000	0.005005	3.000000	0.000038	0.004102	263.290009	263.279999	263.275195	263.275721	1	1	263.296651	263.276693	H

	mid	BS	DT
2017-12-07 22:21:27.635876	263.459991	B	263.463328
2017-12-07 22:21:28.585295	263.459991	B	263.463328
2017-12-07 22:21:29.549998	263.459991	B	263.463328