BS, buy- sell can be predicted with 85% accuracy
In [1]:
    
import pandas as pd
import numpy as np
import matplotlib.pylab as plt
import csv
import glob
from statsmodels.tsa.arima_model import ARIMA
from statsmodels.tsa.arima_model import ARIMAResults
import pickle
#from sklearn.cross_validation import train_test_split
from sklearn import linear_model
from sklearn.svm import SVR
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
    
    
In [2]:
    
# loading csv file
def get_csv_pd(path):
    #spy_pd=pd.read_csv('C:\\Users\Michal\Dropbox\IB_data\SPY.csv',sep=' ',names=['askPrice','askSize','bidPrice','bidSize'],index_col=0,parse_dates=True)
    #spy_pd=pd.read_csv(path+'\SPY.csv',sep=',',names=['askPrice','askSize','bidPrice','bidSize'],index_col=0,parse_dates=True)
    spy_pd=pd.read_csv(path,sep=',',dtype={'askPrice':np.float32,'askSize':np.float32,
                                           'bidPrice':np.float32,'bidSize':np.float32},index_col=0,parse_dates=True)
    #spy_pd = pd.read_csv(path, usecols=['askPrice','askSize','bidPrice','bidSize'], engine='python', skipfooter=3)
    return spy_pd
def preprocessing(df):
    df.bidPrice=df.loc[:,'bidPrice'].replace(to_replace=0, method='ffill')
    df.bidSize=df.loc[:,'bidSize'].replace(to_replace=0, method='ffill')
    df.askPrice=df.loc[:,'askPrice'].replace(to_replace=0, method='ffill')
    df.askSize=df.loc[:,'askSize'].replace(to_replace=0, method='ffill')
    df=df.dropna()
    # to exclude 0
    df=df[df['bidPrice']>df.bidPrice.mean()-df.bidPrice.std()]
    df=df[df['askPrice']>df.askPrice.mean()-df.askPrice.std()]
    df['mid']=(df.askPrice+df.bidPrice)/2
    df['vwap']=((df.loc[:,'bidPrice']*df.loc[:,'bidSize'])+(df.loc[:,'askPrice']*df.loc[:,'askSize']))/(df.loc[:,'bidSize']+df.loc[:,'askSize'])
    df['spread']=df.vwap-df.mid
    df['v']=(df.mid-df.mid.shift(60))
    df['mom']=np.where(np.logical_and((df.mid-df.mid.shift(12))!=0,df.v!=0),(df.mid-df.mid.shift(12))/df.v,0)
    df['return']=(df.askPrice/df.bidPrice.shift(1))-1
    #df['ret'] = np.log(df.Close/df.Close.shift(1))
    df['sigma']=df.spread.rolling(60).std()
    #df['sigma']=df.Close.rolling(5).std()
    df['high']=df.askPrice.rolling(5).max()
    df['low']=df.bidPrice.rolling(5).min()
    
    #df['mom']=np.where(np.logical_and(df.vel_c==1,df.Close>df.price),1,np.where(np.logical_and(df.vel_c==-1,df.Close<df.price),-1,0))
    #flagD=np.logical_and(np.logical_and(df.Close.shift(10)<df.Close.shift(15),df.Close.shift(15)< df.Close.shift(20)),df.Close< df.Close.shift(10))
    #flagU=np.logical_and(np.logical_and(df.Close.shift(15)>df.Close.shift(20),df.Close.shift(10)> df.Close.shift(15)),df.Close> df.Close.shift(10))
    #df['UD']= np.where(flagU,-1,np.where(flagD,1,0))
    
    #df['P']=(df.High+df.Low+df.Close)/3
    #df['UT']=(pd.rolling_max(df.High,60)+pd.rolling_max(df.P+df.High-df.Low,60))*0.5
    #df['DT']=(pd.rolling_min(df.Low,60)+pd.rolling_min(df.P+df.High-df.Low,60))*0.5
    #df['BA']=np.where(df.Close<=df.DT,-1,np.where(df.Close>=df.UT,1,0))# below or above
    return df
    
In [ ]:
    
'''
def normalise(df,window_length=60):
    dfn=(df-df.rolling(window_length).min())/(df.rolling(window_length).max()-df.rolling(window_length).min())
    return dfn
def de_normalise(data,df,window_length=60):
    dn=(df*(data.rolling(window_length).max()-data.rolling(window_length).min()))+data.rolling(window_length).min()
    return dn
#https://stackoverflow.com/questions/312443/how-do-you-split-a-list-into-evenly-sized-chunks
def chunks(l, n):
    """Yield successive n-sized chunks from l."""
    for i in range(0, len(l), n):
        yield l[i:i + n]
##### ARIMA        
from statsmodels.tsa.arima_model import ARIMA
from statsmodels.tsa.arima_model import ARIMAResults
        
###ARIMA preprocessing
def arima_processing(df):
    #data=df[['vwap','mid']]
    df=df.dropna()
    df['Lvwap']=np.log(df.vwap)
    df['Lmid']=np.log(df.mid)
    df['LDvwap']=df.Lvwap-df.Lvwap.shift(60)
    df['LDmid']=df.Lmid-df.Lmid.shift(60)
    df=df.dropna()
    return df   
###Model is already saved from "/Dropbox/DataScience/ARIMA_model_saving.ipynb". Here loaded and added to "df_ml"
def ARIMA_(data):
    ### load model
    data=data.dropna()
    predictions_mid=ARIMA_mid(data.LDmid)
    predictions_vwap=ARIMA_vwap(data.LDvwap) 
    vwap_arima=np.exp(predictions_vwap+data.Lvwap.shift(60))
    mid_arima=np.exp(predictions_mid+data.Lmid.shift(60))
    df_ml['arima']=data.mid+vwap_arima-mid_arima
    
def ARIMA_mid(data):
    ### load model
    mid_arima_loaded = ARIMAResults.load('mid_arima.pkl')
    predictions_mid = mid_arima_loaded.predict()
    return predictions_mid
def ARIMA_vwap(data):
    ### load model
    vwap_arima_loaded = ARIMAResults.load('vwap_arima.pkl')
    predictions_vwap = vwap_arima_loaded.predict()
    return predictions_vwap
#### KALMAN moving average
##KF moving average
#https://github.com/pykalman/pykalman
# Import a Kalman filter and other useful libraries
from pykalman import KalmanFilter
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from scipy import poly1d
def kalman_ma(data):
    #x=data.mid
    x=data.mid
    # Construct a Kalman filter
    kf = KalmanFilter(transition_matrices = [1],
                  observation_matrices = [1],
                  initial_state_mean = 248,
                  initial_state_covariance = 1,
                  observation_covariance=1,
                  transition_covariance=.01)
    # Use the observed values of the price to get a rolling mean
    state_means, _ = kf.filter(x.values)
    state_means = pd.Series(state_means.flatten(), index=x.index)
    df_ml['km']=state_means
### Linear Regression, sklearn, svm:SVR,linear_model
import pickle
#from sklearn.cross_validation import train_test_split
from sklearn import linear_model
from sklearn.svm import SVR
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
## loading model saved from /Dropbox/DataScience/REG_model_saving.ipynb
filename_rgr = 'rgr.sav'
filename_svr = 'svr.sav'
# load the model from disk
loaded_rgr_model = pickle.load(open(filename_rgr, 'rb'))
loaded_svr_model = pickle.load(open(filename_svr, 'rb'))
def strat_lr(data,df):
    df=df.dropna()
    data=data.dropna()
    X=df[['askPrice','askSize','bidPrice','bidSize','vwap','spread','v','return','sigma']]
    y=df.mid
    predict_regr=loaded_rgr_model.predict(X)
    predict_svr=loaded_svr_model.predict(X)
    df['predict_regr']=predict_regr
    df['predict_svr']=predict_svr
    df_ml['REG']=de_normalise(data.mid,df.predict_regr)
    df_ml['SVR']=de_normalise(data.mid,df.predict_svr)
    
#### loading classification model from /Dropbox/DataScience/ML_20Sep
filename_svm_model_up = 'svm_model_up.sav'
filename_lm_model_up = 'lm_model_up.sav'
filename_svm_model_dn = 'svm_model_dn.sav'
filename_lm_model_dn = 'lm_model_dn.sav'
# load the model from disk
loaded_svm_up_model = pickle.load(open(filename_svm_model_up, 'rb'))
loaded_lm_up_model = pickle.load(open(filename_lm_model_up, 'rb'))
loaded_svm_dn_model = pickle.load(open(filename_svm_model_dn, 'rb'))
loaded_lm_dn_model = pickle.load(open(filename_lm_model_dn, 'rb'))
def classification_up_dn(data):
    X=data[['askPrice','askSize','bidPrice','bidSize','vwap','spread','v','return','sigma']]
    y1=data.U
    y2=data.D
    
    
    predict_svm_up=loaded_svm_up_model.predict(X)
    predict_lm_up=loaded_lm_up_model.predict(X)
    predict_svm_dn=loaded_svm_dn_model.predict(X)
    predict_lm_dn=loaded_lm_dn_model.predict(X)
    
    data['predict_svm_up']=predict_svm_up
    data['predict_lm_up']=predict_lm_up
    data['predict_svm_dn']=predict_svm_dn
    data['predict_lm_dn']=predict_lm_dn
    
    data['predict_svm']=data.predict_svm_up+data.predict_svm_dn
    data['predict_lm']=data.predict_lm_up+data.predict_lm_dn
    
    data['UD']=np.where(np.logical_and(data.predict_svm>0,data.predict_lm>0),1,np.where(np.logical_and(data.predict_svm<0,data.predict_lm<0),-1,0))  
       
    df_ml['UD']=data.UD
### LSTM
#df.loc[:, cols].prod(axis=1)
def lstm_processing(df):
    df=df.dropna()
    df_price=df[['mid','vwap','arima','km','REG','SVR']]
    #normalization
    dfn=normalise(df_price,12)
    dfn['UD']=df.UD
    return dfn
import numpy
import matplotlib.pyplot as plt
import pandas
import math
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import LSTM
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import mean_squared_error
import numpy
import matplotlib.pyplot as plt
import pandas
import math
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import LSTM
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import mean_squared_error
from keras.models import load_model
model = load_model('21sep.h5')
# convert an array of values into a dataset matrix
def create_dataset(dataset, look_back=1):
    dataX, dataY = [], []
    for i in range(len(dataset)-look_back-1):
        a = dataset[i:(i+look_back), 0]
        b = dataset[i:(i+look_back), 1]
        c = dataset[i:(i+look_back), 2]
        d = dataset[i:(i+look_back), 3]
        e=  dataset[i:(i+look_back), 4]
        f = dataset[i:(i+look_back), 5]
        g=  dataset[i:(i+look_back), 6]
        dataX.append(np.c_[b,c,d,e,f,g])
        #dataX.append(b)
        #dataX.append(c)
        #dataX.append(d)
        #dataX.append(e)
        #dataX.concatenate((a,bT,cT,dT,eT),axis=1)
        dataY.append(dataset[i + look_back,0])
    return np.array(dataX), np.array(dataY)
def strat_LSTM(df_ml):
    
    #normalization
    df_lstm=lstm_processing(df_ml)
    df_lstm=df_lstm.dropna()
    dataset=df_lstm.values
    dataset = dataset.astype('float32')
    # reshape into X=t and Y=t+1
    look_back = 3
    X_,Y_ = create_dataset(dataset,look_back)
    
    # reshape input to be [samples, time steps, features]
    X_ = numpy.reshape(X_, (X_.shape[0],X_.shape[1],X_.shape[2]))
    # make predictions
    predict = model.predict(X_)
    df_lstm=df_lstm.tail(len(predict))
    df_lstm['LSTM']=predict
    #LSTM=(df_lstm.LSTM*(df_ml.mid.rolling(60).max()-df_ml.midClose.rolling(60).min()))+df_LSTM.Close.rolling(60).min()
    LSTM=de_normalise(df_ml.mid,df_lstm.LSTM,window_length=12)
    df_lstm['pred']=LSTM
    df_lstm=df_lstm.dropna()
    df_lstm=df_lstm.tail(len(df_ml))
    df_ml['LSTM']=df_lstm.pred
    '''
    
In [ ]:
    
'''
#https://stackoverflow.com/questions/312443/how-do-you-split-a-list-into-evenly-sized-chunks
def chunks(l, n):
    """Yield successive n-sized chunks from l."""
    for i in range(0, len(l), n):
        yield l[i:i + n]
'''
    
In [3]:
    
filename = '/home/octo/Dropbox'+ '/SPY7Dec.csv'
    
In [4]:
    
data=get_csv_pd(filename)
    
In [5]:
    
data=preprocessing(data)
    
In [6]:
    
df=data.dropna()
df=df[['mid','vwap','spread','v','mom','return','sigma','high','low',]]
    
In [7]:
    
# split into train and test sets
train_size = int(len(df) * 0.80)
test_size = len(df) - train_size
train= df[0:train_size]
test= df[train_size:len(df)]
print(len(train), len(test))
    
    
In [8]:
    
train_X=train[['mid','vwap','spread','v','return','sigma','high','low',]]
train_y=train['mom']
test_X=test[['mid','vwap','spread','v','return','sigma','high','low',]]
test_y=test['mom']
    
In [9]:
    
train_X.head()
    
    Out[9]:
In [10]:
    
test_y.head()
    
    Out[10]:
In [11]:
    
from sklearn import linear_model
regr = linear_model.LinearRegression()
#regr.fit(X.tail(20),y.tail(20))
#predict=regr.predict(X.tail(5))
regr.fit(train_X,train_y)
predict=regr.predict(test_X)
#X=X.dropna()
#y=y.dropna()
#y[y == inf] = 0
dt=test[['mid']]
dt['predict']=predict
#dt['predict']=dt.mid+dt.mid*dt.predict
dt['predict']=dt.predict*test.v+test.mid.shift(12)
pdf=test
pdf['pREG']=dt.predict
    
    
In [12]:
    
pdf.tail()
    
    Out[12]:
In [13]:
    
from sklearn.svm import SVR
# Fit regression model
svr_rbf = SVR(kernel='rbf', C=1e3, gamma=0.9) #kernel='linear' #kernel='poly'
predict_svr = svr_rbf.fit(train_X,train_y).predict(test_X)
dt1=test[['mid']]
dt1['predict']=predict_svr
dt1['predict']=dt1.predict*test.v+test.mid.shift(12)
pdf['pSVR']=dt1.predict
    
    
In [14]:
    
pdf.dropna().head()
    
    Out[14]:
In [15]:
    
pdf[['mid','high','low','pREG','pSVR']].tail(300).plot(figsize=(15,9))
#df[['Volume']].tail(5000).plot(figsize=(15,9))
#data[['AvgVolume']].tail(5000).plot(figsize=(15,9))
plt.show()
    
    
In [16]:
    
# look at the results
plt.scatter(pdf['mid'],test_y, c='k', label='data')
plt.hold('on')
plt.plot(pdf['pREG'],test_y, c='g', label='pREG')
#plt.plot(pdf['pSVR'], y, c='g', label='pSVR')
plt.xlabel('data')
plt.ylabel('target')
plt.title('Support Vector Regression')
plt.legend()
plt.show()
    
    
In [17]:
    
# look at the results
plt.scatter(pdf['mid'],pdf['pREG'], c='k', label='pSVR')
plt.hold('on')
plt.plot(pdf['mid'],pdf['pSVR'], c='g', label='pREG')
plt.plot(pdf['mid'], pdf['high'], c='g', label='high')
plt.xlabel('data')
plt.ylabel('target')
plt.title('Support Vector Regression')
plt.legend()
plt.show()
    
    
    https://www.quantopian.com/posts/some-code-from-ernie-chans-new-book-implemented-in-python
    http://auquan.com/cointegration-stationarity/
    https://www.analyticsvidhya.com/blog/2016/02/time-series-forecasting-codes-python/
    http://machinelearningmastery.com/time-series-forecast-case-study-python-monthly-armed-robberies-boston/
    https://www.quantstart.com/articles/Basics-of-Statistical-Mean-Reversion-Testing-Part-II
    https://datascience.ibm.com/exchange/public/entry/view/815137c868b916821dec777bdc23013c
    http://machinelearningmastery.com/time-series-data-stationary-python/
In [18]:
    
X=df[['mid','vwap','spread','v','return','sigma','high','low',]]
y=df['mom']
    
In [19]:
    
len(df)
    
    Out[19]:
In [20]:
    
from sklearn import linear_model
regr = linear_model.LinearRegression()
#regr.fit(X.tail(20),y.tail(20))
#predict=regr.predict(X.tail(5))
regr.fit(X.dropna(),y.dropna())
predict=regr.predict(X)
#X=X.dropna()
#y=y.dropna()
#y[y == inf] = 0
dt=df[['mid']]
dt['predict']=predict
#dt['predict']=dt.mid+dt.mid*dt.predict
dt['predict']=dt.predict*df.v+df.mid.shift(12)
classify_df=df
classify_df['pREG']=dt.predict
from sklearn.svm import SVR
# Fit regression model
svr_rbf = SVR(kernel='rbf', C=1e3, gamma=0.9) #kernel='linear' #kernel='poly'
predict_svr = svr_rbf.fit(X, y).predict(X)
dt1=df[['mid']]
dt1['predict']=predict_svr
dt1['predict']=dt1.predict*df.v+df.mid.shift(12)
classify_df['pSVR']=dt1.predict
    
    
In [21]:
    
def classification(df):
    mid1=(df.high+df.low)/2
    #flagUD=np.where(np.logical_and(df.mid>df.pREG,df.mid>df.pSVR),1,np.where(np.logical_and(df.mid<df.pREG,df.mid<df.pSVR),-1,0))
    #df['UD']= np.where(np.logical_and(df.mid>mid1,flagUD==1),1,np.where(np.logical_and(df.mid<mid1,flagUD==-1),-1,0))
    flagUD=np.where(np.logical_and(df.mid>df.pREG,df.mid>df.pSVR),1,np.where(np.logical_and(df.mid<df.pREG,df.mid<df.pSVR),-1,0))
    UD= np.where(np.logical_and(df.mid>mid1,flagUD==1),1,np.where(np.logical_and(df.mid<mid1,flagUD==-1),-1,0))
    df['U']= np.where(UD==1,1,0)
    df['D']= np.where(UD==-1,-1,0)
    df['UD']=df.U+df.D
    return df
    
In [22]:
    
data_class=classification(classify_df)
data_class=data_class.dropna()
df=df.dropna()
# both df and data_class have U,D,UD
    
In [23]:
    
data_class.head()
    
    Out[23]:
In [24]:
    
df.head()
    
    Out[24]:
In [25]:
    
# split into train and test sets
train_size = int(len(data_class) * 0.80)
test_size = len(data_class) - train_size
train= data_class[0:train_size]
test= data_class[train_size:len(data_class)]
print(len(train), len(test))
    
    
In [26]:
    
train_X=train[['mid','vwap','spread','v','return','sigma','high','low','mom','pREG','pSVR']]
train_y=train['UD']
test_X=test[['mid','vwap','spread','v','return','sigma','high','low','mom','pREG','pSVR']]
test_y=test['UD']
train_U=train['U']
test_U=test['U']
train_D=train['D']
test_D=test['D']
    
In [27]:
    
print(len(train_U), len(test_U))
    
    
In [28]:
    
from sklearn import metrics
from sklearn.linear_model import LogisticRegression
    
In [29]:
    
model = LogisticRegression()
model.fit(train_X,train_U)
print(model)
    
    
In [30]:
    
# make predictions
expected =test_U
predicted = model.predict(test_X)
    
In [31]:
    
# summarize the fit of the model
print(metrics.classification_report(expected, predicted))
print(metrics.confusion_matrix(expected, predicted))
    
    
    
In [32]:
    
model = LogisticRegression()
model.fit(train_X,train_D)
print(model)
    
    
In [33]:
    
# make predictions
expected =test_D
predicted = model.predict(test_X)
    
In [34]:
    
# summarize the fit of the model
print(metrics.classification_report(expected, predicted))
print(metrics.confusion_matrix(expected, predicted))
    
    
    
In [35]:
    
from sklearn import metrics
from sklearn.tree import DecisionTreeClassifier
    
In [36]:
    
# fit a CART model to the data
model = DecisionTreeClassifier()
model.fit(train_X,train_U)
print(model)
    
    
In [37]:
    
# make predictions
expected =test_U
predicted = model.predict(test_X)
    
In [38]:
    
# summarize the fit of the model
print(metrics.classification_report(expected, predicted))
print(metrics.confusion_matrix(expected, predicted))
    
    
In [39]:
    
accuracy =model.score(test_X,test_U) 
accuracy
    
    Out[39]:
In [40]:
    
# fit a CART model to the data
model = DecisionTreeClassifier()
model.fit(train_X,train_D)
print(model)
    
    
In [41]:
    
# make predictions
expected =test_D
predicted = model.predict(test_X)
    
In [42]:
    
# summarize the fit of the model
print(metrics.classification_report(expected, predicted))
print(metrics.confusion_matrix(expected, predicted))
    
    
In [43]:
    
accuracy =model.score(test_X,test_D) 
accuracy
    
    Out[43]:
The random forest algorithm can find nonlinearities in data that a linear regression wouldn’t be able to pick up on.
In [44]:
    
# Import the random forest model.
from sklearn.ensemble import RandomForestRegressor
    
In [45]:
    
# Initialize the model with some parameters.
model = RandomForestRegressor(n_estimators=100, min_samples_leaf=10, random_state=1)
# Fit the model to the data.
model.fit(train_X,train_U)
print(model)
    
    
In [46]:
    
# Make predictions.
expected=test_U
predicted = model.predict(test_X)
    
In [47]:
    
accuracy =model.score(test_X,test_U) 
accuracy
    
    Out[47]:
In [48]:
    
from sklearn.ensemble import RandomForestClassifier
    
In [49]:
    
clf = RandomForestClassifier(n_estimators=100, n_jobs=-1)
clf.fit(train_X,train_U)
accuracy = clf.score(test_X,test_U)
print(accuracy)
    
    
In [50]:
    
from sklearn import neighbors
    
In [51]:
    
clf = neighbors.KNeighborsClassifier()
clf.fit(train_X,train_U)
    
    Out[51]:
In [52]:
    
accuracy = clf.score(test_X,test_U)
print(accuracy)
    
    
In [53]:
    
from sklearn.ensemble import AdaBoostClassifier
    
In [54]:
    
clf = AdaBoostClassifier()
clf.fit(train_X,train_U)
accuracy = clf.score(test_X,test_U)
print(accuracy)
    
    
In [55]:
    
from sklearn.ensemble import GradientBoostingClassifier
    
In [56]:
    
clf = GradientBoostingClassifier(n_estimators=100)
clf.fit(train_X,train_U)
accuracy = clf.score(test_X,test_U)
print(accuracy)
    
    
In [57]:
    
from sklearn.qda import QDA
    
    
In [58]:
    
clf = QDA()
clf.fit(train_X,train_U)
accuracy = clf.score(test_X,test_U)
print(accuracy)
    
    
    
SVM (Support Vector Machines) is one of the most popular machine learning algorithms used mainly for the classification problem. As well as logistic regression, SVM allows multi-class classification with the help of the one-vs-all method.
In [59]:
    
from sklearn import metrics
from sklearn.svm import SVC
    
In [60]:
    
# fit a SVM model to the data
model = SVC()
model.fit(train_X,train_U)
print(model)
    
    
In [61]:
    
# Make predictions.
expected=test_U
predicted = model.predict(test_X)
    
In [62]:
    
accuracy =model.score(test_X,test_U) 
accuracy
    
    Out[62]:
In [63]:
    
# fit a SVM model to the data
model = SVC()
model.fit(train_X,train_D)
print(model)
    
    
In [64]:
    
# Make predictions.
expected=test_D
predicted = model.predict(test_X)
    
In [65]:
    
accuracy =model.score(test_X,test_D) 
accuracy
    
    Out[65]:
In [ ]:
    
if savemodel == True:
        fname_out = '{}-{}.pickle'.format(fout, datetime.now())
        with open(fname_out, 'wb') as f:
            cPickle.dump(clf, f, -1)
    
In [66]:
    
# plotting
import matplotlib.pyplot as plt
import seaborn as sns; sns.set()
%matplotlib inline
#plt.rcParams['figure.figsize'] = 8,6
    
In [67]:
    
test.boxplot(column='v')
    
    Out[67]:
    
In [68]:
    
test.boxplot(by='v')
plt.ylim(245,248)
    
    Out[68]:
    
In [69]:
    
test.boxplot(by='UD')
    
    Out[69]:
    
In [70]:
    
#some descriptive statistics
test.describe()
    
    Out[70]:
In [71]:
    
test['v'].plot(kind='hist', grid=True, title='velocity')
    
    Out[71]:
    
In [72]:
    
test['UD'].plot(kind='hist', grid=True, title='up-down')
    
    Out[72]:
    
In [73]:
    
test['v'].plot(kind='line', grid=True, title='velocity')
    
    Out[73]:
    
In [74]:
    
test['UD'].plot(kind='line', grid=True, title='up-down')
    
    Out[74]:
    
In [75]:
    
# Find 7, 30, 120 day moving averages (very broadly, rolling week, month and quarter)
spy_12 = test.rolling(window=12).mean()
spy_60 = test.rolling(window=60).mean()
spy_360 = test.rolling(window=360).mean()
fig = plt.figure()
fig.autofmt_xdate()
ax = fig.add_subplot(1,1,1)
ax.plot(test.index,test, label='SPY')
ax.plot(spy_12.index, spy_12, label='1 min rolling')
ax.plot(spy_60.index, spy_60, label='5 min rolling')
ax.plot(spy_360.index,spy_360, label='30 min rolling')
ax.grid()
ax.legend(loc=2)
ax.set_xlabel('Date')
plt.title('SPY Closes & Rolling Averages')
plt.show()
    
    
In [76]:
    
#frequency
round(test['mom']).value_counts()
    
    Out[76]:
In [77]:
    
round(test['vwap'],1).hist(bins=50)
    
    Out[77]:
    
In [78]:
    
test.boxplot(column='mid')
    
    Out[78]:
    
In [79]:
    
#df for datascience
#signal=df.DataFrame(data=df.mid)
signal=df
#df['time']=df.index.strftime('%H:%M:%S')
time=signal.index.strftime('%H:%M:%S')
    
In [80]:
    
P=(signal.high+signal.low+signal.mid)/3
signal['UT']=(P+signal.high.rolling(60).max()-signal.low.rolling(60).max())
signal['DT']=(P-signal.high.rolling(60).min()+signal.low.rolling(60).min())
signal['BS']=np.where(signal.mid<=df.DT,"B",np.where(signal.mid>=df.UT,"S","H"))
signal=signal.dropna()
    
    
In [81]:
    
signal.head()
    
    Out[81]:
In [82]:
    
df[['UT','DT','mid','high','low','pREG','pSVR']].tail(100).plot(figsize=(16, 10))
plt.show()
    
    
In [83]:
    
signal.boxplot(column='mid',by ='BS')
    
    Out[83]:
    
In [84]:
    
temp1 = round(signal['UD']).value_counts(ascending=True)
temp2 = signal.pivot_table(values='UD',index=['BS'],aggfunc=lambda x: x.map({'B':1,'S':-1,'H':0}).mean())
    
In [85]:
    
print ('Frequency Table for spread:')
print (temp1)
print ('\nProbility') 
print (temp2.tail())
    
    
In [86]:
    
temp3 = pd.crosstab(round(signal['UD']),signal['BS'])
temp3.plot(kind='bar', stacked=True, color=['red','blue'], grid=False)
    
    Out[86]:
    
In [87]:
    
# number of missing values in each column as isnull() returns 1, if the value is null.
signal.apply(lambda x: sum(x.isnull()),axis=0)
    
    Out[87]:
In [88]:
    
signal['BS'].value_counts()
    
    Out[88]:
In [89]:
    
signal['UD'].value_counts()
    
    Out[89]:
In [90]:
    
table = signal.pivot_table(values='v', index='BS' ,columns='UD', aggfunc=np.median)
print(table)
    
    
In [91]:
    
#Boolean indexing
signal.loc[(signal['v']<0) & (signal["BS"]=="B") & (signal["DT"]>signal["mid"]), ['mid',"spread","BS","DT"]].head()
    
    Out[91]:
In [101]:
    
train_X.head()
    
    Out[101]:
In [93]:
    
# Create first network with Keras
from keras.models import Sequential
from keras.layers import Dense
import numpy
    
    
In [102]:
    
# create model
model = Sequential()
model.add(Dense(12, input_dim=11, init='uniform', activation='relu'))
model.add(Dense(8, init='uniform', activation='relu'))
model.add(Dense(1, init='uniform', activation='sigmoid'))
# Compile model
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
# Fit the model
model.fit(train_X,train_U, nb_epoch=11, batch_size=10)
# evaluate the model
scores = model.evaluate(test_X,test_U)
print("%s: %.2f%%" % (model.metrics_names[1], scores[1]*100))
    
    
    
In [ ]: