The best model parameters are given by

author : SHAMINDRA
data_source_dir : SC_shuffle
test_type : validation
model_type : RF
RF:
    n_estimators : 100
    criterion : 'gini'
    max_features : 'auto'
    max_depth : 20
    n_jobs : 1
SVM:
    kernel : 'rbf'
    degree : 3
    gamma : 'auto'
    tol : 0.001
NNET:
    method1 : 'Tanh'
    neurons1 : 24
    method2 : 'Tanh'
    neurons2 : 39
    decay : 0.0001
    learning_rate : 0.001
    n_iter : 25
    random_state : 1

In [66]:
# Code source: Gaël Varoquaux
# Modified for documentation by Jaques Grobler
# License: BSD 3 clause

import imp
import numpy as np
import matplotlib.pyplot as plt
from sklearn import linear_model, datasets
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import log_loss
from sklearn.metrics import accuracy_score
import pandas as pd

We looked at the top features from the best performing random forest. They are as below:


In [48]:
# The top variables are:
var_importance = [(1, 'P_1_bid', 0.020001165389254737)
                  , (2, 'V_1_bid', 0.018358575666246449)
                  , (3, 'P_1_ask', 0.017058479215839299)
                  , (4, 'V_1_ask', 0.016953559068869958)
                  , (5, 'P_2_bid', 0.016908649059514971)
                  , (6, 'V_2_bid', 0.016219220215427665)
                  , (7, 'P_2_ask', 0.015039647893425838)
                  , (8, 'V_2_ask', 0.014497773408233052)
                  , (9, 'P_3_bid', 0.014321084019596746)
                  , (10, 'V_3_bid', 0.014158850118003859)
                  , (11, 'P_3_ask', 0.014101386932514923)
                  , (12, 'V_3_ask', 0.013911823640617986)
                  , (13, 'P_4_bid', 0.013838322603744435)
                  , (14, 'V_4_bid', 0.013668619218980316)
                  , (15, 'P_4_ask', 0.013413471959983998)]

In [33]:
# Open test and train sets
df_train = pd.read_csv(train_ds_ref
                       , compression='gzip', index_col = None)
df_test  = pd.read_csv(test_ds_ref
                       , compression='gzip', index_col = None)

# Drop the first columns - they are not useful
df_train_clean = df_train.iloc[:,1:]
df_test_clean  = df_test.iloc[:,1:]

In [34]:
X_train_cols  =  list(df_train_clean[['P_1_bid', 'V_1_bid', 'P_1_ask', 'V_1_ask', 'P_2_bid', 'V_2_bid', 'P_2_ask'
                          , 'V_2_ask']].columns.values)

X_train  =  np.array(df_train_clean[['P_1_bid', 'V_1_bid', 'P_1_ask', 'V_1_ask', 'P_2_bid', 'V_2_bid', 'P_2_ask'
                          , 'V_2_ask']])
Y_train  =  np.array(df_train_clean[['labels']])[:,0]

X_test  =  np.array(df_test_clean[['P_1_bid', 'V_1_bid', 'P_1_ask', 'V_1_ask', 'P_2_bid', 'V_2_bid', 'P_2_ask'
                          , 'V_2_ask']])
Y_test  =  np.array(df_test_clean[['labels']])[:,0]

In [38]:
# Define the labels
labels = np.unique(Y_train)

## # Scale Data
scaler = MinMaxScaler()
X_test = scaler.fit_transform(X_test)
X_train = scaler.fit_transform(X_train)

# Set up the data
logreg = linear_model.LogisticRegression(C=1e5)

# Fit
logreg.fit(X_train, Y_train)

# Predict
Y_hat   = logreg.predict(X_test)
Y_probs = logreg.predict_proba(X_test)

## # Misclassification error rate
miss_err = 1-accuracy_score(Y_test, Y_hat)
## # Log Loss
eps = 10^(-15)
logloss = log_loss(Y_test, Y_probs, eps = eps)

##confusion_matrix
confusion_matrix1 = confusion_matrix(y_true=Y_test, y_pred=Y_hat
                                     , labels=labels)

# classification_report
classification_report1 = classification_report(y_true=Y_test, y_pred=Y_hat)

# Output results in a list format
result = []
result.append("confusion_matrix")
result.append(confusion_matrix1)
result.append("classification_report")
result.append(classification_report1)
result.append("logloss")
result.append(logloss)
result.append("miss_err")
result.append(miss_err)
result.append("Y_hat")
result.append(Y_hat)

In [46]:
print(result[3])
print(Y_hat)
print(Y_probs)


             precision    recall  f1-score   support

         -1       0.43      0.60      0.50     18373
          0       0.40      0.44      0.42     16950
          1       0.38      0.17      0.24     15265

avg / total       0.41      0.42      0.40     50588

[-1  1 -1 ...,  0 -1 -1]
[[ 0.4061748   0.27577677  0.31804843]
 [ 0.19159361  0.28938718  0.51901922]
 [ 0.52523662  0.20730076  0.26746262]
 ..., 
 [ 0.33569901  0.43736893  0.22693206]
 [ 0.46693092  0.23000641  0.30306267]
 [ 0.38504269  0.29336207  0.32159525]]

The predicted output for our most successful RF model is as follows

classification_report

             precision    recall  f1-score   support

         -1       0.99      0.98      0.98     18373
          0       0.97      0.98      0.97     16950
          1       0.99      0.98      0.98     15265

avg / total       0.98      0.98      0.98     50588

In [49]:
def predict_simple_linear(df_train_clean, df_test_clean):
    X_train_cols  =  list(df_train_clean[['P_1_bid', 'V_1_bid', 'P_1_ask', 'V_1_ask', 'P_2_bid', 'V_2_bid', 'P_2_ask'
                          , 'V_2_ask']].columns.values)

    X_train  =  np.array(df_train_clean[['P_1_bid', 'V_1_bid', 'P_1_ask', 'V_1_ask', 'P_2_bid', 'V_2_bid', 'P_2_ask'
                              , 'V_2_ask']])
    Y_train  =  np.array(df_train_clean[['labels']])[:,0]

    X_test  =  np.array(df_test_clean[['P_1_bid', 'V_1_bid', 'P_1_ask', 'V_1_ask', 'P_2_bid', 'V_2_bid', 'P_2_ask'
                              , 'V_2_ask']])
    Y_test  =  np.array(df_test_clean[['labels']])[:,0]
    
    # Define the labels
    labels = np.unique(Y_train)

    ## # Scale Data
    scaler = MinMaxScaler()
    X_test = scaler.fit_transform(X_test)
    X_train = scaler.fit_transform(X_train)

    # Set up the data
    logreg = linear_model.LogisticRegression(C=1e5)

    # Fit
    logreg.fit(X_train, Y_train)

    # Predict
    Y_hat   = logreg.predict(X_test)
    Y_probs = logreg.predict_proba(X_test)

    ## # Misclassification error rate
    miss_err = 1-accuracy_score(Y_test, Y_hat)
    ## # Log Loss
    eps = 10^(-15)
    logloss = log_loss(Y_test, Y_probs, eps = eps)

    ##confusion_matrix
    confusion_matrix1 = confusion_matrix(y_true=Y_test, y_pred=Y_hat
                                         , labels=labels)

    # classification_report
    classification_report1 = classification_report(y_true=Y_test, y_pred=Y_hat)

    # Output results in a list format
    result = []
    result.append("confusion_matrix")
    result.append(confusion_matrix1)
    result.append("classification_report")
    result.append(classification_report1)
    result.append("logloss")
    result.append(logloss)
    result.append("miss_err")
    result.append(miss_err)
    result.append("Y_hat")
    result.append(Y_hat)
    
    return result

In [62]:
linear_simple_predict = predict_simple_linear(df_train_clean = df_train_clean
                                              , df_test_clean = df_train_clean)


Out[62]:
array([-1,  0,  1, ...,  0, -1, -1])

In [64]:
# Get the predicted outcomes
linear_simple_predict_vals = linear_simple_predict[len(linear_simple_predict) -1]
len(list(linear_simple_predict_vals))


Out[64]:
202349

In [67]:
modl = imp.load_source('execute_model', '../../execute_model.py')


---------------------------------------------------------------------------
ImportError                               Traceback (most recent call last)
<ipython-input-67-cda231371c95> in <module>()
----> 1 modl = imp.load_source('execute_model', '../../execute_model.py')

/Users/shamindra/anaconda3/envs/py3_stat222_finance/lib/python3.5/imp.py in load_source(name, pathname, file)
    170         module = _exec(spec, sys.modules[name])
    171     else:
--> 172         module = _load(spec)
    173     # To allow reloading to potentially work, use a non-hacked loader which
    174     # won't rely on a now-closed file object.

/Users/shamindra/anaconda3/envs/py3_stat222_finance/lib/python3.5/importlib/_bootstrap.py in _load(spec)

/Users/shamindra/anaconda3/envs/py3_stat222_finance/lib/python3.5/importlib/_bootstrap.py in _load_unlocked(spec)

/Users/shamindra/anaconda3/envs/py3_stat222_finance/lib/python3.5/importlib/_bootstrap_external.py in exec_module(self, module)

/Users/shamindra/anaconda3/envs/py3_stat222_finance/lib/python3.5/importlib/_bootstrap.py in _call_with_frames_removed(f, *args, **kwds)

/Users/shamindra/LEARNING/STUDY/UC_BERKELEY/STATISTICS/COURSES/MA_PROGRAM/CURRENT_COURSES/SPRING_2016/STAT222/PROJECTS/lobpredictrst/lobpredictrst/execute_model.py in <module>()
      5 import pandas as pd
      6 import re
----> 7 from rf import *
      8 from svm import *
      9 modl = imp.load_source('read_model_yaml', 'read_model_yaml.py')

ImportError: No module named 'rf'

In [ ]: