The best model parameters are given by

author : SHAMINDRA
data_source_dir : SC_shuffle
test_type : validation
model_type : RF
RF:
    n_estimators : 100
    criterion : 'gini'
    max_features : 'auto'
    max_depth : 20
    n_jobs : 1
SVM:
    kernel : 'rbf'
    degree : 3
    gamma : 'auto'
    tol : 0.001
NNET:
    method1 : 'Tanh'
    neurons1 : 24
    method2 : 'Tanh'
    neurons2 : 39
    decay : 0.0001
    learning_rate : 0.001
    n_iter : 25
    random_state : 1

In [11]:
# Code source: Gaël Varoquaux
# Modified for documentation by Jaques Grobler
# License: BSD 3 clause

import numpy as np
import matplotlib.pyplot as plt
from sklearn import linear_model, datasets
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
import pandas as pd

In [12]:
data_source_dir   = "SC_shuffle"
train_ds_name     = "train_test_validation.tar.gz"
test_ds_name      = "validation.tar.gz"
train_ds_ref      = "../../data/output/model_clean_data/" + data_source_dir + "/" + train_ds_name
test_ds_ref       = "../../data/output/model_clean_data/" + data_source_dir + "/" + test_ds_name
train_ds_ref


Out[12]:
'../../data/output/model_clean_data/SC_shuffle/train_test_validation.tar.gz'

In [13]:
# Open test and train sets
df_train = pd.read_csv(train_ds_ref
                       , compression='gzip', index_col = None)
df_test  = pd.read_csv(test_ds_ref
                       , compression='gzip', index_col = None)

# Drop the first columns - they are not useful
df_train_clean = df_train.iloc[:,1:]
df_test_clean  = df_test.iloc[:,1:]

# Traning data column names - used for variale importance
X_train_cols  =  list(df_train_clean.drop(['labels', 'index', 'Time'], axis=1).columns.values)

# Define test/training set
X_train  =  np.array(df_train_clean.drop(['labels', 'index', 'Time'], axis = 1))
Y_train  =  np.array(df_train_clean[['labels']])[:,0]
X_test   =  np.array(df_test_clean.drop(['labels', 'index', 'Time'], axis = 1))
Y_test   =  np.array(df_test_clean[['labels']])[:,0]

In [14]:
# Define the labels
labels = np.unique(Y_train)

## # Scale Data
scaler = MinMaxScaler()
X_test = scaler.fit_transform(X_test)
X_train = scaler.fit_transform(X_train)

# Set up the data
logreg = linear_model.LogisticRegression(C=1e5)

# Fit
logreg.fit(X_train, Y_train)

# Predict
Y_hat   = logreg.predict(X_test)

## # Misclassification error rate
miss_err = 1-accuracy_score(Y_test, Y_hat)
## # Log Loss
eps = 10^(-15)
logloss = log_loss(Y_test, Y_probs, eps = eps)

##confusion_matrix
confusion_matrix1 = confusion_matrix(y_true=Y_test, y_pred=Y_hat
                                     , labels=labels)

# classification_report
classification_report1 = classification_report(y_true=Y_test, y_pred=Y_hat)

# Output results in a list format
result = []
result.append("confusion_matrix")
result.append(confusion_matrix1)
result.append("classification_report")
result.append(classification_report1)
result.append("logloss")
result.append(logloss)
result.append("miss_err")
result.append(miss_err)


---------------------------------------------------------------------------
ValueError                                Traceback (most recent call last)
<ipython-input-14-52b79987e091> in <module>()
      6 
      7 # Fit
----> 8 logreg.fit(X_train, Y_train)
      9 
     10 # Predict

/Users/shamindra/anaconda3/envs/py3_stat222_finance/lib/python3.5/site-packages/sklearn/linear_model/logistic.py in fit(self, X, y, sample_weight)
   1140 
   1141         X, y = check_X_y(X, y, accept_sparse='csr', dtype=np.float64, 
-> 1142                          order="C")
   1143         check_classification_targets(y)
   1144         self.classes_ = np.unique(y)

/Users/shamindra/anaconda3/envs/py3_stat222_finance/lib/python3.5/site-packages/sklearn/utils/validation.py in check_X_y(X, y, accept_sparse, dtype, order, copy, force_all_finite, ensure_2d, allow_nd, multi_output, ensure_min_samples, ensure_min_features, y_numeric, warn_on_dtype, estimator)
    508     X = check_array(X, accept_sparse, dtype, order, copy, force_all_finite,
    509                     ensure_2d, allow_nd, ensure_min_samples,
--> 510                     ensure_min_features, warn_on_dtype, estimator)
    511     if multi_output:
    512         y = check_array(y, 'csr', force_all_finite=True, ensure_2d=False,

/Users/shamindra/anaconda3/envs/py3_stat222_finance/lib/python3.5/site-packages/sklearn/utils/validation.py in check_array(array, accept_sparse, dtype, order, copy, force_all_finite, ensure_2d, allow_nd, ensure_min_samples, ensure_min_features, warn_on_dtype, estimator)
    396                              % (array.ndim, estimator_name))
    397         if force_all_finite:
--> 398             _assert_all_finite(array)
    399 
    400     shape_repr = _shape_repr(array.shape)

/Users/shamindra/anaconda3/envs/py3_stat222_finance/lib/python3.5/site-packages/sklearn/utils/validation.py in _assert_all_finite(X)
     52             and not np.isfinite(X).all()):
     53         raise ValueError("Input contains NaN, infinity"
---> 54                          " or a value too large for %r." % X.dtype)
     55 
     56 

ValueError: Input contains NaN, infinity or a value too large for dtype('float64').

In [ ]:
print(result)

In [3]:
# The top variables are:
var_importance = [(1, 'P_1_bid', 0.020001165389254737)
                  , (2, 'V_1_bid', 0.018358575666246449)
                  , (3, 'P_1_ask', 0.017058479215839299)
                  , (4, 'V_1_ask', 0.016953559068869958)
                  , (5, 'P_2_bid', 0.016908649059514971)
                  , (6, 'V_2_bid', 0.016219220215427665)
                  , (7, 'P_2_ask', 0.015039647893425838)
                  , (8, 'V_2_ask', 0.014497773408233052)
                  , (9, 'P_3_bid', 0.014321084019596746)
                  , (10, 'V_3_bid', 0.014158850118003859)
                  , (11, 'P_3_ask', 0.014101386932514923)
                  , (12, 'V_3_ask', 0.013911823640617986)
                  , (13, 'P_4_bid', 0.013838322603744435)
                  , (14, 'V_4_bid', 0.013668619218980316)
                  , (15, 'P_4_ask', 0.013413471959983998)]

var_importance


Out[3]:
[(1, 'P_1_bid', 0.020001165389254737),
 (2, 'V_1_bid', 0.01835857566624645),
 (3, 'P_1_ask', 0.0170584792158393),
 (4, 'V_1_ask', 0.01695355906886996),
 (5, 'P_2_bid', 0.01690864905951497),
 (6, 'V_2_bid', 0.016219220215427665),
 (7, 'P_2_ask', 0.015039647893425838),
 (8, 'V_2_ask', 0.014497773408233052),
 (9, 'P_3_bid', 0.014321084019596746),
 (10, 'V_3_bid', 0.014158850118003859),
 (11, 'P_3_ask', 0.014101386932514923),
 (12, 'V_3_ask', 0.013911823640617986),
 (13, 'P_4_bid', 0.013838322603744435),
 (14, 'V_4_bid', 0.013668619218980316),
 (15, 'P_4_ask', 0.013413471959983998)]

In [ ]: