author : SHAMINDRA
data_source_dir : SC_shuffle
test_type : validation
model_type : RF
RF:
n_estimators : 100
criterion : 'gini'
max_features : 'auto'
max_depth : 20
n_jobs : 1
SVM:
kernel : 'rbf'
degree : 3
gamma : 'auto'
tol : 0.001
NNET:
method1 : 'Tanh'
neurons1 : 24
method2 : 'Tanh'
neurons2 : 39
decay : 0.0001
learning_rate : 0.001
n_iter : 25
random_state : 1
In [11]:
# Code source: Gaël Varoquaux
# Modified for documentation by Jaques Grobler
# License: BSD 3 clause
import numpy as np
import matplotlib.pyplot as plt
from sklearn import linear_model, datasets
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
import pandas as pd
In [12]:
data_source_dir = "SC_shuffle"
train_ds_name = "train_test_validation.tar.gz"
test_ds_name = "validation.tar.gz"
train_ds_ref = "../../data/output/model_clean_data/" + data_source_dir + "/" + train_ds_name
test_ds_ref = "../../data/output/model_clean_data/" + data_source_dir + "/" + test_ds_name
train_ds_ref
Out[12]:
In [13]:
# Open test and train sets
df_train = pd.read_csv(train_ds_ref
, compression='gzip', index_col = None)
df_test = pd.read_csv(test_ds_ref
, compression='gzip', index_col = None)
# Drop the first columns - they are not useful
df_train_clean = df_train.iloc[:,1:]
df_test_clean = df_test.iloc[:,1:]
# Traning data column names - used for variale importance
X_train_cols = list(df_train_clean.drop(['labels', 'index', 'Time'], axis=1).columns.values)
# Define test/training set
X_train = np.array(df_train_clean.drop(['labels', 'index', 'Time'], axis = 1))
Y_train = np.array(df_train_clean[['labels']])[:,0]
X_test = np.array(df_test_clean.drop(['labels', 'index', 'Time'], axis = 1))
Y_test = np.array(df_test_clean[['labels']])[:,0]
In [14]:
# Define the labels
labels = np.unique(Y_train)
## # Scale Data
scaler = MinMaxScaler()
X_test = scaler.fit_transform(X_test)
X_train = scaler.fit_transform(X_train)
# Set up the data
logreg = linear_model.LogisticRegression(C=1e5)
# Fit
logreg.fit(X_train, Y_train)
# Predict
Y_hat = logreg.predict(X_test)
## # Misclassification error rate
miss_err = 1-accuracy_score(Y_test, Y_hat)
## # Log Loss
eps = 10^(-15)
logloss = log_loss(Y_test, Y_probs, eps = eps)
##confusion_matrix
confusion_matrix1 = confusion_matrix(y_true=Y_test, y_pred=Y_hat
, labels=labels)
# classification_report
classification_report1 = classification_report(y_true=Y_test, y_pred=Y_hat)
# Output results in a list format
result = []
result.append("confusion_matrix")
result.append(confusion_matrix1)
result.append("classification_report")
result.append(classification_report1)
result.append("logloss")
result.append(logloss)
result.append("miss_err")
result.append(miss_err)
In [ ]:
print(result)
In [3]:
# The top variables are:
var_importance = [(1, 'P_1_bid', 0.020001165389254737)
, (2, 'V_1_bid', 0.018358575666246449)
, (3, 'P_1_ask', 0.017058479215839299)
, (4, 'V_1_ask', 0.016953559068869958)
, (5, 'P_2_bid', 0.016908649059514971)
, (6, 'V_2_bid', 0.016219220215427665)
, (7, 'P_2_ask', 0.015039647893425838)
, (8, 'V_2_ask', 0.014497773408233052)
, (9, 'P_3_bid', 0.014321084019596746)
, (10, 'V_3_bid', 0.014158850118003859)
, (11, 'P_3_ask', 0.014101386932514923)
, (12, 'V_3_ask', 0.013911823640617986)
, (13, 'P_4_bid', 0.013838322603744435)
, (14, 'V_4_bid', 0.013668619218980316)
, (15, 'P_4_ask', 0.013413471959983998)]
var_importance
Out[3]:
In [ ]: