In [4]:
"""
The intent of this notebook is model selection and
evaluation for the MVP of our brainNN classifier.
"""
import sys
import matplotlib
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from tornado import gen
from tornado.ioloop import IOLoop
import aimetrics as aim
import aimetrics.metrics as aim_metrics
import seaborn as sns
%matplotlib inline
In [5]:
X_trn_val = pd.read_csv('output/bnn-mvp/X_trn_val.csv', index_col=0)
y_trn_val = pd.read_csv('output/bnn-mvp/y_trn_val.csv', index_col=0)
X_test = pd.read_csv('output/bnn-mvp/X_test.csv', index_col=0)
y_test = pd.read_csv('output/bnn-mvp/y_test.csv', index_col=0)
labels = ['small_drone', 'person']
# create data storage variable
metrics = {}
In [6]:
from sklearn.cross_validation import StratifiedKFold
# put together k-fold output
skf = StratifiedKFold(y_trn_val['small_drone'], 5)
@gen.coroutine
def get_default_metrics():
indx = list(skf) # for debugging
metrics['default'] = yield [
aim_metrics.remote_classifier_metrics(
'http://localhost:3002/',
'bnn',
X_trn_val.iloc[trn_ind].values,
y_trn_val.iloc[trn_ind].values,
X_trn_val.iloc[val_ind].values,
y_trn_val.iloc[val_ind].values,
labels
)
for trn_ind, val_ind in indx
]
IOLoop.instance().add_callback(get_default_metrics)
In [ ]:
row_range = range(0, 6)
col_range = range(2,7)
rate_range = np.arange(0.1, 0.8, 0.1)
n_folds = 5
print("TESTING %d MODELS" % (len(row_range)*len(col_range)*len(rate_range)*n_folds)
@gen.coroutine
def get_param_grid_metrics():
yield [get_param_metrics(nrows, ncols, learning_rate=rate) for nrows in row_range for ncols in col_range for rate in rate_range]
print("PARAMETER SEARCH COMPLETE")
@gen.coroutine
def get_param_metrics(n_hidden_rows, ncols, learning_rate=None):
"""Get the metrics for a particular parameter set. Assumes a grid topo"""
skf = StratifiedKFold(y_trn_val['small_drone'], n_folds)
params = {'hiddenLayers': [ncols] * n_hidden_rows}
if learning_rate:
params['learningRate'] = learning_rate
key = 'x'.join((str(n_hidden_rows), str(ncols), str(learning_rate)))
metrics[key] = yield [
aim_metrics.remote_classifier_metrics(
'http://localhost:3002/',
'bnn',
X_trn_val.iloc[trn_ind].values,
y_trn_val.iloc[trn_ind].values,
X_trn_val.iloc[val_ind].values,
y_trn_val.iloc[val_ind].values,
labels,
model_params = params,
)
for trn_ind, val_ind in skf
]
print("%s Complete" % key)
IOLoop.instance().add_callback(get_param_grid_metrics)
In [8]:
def get_score_report(key, score, agg):
stats = pd.Series([r[score] for r in metrics[key]]).describe()
return stats[agg]
In [13]:
scores = ['roc_auc', 'acc', 'f1_score']
aggs = ['mean', 'std']
report = pd.DataFrame({
score + "_" + agg: pd.Series({
key: get_score_report(key, score, agg) for key in metrics.keys()
}) for score in scores for agg in aggs
})
report.sort("roc_auc_mean", ascending=False)
Out[13]:
In [14]:
import json
with open("../output/bnn-mvp/param_metrics.json", 'w') as f:
json.dump(metrics, f)
report.to_csv("../output/bnn-mvp/param_metrics_summary.csv")
In [11]:
"""
Conclusion:
The 1-layer NN's almost categorically outperformed the others
in roc_auc. Of those, 1x4x0.7 was the best performer by a very
small margin. Therefore, it will be the selected model for our
MVP. However, I want to confirm this with an increased search
space this evening, in order to rule out somewhat larger
dimensions.
Selected Params:
{
hiddenLayers: [4],
learningRate: 0.7
}
"""
pass
In [ ]:
In [ ]: