Classical algorithm for signal/background separation. Try using this and tree-based methods before going into anything too deep. The performance of SVMs is often contingent on hyperparameters of the kernel. Extending to HyperBand cross-validation may be worthwhile.
Todo: Cross-validation to resolve hyperparameters, Outlier Detection
Note: Fit time complexity is greater than quadratic with the number of training samples, making datasets of $\mathcal{O}(10^5)$ hard to train. May be practical to form an ensemble of SVM classifiers with suitably tuned hyperparameters.
In [1]:
import numpy as np
import pandas as pd
import time, os
from sklearn import svm
class config(object):
# Set network parameters
mode = 'kst'
channel = 'rho0'
def load_data(file_name, test_size = 0.05):
from sklearn.model_selection import train_test_split
df = pd.read_hdf(file_name, 'df')
df_X_train, df_X_test, df_y_train, df_y_test = train_test_split(df.drop('labels', axis = 1),
df['labels'], test_size = test_size, random_state=42)
return df_X_train, df_X_test, df_y_train, df_y_test
def plot_ROC_curve(network_output, y_true, meta = ''):
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import roc_curve, auc
y_score = network_output[:,1]
# Compute ROC curve, integrate
fpr, tpr, thresholds = roc_curve(y_true, y_score)
roc_auc = auc(fpr, tpr)
plt.figure()
plt.axes([.1,.1,.8,.7])
plt.figtext(.5,.9, r'$\mathrm{Receiver \;Operating \;Characteristic}$', fontsize=15, ha='center')
plt.figtext(.5,.85, meta, fontsize=10,ha='center')
plt.plot(fpr, tpr, color='darkorange',
lw=2, label='ROC (area = %0.2f)' % roc_auc)
plt.plot([0, 1], [0, 1], color='navy', lw=1.0, linestyle='--')
plt.xlim([-0.05, 1.05])
plt.ylim([-0.05, 1.05])
plt.xlabel(r'$\mathrm{False \;Positive \;Rate}$')
plt.ylabel(r'$\mathrm{True \;Positive \;Rate}$')
plt.legend(loc="lower right")
plt.savefig(os.path.join('graphs', '{}_{}_ROC.pdf'.format(config.channel, config.mode)), format='pdf', dpi=1000)
plt.show()
plt.gcf().clear()
In [2]:
test_file = '/data/dnn/norm_std_dnn_B02rho0gamma_kst.h5'
df_X_train, df_X_test, df_y_train, df_y_test = load_data(test_file)
df_y_train = df_y_train.astype(np.int8)
df_y_test = df_y_test.astype(np.int8)
df_train = pd.concat([df_X_train, df_y_train], axis = 1)
df_test = pd.concat([df_X_test, df_y_test], axis = 1)
config.n_features = df_train.shape[1]-1
train_X = df_X_train.iloc[:32000]
train_y = df_y_train.iloc[:32000]
test_X = df_X_test.iloc[:10000]
test_y = df_y_test.iloc[:10000]
grid_X = df_X_train.iloc[64000:72000]
grid_y = df_y_train.iloc[64000:72000]
In [20]:
from sklearn.multiclass import OneVsRestClassifier
#mySVM = svm.SVC(kernel = 'rbf', cache_size = 8192, decision_function_shape = 'ovr')
mySVM = OneVsRestClassifier(svm.SVC(kernel = 'rbf', cache_size = 8192, decision_function_shape = 'ovr'), n_jobs = -1)
mySVM.fit(train_X.values, train_y.values)
In [ ]:
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.model_selection import GridSearchCV
C_range = np.logspace(-2, 10, 13)
gamma_range = np.logspace(-9, 3, 13)
param_grid = dict(gamma = gamma_range, C = C_range)
cv = StratifiedShuffleSplit(n_splits = 4, test_size = 0.25, random_state = 28)
grid = GridSearchCV(estimator = svm.SVC(), param_grid = param_grid, cv = cv, n_jobs = -1, scoring = 'roc_auc')
grid.fit(grid_X.values, grid_y.values)
In [ ]:
print('Parameters selected by cross-validation: {}, AUC: {}'.format(grid.best_params_, grid.best_score_))