Support Vector Machine

Classical algorithm for signal/background separation. Try using this and tree-based methods before going into anything too deep. The performance of SVMs is often contingent on hyperparameters of the kernel. Extending to HyperBand cross-validation may be worthwhile.

Todo: Cross-validation to resolve hyperparameters, Outlier Detection

Note: Fit time complexity is greater than quadratic with the number of training samples, making datasets of $\mathcal{O}(10^5)$ hard to train. May be practical to form an ensemble of SVM classifiers with suitably tuned hyperparameters.


In [1]:
import numpy as np
import pandas as pd
import time, os
from sklearn import svm

class config(object):
    # Set network parameters
    mode = 'kst'
    channel = 'rho0'

def load_data(file_name, test_size = 0.05):
    from sklearn.model_selection import train_test_split
    df = pd.read_hdf(file_name, 'df')
    df_X_train, df_X_test, df_y_train, df_y_test = train_test_split(df.drop('labels', axis = 1),
                                                                    df['labels'], test_size = test_size, random_state=42)
    return df_X_train, df_X_test, df_y_train, df_y_test

def plot_ROC_curve(network_output, y_true, meta = ''):
    import matplotlib.pyplot as plt
    import seaborn as sns
    from sklearn.metrics import roc_curve, auc
    
    y_score = network_output[:,1]
    # Compute ROC curve, integrate
    fpr, tpr, thresholds = roc_curve(y_true, y_score)    
    roc_auc = auc(fpr, tpr)
    
    plt.figure()
    plt.axes([.1,.1,.8,.7])
    plt.figtext(.5,.9, r'$\mathrm{Receiver \;Operating \;Characteristic}$', fontsize=15, ha='center')
    plt.figtext(.5,.85, meta, fontsize=10,ha='center')
    plt.plot(fpr, tpr, color='darkorange',
                     lw=2, label='ROC (area = %0.2f)' % roc_auc)
    plt.plot([0, 1], [0, 1], color='navy', lw=1.0, linestyle='--')
    plt.xlim([-0.05, 1.05])
    plt.ylim([-0.05, 1.05])
    plt.xlabel(r'$\mathrm{False \;Positive \;Rate}$')
    plt.ylabel(r'$\mathrm{True \;Positive \;Rate}$')
    plt.legend(loc="lower right")
    plt.savefig(os.path.join('graphs', '{}_{}_ROC.pdf'.format(config.channel, config.mode)), format='pdf', dpi=1000)
    plt.show()
    plt.gcf().clear()

In [2]:
test_file = '/data/dnn/norm_std_dnn_B02rho0gamma_kst.h5'

df_X_train, df_X_test, df_y_train, df_y_test = load_data(test_file)
df_y_train = df_y_train.astype(np.int8)
df_y_test = df_y_test.astype(np.int8)
df_train = pd.concat([df_X_train, df_y_train], axis = 1)
df_test = pd.concat([df_X_test, df_y_test], axis = 1)
config.n_features = df_train.shape[1]-1

train_X = df_X_train.iloc[:32000]
train_y = df_y_train.iloc[:32000]
test_X = df_X_test.iloc[:10000]
test_y = df_y_test.iloc[:10000]
grid_X = df_X_train.iloc[64000:72000]
grid_y = df_y_train.iloc[64000:72000]

In [20]:
from sklearn.multiclass import OneVsRestClassifier
#mySVM = svm.SVC(kernel = 'rbf', cache_size = 8192, decision_function_shape = 'ovr')
mySVM = OneVsRestClassifier(svm.SVC(kernel = 'rbf', cache_size = 8192, decision_function_shape = 'ovr'), n_jobs = -1)

mySVM.fit(train_X.values, train_y.values)

Grid Search?


In [ ]:
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.model_selection import GridSearchCV

C_range = np.logspace(-2, 10, 13)
gamma_range = np.logspace(-9, 3, 13)
param_grid = dict(gamma = gamma_range, C = C_range)
cv = StratifiedShuffleSplit(n_splits = 4, test_size = 0.25, random_state = 28)
grid = GridSearchCV(estimator = svm.SVC(), param_grid = param_grid, cv = cv, n_jobs = -1, scoring = 'roc_auc')
grid.fit(grid_X.values, grid_y.values)

In [ ]:
print('Parameters selected by cross-validation: {}, AUC: {}'.format(grid.best_params_, grid.best_score_))