notebook.community



In [1]:

    
%pylab inline

PROFILE = None #'threads-2'









    



Populating the interactive namespace from numpy and matplotlib



In [2]:

    
matplotlib.rc('font', size=16)



In [3]:

    
from collections import OrderedDict
def generate_result(auc_b_c, auc_b_light, auc_c_light, log_loss_value, label=""):
    result = OrderedDict()
    result['name'] = [label]
    result['logloss'] = [log_loss_value]
    result['b vs c'] = [auc_b_c]
    result['b vs light'] = [auc_b_light]
    result['c vs light'] = [auc_c_light]
    return pandas.DataFrame(result)



In [4]:

    
import root_numpy
import pandas
from rep.data import LabeledDataStorage
from hep_ml.decisiontrain import DecisionTrainClassifier, DecisionTrainRegressor
from hep_ml.losses import LogLossFunction, MSELossFunction
from rep.metaml import FoldingClassifier, FoldingRegressor
from rep.report import ClassificationReport
from rep.report.metrics import RocAuc
from sklearn.metrics import roc_auc_score
from rep.estimators import SklearnClassifier

Read data



In [5]:

    
treename = 'tag'
numpy.random.seed(11)

data_b = pandas.DataFrame(root_numpy.root2array('datasets/type=5.root', treename=treename)).dropna()
data_b = data_b.ix[numpy.random.choice(numpy.arange(len(data_b)), replace=False, size=40000), :]

data_c = pandas.DataFrame(root_numpy.root2array('datasets/type=4.root', treename=treename)).dropna()
data_light = pandas.DataFrame(root_numpy.root2array('datasets/type=0.root', treename=treename)).dropna()



In [6]:

    
set(data_light.JetParton)









    Out[6]:





{-3.0, -2.0, -1.0, 0.0, 1.0, 2.0, 3.0}

Add features



In [7]:

    
def add_features(*arrays):
    new_data = []
    for data in arrays:
        data['log_SVFDChi2'] = numpy.log1p(data['SVFDChi2'].values)
        data['log_SVMC'] = numpy.log1p(data['SVMC'].values)
        data['log_SVM'] = numpy.log1p(data['SVM'].values)
        data['log_SVSumIPChi2'] = numpy.log1p(data['SVSumIPChi2'].values)
        data['SV_M_PT'] = numpy.log1p(data['SVM'] / data['SVPT'])
        data['SV_MC_PT'] = numpy.log1p(data['SVMC'] / data['SVPT'])
        data['SVM_diff'] = numpy.log1p(data['SVMC'] ** 2 - data['SVM']**2)
        data['SV_Mdiff_PT'] = numpy.log1p(data['SVM_diff'] / data['SVPT'])
        data['SV_theta'] = numpy.log1p((data['SVMC'] ** 2 - data['SVM']**2) / data['SVPT'])
        data['SVM_rel'] = numpy.log1p(data['SVM'] / data['SVMC'])
        data['SV_R_FD_rel'] = numpy.log1p(data['SVR'] / data['SVFDChi2'])
        data['SV_Q_N_rel'] = 1. * data['SVQ'] / data['SVN']
        
        data = data.drop(['SVFDChi2', 'SVSumIPChi2', 'SVMC', 'SVM'], axis=1)
        new_data.append(data)
    return new_data



In [8]:

    
data_b, data_c, data_light = add_features(data_b, data_c, data_light)



In [9]:

    
len(data_b), len(data_c), len(data_light)









    Out[9]:





(40000, 19941, 42423)



In [10]:

    
jet_features = [column for column in data_b.columns if "Jet" in column]
sv_features = [column for column in data_b.columns if "SV" in column]



In [11]:

    
print "Jet features", ", ".join(jet_features)
print "SV features", ", ".join(sv_features)









    



Jet features JetParton, JetFlavor, JetPx, JetPy, JetPz, JetE, JetQ, JetSigma1, JetSigma2, JetMult, JetPTHard, JetPTD, JetNDis
SV features SVR, SVPT, SVDR, SVN, SVQ, log_SVFDChi2, log_SVMC, log_SVM, log_SVSumIPChi2, SV_M_PT, SV_MC_PT, SVM_diff, SV_Mdiff_PT, SV_theta, SVM_rel, SV_R_FD_rel, SV_Q_N_rel



In [12]:

    
jet_features_base = ['JetQ', 'JetSigma1', 'JetSigma2', 'JetMult', 'JetPTHard', 'JetPTD', 'JetNDis']

Feature pdfs



In [13]:

    
figsize(18, 24)
for i, feature in enumerate(sv_features):
    subplot(len(sv_features) / 3 + 1, 3, i)
    hist(data_b[feature].values, label='b', alpha=0.2, bins=60, normed=True)
    hist(data_c[feature].values, label='c', alpha=0.2, bins=60, normed=True)
    hist(data_light[feature].values, label='light', alpha=0.2, bins=60, normed=True)
    title(feature); legend(loc='best');









    



/moosefs/miniconda/envs/ipython_py2/lib/python2.7/site-packages/matplotlib/axes/_subplots.py:69: MatplotlibDeprecationWarning: The use of 0 (which ends up being the _last_ sub-plot) is deprecated in 1.4 and will raise an error in 1.5
  mplDeprecation)

Prepare datasets:

b vs c
b vs light
c vs light
b, c vs light



In [14]:

    
labels = numpy.array([0] * len(data_b) + [1] * len(data_c) + [2] * len(data_light))
full_data = pandas.concat([data_b, data_c, data_light])
full_data.index = range(len(full_data))



In [15]:

    
data_b_c_lds = LabeledDataStorage(pandas.concat([data_b, data_c]), [1] * len(data_b) + [0] * len(data_c))
data_c_light_lds = LabeledDataStorage(pandas.concat([data_c, data_light]), [1] * len(data_c) + [0] * len(data_light))
data_b_light_lds = LabeledDataStorage(pandas.concat([data_b, data_light]), [1] * len(data_b) + [0] * len(data_light))

data_bc_light_lds = LabeledDataStorage(full_data, [1] * len(data_b) + [1] * len(data_c) + [0] * len(data_light))

Baseline

b vs c
b, c vs light



In [16]:

    
baseline_features = ['log_SVM', 'log_SVMC', 'SVR', 'SVPT', 'SVDR', 'SVN', 'SVQ', 'log_SVFDChi2', 'log_SVSumIPChi2']

b vs c



In [17]:

    
tt_base = DecisionTrainClassifier(learning_rate=0.1, n_estimators=2000, depth=6, pretransform_needed=True, 
                                  max_features=6, loss=LogLossFunction(regularization=30))
tt_folding_base_b_c = FoldingClassifier(SklearnClassifier(tt_base), n_folds=2, 
                                        random_state=11, parallel_profile='threads-2', 
                                        features=baseline_features)
%time tt_folding_base_b_c.fit_lds(data_b_c_lds)
pass









    



CPU times: user 40.5 s, sys: 2.44 s, total: 43 s
Wall time: 10.9 s



In [18]:

    
report_base_b_c = tt_folding_base_b_c.test_on_lds(data_b_c_lds)









    



KFold prediction using folds column



In [19]:

    
report_base_b_c.learning_curve(RocAuc()).plot(new_plot=True)









    



KFold prediction using folds column



In [20]:

    
report_base_b_c.feature_importance()









    Out[20]:



In [21]:

    
auc_base_b_c = report_base_b_c.compute_metric(RocAuc())['clf']
auc_base_b_c









    Out[21]:





0.9497825152951207

b, c vs light



In [22]:

    
tt_base = DecisionTrainClassifier(learning_rate=0.1, n_estimators=15000, depth=6, pretransform_needed=True, 
                                  max_features=6, loss=LogLossFunction(regularization=30))
tt_folding_base_bc_light = FoldingClassifier(SklearnClassifier(tt_base), n_folds=2, random_state=11,
                                             parallel_profile='threads-2', features=baseline_features)
%time tt_folding_base_bc_light.fit_lds(data_bc_light_lds)









    



CPU times: user 6min 31s, sys: 24.1 s, total: 6min 56s
Wall time: 2min 5s






    Out[22]:





FoldingClassifier(base_estimator=SklearnClassifier(clf=DecisionTrainClassifier(bootstrap=True, depth=6, learning_rate=0.1,
            loss=LogLossFunction(regularization=30), max_features=6,
            n_estimators=15000, n_threads=4, pretransform_needed=True,
            train_features=None, use_friedman_mse=True),
         features=None),
         features=['log_SVM', 'log_SVMC', 'SVR', 'SVPT', 'SVDR', 'SVN', 'SVQ', 'log_SVFDChi2', 'log_SVSumIPChi2'],
         n_folds=2, parallel_profile='threads-2', random_state=11)



In [23]:

    
report_base_bc_light = tt_folding_base_bc_light.test_on_lds(data_bc_light_lds)









    



KFold prediction using folds column



In [24]:

    
report_base_bc_light.learning_curve(RocAuc()).plot(new_plot=True)









    



KFold prediction using folds column



In [25]:

    
report_base_bc_light.feature_importance()









    Out[25]:



In [26]:

    
print report_base_bc_light.compute_metric(RocAuc())
probs_base_bc_light = tt_folding_base_bc_light.predict_proba(full_data)[:, 1]
baseline_result = generate_result(auc_base_b_c, 
                                  roc_auc_score(labels < 1, probs_base_bc_light, sample_weight=(labels != 1) * 1),
                                  roc_auc_score(labels < 2, probs_base_bc_light, sample_weight=(labels != 0) * 1),
                                  -1,
                                  label='baseline')









    



OrderedDict([('clf', 0.98226038450259434)])
KFold prediction using folds column

baseline results



In [27]:

    
baseline_result









    Out[27]:






  
    
      
      name
      logloss
      b vs c
      b vs light
      c vs light
    
  
  
    
      0
      baseline
      -1
      0.949783
      0.985596
      0.975569

One versus One

b vs c



In [28]:

    
tt_base = DecisionTrainClassifier(learning_rate=0.1, n_estimators=2000, depth=6, pretransform_needed=True, 
                                  max_features=6, loss=LogLossFunction(regularization=30))
tt_folding_b_c = FoldingClassifier(SklearnClassifier(tt_base), n_folds=2, 
                                   random_state=11, parallel_profile='threads-2', 
                                   features=sv_features)
%time tt_folding_b_c.fit_lds(data_b_c_lds)
pass









    



CPU times: user 19.7 s, sys: 2.53 s, total: 22.3 s
Wall time: 10.7 s



In [29]:

    
report_b_c = tt_folding_b_c.test_on_lds(data_b_c_lds)
report_b_c.learning_curve(RocAuc()).plot(new_plot=True)









    



KFold prediction using folds column
KFold prediction using folds column



In [30]:

    
report_b_c.feature_importance()









    Out[30]:



In [31]:

    
report_b_c.compute_metric(RocAuc())['clf']









    Out[31]:





0.95004684193370448

c vs light



In [32]:

    
tt_base = DecisionTrainClassifier(learning_rate=0.1, n_estimators=7000, depth=6, pretransform_needed=True, 
                                  max_features=6, loss=LogLossFunction(regularization=30))
tt_folding_c_light = FoldingClassifier(SklearnClassifier(tt_base), n_folds=2, 
                                       random_state=11, parallel_profile='threads-2', 
                                       features=sv_features)
%time tt_folding_c_light.fit_lds(data_c_light_lds)
pass









    



CPU times: user 1min 11s, sys: 7.2 s, total: 1min 18s
Wall time: 37.8 s



In [33]:

    
report_c_light = tt_folding_c_light.test_on_lds(data_c_light_lds)
report_c_light.learning_curve(RocAuc()).plot(new_plot=True)









    



KFold prediction using folds column
KFold prediction using folds column



In [34]:

    
report_c_light.feature_importance()









    Out[34]:



In [35]:

    
report_c_light.compute_metric(RocAuc())['clf']









    Out[35]:





0.97680800324041994

b vs light



In [36]:

    
tt_base = DecisionTrainClassifier(learning_rate=0.1, n_estimators=10000, depth=6, pretransform_needed=True, 
                                  max_features=6, loss=LogLossFunction(regularization=30))
tt_folding_b_light = FoldingClassifier(SklearnClassifier(tt_base), n_folds=2, 
                                       random_state=11, parallel_profile='threads-2', 
                                       features=sv_features)
%time tt_folding_b_light.fit_lds(data_b_light_lds)
pass









    



CPU times: user 2min 10s, sys: 17.8 s, total: 2min 28s
Wall time: 1min 11s



In [37]:

    
report_b_light = tt_folding_b_light.test_on_lds(data_b_light_lds)
report_b_light.learning_curve(RocAuc()).plot(new_plot=True)









    



KFold prediction using folds column
KFold prediction using folds column



In [38]:

    
report_b_light.compute_metric(RocAuc())['clf']









    Out[38]:





0.98660432283195421

One vs One result



In [39]:

    
probs_b_c = numpy.concatenate([tt_folding_b_c.predict_proba(pandas.concat([data_b, data_c])),
                               tt_folding_b_c.predict_proba(data_light)])[:, 1]
probs_c_light = numpy.concatenate([tt_folding_c_light.predict_proba(data_b), 
                                   tt_folding_c_light.predict_proba(pandas.concat([data_c, data_light]))])[:, 1]
probs_b_light = tt_folding_b_light.predict_proba(pandas.concat([data_b, data_light]))[:, 1]
probs_b_light = numpy.concatenate([probs_b_light[:len(data_b)], tt_folding_b_light.predict_proba(data_c)[:, 1], 
                                   probs_b_light[len(data_b):]])









    



KFold prediction using folds column
KFold prediction using random classifier (length of data passed not equal to length of train)
KFold prediction using random classifier (length of data passed not equal to length of train)
KFold prediction using folds column
KFold prediction using folds column
KFold prediction using random classifier (length of data passed not equal to length of train)



In [40]:

    
one_vs_one_result = generate_result(report_b_c.compute_metric(RocAuc())['clf'],
                                    report_b_light.compute_metric(RocAuc())['clf'],
                                    report_c_light.compute_metric(RocAuc())['clf'],
                                    -1, label='one vs one, add features')

Baseline with new features



In [41]:

    
tt_base = DecisionTrainClassifier(learning_rate=0.1, n_estimators=15000, depth=6, pretransform_needed=True, 
                                  max_features=6, loss=LogLossFunction(regularization=30))
tt_folding_base_add_bc_light = FoldingClassifier(SklearnClassifier(tt_base), n_folds=2, random_state=11,
                                                 parallel_profile='threads-2', features=sv_features)
%time tt_folding_base_add_bc_light.fit_lds(data_bc_light_lds)









    



CPU times: user 3min 56s, sys: 24.7 s, total: 4min 21s
Wall time: 2min 4s






    Out[41]:





FoldingClassifier(base_estimator=SklearnClassifier(clf=DecisionTrainClassifier(bootstrap=True, depth=6, learning_rate=0.1,
            loss=LogLossFunction(regularization=30), max_features=6,
            n_estimators=15000, n_threads=4, pretransform_needed=True,
            train_features=None, use_friedman_mse=True),
         features=None),
         features=['SVR', 'SVPT', 'SVDR', 'SVN', 'SVQ', 'log_SVFDChi2', 'log_SVMC', 'log_SVM', 'log_SVSumIPChi2', 'SV_M_PT', 'SV_MC_PT', 'SVM_diff', 'SV_Mdiff_PT', 'SV_theta', 'SVM_rel', 'SV_R_FD_rel', 'SV_Q_N_rel'],
         n_folds=2, parallel_profile='threads-2', random_state=11)



In [42]:

    
report_base_add_bc_light = tt_folding_base_add_bc_light.test_on_lds(data_bc_light_lds)
report_base_add_bc_light.learning_curve(RocAuc()).plot(new_plot=True)









    



KFold prediction using folds column
KFold prediction using folds column



In [43]:

    
report_base_add_bc_light.feature_importance()









    Out[43]:



In [44]:

    
print report_base_add_bc_light.compute_metric(RocAuc())
probs_base_add_bc_light = tt_folding_base_add_bc_light.predict_proba(full_data)[:, 1]
baseline_add_result = generate_result(report_b_c.compute_metric(RocAuc())['clf'], 
                                      roc_auc_score(labels < 1, probs_base_add_bc_light, 
                                                    sample_weight=(labels != 1) * 1),
                                      roc_auc_score(labels < 2, probs_base_add_bc_light, 
                                                    sample_weight=(labels != 0) * 1),
                                      -1,
                                      label='baseline, add features')









    



OrderedDict([('clf', 0.98256034277312876)])
KFold prediction using folds column



In [45]:

    
pandas.concat([baseline_result, baseline_add_result, one_vs_one_result])









    Out[45]:






  
    
      
      name
      logloss
      b vs c
      b vs light
      c vs light
    
  
  
    
      0
      baseline
      -1
      0.949783
      0.985596
      0.975569
    
    
      0
      baseline, add features
      -1
      0.950047
      0.985900
      0.975862
    
    
      0
      one vs one, add features
      -1
      0.950047
      0.986604
      0.976808

MN b vs c



In [46]:

    
from rep_ef.estimators import MatrixNetClassifier



In [47]:

    
mn_base = MatrixNetClassifier(connection_url='mn',
                              iterations=5000, regularization=0.02, sync=False)
mn_folding_b_c = FoldingClassifier(mn_base, n_folds=2, random_state=11,
                                   parallel_profile='threads-2', features=sv_features)
%time mn_folding_b_c.fit_lds(data_b_c_lds)









    



CPU times: user 1.02 s, sys: 332 ms, total: 1.35 s
Wall time: 1.97 s






    Out[47]:





FoldingClassifier(base_estimator=MatrixNetClassifier(auto_stop=None, baseline_feature=None,
          command_line_params=None, connection_token=None,
          connection_url='mn', dump_filename=None,
          features_sample_rate_per_iteration=1.0, intervals=64,
          iterations=5000, max_features_per_iteration=6,
          regularization=0.02, sync=False, train_features=None,
          training_fraction=0.5),
         features=['SVR', 'SVPT', 'SVDR', 'SVN', 'SVQ', 'log_SVFDChi2', 'log_SVMC', 'log_SVM', 'log_SVSumIPChi2', 'SV_M_PT', 'SV_MC_PT', 'SVM_diff', 'SV_Mdiff_PT', 'SV_theta', 'SVM_rel', 'SV_R_FD_rel', 'SV_Q_N_rel'],
         n_folds=2, parallel_profile='threads-2', random_state=11)



In [48]:

    
report_mn_b_c = mn_folding_b_c.test_on_lds(data_b_c_lds)
report_mn_b_c.learning_curve(RocAuc()).plot(new_plot=True)









    



KFold prediction using folds column
KFold prediction using folds column



In [49]:

    
report_mn_b_c.compute_metric(RocAuc())['clf']









    Out[49]:





0.95076077428413819



In [ ]:

    
mn_base = MatrixNetClassifier(connection_url='mn', 
                              iterations=5000, regularization=0.03, sync=False)
mn_folding_bc_light = FoldingClassifier(mn_base, n_folds=2, random_state=11,
                                        parallel_profile='threads-2', features=sv_features)
%time mn_folding_bc_light.fit_lds(data_bc_light_lds)



In [56]:

    
report_mn_bc_light = mn_folding_bc_light.test_on_lds(data_bc_light_lds)
report_mn_bc_light.learning_curve(RocAuc()).plot(new_plot=True)









    



KFold prediction using folds column
KFold prediction using folds column



In [57]:

    
report_mn_bc_light.compute_metric(RocAuc())['clf']









    Out[57]:





0.98296983622577772



In [58]:

    
probs_mn_bc_light = mn_folding_bc_light.predict_proba(full_data)[:, 1]
mn_result = generate_result(report_mn_b_c.compute_metric(RocAuc())['clf'], 
                            roc_auc_score(labels < 1, probs_mn_bc_light, 
                                          sample_weight=(labels != 1) * 1),
                            roc_auc_score(labels < 2, probs_mn_bc_light, 
                                            sample_weight=(labels != 0) * 1),
                            -1,
                            label='mn, add features')









    



KFold prediction using folds column



In [59]:

    
pandas.concat([baseline_result, baseline_add_result, one_vs_one_result, mn_result])









    Out[59]:






  
    
      
      name
      logloss
      b vs c
      b vs light
      c vs light
    
  
  
    
      0
      baseline
      -1
      0.949783
      0.985596
      0.975569
    
    
      0
      baseline, add features
      -1
      0.950047
      0.985900
      0.975862
    
    
      0
      one vs one, add features
      -1
      0.950047
      0.986604
      0.976808
    
    
      0
      mn, add features
      -1
      0.950761
      0.986284
      0.976321

Hierarchical training



In [60]:

    
from hep_ml.nnet import MLPClassifier
from rep.estimators import SklearnClassifier, TheanetsClassifier



In [61]:

    
from utils import train_one_vs_one

mlp features (one vs one)



In [62]:

    
mlp_columns = train_one_vs_one([SklearnClassifier(MLPClassifier(layers=(30, 10), scaler='iron', 
                                                                epochs=700, random_state=11))]*3, 
                                data_b, data_c, data_light, 'mlp', folding=True,
                                n_folds=2, features=sv_features)









    



KFold prediction using folds column
KFold prediction using random classifier (length of data passed not equal to length of train)
KFold prediction using random classifier (length of data passed not equal to length of train)
KFold prediction using folds column
KFold prediction using folds column
KFold prediction using random classifier (length of data passed not equal to length of train)



In [63]:

    
full_data = pandas.concat([full_data, mlp_columns], axis=1)



In [65]:

    
mlp_features = list(mlp_columns.columns)



In [69]:

    
from sklearn.ensemble import AdaBoostClassifier
from hep_ml.nnet import MLPMultiClassifier



In [85]:

    
combo_base = AdaBoostClassifier(MLPMultiClassifier(layers=(40, 30, 20, 10), random_state=11, epochs=700), random_state=11,
                                n_estimators=10, learning_rate=0.1)
combo_multi = FoldingClassifier(SklearnClassifier(combo_base), n_folds=2, random_state=13,
                                features=sv_features)
%time combo_multi.fit(full_data, labels)









    



CPU times: user 1h 57min 18s, sys: 2h 34min 42s, total: 4h 32min 1s
Wall time: 45min 6s






    Out[85]:





FoldingClassifier(base_estimator=SklearnClassifier(clf=AdaBoostClassifier(algorithm='SAMME.R',
          base_estimator=MLPMultiClassifier(epochs=700, layers=(40, 30, 20, 10), random_state=11,
          scaler='standard', trainer='irprop-', trainer_parameters=None),
          learning_rate=0.1, n_estimators=10, random_state=11),
         features=None),
         features=['SVR', 'SVPT', 'SVDR', 'SVN', 'SVQ', 'log_SVFDChi2', 'log_SVMC', 'log_SVM', 'log_SVSumIPChi2', 'SV_M_PT', 'SV_MC_PT', 'SVM_diff', 'SV_Mdiff_PT', 'SV_theta', 'SVM_rel', 'SV_R_FD_rel', 'SV_Q_N_rel'],
         n_folds=2, parallel_profile=None, random_state=13)



In [86]:

    
combo_multi_probs = combo_multi.predict_proba(full_data)
log_loss(labels, combo_multi_probs)
combo_multi_result = generate_result(1 - roc_auc_score(labels > 0, combo_multi_probs[:, 0] / combo_multi_probs[:, 1], 
                                                 sample_weight=(labels != 2) * 1),
                               1 - roc_auc_score(labels > 1, combo_multi_probs[:, 0] / combo_multi_probs[:, 2],
                                                      sample_weight=(labels != 1) * 1),
                               1 - roc_auc_score(labels > 1, combo_multi_probs[:, 1] / combo_multi_probs[:, 2],
                                                 sample_weight=(labels != 0) * 1),
                               log_loss(labels, combo_multi_probs),
                               label='multiclass combo')

pandas.concat([baseline_result, baseline_add_result, one_vs_one_result, 
               mn_result,  combo_multi_result])









    



KFold prediction using folds column






    Out[86]:






  
    
      
      name
      logloss
      b vs c
      b vs light
      c vs light
    
  
  
    
      0
      baseline
      -1.000000
      0.949783
      0.985596
      0.975569
    
    
      0
      baseline, add features
      -1.000000
      0.950047
      0.985900
      0.975862
    
    
      0
      one vs one, add features
      -1.000000
      0.950047
      0.986604
      0.976808
    
    
      0
      mn, add features
      -1.000000
      0.950761
      0.986284
      0.976321
    
    
      0
      multiclass combo
      0.323668
      0.951336
      0.989367
      0.982157



In [75]:









    



KFold prediction using folds column






    Out[75]:






  
    
      
      name
      logloss
      b vs c
      b vs light
      c vs light
    
  
  
    
      0
      baseline
      -1.000000
      0.949783
      0.985596
      0.975569
    
    
      0
      baseline, add features
      -1.000000
      0.950047
      0.985900
      0.975862
    
    
      0
      one vs one, add features
      -1.000000
      0.950047
      0.986604
      0.976808
    
    
      0
      mn, add features
      -1.000000
      0.950761
      0.986284
      0.976321
    
    
      0
      multiclass combo
      0.329491
      0.950639
      0.988430
      0.980871



In [64]:

    
sv_features









    Out[64]:





['SVR',
 'SVPT',
 'SVDR',
 'SVN',
 'SVQ',
 'log_SVFDChi2',
 'log_SVMC',
 'log_SVM',
 'log_SVSumIPChi2',
 'SV_M_PT',
 'SV_MC_PT',
 'SVM_diff',
 'SV_Mdiff_PT',
 'SV_theta',
 'SVM_rel',
 'SV_R_FD_rel',
 'SV_Q_N_rel']



In [ ]:

    
sv_features_without_categorial = list(set(sv_features) - {'SVN', 'SVQ', "SV_Q_N_rel"})



In [ ]:

    
mlp_columns_without_categorial = train_one_vs_one([SklearnClassifier(MLPClassifier(
                layers=(30, 10), epochs=700, scaler='iron', random_state=11))]*3, 
                data_b, data_c, data_light, 'rbf_cat', folding=True,
                n_folds=3, features=sv_features_without_categorial)



In [62]:

    
full_data = pandas.concat([full_data, mlp_columns_without_categorial], axis=1)



In [63]:

    
mlp_columns_group1 = train_one_vs_one([SklearnClassifier(MLPClassifier(
                                        layers=(20, 10), epochs=400, random_state=11))]*3, 
                                        data_b, data_c, data_light, 'gr1', folding=True,
                                        n_folds=3, features=['log_SVM', 'log_SVMC', 'SVR', 'log_SVFDChi2'])









    



KFold prediction using folds column
KFold prediction using random classifier (length of data passed not equal to length of train)
KFold prediction using random classifier (length of data passed not equal to length of train)
KFold prediction using folds column
KFold prediction using folds column
KFold prediction using random classifier (length of data passed not equal to length of train)



In [64]:

    
mlp_columns_group2 = train_one_vs_one([SklearnClassifier(MLPClassifier(
                                        layers=(20, 10), epochs=400, random_state=11))]*3, 
                                        data_b, data_c, data_light, 'gr2', folding=True,
                                        n_folds=3, features=['SVPT', 'SVDR', 'SVR', 'log_SVFDChi2'])









    



KFold prediction using folds column
KFold prediction using random classifier (length of data passed not equal to length of train)
KFold prediction using random classifier (length of data passed not equal to length of train)
KFold prediction using folds column
KFold prediction using folds column
KFold prediction using random classifier (length of data passed not equal to length of train)



In [65]:

    
full_data = pandas.concat([full_data, mlp_columns_group1], axis=1)
full_data = pandas.concat([full_data, mlp_columns_group2], axis=1)

mn features (one vs one)



In [66]:

    
mn_columns = train_one_vs_one([MatrixNetClassifier(connection='mn', connection_auth='AUTH_HEADERS', 
                               iterations=5000, regularization=0.02, sync=False)]*3, 
                               data_b, data_c, data_light, 'mn', folding=True,
                               n_folds=2, features=sv_features)









    



KFold prediction using folds column
KFold prediction using random classifier (length of data passed not equal to length of train)
KFold prediction using random classifier (length of data passed not equal to length of train)
KFold prediction using folds column
KFold prediction using folds column
KFold prediction using random classifier (length of data passed not equal to length of train)



In [67]:

    
full_data = pandas.concat([full_data, mn_columns], axis=1)



In [68]:

    
mpl_features = list(mlp_columns.columns)
mn_features = list(mn_columns.columns)
rbf_features_cat = list(mlp_columns_without_categorial.columns)
mlp1_features = list(mlp_columns_group1.columns)
mlp2_features = list(mlp_columns_group2.columns)

Random forest selection



In [69]:

    
from sklearn.ensemble import RandomForestClassifier



In [70]:

    
rf_folding = FoldingClassifier(RandomForestClassifier(n_estimators=500, max_depth=8, min_samples_split=100,
                                                      min_samples_leaf=20, random_state=11),
                              n_folds=2, random_state=13,
                              features=sv_features)
rf_folding.fit_lds(data_b_c_lds)









    Out[70]:





FoldingClassifier(base_estimator=RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=8, max_features='auto', max_leaf_nodes=None,
            min_samples_leaf=20, min_samples_split=100,
            min_weight_fraction_leaf=0.0, n_estimators=500, n_jobs=1,
            oob_score=False, random_state=11, verbose=0, warm_start=False),
         features=['SVR', 'SVPT', 'SVDR', 'SVN', 'SVQ', 'log_SVFDChi2', 'log_SVMC', 'log_SVM', 'log_SVSumIPChi2', 'SVM_diff', 'SV_theta', 'SVM_rel', 'SV_R_FD_rel', 'SV_Q_N_rel'],
         n_folds=2, parallel_profile=None, random_state=13)



In [71]:

    
p = rf_folding.predict_proba(pandas.concat([data_b, data_c]))[:, 1]









    



KFold prediction using folds column



In [72]:

    
p_b = p[:len(data_b)]
p_c = p[len(data_b):]



In [73]:

    
figure(figsize=(8, 5))
scatter(p_b, numpy.exp(data_b.log_SVM.values) + 0.01, alpha=0.05)
scatter(p_c, numpy.exp(data_c.log_SVM.values) + 0.01, alpha=0.05, color='r')
xlim(0, 1.01)
ylim(0, 5)









    Out[73]:





(0, 5)






    



/moosefs/miniconda/envs/ipython_py2/lib/python2.7/site-packages/matplotlib/collections.py:590: FutureWarning: elementwise comparison failed; returning scalar instead, but in the future will perform elementwise comparison
  if self._edgecolors == str('face'):



In [74]:

    
figure(figsize=(8, 5))
hist(p[:len(data_b)], bins=100, alpha=0.4, normed=True)
hist(p[len(data_b):], bins=100, alpha=0.4, normed=True)
pass



In [75]:

    
data_b_new = full_data[labels == 0].ix[p_b > 0.8, :]
data_c_new = full_data[labels == 1].ix[p_c < 0.8, :]



In [76]:

    
data_b_remain = full_data[labels == 0].ix[p_b < 0.8, :]
data_c_remain = full_data[labels == 1].ix[p_c > 0.8, :]



In [77]:

    
labels_new = numpy.array([0] * len(data_b_new) + [1] * len(data_c_new) + [2] * len(data_light))
full_data_new = pandas.concat([data_b_new, data_c_new, full_data[labels == 2]])
full_data_new.index = range(len(full_data_new))



In [78]:

    
data_multi_lds_new = LabeledDataStorage(full_data_new, labels_new)



In [79]:

    
from rep.estimators import XGBoostClassifier
xgb_base = XGBoostClassifier(n_estimators=3000, colsample=0.7, eta=0.01, nthreads=16, 
                             subsample=0.7, max_depth=6)
multi_folding = FoldingClassifier(xgb_base, n_folds=2, random_state=13,
                                  features=sv_features)
%time multi_folding.fit_lds(data_multi_lds_new)









    



CPU times: user 21min 22s, sys: 2min 27s, total: 23min 49s
Wall time: 7min 14s






    Out[79]:





FoldingClassifier(base_estimator=XGBoostClassifier(base_score=0.5, colsample=0.7, eta=0.01, features=None,
         gamma=None, max_depth=6, min_child_weight=1.0, missing=-999.0,
         n_estimators=3000, nthreads=16, num_feature=None, random_state=0,
         scale_pos_weight=1.0, subsample=0.7, verbose=0),
         features=['SVR', 'SVPT', 'SVDR', 'SVN', 'SVQ', 'log_SVFDChi2', 'log_SVMC', 'log_SVM', 'log_SVSumIPChi2', 'SVM_diff', 'SV_theta', 'SVM_rel', 'SV_R_FD_rel', 'SV_Q_N_rel'],
         n_folds=2, parallel_profile=None, random_state=13)



In [80]:

    
p1 = multi_folding.predict_proba(full_data_new)
p2 = multi_folding.predict_proba(data_b_remain)
p3 = multi_folding.predict_proba(data_c_remain)









    



KFold prediction using folds column
KFold prediction using random classifier (length of data passed not equal to length of train)
KFold prediction using random classifier (length of data passed not equal to length of train)



In [81]:

    
p = numpy.concatenate([p1, p2, p3])
labels_temp = numpy.array(list(labels_new) + [0] * len(p2) + [1] * len(p3))



In [73]:

    
from sklearn.metrics import log_loss



In [84]:

    
multi_result_selection = generate_result(1 - roc_auc_score(labels_temp > 0, p[:, 0] / p[:, 1], 
                                                           sample_weight=(labels_temp != 2) * 1),
                                           1 - roc_auc_score(labels_temp > 1, p[:, 0] / p[:, 2],
                                                             sample_weight=(labels_temp != 1) * 1),
                                           1 - roc_auc_score(labels_temp > 1, p[:, 1] / p[:, 2],
                                                             sample_weight=(labels_temp != 0) * 1),
                                       log_loss(labels_temp, p),
                                       label='multiclass, NN features + selection')

pandas.concat([baseline_result, baseline_add_result, one_vs_one_result, mn_result, multi_result_selection])









    Out[84]:






  
    
      
      name
      logloss
      b vs c
      b vs light
      c vs light
    
  
  
    
      0
      baseline
      -1.000000
      0.949783
      0.985596
      0.975569
    
    
      0
      baseline, add features
      -1.000000
      0.950050
      0.986048
      0.976101
    
    
      0
      one vs one, add features
      -1.000000
      0.950050
      0.986691
      0.977061
    
    
      0
      mn, add features
      -1.000000
      0.950821
      0.986283
      0.976571
    
    
      0
      multiclass, NN features + selection
      0.793759
      0.940595
      0.960148
      0.970705

Multiclassification



In [105]:

    
data_multi_lds = LabeledDataStorage(full_data, labels)



In [106]:

    
from rep.estimators import XGBoostClassifier
xgb_base = XGBoostClassifier(n_estimators=3000, colsample=0.7, eta=0.01, nthreads=16, 
                             subsample=0.7, max_depth=6)
multi_folding = FoldingClassifier(xgb_base, n_folds=2, random_state=13,
                                  features=sv_features + mlp1_features + mlp2_features)
%time multi_folding.fit_lds(data_multi_lds)









    



CPU times: user 36min 22s, sys: 2min 45s, total: 39min 7s
Wall time: 9min 55s






    Out[106]:





FoldingClassifier(base_estimator=XGBoostClassifier(base_score=0.5, colsample=0.7, eta=0.01, features=None,
         gamma=None, max_depth=6, min_child_weight=1.0, missing=-999.0,
         n_estimators=3000, nthreads=16, num_feature=None, random_state=0,
         scale_pos_weight=1.0, subsample=0.7, verbose=0),
         features=['SVR', 'SVPT', 'SVDR', 'SVN', 'SVQ', 'log_SVFDChi2', 'log_SVMC', 'log_SVM', 'log_SVSumIPChi2', 'SVM_diff', 'SV_theta', 'SVM_rel', 'SV_R_FD_rel', 'SV_Q_N_rel', 'rbf_b_c', 'rbf_b_light', 'rbf_c_light', 'gr1_b_c', 'gr1_b_light', 'gr1_c_light', 'gr2_b_c', 'gr2_b_light', 'gr2_c_light'],
         n_folds=2, parallel_profile=None, random_state=13)



In [107]:

    
multi_report = multi_folding.test_on_lds(data_multi_lds)









    



KFold prediction using folds column



In [108]:

    
multi_report.feature_importance().plot(new_plot=True)



In [109]:

    
from sklearn.metrics import log_loss
probs_xgb_mult = multi_folding.predict_proba(full_data)
print 'log loss\t', log_loss(labels, probs_xgb_mult)









    



KFold prediction using folds column
log loss	0.302776518194



In [87]:

    
# 0.302275	0.949335	0.988883	0.981100

# my 0.949379 0.988952 0.981269

#0.949040	0.987689	0.979919

# 0.9501 0.9865 0.9777
#0.951279	0.987887	0.98012



In [110]:

    
multi_result = generate_result(1 - roc_auc_score(labels > 0, probs_xgb_mult[:, 0] / probs_xgb_mult[:, 1], 
                                                 sample_weight=(labels != 2) * 1),
                               1 - roc_auc_score(labels > 1, probs_xgb_mult[:, 0] / probs_xgb_mult[:, 2],
                                                      sample_weight=(labels != 1) * 1),
                               1 - roc_auc_score(labels > 1, probs_xgb_mult[:, 1] / probs_xgb_mult[:, 2],
                                                 sample_weight=(labels != 0) * 1),
                               log_loss(labels, probs_xgb_mult),
                               label='multiclass, NN features')

pandas.concat([baseline_result, baseline_add_result, one_vs_one_result, mn_result, multi_result])









    Out[110]:






  
    
      
      name
      logloss
      b vs c
      b vs light
      c vs light
    
  
  
    
      0
      baseline
      -1.000000
      0.949783
      0.985596
      0.975569
    
    
      0
      baseline, add features
      -1.000000
      0.950050
      0.986048
      0.976101
    
    
      0
      one vs one, add features
      -1.000000
      0.950050
      0.986691
      0.977061
    
    
      0
      mn, add features
      -1.000000
      0.950821
      0.986283
      0.976571
    
    
      0
      multiclass, NN features
      0.302777
      0.949580
      0.988853
      0.980874

MLP



In [145]:

    
full_data['mult_xgb_0'] = probs_xgb_mult[:, 0] / probs_xgb_mult[:, 1]
full_data['mult_xgb_1'] = probs_xgb_mult[:, 0] / probs_xgb_mult[:, 2]
full_data['mult_xgb_2'] = probs_xgb_mult[:, 1] / probs_xgb_mult[:, 2]



In [146]:

    
xgb_features = ['mult_xgb_0', 'mult_xgb_1', 'mult_xgb_2']



In [154]:

    
full_data['mn'] = probs_mn_bc_light



In [163]:

    
from hep_ml.nnet import MLPMultiClassifier
mlp_multi_base = MLPMultiClassifier(layers=(30, 10), random_state=11, epochs=700)
mlp_multi = FoldingClassifier(mlp_multi_base, n_folds=2, 
                              features=sv_features + rbf_features + combo_features)
%time mlp_multi.fit(full_data, labels)









    



CPU times: user 4min 57s, sys: 6min 11s, total: 11min 9s
Wall time: 1min 23s






    Out[163]:





FoldingClassifier(base_estimator=MLPMultiClassifier(epochs=700, layers=(30, 10), random_state=11,
          scaler='standard', trainer='irprop-', trainer_parameters=None),
         features=['SVM', 'SVMC', 'SVR', 'SVPT', 'SVDR', 'SVN', 'SVQ', 'log_SVFDChi2', 'log_SVSumIPChi2', 'SVM_diff', 'SV_theta', 'SVM_rel', 'SV_R_FD_rel', 'SV_Q_N_rel', 'rbf_b_c', 'rbf_b_light', 'rbf_c_light', 'combo_0', 'combo_1', 'combo_2'],
         n_folds=2, parallel_profile=None, random_state=None)



In [164]:

    
mlp_multi_probs = mlp_multi.predict_proba(full_data)
log_loss(labels, mlp_multi_probs)









    



KFold prediction using folds column






    Out[164]:





0.30371548769389162



In [217]:

    
mlp_multi_result = generate_result(1 - roc_auc_score(labels > 0, mlp_multi_probs[:, 0] / mlp_multi_probs[:, 1], 
                                                 sample_weight=(labels != 2) * 1),
                               1 - roc_auc_score(labels > 1, mlp_multi_probs[:, 0] / mlp_multi_probs[:, 2],
                                                      sample_weight=(labels != 1) * 1),
                               1 - roc_auc_score(labels > 1, mlp_multi_probs[:, 1] / mlp_multi_probs[:, 2],
                                                 sample_weight=(labels != 0) * 1),
                               log_loss(labels, mlp_multi_probs),
                               label='multiclass NN, NN features')

pandas.concat([baseline_result, baseline_add_result, one_vs_one_result, mn_result, multi_result, mlp_multi_result])









    Out[217]:






  
    
      
      name
      logloss
      b vs c
      b vs light
      c vs light
    
  
  
    
      0
      baseline
      -1.000000
      0.949218
      0.985552
      0.975261
    
    
      0
      baseline, add features
      -1.000000
      0.949592
      0.985957
      0.975653
    
    
      0
      one vs one, add features
      -1.000000
      0.949592
      0.986477
      0.977006
    
    
      0
      mn, add features
      -1.000000
      0.950182
      0.986155
      0.976294
    
    
      0
      multiclass, NN features
      0.300620
      0.949504
      0.989182
      0.981199
    
    
      0
      multiclass NN, NN features
      0.303715
      0.949605
      0.988334
      0.981056

Bagging over NN



In [363]:

    
from hep_ml.nnet import MLPMultiClassifier
mlp_multi_base = MLPMultiClassifier(layers=(30, 10), random_state=11, epochs=700)
mlp_multi = FoldingClassifier(mlp_multi_base, n_folds=2, random_state=13,
                              features=sv_features + rbf_features + combo_features + jet_features)
%time mlp_multi.fit(full_data, labels)









    



CPU times: user 5min 1s, sys: 6min 10s, total: 11min 11s
Wall time: 1min 24s






    Out[363]:





FoldingClassifier(base_estimator=MLPMultiClassifier(epochs=700, layers=(30, 10), random_state=11,
          scaler='standard', trainer='irprop-', trainer_parameters=None),
         features=['SVM', 'SVMC', 'SVR', 'SVPT', 'SVDR', 'SVN', 'SVQ', 'log_SVFDChi2', 'log_SVSumIPChi2', 'SVM_diff', 'SV_theta', 'SVM_rel', 'SV_R_FD_rel', 'SV_Q_N_rel', 'rbf_b_c', 'rbf_b_light', 'rbf_c_light', 'combo_0', 'combo_1', 'combo_2'],
         n_folds=2, parallel_profile=None, random_state=13)



In [364]:

    
mlp_multi_probs = mlp_multi.predict_proba(full_data)
log_loss(labels, mlp_multi_probs)









    



KFold prediction using folds column






    Out[364]:





0.3035165177688911



In [365]:

    
mlp_multi_result2 = generate_result(1 - roc_auc_score(labels > 0, mlp_multi_probs[:, 0] / mlp_multi_probs[:, 1], 
                                                 sample_weight=(labels != 2) * 1),
                               1 - roc_auc_score(labels > 1, mlp_multi_probs[:, 0] / mlp_multi_probs[:, 2],
                                                      sample_weight=(labels != 1) * 1),
                               1 - roc_auc_score(labels > 1, mlp_multi_probs[:, 1] / mlp_multi_probs[:, 2],
                                                 sample_weight=(labels != 0) * 1),
                               log_loss(labels, mlp_multi_probs),
                               label='multiclass2 NN, NN features')

pandas.concat([baseline_result, baseline_add_result, one_vs_one_result, 
               mn_result, multi_result, mlp_multi_result, mlp_multi_result2])









    Out[365]:






  
    
      
      name
      logloss
      b vs c
      b vs light
      c vs light
    
  
  
    
      0
      baseline
      -1.000000
      0.949218
      0.985552
      0.975261
    
    
      0
      baseline, add features
      -1.000000
      0.949592
      0.985957
      0.975653
    
    
      0
      one vs one, add features
      -1.000000
      0.949592
      0.986477
      0.977006
    
    
      0
      mn, add features
      -1.000000
      0.950182
      0.986155
      0.976294
    
    
      0
      multiclass, NN features
      0.300620
      0.949504
      0.989182
      0.981199
    
    
      0
      multiclass NN, NN features
      0.303715
      0.949605
      0.988334
      0.981056
    
    
      0
      multiclass2 NN, NN features
      0.303517
      0.949405
      0.988516
      0.981045



In [352]:

    
from sklearn.ensemble import AdaBoostClassifier, BaggingClassifier



In [238]:

    
combo_base = BaggingClassifier(MLPMultiClassifier(layers=(30, 10), random_state=11, epochs=700), max_samples=0.8, 
                               max_features=0.8, bootstrap_features=True, random_state=11,
                               n_estimators=10)
combo_multi = FoldingClassifier(SklearnClassifier(combo_base), n_folds=2, random_state=13,
                                features=sv_features + rbf_features + jet_features_base)
%time combo_multi.fit(full_data, labels)









    



CPU times: user 49min 11s, sys: 1h 44s, total: 1h 49min 56s
Wall time: 13min 38s






    Out[238]:





FoldingClassifier(base_estimator=SklearnClassifier(clf=BaggingClassifier(base_estimator=MLPMultiClassifier(epochs=700, layers=(30, 10), random_state=11,
          scaler='standard', trainer='irprop-', trainer_parameters=None),
         bootstrap=True, bootstrap_features=True, max_features=0.8,
         max_samples=0.8, n_estimators=10, n_jobs=1, oob_score=False,
         random_state=11, verbose=0, warm_start=False),
         features=None),
         features=['SVM', 'SVMC', 'SVR', 'SVPT', 'SVDR', 'SVN', 'SVQ', 'log_SVFDChi2', 'log_SVSumIPChi2', 'SVM_diff', 'SV_theta', 'SVM_rel', 'SV_R_FD_rel', 'SV_Q_N_rel', 'rbf_b_c', 'rbf_b_light', 'rbf_c_light', 'JetQ', 'JetSigma1', 'JetSigma2', 'JetMult', 'JetPTHard', 'JetPTD', 'JetNDis'],
         n_folds=2, parallel_profile=None, random_state=13)



In [307]:

    
combo_base = AdaBoostClassifier(MLPMultiClassifier(layers=(30, 10), random_state=11, epochs=700), random_state=11,
                                n_estimators=10, learning_rate=0.1)
combo_multi = FoldingClassifier(SklearnClassifier(combo_base), n_folds=2, random_state=13,
                                features=sv_features + rbf_features)
%time combo_multi.fit(full_data, labels)









    



CPU times: user 48min 14s, sys: 1h 22s, total: 1h 48min 36s
Wall time: 13min 42s






    Out[307]:





FoldingClassifier(base_estimator=SklearnClassifier(clf=AdaBoostClassifier(algorithm='SAMME.R',
          base_estimator=MLPMultiClassifier(epochs=700, layers=(30, 10), random_state=11,
          scaler='standard', trainer='irprop-', trainer_parameters=None),
          learning_rate=0.1, n_estimators=10, random_state=11),
         features=None),
         features=['SVM', 'SVMC', 'SVR', 'SVPT', 'SVDR', 'SVN', 'SVQ', 'log_SVFDChi2', 'log_SVSumIPChi2', 'SVM_diff', 'SV_theta', 'SVM_rel', 'SV_R_FD_rel', 'SV_Q_N_rel', 'rbf_b_c', 'rbf_b_light', 'rbf_c_light'],
         n_folds=2, parallel_profile=None, random_state=13)



In [308]:

    
combo_multi_probs = combo_multi.predict_proba(full_data)
log_loss(labels, combo_multi_probs)









    



KFold prediction using folds column






    Out[308]:





0.32860535944797775



In [309]:

    
combo_multi_probs = combo_multi.predict_proba(full_data)
log_loss(labels, combo_multi_probs)
combo_multi_result = generate_result(1 - roc_auc_score(labels > 0, combo_multi_probs[:, 0] / combo_multi_probs[:, 1], 
                                                 sample_weight=(labels != 2) * 1),
                               1 - roc_auc_score(labels > 1, combo_multi_probs[:, 0] / combo_multi_probs[:, 2],
                                                      sample_weight=(labels != 1) * 1),
                               1 - roc_auc_score(labels > 1, combo_multi_probs[:, 1] / combo_multi_probs[:, 2],
                                                 sample_weight=(labels != 0) * 1),
                               log_loss(labels, combo_multi_probs),
                               label='multiclass combo')

pandas.concat([baseline_result, baseline_add_result, one_vs_one_result, 
               mn_result, multi_result, mlp_multi_result, combo_multi_result])









    Out[309]:






  
    
      
      name
      logloss
      b vs c
      b vs light
      c vs light
    
  
  
    
      0
      baseline
      -1.000000
      0.949218
      0.985552
      0.975261
    
    
      0
      baseline, add features
      -1.000000
      0.949592
      0.985957
      0.975653
    
    
      0
      one vs one, add features
      -1.000000
      0.949592
      0.986477
      0.977006
    
    
      0
      mn, add features
      -1.000000
      0.950182
      0.986155
      0.976294
    
    
      0
      multiclass, NN features
      0.300620
      0.949504
      0.989182
      0.981199
    
    
      0
      multiclass NN, NN features
      0.303715
      0.949605
      0.988334
      0.981056
    
    
      0
      multiclass combo
      0.328605
      0.950393
      0.988648
      0.981527



In [220]:

    
combo_multi_result = generate_result(1 - roc_auc_score(labels > 0, combo_multi_probs[:, 0] / combo_multi_probs[:, 1], 
                                                 sample_weight=(labels != 2) * 1),
                               1 - roc_auc_score(labels > 1, combo_multi_probs[:, 0] / combo_multi_probs[:, 2],
                                                      sample_weight=(labels != 1) * 1),
                               1 - roc_auc_score(labels > 1, combo_multi_probs[:, 1] / combo_multi_probs[:, 2],
                                                 sample_weight=(labels != 0) * 1),
                               log_loss(labels, combo_multi_probs),
                               label='multiclass combo')

pandas.concat([baseline_result, baseline_add_result, one_vs_one_result, 
               mn_result, multi_result, mlp_multi_result, combo_multi_result])









    Out[220]:






  
    
      
      name
      logloss
      b vs c
      b vs light
      c vs light
    
  
  
    
      0
      baseline
      -1.000000
      0.949218
      0.985552
      0.975261
    
    
      0
      baseline, add features
      -1.000000
      0.949592
      0.985957
      0.975653
    
    
      0
      one vs one, add features
      -1.000000
      0.949592
      0.986477
      0.977006
    
    
      0
      mn, add features
      -1.000000
      0.950182
      0.986155
      0.976294
    
    
      0
      multiclass, NN features
      0.300620
      0.949504
      0.989182
      0.981199
    
    
      0
      multiclass NN, NN features
      0.303715
      0.949605
      0.988334
      0.981056
    
    
      0
      multiclass combo
      0.325421
      0.950355
      0.988817
      0.981465



In [161]:

    
full_data['combo_0'] = combo_multi_probs[:, 0] / combo_multi_probs[:, 1]
full_data['combo_1'] = combo_multi_probs[:, 0] / combo_multi_probs[:, 2]
full_data['combo_2'] = combo_multi_probs[:, 1] / combo_multi_probs[:, 2]



In [162]:

    
combo_features = ['combo_0', 'combo_1', 'combo_2']

Hierarchical multiclassification (construct 1d discrimination plane)



In [254]:

    
label_rank = labels - 1
data_ranker_lds = LabeledDataStorage(full_data, label_rank)

XGboost



In [255]:

    
from rep.estimators import XGBoostRegressor



In [256]:

    
xgb_base = XGBoostRegressor(n_estimators=500, colsample=0.7, eta=0.01, nthreads=8, 
                            subsample=0.7, max_depth=6)
xgb_folding_regression = FoldingRegressor(xgb_base, n_folds=2, random_state=11,
                                          parallel_profile='threads-2', 
                                          features=sv_features + rbf_features)
%time xgb_folding_regression.fit_lds(data_ranker_lds)
pass









    



CPU times: user 1min 36s, sys: 5.99 s, total: 1min 42s
Wall time: 20.1 s



In [257]:

    
report_reg = xgb_folding_regression.test_on_lds(data_ranker_lds)









    



KFold prediction using folds column



In [258]:

    
from sklearn.metrics import mean_squared_error
lc = report_reg.learning_curve(mean_squared_error, steps=10)









    



KFold prediction using folds column



In [260]:

    
lc.plot(new_plot=True)



In [261]:

    
figsize(8, 6)
reg_values = xgb_folding_regression.predict(full_data)
for label in numpy.unique(labels):
    hist(reg_values[labels == label], 
         bins=100, alpha=0.1, normed=True,  label=str(label))
legend()









    



KFold prediction using folds column






    Out[261]:





<matplotlib.legend.Legend at 0x7f434008d810>



In [262]:

    
full_data['xgb_reg'] = xgb_folding_regression.predict(full_data)









    



KFold prediction using folds column

Decision Train regression



In [268]:

    
tt_base = DecisionTrainRegressor(learning_rate=0.02, n_estimators=10000, depth=6, pretransform_needed=True, 
                                 max_features=0.7, loss=MSELossFunction(regularization=100))
tt_folding_regression = FoldingRegressor(tt_base, n_folds=2, random_state=11,
                                         parallel_profile='threads-2', 
                                         features=sv_features)
%time tt_folding_regression.fit_lds(data_ranker_lds)
pass









    



CPU times: user 1min 7s, sys: 13.2 s, total: 1min 21s
Wall time: 39.6 s



In [269]:

    
report_dt_reg = tt_folding_regression.test_on_lds(data_ranker_lds)









    



KFold prediction using folds column



In [270]:

    
lc_dt = report_dt_reg.learning_curve(mean_squared_error, steps=1)









    



KFold prediction using folds column



In [271]:

    
lc_dt.plot(new_plot=True)
lc.plot()
# ylim(0.16, 0.18)



In [277]:

    
figsize(8, 6)
reg_values = tt_folding_regression.predict(full_data)
for label in numpy.unique(labels):
    hist(reg_values[labels == label], 
         bins=100, alpha=0.1, normed=True, range=(-0.5, 2.5), label=str(label))
xlim(-1, 2.5)
# ylim(0, 0.05)
legend()









    



KFold prediction using folds column






    Out[277]:





<matplotlib.legend.Legend at 0x7f4340443c50>

Ranker



In [278]:

    
from hep_ml.gradientboosting import UGradientBoostingRegressor
from hep_ml.losses import RankBoostLossFunction
from rep.estimators import SklearnClassifier

decision train ranker



In [280]:

    
full_data['fake_request'] = 0



In [286]:

    
tt_base = DecisionTrainRegressor(learning_rate=0.02, n_estimators=10000, depth=6, pretransform_needed=True, 
                                 max_features=6, 
                                 train_features=sv_features + rbf_features,
                                 loss=RankBoostLossFunction(request_column='fake_request'))
tt_folding_ranking = FoldingRegressor(SklearnClassifier(tt_base), n_folds=2, random_state=11,
                                      parallel_profile='threads-2', features=sv_features + rbf_features + ['fake_request'])
%time tt_folding_ranking.fit_lds(data_multi_lds)
pass









    



CPU times: user 10min 21s, sys: 50.2 s, total: 11min 12s
Wall time: 6min 2s



In [290]:

    
tt_folding_ranking.estimators[0].feature_importances_









    Out[290]:





array([ 0.047 ,  0.047 ,  0.0622,  0.0628,  0.0637,  0.0422,  0.0198,
        0.0511,  0.0577,  0.0395,  0.0501,  0.0461,  0.0531,  0.029 ,
        0.102 ,  0.1261,  0.1006])



In [291]:

    
tt_ranks = tt_folding_ranking.predict(full_data)
for label in numpy.unique(labels):
    hist(tt_ranks[labels == label], bins=100, alpha=0.1, 
         normed=True, range=(-20, 10), label=str(label))
legend()









    



KFold prediction using folds column






    Out[291]:





<matplotlib.legend.Legend at 0x7f4340549b10>



In [292]:

    
full_data['dt_rank'] = tt_folding_ranking.predict(full_data)









    



KFold prediction using folds column

ugb ranker



In [55]:

    
ranker_base = UGradientBoostingRegressor(loss=RankBoostLossFunction(request_column='fake_request'), subsample=0.6, 
                                         n_estimators=400, max_depth=6, 
                                         train_features=sv_features + ['bdt1', 'bdt2'])
ranker = FoldingRegressor(ranker_base, n_folds=2, random_state=11,
                          parallel_profile='threads-2')
%time ranker.fit_lds(data_multi_lds)
pass









    



CPU times: user 4min 32s, sys: 1.64 s, total: 4min 33s
Wall time: 2min 19s



In [67]:

    
ranker.estimators[0].feature_importances_









    Out[67]:





array([ 0.100652  ,  0.10031386,  0.09663487,  0.08486942,  0.08862554,
        0.01144869,  0.01208862,  0.04759602,  0.04616032,  0.04938687,
        0.05137088,  0.11066463,  0.20018827])



In [56]:

    
ranks = ranker.predict(full_data)
tt_ranks = tt_folding_ranking.predict(full_data)









    



KFold prediction using folds column
KFold prediction using folds column

pdfs form for dt ranker



In [57]:

    
for label in numpy.unique(full_data.label.values):
    hist(tt_ranks[full_data.label.values == label], bins=100, alpha=0.1, 
         normed=True, range=(-20, 10), label=str(label))
legend()









    Out[57]:





<matplotlib.legend.Legend at 0x7f9b1527d490>

pdfs from for ugb ranker



In [58]:

    
for label in numpy.unique(full_data.label.values):
    hist(ranks[full_data.label.values == label], bins=100, alpha=0.1, normed=True, range=(-20, 10), label=str(label))
legend()









    Out[58]:





<matplotlib.legend.Legend at 0x7f9b152c5e10>



In [59]:

    
labels = full_data.label.values
ranker_result = generate_result(roc_auc_score(labels[labels != 2] > 0, ranks[labels != 2]),
                                roc_auc_score(labels[labels != 1] > 0, ranks[labels != 1]),
                                roc_auc_score(labels[labels != 0] > 1, ranks[labels != 0]), 
                                label='ranker')



In [60]:

    
tt_ranker_result = generate_result(roc_auc_score(labels[labels != 2] > 0, tt_ranks[labels != 2]),
                                   roc_auc_score(labels[labels != 1] > 0, tt_ranks[labels != 1]),
                                   roc_auc_score(labels[labels != 0] > 1, tt_ranks[labels != 0]), 
                                   label='dt ranker')



In [ ]:



In [ ]:

Results



In [61]:

    
result = pandas.concat([baseline_result, multiclass_result,
                       regression_result, ranker_result, tt_ranker_result])
result.index = result['name']
result = result.drop('name', axis=1)
result

	b vs c	b vs light	c vs light
name
baseline	0.948797	0.981854	0.967735
multiclass	0.948900	0.983470	0.973202
regression	0.896365	0.982162	0.943175
ranker	0.898323	0.982022	0.940905
dt ranker	0.898226	0.981927	0.942289

name	logloss	b vs c	b vs light	c vs light
baseline	-1	0.949783	0.985596	0.975569
baseline, add features	-1	0.950047	0.985900	0.975862
one vs one, add features	-1	0.950047	0.986604	0.976808

name	logloss	b vs c	b vs light	c vs light
baseline	-1.000000	0.949783	0.985596	0.975569
baseline, add features	-1.000000	0.950047	0.985900	0.975862
one vs one, add features	-1.000000	0.950047	0.986604	0.976808
mn, add features	-1.000000	0.950761	0.986284	0.976321
multiclass combo	0.323668	0.951336	0.989367	0.982157

name	logloss	b vs c	b vs light	c vs light
baseline	-1.000000	0.949218	0.985552	0.975261
baseline, add features	-1.000000	0.949592	0.985957	0.975653
one vs one, add features	-1.000000	0.949592	0.986477	0.977006
mn, add features	-1.000000	0.950182	0.986155	0.976294
multiclass, NN features	0.300620	0.949504	0.989182	0.981199
multiclass NN, NN features	0.303715	0.949605	0.988334	0.981056