In [1]:
%pylab inline


Populating the interactive namespace from numpy and matplotlib

In [2]:
%run additional.ipynb

In [3]:
pandas.set_option('display.max_colwidth', 140)

HLT1 track classifier

Training channels (read data)

We will use all possible


In [4]:
bck_train_mode_name = 30000000
sig_train_modes_names = list(set(empty_events.keys()) - {bck_train_mode_name})
sig_train_files = ['mod_{}.csv'.format(name) for name in sig_train_modes_names]
bck_train_files = 'mod_30000000.csv'
folder = "datasets/prepared_hlt_track/"

In [5]:
# concat all signal data
if not os.path.exists(folder + 'signal_track.csv'):
    concat_files(folder, sig_train_files, os.path.join(folder , 'signal_track.csv'))

In [6]:
signal_data = pandas.read_csv(os.path.join(folder, 'signal_track.csv'), sep='\t')
bck_data = pandas.read_csv(os.path.join(folder, bck_train_files), sep='\t')

In [7]:
signal_data.columns


Out[7]:
Index([u'unique', u'mode', u'event_number', u'trk_number', u'pass_trk', u'signal', u'pt', u'p', u'ip', u'ipchi2', u'nvelo', u'nt', u'chi2', u'ismu', u'good', u'sig'], dtype='object')

Counting events and svrs,

that passed L0 and GoodGenB preselection (this data was generated by skim)


In [8]:
print 'Signal', statistic_length(signal_data)
print 'Bck', statistic_length(bck_data)


Signal {'SVR': 9126936, 'Events': 587497}
Bck {'SVR': 1404261, 'Events': 110520}

In [9]:
total_bck_events = statistic_length(bck_data)['Events'] + empty_events[bck_train_mode_name]
total_signal_events_by_mode = dict()
for mode in sig_train_modes_names:
    total_signal_events_by_mode[mode] = statistic_length(signal_data[signal_data['mode'] == mode])['Events'] + empty_events[mode]

events distribution by mode


In [10]:
print 'Bck:', total_bck_events
'Signal:', total_signal_events_by_mode


Bck: 183791
Out[10]:
('Signal:',
 {11102003: 23299,
  11104121: 23321,
  11114001: 62834,
  11114101: 38503,
  11124001: 0,
  11296013: 17233,
  11536011: 14050,
  11874004: 27792,
  11874042: 4039,
  12103035: 25963,
  12103121: 25697,
  12165106: 0,
  12265042: 0,
  12873002: 9797,
  12873432: 0,
  12875500: 0,
  13102201: 0,
  13102412: 34782,
  13104012: 0,
  13112001: 36061,
  13144001: 55645,
  13144020: 40397,
  13246001: 38151,
  13264021: 25563,
  13512010: 46435,
  13774002: 12938,
  15164001: 7273,
  15512011: 29846,
  20000000: 0,
  21263002: 9707,
  23103110: 0,
  25103000: 0,
  25103110: 0,
  27163002: 977,
  27163400: 469,
  27265001: 1015,
  27265101: 2699})

Define variables

We will use two sets of variables


In [11]:
variables_base = ["pt", "ipchi2"]

Counting events and svrs,

which passed pass_trk (equivalent Mike's preselections for nbody selection)


In [12]:
# hlt1 track selection
signal_data = signal_data[signal_data['pass_trk'] == 1]
bck_data = bck_data[bck_data['pass_trk'] == 1]

In [13]:
print 'Signal', statistic_length(signal_data)
print 'Bck', statistic_length(bck_data)


Signal {'SVR': 2092390, 'Events': 576519}
Bck {'SVR': 316559, 'Events': 95858}

In [14]:
total_signal_events_by_mode_presel = dict()
for mode in sig_train_modes_names:
    total_signal_events_by_mode_presel[mode] = statistic_length(signal_data[signal_data['mode'] == mode])['Events']
total_bck_events_presel = statistic_length(bck_data)['Events']

events distribution by mode


In [15]:
print 'Bck:', total_bck_events_presel
'Signal:', total_signal_events_by_mode_presel


Bck: 95858
Out[15]:
('Signal:',
 {11102003: 21776,
  11104121: 21285,
  11114001: 60684,
  11114101: 35702,
  11124001: 0,
  11296013: 17129,
  11536011: 13834,
  11874004: 27067,
  11874042: 3926,
  12103035: 24897,
  12103121: 20015,
  12165106: 0,
  12265042: 0,
  12873002: 9404,
  12873432: 0,
  12875500: 0,
  13102201: 0,
  13102412: 30400,
  13104012: 0,
  13112001: 34367,
  13144001: 53937,
  13144020: 39012,
  13246001: 37152,
  13264021: 25042,
  13512010: 42211,
  13774002: 12503,
  15164001: 7074,
  15512011: 26855,
  20000000: 0,
  21263002: 8472,
  23103110: 0,
  25103000: 0,
  25103110: 0,
  27163002: 775,
  27163400: 333,
  27265001: 792,
  27265101: 1875})

In [16]:
signal_data.head()


Out[16]:
unique mode event_number trk_number pass_trk signal pt p ip ipchi2 nvelo nt chi2 ismu good sig
0 13144001_0 13144001 0 0 1 1 1390.58 23691.70 0.170297 52.7346 19 24 1.07259 0 1 1
7 13144001_0 13144001 0 7 1 1 1072.72 16412.20 0.134709 22.5363 15 19 1.09721 0 1 1
8 13144001_0 13144001 0 8 1 1 1234.32 15950.30 1.033650 712.2440 12 19 1.26025 1 1 1
11 13144001_0 13144001 0 11 1 1 1848.32 9100.94 1.320430 3941.8900 12 20 1.31286 1 1 1
14 13144001_0 13144001 0 14 1 1 1234.32 15950.30 1.033630 711.8770 12 19 1.25972 1 1 1

Prepare train/test splitting

Divide events which passed alll preselections into two equal parts randomly


In [17]:
ds_train_signal, ds_train_bck, ds_test_signal, ds_test_bck = prepare_data(signal_data, bck_data, 'unique')

train: counting events and svrs


In [18]:
print 'Signal', statistic_length(ds_train_signal)
print 'Bck', statistic_length(ds_train_bck)


Signal {'SVR': 1045984, 'Events': 288259}
Bck {'SVR': 158294, 'Events': 47929}

In [19]:
train = pandas.concat([ds_train_bck, ds_train_signal])

test: counting events and svrs


In [20]:
print 'Signal', statistic_length(ds_test_signal)
print 'Bck', statistic_length(ds_test_bck)


Signal {'SVR': 1046406, 'Events': 288260}
Bck {'SVR': 158265, 'Events': 47929}

In [21]:
test = pandas.concat([ds_test_bck, ds_test_signal])

Define all total events in test samples

(which passed just l0 and goodgenB) using also empty events. Suppose that events which didn't pass pass_2trk also were equal randomly divided into training and test samples


In [22]:
total_test_bck_events = (total_bck_events - total_bck_events_presel) // 2 + statistic_length(ds_test_bck)['Events']
total_test_signal_events = dict()
for mode in sig_train_modes_names:
    total_not_passed_signal = total_signal_events_by_mode[mode] - total_signal_events_by_mode_presel[mode]
    total_test_signal_events[mode] = total_not_passed_signal // 2 + \
        statistic_length(ds_test_signal[ds_test_signal['mode'] == mode])['Events']

In [23]:
print 'Bck total test events:', total_test_bck_events
'Signal total test events:', total_test_signal_events


Bck total test events: 91895
Out[23]:
('Signal total test events:',
 {11102003: 11684,
  11104121: 11679,
  11114001: 31565,
  11114101: 19220,
  11124001: 0,
  11296013: 8502,
  11536011: 7092,
  11874004: 13927,
  11874042: 2047,
  12103035: 13058,
  12103121: 12894,
  12165106: 0,
  12265042: 0,
  12873002: 4907,
  12873432: 0,
  12875500: 0,
  13102201: 0,
  13102412: 17470,
  13104012: 0,
  13112001: 17895,
  13144001: 27802,
  13144020: 20171,
  13246001: 18995,
  13264021: 12759,
  13512010: 23049,
  13774002: 6501,
  15164001: 3632,
  15512011: 14963,
  20000000: 0,
  21263002: 4888,
  23103110: 0,
  25103000: 0,
  25103110: 0,
  27163002: 484,
  27163400: 238,
  27265001: 487,
  27265101: 1328})

In [24]:
from hep_ml import nnet
from sklearn.metrics import roc_auc_score

Training


In [25]:
from sklearn.base import BaseEstimator, TransformerMixin
from scipy.stats.distributions import norm

class SupervisedTransform(BaseEstimator, TransformerMixin):
    def __init__(self, scale=0., like_normal=False):
        """
        This transformation applies nonlinear rescale to each axis,
        only order of events is taken into account.

        If sig and bck events are neighbours in some feature, they will be 'splitted' by 'insertion'
        Scale is how big insertion is
        """
        self.like_normal = like_normal
        self.scale = scale

    def fit(self, X, y):
        X = numpy.array(X)
        self.initial_values = []
        self.transformed_values = []
        for axis in range(X.shape[1]):
            initial_values = X[:, axis] * (1 + 1e-6 * numpy.random.normal(size=len(X)))
            initial_values += 1e-8 * numpy.random.normal(size=len(X))
            indices = numpy.argsort(initial_values)
            initial_values = initial_values[indices]
            self.initial_values.append(initial_values)
            transformed = numpy.arange(len(X), dtype='float')
            # increase the distance between neighs of different classes
            additions = numpy.abs(numpy.diff(y[indices]))
            additions = numpy.cumsum(additions)
            transformed[1:] += additions * self.scale
            transformed /= transformed[-1] / 2.
            transformed -= 1

            if self.like_normal:
                # converting to normal-like distributions
                transformed -= transformed[0]
                transformed /= transformed[-1] / 0.9
                transformed += 0.05
                transformed = norm().ppf(transformed)

            self.transformed_values.append(transformed)

        return self

    def transform(self, X):
        X = numpy.array(X)
        result = []
        for axis, (init_vals, trans_vals) in enumerate(zip(self.initial_values, self.transformed_values)):
            result.append(numpy.interp(X[:, axis], init_vals, trans_vals))
        return numpy.vstack(result).T

In [26]:
models = {}

In [27]:
# import cPickle
# with open('models/hlt1.pkl', 'r') as file:
#     models = cPickle.load(file)

Matrixnet training


In [27]:
from rep_ef.estimators import EventFilterClassifier

In [28]:
models['MN'] = EventFilterClassifier(features=variables_base, 
                                     dataset_name='hlt1_{random}', 
                                     iterations=5000, sync=False)
models['MN'].fit(train, train['signal'])


Out[28]:
EventFilterClassifier(auto_stop=None, baseline=None, command_line_params=None,
           connection='default', dataset_name='hlt1_121895',
           dump_filename=None, features=['pt', 'ipchi2'],
           features_sample_rate_per_iteration=1.0, formula_name='EF',
           intervals=64, iterations=5000, max_features_per_iteration=6,
           regularization=0.01, sync=False, training_fraction=0.5)

In [29]:
models['MN'].get_feature_importances().sort('effect')


Out[29]:
effect efficiency information
ipchi2 0.473405 0.49 0.967058
pt 1.000000 1.00 1.000000

In [30]:
print roc_auc_score(test['signal'], models['MN'].predict_proba(test)[:, 1])


0.908735913497

NN


In [32]:
from hep_ml import nnet
from rep.estimators import SklearnClassifier
base_nn = nnet.MultiLayerNetwork(layers=[7, 7], scaler=SupervisedTransform(scale=0., ), epochs=300)
models['special NN'] = SklearnClassifier(base_nn, features=variables_base)

In [33]:
%time models['special NN'].fit(train, train['signal'])


CPU times: user 4min 34s, sys: 3min 4s, total: 7min 39s
Wall time: 5min 57s
Out[33]:
SklearnClassifier(clf=MultiLayerNetwork(epochs=300, layers=[7, 7], loss='log_loss',
         random_state=<mtrand.RandomState object at 0x335c1d0>,
         scaler=SupervisedTransform(like_normal=False, scale=0.0),
         trainer='irprop-', trainer_parameters=None),
         features=['pt', 'ipchi2'])

In [34]:
print roc_auc_score(test['signal'], models['special NN'].predict_proba(test)[:, 1])
print roc_auc_score(train['signal'], models['special NN'].predict_proba(train)[:, 1])


0.903015727377
0.902884392309

Theanets


In [35]:
from rep.estimators import TheanetsClassifier
models['NN'] = TheanetsClassifier(layers=[8, 8], scaler=SupervisedTransform(scale=0., ), features=variables_base)

In [36]:
%time models['NN'].fit(train, train['signal'])


CPU times: user 8min 22s, sys: 6min 57s, total: 15min 20s
Wall time: 8min 13s
Out[36]:
TheanetsClassifier(decode_from=1, features=['pt', 'ipchi2'],
          hidden_activation='logistic', hidden_dropouts=0, hidden_noise=0,
          input_dropouts=0, input_layer=-1, input_noise=0, layers=[8, 8],
          output_activation='linear', output_layer=-1, random_state=42,
          scaler=SupervisedTransform(like_normal=False, scale=0.0),
          trainers=[{}])

In [37]:
print roc_auc_score(test['signal'], models['NN'].predict_proba(test)[:, 1])
print roc_auc_score(train['signal'], models['NN'].predict_proba(train)[:, 1])


0.905360860344
0.905113248163

Logistic Regression


In [38]:
from rep.estimators import SklearnClassifier
from sklearn.linear_model import LogisticRegression
models['logistic regression'] = SklearnClassifier(LogisticRegression(C=10), 
                                                  features=['pt', 'ipchi2',
                                                            'llpt: log(log(pt))', 
                                                            'llip: log(log(ipchi2))',
                                                            'lpt: log(pt)', 'lip: log(ipchi2)'])
models['logistic regression'].fit(train, train['signal'])


Out[38]:
SklearnClassifier(clf=LogisticRegression(C=10, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr',
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0),
         features=['pt', 'ipchi2', 'llpt: log(log(pt))', 'llip: log(log(ipchi2))', 'lpt: log(pt)', 'lip: log(ipchi2)'])

Gradient Boosting


In [39]:
from sklearn.ensemble import GradientBoostingClassifier
models['GB'] = SklearnClassifier(GradientBoostingClassifier(n_estimators=500, learning_rate=0.05, 
                                                            max_depth=4, subsample=0.3), features=variables_base)
models['GB'].fit(train, train['signal'])


Out[39]:
SklearnClassifier(clf=GradientBoostingClassifier(init=None, learning_rate=0.05, loss='deviance',
              max_depth=4, max_features=None, max_leaf_nodes=None,
              min_samples_leaf=1, min_samples_split=2,
              min_weight_fraction_leaf=0.0, n_estimators=500,
              random_state=None, subsample=0.3, verbose=0,
              warm_start=False),
         features=['pt', 'ipchi2'])

Calculate thresholds on classifiers


In [28]:
thresholds = dict()
RATE = [100000., 50000.]
events_pass = dict()
for name, cl in models.items():
    prob = cl.predict_proba(ds_test_bck)
    thr, result = calculate_thresholds(ds_test_bck, prob, total_test_bck_events, rates=RATE)
    thresholds[name] = thr
    print name, result
for rate, val in result.items():
    events_pass[rate] = val[1]


special NN {100000.0: (0.9386080762114769, 9189, 0.09999455900756297), 50000.0: (0.97375860454788443, 4594, 0.04999183851134447)}
MN {100000.0: (0.94372227634864903, 9179, 0.09988573915882257), 50000.0: (0.97239078344689123, 4586, 0.04990478263235214)}
logistic regression {100000.0: (0.96249247558889151, 9189, 0.09999455900756297), 50000.0: (0.97618482085207137, 4594, 0.04999183851134447)}
NN {100000.0: (0.95871122150276855, 9189, 0.09999455900756297), 50000.0: (0.98430484180601552, 4594, 0.04999183851134447)}
GB {100000.0: (0.94397741593796591, 9188, 0.09998367702268894), 50000.0: (0.97434938833825169, 4594, 0.04999183851134447)}

Save models


In [41]:
import cPickle
with open('models/hlt1.pkl', 'w') as file:
    cPickle.dump(models, file)

Plot decision boundary


In [30]:
def plot_boundary(name, rate=100000.):
    t = thresholds[name][rate]
    prob = models[name].predict_proba(ds_test_signal)[:, 1]
    plt.figure(figsize=(10, 8))
    plt.scatter(numpy.log(ds_test_signal['ipchi2'].values[prob > t]), 
                numpy.log(ds_test_signal['pt'].values[prob > t]),
                alpha=0.01, label='signal')
    plt.scatter(numpy.log(ds_test_signal['ipchi2'].values[prob <= t]), 
                numpy.log(ds_test_signal['pt'].values[prob <= t]),
                alpha=0.01, label='misclassified signal', color='r')
    plt.legend(loc='best', fontsize=16, scatterpoints=500)
    plt.xlabel('log(ipchi2)', fontsize=16)
    plt.ylabel('log(pt)', fontsize=16)
    plt.title("Estimator: " + name, fontsize=16)
    plt.show()
    
for name in models:
    plot_boundary(name)


Initial correlation


In [31]:
scatter(numpy.log(ds_test_signal['ipchi2'].values), numpy.log(ds_test_signal['pt'].values), alpha=0.01, label='signal')
scatter(numpy.log(ds_test_bck['ipchi2'].values), numpy.log(ds_test_bck['pt'].values), color='r', alpha=0.01, label='background')
legend(loc='best', fontsize=16, scatterpoints=500)
xlabel('log(ipchi2)', fontsize=16)
ylabel('log(pt)', fontsize=16)
title('Tracks description', fontsize=16)
# plt.savefig('hlt1_init.pdf' , format='pdf')


Out[31]:
<matplotlib.text.Text at 0x2f761a10>

In [32]:
def plot_eff(name, rate=100000.):
    prob = models[name].predict_proba(ds_test_bck)[:, 1]
    t = thresholds[name][rate]
    cut_pass = prob > t
    matr, b1, b2 = numpy.histogram2d(numpy.log(ds_test_bck['ipchi2'].values), 
                                     numpy.log(ds_test_bck['pt'].values), bins=(25, 20),
                                     weights=cut_pass)

    matr2, b11, b22 = numpy.histogram2d(numpy.log(ds_test_bck['ipchi2'].values), 
                                        numpy.log(ds_test_bck['pt'].values), bins=(25, 20))
    figsize(10, 8)
    p = plt.pcolor(matr / (matr2 + 10e-5))
    plt.colorbar(p)
    xlabel('log(pt) bins', fontsize=16)
    title('Background track efficiency for ' + name, fontsize=16)
    ylabel('log(ipchi2) bins', fontsize=16)
    show()
    
for name in models:
    plot_eff(name)


Final efficiencies for each mode


In [29]:
train_modes_eff, statistic = result_statistic(models, sig_train_modes_names, ds_test_signal,
                                              thresholds, [100000.], total_test_signal_events)
train_modes_eff_5, statistic_5 = result_statistic(models, sig_train_modes_names, ds_test_signal,
                                                  thresholds, [50000.], total_test_signal_events)

rate 100000.


In [30]:
from rep.plotting import BarComparePlot
BarComparePlot(train_modes_eff, sortby=('MN', 100000.0)).plot(new_plot=True, figsize=(24, 8), 
                                                                  ylabel='efficiency', fontsize=22)
lgd = legend(bbox_to_anchor=(0.5, 1.4), loc='upper center', ncol=2, fontsize=22)
# plt.savefig('track.pdf' , format='pdf', bbox_extra_artists=(lgd,), bbox_inches='tight')



In [38]:
pandas.DataFrame(statistic)


Out[38]:
100000.0 classifier mode name
0 89.403640 special NN 13144001 $B_{s}^{0} \rightarrow \psi(1S)[\mu^{+}\mu^{-}]\phi[K^{+}K^{-}]$
1 35.714286 special NN 27163400 $D^{*+}(2010) \rightarrow D^{0}[K^{-}\pi^{+}\pi^{0}]\pi^{+}$
2 88.818387 special NN 11536011 $B^{0} \rightarrow D^{*-}[\pi^{-}\bar{D}^{0}[K^{+}\pi^{-}]\tau^{+}[\pi^{+}\pi^{+}\pi^{-}\bar{\nu}_{\tau}]\nu_{\tau}$
3 94.260174 special NN 11296013 $B^{0} \rightarrow D^{+}[K^{-}\pi^{+}\pi^{+}]D^{-}[K^{+}\pi^{-}\pi^{-}]$
4 87.977190 special NN 11114001 $B^{0} \rightarrow K^{*}[K^{+}\pi^{-}]\mu^{+}\mu^{-}$
5 91.527549 special NN 13264021 $B_{s}^{0} \rightarrow D_{s}^{-}[K^{+}K^{-}\pi^{-}]\pi^{+}$
6 61.415712 special NN 21263002 $D^{+} \rightarrow K^{-}K^{+}\pi^{+}$
7 88.098635 special NN 12873002 $B^{+} \rightarrow \bar{D}^{0}[K^{+}\pi^{-}]\mu^{+}\nu_{\mu}$
8 88.486444 special NN 13246001 $B_{s}^{0} \rightarrow \psi(1S)[\mu^{+}\mu^{-}]K^{+}K^{-}\pi^{+}\pi^{-}$
9 86.663590 special NN 13774002 $B_{s}^{0} \rightarrow D_{s}^{-}[K^{+}K^{-}\pi^{-}]\mu^{+}\nu_{\mu}$
10 85.373160 special NN 11102003 $B^{0} \rightarrow K^{-}\pi^{+}$
11 90.393965 special NN 13112001 $B_{s}^{0} \rightarrow \mu^{+}\mu^{-}$
12 80.719337 special NN 13512010 $B_{s}^{0} \rightarrow K^{-}\mu^{+}\nu_{\mu}$
13 78.607231 special NN 15512011 $\Lambda_{b}^{0} \rightarrow p^{+}\mu^{-}\bar{\nu}_{\mu}$
14 72.764740 special NN 13102412 $B_{s}^{0} \rightarrow K^{-}K^{+}\pi^{0}$
15 29.066265 special NN 27265101 $D^{*+}(2010) \rightarrow D^{0}[K_{S}[\pi^{+}\pi^{-}]K^{+}K^{-}]\pi^{+}$
16 67.100977 special NN 12103121 $B^{+} \rightarrow K^{+}K_{S}[\pi^{+}\pi^{-}]$
17 88.909821 special NN 13144020 $B_{s}^{0} \rightarrow \psi(2S)[\mu^{+}\mu^{-}]\phi[K^{+}K^{-}]$
18 49.586777 special NN 27163002 $D^{*+}(2010) \rightarrow D^{0}[K^{+}K^{-}]\pi^{+}$
19 89.179515 special NN 15164001 $\Lambda_{b}^{0} \rightarrow \Lambda_{c}^{+}[p^{+}K^{+}\pi^{-}]\pi^{-}$
20 30.595483 special NN 27265001 $D^{*+}(2010) \rightarrow D^{0}[K^{+}K^{-}\pi^{+}\pi^{-}]\pi^{+}$
21 90.837941 special NN 11874004 $B^{0} \rightarrow D^{*-}[\pi^{-}\bar{D}^{0}[K^{+}\pi^{-}]]\mu^{+}\nu_{\mu}$
22 83.251821 special NN 11114101 $B^{0} \rightarrow K_{S}^{0}[\pi^{+}\pi^{-}]\mu^{+}\mu^{-}$
23 80.263721 special NN 11104121 $B^{0} \rightarrow K_{S}^{0}[\pi^{+}\pi^{-}]\pi^{+}\pi^{-}$
24 88.519785 special NN 11874042 $B^{0} \rightarrow D^{-}[K^{+}\pi^{-}\pi^{-}]\mu^{+}\nu_{\mu}$
25 88.941645 special NN 12103035 $B^{+} \rightarrow \pi^{+}K^{-}K^{+}$
26 89.195022 MN 13144001 $B_{s}^{0} \rightarrow \psi(1S)[\mu^{+}\mu^{-}]\phi[K^{+}K^{-}]$
27 34.033613 MN 27163400 $D^{*+}(2010) \rightarrow D^{0}[K^{-}\pi^{+}\pi^{0}]\pi^{+}$
28 88.465877 MN 11536011 $B^{0} \rightarrow D^{*-}[\pi^{-}\bar{D}^{0}[K^{+}\pi^{-}]\tau^{+}[\pi^{+}\pi^{+}\pi^{-}\bar{\nu}_{\tau}]\nu_{\tau}$
29 94.413079 MN 11296013 $B^{0} \rightarrow D^{+}[K^{-}\pi^{+}\pi^{+}]D^{-}[K^{+}\pi^{-}\pi^{-}]$
... ... ... ... ...
100 83.392300 NN 11114101 $B^{0} \rightarrow K_{S}^{0}[\pi^{+}\pi^{-}]\mu^{+}\mu^{-}$
101 80.734652 NN 11104121 $B^{0} \rightarrow K_{S}^{0}[\pi^{+}\pi^{-}]\pi^{+}\pi^{-}$
102 88.764045 NN 11874042 $B^{0} \rightarrow D^{-}[K^{+}\pi^{-}\pi^{-}]\mu^{+}\nu_{\mu}$
103 89.485373 NN 12103035 $B^{+} \rightarrow \pi^{+}K^{-}K^{+}$
104 89.493562 GB 13144001 $B_{s}^{0} \rightarrow \psi(1S)[\mu^{+}\mu^{-}]\phi[K^{+}K^{-}]$
105 34.873950 GB 27163400 $D^{*+}(2010) \rightarrow D^{0}[K^{-}\pi^{+}\pi^{0}]\pi^{+}$
106 88.465877 GB 11536011 $B^{0} \rightarrow D^{*-}[\pi^{-}\bar{D}^{0}[K^{+}\pi^{-}]\tau^{+}[\pi^{+}\pi^{+}\pi^{-}\bar{\nu}_{\tau}]\nu_{\tau}$
107 94.366032 GB 11296013 $B^{0} \rightarrow D^{+}[K^{-}\pi^{+}\pi^{+}]D^{-}[K^{+}\pi^{-}\pi^{-}]$
108 88.205291 GB 11114001 $B^{0} \rightarrow K^{*}[K^{+}\pi^{-}]\mu^{+}\mu^{-}$
109 92.146720 GB 13264021 $B_{s}^{0} \rightarrow D_{s}^{-}[K^{+}K^{-}\pi^{-}]\pi^{+}$
110 60.085925 GB 21263002 $D^{+} \rightarrow K^{-}K^{+}\pi^{+}$
111 88.424699 GB 12873002 $B^{+} \rightarrow \bar{D}^{0}[K^{+}\pi^{-}]\mu^{+}\nu_{\mu}$
112 88.607528 GB 13246001 $B_{s}^{0} \rightarrow \psi(1S)[\mu^{+}\mu^{-}]K^{+}K^{-}\pi^{+}\pi^{-}$
113 86.632826 GB 13774002 $B_{s}^{0} \rightarrow D_{s}^{-}[K^{+}K^{-}\pi^{-}]\mu^{+}\nu_{\mu}$
114 86.896611 GB 11102003 $B^{0} \rightarrow K^{-}\pi^{+}$
115 91.159542 GB 13112001 $B_{s}^{0} \rightarrow \mu^{+}\mu^{-}$
116 81.561022 GB 13512010 $B_{s}^{0} \rightarrow K^{-}\mu^{+}\nu_{\mu}$
117 79.502774 GB 15512011 $\Lambda_{b}^{0} \rightarrow p^{+}\mu^{-}\bar{\nu}_{\mu}$
118 73.508872 GB 13102412 $B_{s}^{0} \rightarrow K^{-}K^{+}\pi^{0}$
119 28.313253 GB 27265101 $D^{*+}(2010) \rightarrow D^{0}[K_{S}[\pi^{+}\pi^{-}]K^{+}K^{-}]\pi^{+}$
120 68.791686 GB 12103121 $B^{+} \rightarrow K^{+}K_{S}[\pi^{+}\pi^{-}]$
121 89.296515 GB 13144020 $B_{s}^{0} \rightarrow \psi(2S)[\mu^{+}\mu^{-}]\phi[K^{+}K^{-}]$
122 48.966942 GB 27163002 $D^{*+}(2010) \rightarrow D^{0}[K^{+}K^{-}]\pi^{+}$
123 90.088106 GB 15164001 $\Lambda_{b}^{0} \rightarrow \Lambda_{c}^{+}[p^{+}K^{+}\pi^{-}]\pi^{-}$
124 29.363450 GB 27265001 $D^{*+}(2010) \rightarrow D^{0}[K^{+}K^{-}\pi^{+}\pi^{-}]\pi^{+}$
125 90.945645 GB 11874004 $B^{0} \rightarrow D^{*-}[\pi^{-}\bar{D}^{0}[K^{+}\pi^{-}]]\mu^{+}\nu_{\mu}$
126 83.990635 GB 11114101 $B^{0} \rightarrow K_{S}^{0}[\pi^{+}\pi^{-}]\mu^{+}\mu^{-}$
127 81.582327 GB 11104121 $B^{0} \rightarrow K_{S}^{0}[\pi^{+}\pi^{-}]\pi^{+}\pi^{-}$
128 88.031265 GB 11874042 $B^{0} \rightarrow D^{-}[K^{+}\pi^{-}\pi^{-}]\mu^{+}\nu_{\mu}$
129 89.669168 GB 12103035 $B^{+} \rightarrow \pi^{+}K^{-}K^{+}$

130 rows × 4 columns

rate 50000.


In [39]:
BarComparePlot(train_modes_eff_5, sortby=('MN', 50000.0)).plot(new_plot=True, figsize=(24, 8), 
                                                             ylabel='efficiency', fontsize=22)
lgd = legend(bbox_to_anchor=(0.5, 1.4), loc='upper center', ncol=2, fontsize=22)
# plt.savefig('track_5.pdf' , format='pdf', bbox_extra_artists=(lgd,), bbox_inches='tight')



In [40]:
pandas.DataFrame(statistic_5)


Out[40]:
50000.0 classifier mode name
0 80.915042 special NN 13144001 $B_{s}^{0} \rightarrow \psi(1S)[\mu^{+}\mu^{-}]\phi[K^{+}K^{-}]$
1 21.008403 special NN 27163400 $D^{*+}(2010) \rightarrow D^{0}[K^{-}\pi^{+}\pi^{0}]\pi^{+}$
2 75.874224 special NN 11536011 $B^{0} \rightarrow D^{*-}[\pi^{-}\bar{D}^{0}[K^{+}\pi^{-}]\tau^{+}[\pi^{+}\pi^{+}\pi^{-}\bar{\nu}_{\tau}]\nu_{\tau}$
3 85.085862 special NN 11296013 $B^{0} \rightarrow D^{+}[K^{-}\pi^{+}\pi^{+}]D^{-}[K^{+}\pi^{-}\pi^{-}]$
4 78.301917 special NN 11114001 $B^{0} \rightarrow K^{*}[K^{+}\pi^{-}]\mu^{+}\mu^{-}$
5 84.520730 special NN 13264021 $B_{s}^{0} \rightarrow D_{s}^{-}[K^{+}K^{-}\pi^{-}]\pi^{+}$
6 43.412439 special NN 21263002 $D^{+} \rightarrow K^{-}K^{+}\pi^{+}$
7 78.540860 special NN 12873002 $B^{+} \rightarrow \bar{D}^{0}[K^{+}\pi^{-}]\mu^{+}\nu_{\mu}$
8 79.252435 special NN 13246001 $B_{s}^{0} \rightarrow \psi(1S)[\mu^{+}\mu^{-}]K^{+}K^{-}\pi^{+}\pi^{-}$
9 74.850023 special NN 13774002 $B_{s}^{0} \rightarrow D_{s}^{-}[K^{+}K^{-}\pi^{-}]\mu^{+}\nu_{\mu}$
10 78.183841 special NN 11102003 $B^{0} \rightarrow K^{-}\pi^{+}$
11 84.738754 special NN 13112001 $B_{s}^{0} \rightarrow \mu^{+}\mu^{-}$
12 72.484706 special NN 13512010 $B_{s}^{0} \rightarrow K^{-}\mu^{+}\nu_{\mu}$
13 69.812203 special NN 15512011 $\Lambda_{b}^{0} \rightarrow p^{+}\mu^{-}\bar{\nu}_{\mu}$
14 62.804808 special NN 13102412 $B_{s}^{0} \rightarrow K^{-}K^{+}\pi^{0}$
15 16.490964 special NN 27265101 $D^{*+}(2010) \rightarrow D^{0}[K_{S}[\pi^{+}\pi^{-}]K^{+}K^{-}]\pi^{+}$
16 59.950365 special NN 12103121 $B^{+} \rightarrow K^{+}K_{S}[\pi^{+}\pi^{-}]$
17 80.204254 special NN 13144020 $B_{s}^{0} \rightarrow \psi(2S)[\mu^{+}\mu^{-}]\phi[K^{+}K^{-}]$
18 36.157025 special NN 27163002 $D^{*+}(2010) \rightarrow D^{0}[K^{+}K^{-}]\pi^{+}$
19 81.993392 special NN 15164001 $\Lambda_{b}^{0} \rightarrow \Lambda_{c}^{+}[p^{+}K^{+}\pi^{-}]\pi^{-}$
20 18.685832 special NN 27265001 $D^{*+}(2010) \rightarrow D^{0}[K^{+}K^{-}\pi^{+}\pi^{-}]\pi^{+}$
21 82.214404 special NN 11874004 $B^{0} \rightarrow D^{*-}[\pi^{-}\bar{D}^{0}[K^{+}\pi^{-}]]\mu^{+}\nu_{\mu}$
22 74.713840 special NN 11114101 $B^{0} \rightarrow K_{S}^{0}[\pi^{+}\pi^{-}]\mu^{+}\mu^{-}$
23 71.281788 special NN 11104121 $B^{0} \rightarrow K_{S}^{0}[\pi^{+}\pi^{-}]\pi^{+}\pi^{-}$
24 77.088422 special NN 11874042 $B^{0} \rightarrow D^{-}[K^{+}\pi^{-}\pi^{-}]\mu^{+}\nu_{\mu}$
25 82.041660 special NN 12103035 $B^{+} \rightarrow \pi^{+}K^{-}K^{+}$
26 81.019351 MN 13144001 $B_{s}^{0} \rightarrow \psi(1S)[\mu^{+}\mu^{-}]\phi[K^{+}K^{-}]$
27 21.008403 MN 27163400 $D^{*+}(2010) \rightarrow D^{0}[K^{-}\pi^{+}\pi^{0}]\pi^{+}$
28 75.888325 MN 11536011 $B^{0} \rightarrow D^{*-}[\pi^{-}\bar{D}^{0}[K^{+}\pi^{-}]\tau^{+}[\pi^{+}\pi^{+}\pi^{-}\bar{\nu}_{\tau}]\nu_{\tau}$
29 85.227005 MN 11296013 $B^{0} \rightarrow D^{+}[K^{-}\pi^{+}\pi^{+}]D^{-}[K^{+}\pi^{-}\pi^{-}]$
... ... ... ... ...
100 73.761707 NN 11114101 $B^{0} \rightarrow K_{S}^{0}[\pi^{+}\pi^{-}]\mu^{+}\mu^{-}$
101 71.350287 NN 11104121 $B^{0} \rightarrow K_{S}^{0}[\pi^{+}\pi^{-}]\pi^{+}\pi^{-}$
102 77.576942 NN 11874042 $B^{0} \rightarrow D^{-}[K^{+}\pi^{-}\pi^{-}]\mu^{+}\nu_{\mu}$
103 82.485832 NN 12103035 $B^{+} \rightarrow \pi^{+}K^{-}K^{+}$
104 81.220775 GB 13144001 $B_{s}^{0} \rightarrow \psi(1S)[\mu^{+}\mu^{-}]\phi[K^{+}K^{-}]$
105 21.848739 GB 27163400 $D^{*+}(2010) \rightarrow D^{0}[K^{-}\pi^{+}\pi^{0}]\pi^{+}$
106 75.902425 GB 11536011 $B^{0} \rightarrow D^{*-}[\pi^{-}\bar{D}^{0}[K^{+}\pi^{-}]\tau^{+}[\pi^{+}\pi^{+}\pi^{-}\bar{\nu}_{\tau}]\nu_{\tau}$
107 85.415196 GB 11296013 $B^{0} \rightarrow D^{+}[K^{-}\pi^{+}\pi^{+}]D^{-}[K^{+}\pi^{-}\pi^{-}]$
108 78.796135 GB 11114001 $B^{0} \rightarrow K^{*}[K^{+}\pi^{-}]\mu^{+}\mu^{-}$
109 85.610158 GB 13264021 $B_{s}^{0} \rightarrow D_{s}^{-}[K^{+}K^{-}\pi^{-}]\pi^{+}$
110 42.573650 GB 21263002 $D^{+} \rightarrow K^{-}K^{+}\pi^{+}$
111 78.948441 GB 12873002 $B^{+} \rightarrow \bar{D}^{0}[K^{+}\pi^{-}]\mu^{+}\nu_{\mu}$
112 79.763096 GB 13246001 $B_{s}^{0} \rightarrow \psi(1S)[\mu^{+}\mu^{-}]K^{+}K^{-}\pi^{+}\pi^{-}$
113 75.326873 GB 13774002 $B_{s}^{0} \rightarrow D_{s}^{-}[K^{+}K^{-}\pi^{-}]\mu^{+}\nu_{\mu}$
114 79.476207 GB 11102003 $B^{0} \rightarrow K^{-}\pi^{+}$
115 85.616094 GB 13112001 $B_{s}^{0} \rightarrow \mu^{+}\mu^{-}$
116 73.174541 GB 13512010 $B_{s}^{0} \rightarrow K^{-}\mu^{+}\nu_{\mu}$
117 70.767894 GB 15512011 $\Lambda_{b}^{0} \rightarrow p^{+}\mu^{-}\bar{\nu}_{\mu}$
118 63.961076 GB 13102412 $B_{s}^{0} \rightarrow K^{-}K^{+}\pi^{0}$
119 16.039157 GB 27265101 $D^{*+}(2010) \rightarrow D^{0}[K_{S}[\pi^{+}\pi^{-}]K^{+}K^{-}]\pi^{+}$
120 61.695362 GB 12103121 $B^{+} \rightarrow K^{+}K_{S}[\pi^{+}\pi^{-}]$
121 80.893362 GB 13144020 $B_{s}^{0} \rightarrow \psi(2S)[\mu^{+}\mu^{-}]\phi[K^{+}K^{-}]$
122 35.950413 GB 27163002 $D^{*+}(2010) \rightarrow D^{0}[K^{+}K^{-}]\pi^{+}$
123 82.626652 GB 15164001 $\Lambda_{b}^{0} \rightarrow \Lambda_{c}^{+}[p^{+}K^{+}\pi^{-}]\pi^{-}$
124 16.632444 GB 27265001 $D^{*+}(2010) \rightarrow D^{0}[K^{+}K^{-}\pi^{+}\pi^{-}]\pi^{+}$
125 82.630861 GB 11874004 $B^{0} \rightarrow D^{*-}[\pi^{-}\bar{D}^{0}[K^{+}\pi^{-}]]\mu^{+}\nu_{\mu}$
126 75.629553 GB 11114101 $B^{0} \rightarrow K_{S}^{0}[\pi^{+}\pi^{-}]\mu^{+}\mu^{-}$
127 72.677455 GB 11104121 $B^{0} \rightarrow K_{S}^{0}[\pi^{+}\pi^{-}]\pi^{+}\pi^{-}$
128 77.821202 GB 11874042 $B^{0} \rightarrow D^{-}[K^{+}\pi^{-}\pi^{-}]\mu^{+}\nu_{\mu}$
129 83.228672 GB 12103035 $B^{+} \rightarrow \pi^{+}K^{-}K^{+}$

130 rows × 4 columns

Classification report using svrs


In [31]:
from rep.data import LabeledDataStorage
from rep.report import ClassificationReport
lds = LabeledDataStorage(test, test['signal'])
report = ClassificationReport(models, lds)

roc curve


In [32]:
report.roc()


Out[32]:

Classification report using events


In [33]:
plots = dict()
for key, value in models.items():
    plots[key] = plot_roc_events(value, ds_test_signal, ds_test_bck, key)


special NN AUC: 0.891304567496
MN AUC: 0.893936670902
logistic regression AUC: 0.813469580865
NN AUC: 0.891370394584
GB AUC: 0.893988369721

In [34]:
from rep.plotting import FunctionsPlot
FunctionsPlot(plots).plot(new_plot=True, xlim=(0, 0.2), ylim=(0.5, 0.95))
plot([1. * events_pass[100000.] / statistic_length(ds_test_bck)['Events']] * 2, [0., 1], 'g--', label='rate: 100 kHz')
plot([1. * events_pass[50000.] / statistic_length(ds_test_bck)['Events']] * 2, [0., 1], 'b--', label='rate: 50 kHz')
lgd = legend(loc='best', fontsize=16)



In [ ]: