In [1]:
%run additional.ipynb

In [2]:
%pylab inline


Populating the interactive namespace from numpy and matplotlib
WARNING: pylab import has clobbered these variables: ['empty']
`%matplotlib` prevents importing * from pylab and numpy

In [3]:
pandas.set_option('display.max_colwidth', 120)

In [103]:
PROFILE = 'ssh-ipy'

HLT2 nbody classification

did preselections:

  • any sv.n,
  • any sv.minpt
  • sv.nlt16 < 2

Training channels (read data)

We will use just 11114001, 11296013, 11874042, 12103035, 13246001, 13264021


In [4]:
sig_train_modes_names = [11114001, 11296013, 11874042, 12103035, 13246001, 13264021]
bck_train_mode_name = 30000000
sig_train_files = ['mod_{}.csv'.format(name) for name in sig_train_modes_names]
bck_train_files = 'mod_30000000.csv'
folder = "datasets/prepared_hlt_body/"

In [5]:
# concat all signal data
if not os.path.exists(folder + 'signal_hlt2.csv'):
    concat_files(folder, sig_train_files, os.path.join(folder , 'signal_hlt2.csv'))

In [6]:
signal_data = pandas.read_csv(os.path.join(folder , 'signal_hlt2.csv'), sep='\t')
bck_data = pandas.read_csv(os.path.join(folder , bck_train_files), sep='\t')

In [7]:
signal_data.columns


Out[7]:
Index([u'unique', u'mode', u'event_number', u'sv_number', u'pass_2body', u'pass_nbody', u'signal', u'sumpt', u'm', u'mcor', u'ipchi2', u'chi2', u'sumipchi2', u'fdr', u'nlt16', u'minpt', u'eta', u'pt', u'nmu', u'n', u'fdchi2', u'maxtchi2', u'ngood', u'nmu1', u'mupt', u'n1trk', u'sig', u'idx'], dtype='object')

Counting events and svrs,

that passed L0 and GoodGenB preselection (this data was generated by skim)


In [8]:
print 'Signal', statistic_length(signal_data)
print 'Bck', statistic_length(bck_data)


Signal {'SVR': 12367085, 'Events': 166059}
Bck {'SVR': 408147, 'Events': 38035}

In [9]:
total_bck_events = statistic_length(bck_data)['Events'] + empty_events[bck_train_mode_name]
total_signal_events_by_mode = dict()
for mode in sig_train_modes_names:
    total_signal_events_by_mode[mode] = statistic_length(signal_data[signal_data['mode'] == mode])['Events'] + empty_events[mode]

events distribution by mode


In [10]:
print 'Bck:', total_bck_events
'Signal:', total_signal_events_by_mode


Bck: 111306
Out[10]:
('Signal:',
 {11114001: 61102,
  11296013: 17149,
  11874042: 3942,
  12103035: 25250,
  13246001: 37313,
  13264021: 25165})

Define variables


In [11]:
variables = ["n", "mcor", "chi2", "eta", "fdchi2", "minpt", "nlt16", "ipchi2", "n1trk", "sumpt"]

Counting events and svrs,

which passed pass_nbody (equivalent Mike's preselections for nbody selection)


In [12]:
# hlt2 nbody selection
signal_data = signal_data[(signal_data['pass_nbody'] == 1) & (signal_data['mcor'] <= 10e3)]
bck_data = bck_data[(bck_data['pass_nbody'] == 1) & (bck_data['mcor'] <= 10e3)]

In [13]:
print 'Signal', statistic_length(signal_data)
print 'Bck', statistic_length(bck_data)


Signal {'SVR': 960600, 'Events': 150566}
Bck {'SVR': 21868, 'Events': 9664}

In [14]:
total_signal_events_by_mode_presel = dict()
for mode in sig_train_modes_names:
    total_signal_events_by_mode_presel[mode] = statistic_length(signal_data[signal_data['mode'] == mode])['Events']
total_bck_events_presel = statistic_length(bck_data)['Events']

events distribution by mode


In [15]:
print 'Bck:', total_bck_events_presel
'Signal:', total_signal_events_by_mode_presel


Bck: 9664
Out[15]:
('Signal:',
 {11114001: 53105,
  11296013: 16256,
  11874042: 3290,
  12103035: 21498,
  13246001: 33807,
  13264021: 22610})

In [16]:
signal_data.head()


Out[16]:
unique mode event_number sv_number pass_2body pass_nbody signal sumpt m mcor ... nmu n fdchi2 maxtchi2 ngood nmu1 mupt n1trk sig idx
1 11114001_0 11114001 0 1 1 1 1 6912.10 2806.110 4168.77 ... 1 2 9653.910 1.20832 2 1 2972.750 2 1 1
2 11114001_0 11114001 0 2 1 1 1 3731.68 728.895 4429.24 ... 2 2 7424.620 1.58733 2 2 2972.750 1 1 2
3 11114001_0 11114001 0 3 1 1 1 4698.27 1249.280 2144.50 ... 1 2 1382.720 1.58733 2 1 758.924 1 1 3
5 11114001_1 11114001 1 1 1 1 1 4003.36 3530.930 4651.29 ... 1 2 1102.850 1.20808 2 1 2611.970 2 1 1
9 11114001_1 11114001 1 5 1 1 1 2145.67 1520.500 2266.15 ... 1 2 868.049 1.35907 2 1 1137.010 2 1 5

5 rows × 28 columns

Prepare train/test splitting

Divide events which passed alll preselections into two equal parts randomly


In [17]:
ds_train_signal, ds_train_bck, ds_test_signal, ds_test_bck = prepare_data(signal_data, bck_data, 'unique')

train: counting events and svrs


In [18]:
print 'Signal', statistic_length(ds_train_signal)
print 'Bck', statistic_length(ds_train_bck)


Signal {'SVR': 479851, 'Events': 75283}
Bck {'SVR': 10759, 'Events': 4832}

In [19]:
train = pandas.concat([ds_train_bck, ds_train_signal])

test: counting events and svrs


In [20]:
print 'Signal', statistic_length(ds_test_signal)
print 'Bck', statistic_length(ds_test_bck)


Signal {'SVR': 480749, 'Events': 75283}
Bck {'SVR': 11109, 'Events': 4832}

In [21]:
test = pandas.concat([ds_test_bck, ds_test_signal])

Define all total events in test samples

(which passed just l0 and goodgenB) using also empty events. Suppose that events which didn't pass pass_nboby also were equal randomly divided into training and test samples


In [22]:
total_test_bck_events = (total_bck_events - total_bck_events_presel) // 2 + statistic_length(ds_test_bck)['Events']
total_test_signal_events = dict()
for mode in sig_train_modes_names:
    total_not_passed_signal = total_signal_events_by_mode[mode] - total_signal_events_by_mode_presel[mode]
    total_test_signal_events[mode] = total_not_passed_signal // 2 + \
        statistic_length(ds_test_signal[ds_test_signal['mode'] == mode])['Events']

In [23]:
print 'Bck total test events:', total_test_bck_events
'Signal total test events:', total_test_signal_events


Bck total test events: 55653
Out[23]:
('Signal total test events:',
 {11114001: 30533,
  11296013: 8630,
  11874042: 1950,
  12103035: 12682,
  13246001: 18627,
  13264021: 12537})

In [24]:
import cPickle
if os.path.exists('models/prunned.pkl'):
    with open('models/prunned.pkl', 'r') as file_pr:
        estimators = cPickle.load(file_pr)

Matrixnet training


In [27]:
from rep_ef.estimators import MatrixNetSkyGridClassifier

Base model with 5000 trees


In [28]:
ef_base = MatrixNetSkyGridClassifier(train_features=variables, user_name='antares',
                                     connection='skygrid',
                                     iterations=5000, sync=False)
ef_base.fit(train, train['signal'])


Out[28]:
MatrixNetSkyGridClassifier(auto_stop=None, baseline_feature=None,
              command_line_params=None, connection='skygrid',
              dump_filename=None, features_sample_rate_per_iteration=1.0,
              intervals=64, iterations=5000, max_features_per_iteration=6,
              regularization=0.01, sync=False,
              train_features=['n', 'mcor', 'chi2', 'eta', 'fdchi2', 'minpt', 'nlt16', 'ipchi2', 'n1trk', 'sumpt'],
              training_fraction=0.5, user_name='antares')

Base BBDT model


In [29]:
special_b = {
'n': [2.5, 3.5],
'mcor': [2000,3000,4000,5000,7500],  # I want to remove splits too close the the B mass as I was looking in simulation and this could distort the mass peak (possibly)
'chi2': [1,2.5,5,7.5,10,100],  # I also propose we add a cut to the pre-selection of chi2 < 1000.  I don't want to put in splits at too small values here b/c these type of inputs are never modeled quite right in the simulation (they always look a bit more smeared in data).
'sumpt': [3000,4000,5000,6000,7500,9000,12e3,23e3,50e3],  # I am happy with the MN splits here (these are almost "as is" from modify-6)
'eta': [2.5,3,3.75,4.25,4.5], # Close to MN.  
'fdchi2': [33,125,350,780,1800,5000,10000],  # I want to make the biggest split 10e3 because in the simulated events there is pretty much only BKGD above 40e3 but we don't want the BDT to learn to kill these as new particles would live here.   Otherwise I took the MN splits and modified the first one (the first one is 5sigma now).
'minpt': [350,500,750,1500,3000,5000],   # let's make 500 the 2nd split so that this lines up with the HLT1 SVs.
'nlt16': [0.5],
'ipchi2': [8,26,62,150,500,1000],  # I also propose we add a cut of IP chi2 < 5000 as it's all background out there.  
'n1trk': [0.5, 1.5, 2.5, 3.5]
}

ef_base_bbdt = MatrixNetSkyGridClassifier(train_features=variables, user_name='antares',
                                          connection='skygrid',
                                          iterations=5000, sync=False, intervals=special_b)
ef_base_bbdt.fit(train, train['signal'])


Out[29]:
MatrixNetSkyGridClassifier(auto_stop=None, baseline_feature=None,
              command_line_params=None, connection='skygrid',
              dump_filename=None, features_sample_rate_per_iteration=1.0,
              intervals={'sumpt': [3000, 4000, 5000, 6000, 7500, 9000, 12000.0, 23000.0, 50000.0], 'eta': [2.5, 3, 3.75, 4.25, 4.5], 'minpt': [350, 500, 750, 1500, 3000, 5000], 'chi2': [1, 2.5, 5, 7.5, 10, 100], 'n1trk': [0.5, 1.5, 2.5, 3.5], 'nlt16': [0.5], 'mcor': [2000, 3000, 4000, 5000, 7500], 'fdchi2': [33, 125, 350, 780, 1800, 5000, 10000], 'ipchi2': [8, 26, 62, 150, 500, 1000], 'n': [2.5, 3.5]},
              iterations=5000, max_features_per_iteration=6,
              regularization=0.01, sync=False,
              train_features=['n', 'mcor', 'chi2', 'eta', 'fdchi2', 'minpt', 'nlt16', 'ipchi2', 'n1trk', 'sumpt'],
              training_fraction=0.5, user_name='antares')

BBDT-5, 6


In [30]:
ef_base_bbdt5 = MatrixNetSkyGridClassifier(train_features=variables, user_name='antares',
                                          connection='skygrid',
                                          iterations=5000, sync=False, intervals=5)
ef_base_bbdt5.fit(train, train['signal'])

ef_base_bbdt6 = MatrixNetSkyGridClassifier(train_features=variables, user_name='antares',
                                          connection='skygrid',
                                          iterations=5000, sync=False, intervals=6)
ef_base_bbdt6.fit(train, train['signal'])


Out[30]:
MatrixNetSkyGridClassifier(auto_stop=None, baseline_feature=None,
              command_line_params=None, connection='skygrid',
              dump_filename=None, features_sample_rate_per_iteration=1.0,
              intervals=6, iterations=5000, max_features_per_iteration=6,
              regularization=0.01, sync=False,
              train_features=['n', 'mcor', 'chi2', 'eta', 'fdchi2', 'minpt', 'nlt16', 'ipchi2', 'n1trk', 'sumpt'],
              training_fraction=0.5, user_name='antares')

Pruning


In [31]:
from rep.data import LabeledDataStorage
from rep.report import ClassificationReport
report = ClassificationReport({'base': ef_base}, LabeledDataStorage(test, test['signal']))

In [32]:
report.roc()


Out[32]:

Minimize log_loss

он же BinomialDeviance


In [33]:
%run pruning.py

Training sample is cut to be aliquot 8


In [34]:
new_trainlen = (len(train) // 8) * 8
trainX = train[ef_base.features][:new_trainlen].values
trainY = train['signal'][:new_trainlen].values 
trainW = numpy.ones(len(trainY))
trainW[trainY == 0] *= sum(trainY) / sum(1 - trainY)

In [35]:
new_features, new_formula_mx, new_classifier = select_trees(trainX, trainY, sample_weight=trainW,
                                                            initial_classifier=ef_base,
                                                            iterations=100, n_candidates=100, 
                                                            learning_rate=0.1, regularization=50.)


0 1.61639046851 0.864149179375
1 1.55188342817 0.871006756536
2 1.4974419247 0.871931415195
3 1.45063957562 0.871257415717
4 1.40839869622 0.882903045944
5 1.37366046126 0.88787704316
6 1.34072767041 0.887003202722
7 1.31150104365 0.893023018467
8 1.28379095058 0.896263184453
9 1.25809114034 0.898190409332
10 1.23593045848 0.89788161298
11 1.21672628163 0.899391156041
12 1.19916894981 0.901041763496
13 1.18285986055 0.900870790499
14 1.16821214312 0.901828769835
15 1.15477734313 0.902456983346
16 1.14213029917 0.903531415611
17 1.12974589809 0.904730333043
18 1.11902162756 0.905431320555
19 1.10922459706 0.906217749773
20 1.09967334445 0.907074639763
21 1.09128157517 0.908045647467
22 1.08329445479 0.908310608384
23 1.07526925282 0.909011025167
24 1.06802464375 0.909790861995
25 1.06178926981 0.910268591808
26 1.05538189711 0.910926982288
27 1.04974682021 0.911391242005
28 1.04431160909 0.911875445471
29 1.03922534297 0.912452518835
30 1.03436250836 0.912689445171
31 1.02963139549 0.913149040851
32 1.02534119856 0.913312034488
33 1.02116665067 0.913531434514
34 1.01741194297 0.913966424249
35 1.01364996586 0.914297519611
36 1.01003652599 0.91472001994
37 1.00665776596 0.915135662804
38 1.00334874405 0.915348876042
39 1.00006517906 0.915804553904
40 0.997073869799 0.916169904628
41 0.994289821897 0.916483854005
42 0.991591343735 0.916773954987
43 0.988827887912 0.917221982097
44 0.986271418313 0.91752158054
45 0.983893571884 0.917694392983
46 0.981632392586 0.91795698595
47 0.979477890859 0.91804893046
48 0.977394041489 0.918156519221
49 0.975265168264 0.918412660519
50 0.973352807959 0.918617887619
51 0.971251012274 0.91875493202
52 0.969354531674 0.918978874537
53 0.967553121642 0.919198296838
54 0.965804421623 0.919480419235
55 0.96409568683 0.919733387772
56 0.9623992872 0.920035855162
57 0.960784529657 0.920191115823
58 0.959280770215 0.920367844727
59 0.957787245789 0.920583069027
60 0.956201299682 0.920745173206
61 0.954705664307 0.920993930474
62 0.953308382945 0.921239222111
63 0.951929776196 0.921432167157
64 0.950565745052 0.921595320013
65 0.949272821333 0.92172460219
66 0.947943617271 0.921915534431
67 0.94672827271 0.922104992055
68 0.945491761711 0.922277753458
69 0.944280050248 0.922392816125
70 0.943171510591 0.922532947673
71 0.942075376538 0.922717989193
72 0.94103088427 0.922812923227
73 0.939868657246 0.923043876713
74 0.938893696144 0.923132846034
75 0.937832755491 0.923224778438
76 0.936780426641 0.923351691601
77 0.935799722128 0.923507977501
78 0.934856256431 0.923549669289
79 0.933859349167 0.923711498903
80 0.932928206017 0.923847081665
81 0.932001324719 0.924036900534
82 0.931088570411 0.924177670024
83 0.930261124209 0.924254283008
84 0.929307015668 0.924424364707
85 0.928375147698 0.924595685293
86 0.92745132652 0.92472634456
87 0.926564151302 0.924850853359
88 0.925662013781 0.925017828153
89 0.924834935539 0.925096802694
90 0.923977048654 0.925209568596
91 0.92315137584 0.92531444453
92 0.92236916542 0.925436261227
93 0.921630076264 0.925546750023
94 0.920800498935 0.925651854423
95 0.920078219071 0.925734530712
96 0.919332405527 0.925880794329
97 0.918620721909 0.925988038696
98 0.917779386901 0.926118577388
99 0.917027666116 0.92621668273

In [36]:
prunned = cPickle.loads(cPickle.dumps(ef_base))
prunned.formula_mx = new_formula_mx

In [100]:
def mode_scheme_fit(train, base, suf, model_file):
    blending_parts = OrderedDict()
    for n_ch, ch in enumerate(sig_train_modes_names):
        temp = FoldingClassifier(base_estimator=base, random_state=11, features=variables, ipc_profile=PROFILE)
        temp_data = train[(train['mode'] == ch) | (train['mode'] == bck_train_mode_name)]
        temp.fit(temp_data, temp_data['signal'])
        blending_parts['ch' + str(n_ch) + suf] = temp
    import cPickle
    with open(model_file, 'w') as f:
        cPickle.dump(blending_parts, f)

def mode_scheme_predict(data, suf, model_file, mode='train'):
    with open(model_file, 'r') as f:
        blending_parts = cPickle.load(f)
    for n_ch, ch in enumerate(sig_train_modes_names):
        temp_name = 'ch' + str(n_ch) + suf
        if mode == 'train':
            temp_key = ((data['mode'] == ch) | (data['mode'] == bck_train_mode_name))
            data.ix[temp_key, temp_name] = blending_parts[temp_name].predict_proba(
                data[temp_key])[:, 1]
            data.ix[~temp_key, temp_name] = blending_parts[temp_name].predict_proba(
                data[~temp_key])[:, 1]
        else:
            data[temp_name] = blending_parts[temp_name].predict_proba(data)[:, 1]

In [101]:
def get_best_svr_by_channel(data, feature_mask, count=1):
    add_events = []
    for id_est, channel in enumerate(sig_train_modes_names):
        train_part = data[(data['mode'] == channel)]
        for num, group in train_part.groupby('unique'):
            index = numpy.argsort(group[feature_mask.format(id_est)].values)[::-1]
            add_events.append(group.iloc[index[:count], :])
    good_events = pandas.concat([data[(data['mode'] == bck_train_mode_name)]] + add_events)
    print len(good_events)
    return good_events

In [104]:
from sklearn.ensemble import RandomForestClassifier
from rep.metaml import FoldingClassifier
base = RandomForestClassifier(n_estimators=500, min_samples_leaf=50, max_depth=6,
                              max_features=7, n_jobs=8)

mode_scheme_fit(train, base, '', 'forest_trick.pkl')
mode_scheme_predict(train, '', 'forest_trick.pkl')
mode_scheme_predict(test, '', 'forest_trick.pkl', mode='test')


KFold prediction using folds column
KFold prediction using folds column
KFold prediction using folds column
KFold prediction using folds column
KFold prediction using folds column
KFold prediction using folds column
KFold prediction using folds column
KFold prediction using folds column
KFold prediction using folds column
KFold prediction using folds column
KFold prediction using folds column
KFold prediction using folds column
KFold prediction using folds column
KFold prediction using folds column
KFold prediction using folds column
KFold prediction using folds column
KFold prediction using folds column
KFold prediction using folds column

In [107]:
good_events = get_best_svr_by_channel(train, 'ch{}', 2)
forest_mn = MatrixNetSkyGridClassifier(train_features=variables,
                                     user_name='antares',
                                     connection='skygrid',
                                     iterations=5000, sync=False)
forest_mn.fit(good_events, good_events['signal'])


148864
Out[107]:
MatrixNetSkyGridClassifier(auto_stop=None, baseline_feature=None,
              command_line_params=None, connection='skygrid',
              dump_filename=None, features_sample_rate_per_iteration=1.0,
              intervals=64, iterations=5000, max_features_per_iteration=6,
              regularization=0.01, sync=False,
              train_features=['n', 'mcor', 'chi2', 'eta', 'fdchi2', 'minpt', 'nlt16', 'ipchi2', 'n1trk', 'sumpt'],
              training_fraction=0.5, user_name='antares')

In [112]:
forest_mn_bbdt = MatrixNetSkyGridClassifier(train_features=variables,
                                            user_name='antares',
                                             connection='skygrid',
                                             iterations=5000, sync=False, intervals=special_b)
forest_mn_bbdt.fit(good_events, good_events['signal'])


Out[112]:
MatrixNetSkyGridClassifier(auto_stop=None, baseline_feature=None,
              command_line_params=None, connection='skygrid',
              dump_filename=None, features_sample_rate_per_iteration=1.0,
              intervals={'sumpt': [3000, 4000, 5000, 6000, 7500, 9000, 12000.0, 23000.0, 50000.0], 'eta': [2.5, 3, 3.75, 4.25, 4.5], 'minpt': [350, 500, 750, 1500, 3000, 5000], 'chi2': [1, 2.5, 5, 7.5, 10, 100], 'n1trk': [0.5, 1.5, 2.5, 3.5], 'nlt16': [0.5], 'mcor': [2000, 3000, 4000, 5000, 7500], 'fdchi2': [33, 125, 350, 780, 1800, 5000, 10000], 'ipchi2': [8, 26, 62, 150, 500, 1000], 'n': [2.5, 3.5]},
              iterations=5000, max_features_per_iteration=6,
              regularization=0.01, sync=False,
              train_features=['n', 'mcor', 'chi2', 'eta', 'fdchi2', 'minpt', 'nlt16', 'ipchi2', 'n1trk', 'sumpt'],
              training_fraction=0.5, user_name='antares')

In [116]:
new_trainlen = (len(good_events) // 8) * 8
trainX = good_events[forest_mn.features][:new_trainlen].values
trainY = good_events['signal'][:new_trainlen].values 
trainW = numpy.ones(len(trainY))
trainW[trainY == 0] *= sum(trainY) / sum(1 - trainY)

In [143]:
len(train), len(good_events)


Out[143]:
(490610, 148864)

In [117]:
new_features_f, new_formula_mx_f, new_classifier_f = select_trees(trainX, trainY, sample_weight=trainW,
                                                                initial_classifier=forest_mn,
                                                                iterations=100, n_candidates=100, 
                                                                learning_rate=0.1, regularization=50.)


0 1.59878247354 0.898267058314
1 1.51376874775 0.923562775721
2 1.44079443049 0.924933289748
3 1.3805166538 0.926697044996
4 1.32495882539 0.927486156871
5 1.27496922824 0.929864337647
6 1.23062960925 0.933785251222
7 1.19300866229 0.933685420934
8 1.15761858181 0.934627903387
9 1.12736800412 0.935328811146
10 1.0991202171 0.935250788258
11 1.07445225558 0.936354042668
12 1.05126925344 0.937850097817
13 1.03058754558 0.938862143141
14 1.011693557 0.939935307133
15 0.994375772612 0.939972899881
16 0.978284559854 0.940253851797
17 0.963552829217 0.94151909765
18 0.95009519394 0.942402605293
19 0.937824257337 0.943313473644
20 0.925258705091 0.943963868293
21 0.914464860855 0.944279791601
22 0.904148147213 0.94469312103
23 0.894715951762 0.945032701158
24 0.885681679521 0.945655614969
25 0.877518873326 0.946299568954
26 0.869470989835 0.947021607744
27 0.862325731184 0.947294104017
28 0.85581607304 0.947788591194
29 0.849038920939 0.948225310934
30 0.842894270099 0.948501071958
31 0.837094979986 0.948995302384
32 0.831854245127 0.949197773096
33 0.826866939463 0.949624221087
34 0.822073866616 0.950014177368
35 0.817624153846 0.950377257506
36 0.813239161583 0.950787564131
37 0.809029532184 0.951188790564
38 0.805019091745 0.951439237491
39 0.801188836807 0.951767397723
40 0.797425227379 0.95204708708
41 0.793877353362 0.952296314858
42 0.790359499498 0.952553469632
43 0.787111040846 0.952867135342
44 0.784038253519 0.953166642023
45 0.7810720125 0.953454269145
46 0.778247009403 0.953694128011
47 0.77554796413 0.953835621722
48 0.772927761308 0.953910043357
49 0.770445182331 0.954081491538
50 0.768067426525 0.954254790148
51 0.76568604058 0.954442982373
52 0.763463801792 0.954526193461
53 0.761344647854 0.954709426307
54 0.759307040909 0.954886963171
55 0.757037477502 0.955075411474
56 0.755142163347 0.955209565722
57 0.753288753573 0.955342839342
58 0.751395688236 0.955431219786
59 0.749595943251 0.955640305471
60 0.747779605378 0.955797967132
61 0.746098369404 0.955874234484
62 0.744367946735 0.956023756816
63 0.742807168465 0.956142489476
64 0.741153084045 0.956320105754
65 0.739642832294 0.956437582587
66 0.738204947827 0.956526529701
67 0.736742814892 0.956671447665
68 0.735430133919 0.956724998722
69 0.73409406536 0.956883355598
70 0.732739448365 0.957025961114
71 0.731516026479 0.957146624292
72 0.730234646269 0.95725046031
73 0.728962072682 0.957391345623
74 0.727785085272 0.957488075375
75 0.726656382863 0.957576798379
76 0.725556843544 0.957672831233
77 0.724400637666 0.957772615421
78 0.723314294513 0.957877710969
79 0.722304240869 0.957955053784
80 0.721265254603 0.958010397728
81 0.720290057549 0.958100522603
82 0.719287866499 0.958197303166
83 0.718327446866 0.95828584715
84 0.717248826746 0.95837316761
85 0.716359696296 0.958473355265
86 0.715416786885 0.958574747263
87 0.714477785961 0.958671261316
88 0.713549252037 0.95874033121
89 0.712539082925 0.958824843891
90 0.711706540491 0.958912600795
91 0.710874318757 0.958997675435
92 0.71009676271 0.959052239702
93 0.709309638511 0.959126766662
94 0.708524743109 0.959228390847
95 0.707789412854 0.95931586206
96 0.707022757936 0.959388495519
97 0.706333697516 0.959431312136
98 0.705551481107 0.959490530574
99 0.704832677023 0.959557165533

In [118]:
prunned_f = cPickle.loads(cPickle.dumps(forest_mn))
prunned_f.formula_mx = new_formula_mx_f

In [120]:
estimators = {'base MN': ef_base, 'BBDT MN-6': ef_base_bbdt6, 'BBDT MN-5': ef_base_bbdt5,
              'BBDT MN special': ef_base_bbdt,
              'Prunned MN': prunned, 'base MN + forest': forest_mn,
              'BBDT MN special + forest': forest_mn_bbdt, 'Prunned MN + forest': prunned_f}

In [121]:
import cPickle
with open('models/prunned.pkl', 'w') as file_pr:
    cPickle.dump(estimators, file_pr)

Calculate thresholds on classifiers


In [122]:
thresholds = dict()
test_bck = test[test['signal'] == 0]
RATE = [2500., 4000.]
events_pass = dict()
for name, cl in estimators.items():
    prob = cl.predict_proba(test_bck)
    thr, result = calculate_thresholds(test_bck, prob, total_test_bck_events, rates=RATE)
    for rate, val in result.items():
        events_pass['{}-{}'.format(rate, name)] = val[1]
    thresholds[name] = thr
    print name, result


BBDT MN-6 {4000.0: (0.99402680780929342, 222, 0.00398900328823244), 2500.0: (0.99595672446060457, 139, 0.002497619175965357)}
BBDT MN-5 {4000.0: (0.99425757263704562, 222, 0.00398900328823244), 2500.0: (0.99628784046073426, 139, 0.002497619175965357)}
Prunned MN {4000.0: (0.76591901044032684, 222, 0.00398900328823244), 2500.0: (0.81966655622937423, 139, 0.002497619175965357)}
BBDT MN special + forest {4000.0: (0.9804137431800608, 222, 0.00398900328823244), 2500.0: (0.98829762728883797, 139, 0.002497619175965357)}
Prunned MN + forest {4000.0: (0.7291955536845629, 222, 0.00398900328823244), 2500.0: (0.81979088052929106, 139, 0.002497619175965357)}
BBDT MN special {4000.0: (0.99441421391844975, 222, 0.00398900328823244), 2500.0: (0.99615994806063501, 139, 0.002497619175965357)}
base MN + forest {4000.0: (0.97825892637449507, 222, 0.00398900328823244), 2500.0: (0.98666907201136622, 139, 0.002497619175965357)}
base MN {4000.0: (0.99410103269102013, 222, 0.00398900328823244), 2500.0: (0.99610789680421596, 139, 0.002497619175965357)}

Final efficiencies for each mode


In [123]:
train_modes_eff, statistic = result_statistic(estimators, sig_train_modes_names, 
                                              test[test['signal'] == 1],
                                              thresholds, RATE, total_test_signal_events)

In [153]:
from rep.plotting import BarComparePlot
xticks_labels = ['$B^0 \\to K^*\mu^+\mu^-$', "$B^0 \\to D^+D^-$", "$B^0 \\to D^- \mu^+ \\nu_{\mu}$", 
                 '$B^+ \\to \pi^+ K^-K^+$', '$B^0_s \\to \psi(1S) K^+K^-\pi^+\pi^-$', '$B^0_s \\to D_s^-\pi^+$']
for r in RATE:
    new_dict = [] 
    for key, val in train_modes_eff.iteritems():
        if (key[0] in {'base MN', 'Prunned MN', 'BBDT MN special', 
                       'base MN + forest', 'Prunned MN + forest', 'BBDT MN special + forest'}) and r == key[1]:
            new_dict.append((key, val))
    new_dict = dict(new_dict)        
    BarComparePlot(new_dict).plot(new_plot=True, figsize=(24, 8), ylabel='efficiency', fontsize=22)
    xticks(3 + 11 * numpy.arange(6), xticks_labels, rotation=0)
    lgd = legend(bbox_to_anchor=(0.5, 1.3), loc='upper center', ncol=2, fontsize=22)
# plt.savefig('hlt2-experiments.pdf' , format='pdf', bbox_extra_artists=(lgd,), bbox_inches='tight')



In [125]:
from rep.plotting import BarComparePlot
for r in RATE:
    new_dict = [] 
    for key, val in train_modes_eff.iteritems():
        if r == key[1]:
            new_dict.append((key, val))
    new_dict = dict(new_dict)        
    BarComparePlot(new_dict).plot(new_plot=True, figsize=(24, 8), ylabel='efficiency', fontsize=22)
    lgd = legend(bbox_to_anchor=(0.5, 1.3), loc='upper center', ncol=2, fontsize=22)
# plt.savefig('hlt2-experiments.pdf' , format='pdf', bbox_extra_artists=(lgd,), bbox_inches='tight')


Classification report using events


In [126]:
plots = OrderedDict()
for key, value in estimators.items():
    plots[key] = plot_roc_events(value, test[test['signal'] == 1], test[test['signal'] == 0], key)


BBDT MN-6 AUC: 0.948787065058
BBDT MN-5 AUC: 0.946395911938
Prunned MN AUC: 0.954561135933
BBDT MN special + forest AUC: 0.956099303177
Prunned MN + forest AUC: 0.957158203039
BBDT MN special AUC: 0.952332233169
base MN + forest AUC: 0.959967543942
base MN AUC: 0.955959310671

In [127]:
bbdt_plots = plots.copy()
bbdt_plots.pop('Prunned MN')
bbdt_plots.pop('Prunned MN + forest')


Out[127]:
(array([ 0.        ,  0.        ,  0.        , ...,  0.99958609,
         0.99979305,  1.        ]),
 array([  1.54235301e-05,   3.71280728e-05,   4.34090853e-05, ...,
          1.00000000e+00,   1.00000000e+00,   1.00000000e+00]))

In [128]:
from rep.plotting import FunctionsPlot
FunctionsPlot(bbdt_plots).plot(new_plot=True, xlim=(0.02, 0.06), ylim=(0.65, 0.82))
plot([1. * events_pass['2500.0-base MN'] / statistic_length(ds_test_bck)['Events']] * 2, 
     [0., 1], 'b--', label='rate: 2.5 kHz')
plot([1. * events_pass['4000.0-base MN'] / statistic_length(ds_test_bck)['Events']] * 2, 
     [0., 1], 'g--', label='rate: 4. kHz')
lgd = legend(loc='upper center', fontsize=16, bbox_to_anchor=(0.5, 1.3), ncol=3)
title('ROC for events (training decays)', fontsize=20)
xlabel('FRP, background events efficiency', fontsize=20)
ylabel('TPR, signal events efficiency', fontsize=20)


Out[128]:
<matplotlib.text.Text at 0x71ba1d0>

In [130]:
from rep.plotting import FunctionsPlot
FunctionsPlot(plots).plot(new_plot=True, xlim=(0.02, 0.06), ylim=(0.65, 0.82))
plot([1. * events_pass['2500.0-base MN'] / statistic_length(ds_test_bck)['Events']] * 2, 
     [0., 1], 'b--', label='rate: 2.5 kHz')
plot([1. * events_pass['4000.0-base MN'] / statistic_length(ds_test_bck)['Events']] * 2, 
     [0., 1], 'g--', label='rate: 4. kHz')
lgd = legend(loc='upper center', fontsize=16, bbox_to_anchor=(0.5, 1.4), ncol=3)
title('ROC for events (training decays)', fontsize=20)
xlabel('FRP, background events efficiency', fontsize=20)
ylabel('TPR, signal events efficiency', fontsize=20)


Out[130]:
<matplotlib.text.Text at 0x472e70d0>

all channels efficiencies


In [131]:
from collections import defaultdict
all_channels = []
efficiencies = defaultdict(OrderedDict)
for mode in empty_events.keys():
    if mode in set(sig_train_modes_names) or mode == bck_train_mode_name:
        continue
    df = pandas.read_csv(os.path.join(folder , 'mod_{}.csv'.format(mode)), sep='\t')
    if len(df) <= 0:
        continue
    total_events = statistic_length(df)['Events'] + empty_events[mode]
    df = df[(df['pass_nbody'] == 1) & (df['mcor'] <= 10e3)]
    passed_events = statistic_length(df)['Events']
    all_channels.append(df)
    for name, cl in estimators.items():
        prob = cl.predict_proba(df)
        for rate, thresh in thresholds[name].items():
            eff = final_eff_for_mode(df, prob, total_events, thresh)
            latex_name = '$' + Samples[str(mode)]['root'].replace("#", "\\") + '$'
            efficiencies[(name, rate)][latex_name] = eff

In [132]:
for key, val in efficiencies.items():
    for key_2, val_2 in val.items():
        if val_2 <= 0.1:
            efficiencies[key].pop(key_2)

In [133]:
from rep.plotting import BarComparePlot
for r in RATE:
    new_dict = [] 
    for key, val in efficiencies.iteritems():
        if r == key[1]:
            new_dict.append((key, val))
    new_dict = dict(new_dict)        
    BarComparePlot(new_dict).plot(new_plot=True, figsize=(24, 8), ylabel='efficiency', fontsize=22)
    lgd = legend(bbox_to_anchor=(0.5, 1.4), loc='upper center', ncol=2, fontsize=22)



In [134]:
plots_all = OrderedDict()
for key, value in estimators.items():
    plots_all[key] = plot_roc_events(value, pandas.concat([test[test['signal'] == 1]] + all_channels), 
                                     test[test['signal'] == 0], key)


BBDT MN-6 AUC: 0.893047340693
BBDT MN-5 AUC: 0.891496975588
Prunned MN AUC: 0.899367387491
BBDT MN special + forest AUC: 0.894444678162
Prunned MN + forest AUC: 0.891066586762
BBDT MN special AUC: 0.896260843657
base MN + forest AUC: 0.890947087059
base MN AUC: 0.897475380098

In [144]:
from rep.plotting import FunctionsPlot
FunctionsPlot(plots_all).plot(new_plot=True, xlim=(0.02, 0.06), ylim=(0.5, 0.66))
plot([1. * events_pass['2500.0-base MN'] / statistic_length(ds_test_bck)['Events']] * 2, 
     [0., 1], 'b--', label='rate: 2.5 kHz')
plot([1. * events_pass['4000.0-base MN'] / statistic_length(ds_test_bck)['Events']] * 2, 
     [0., 1], 'g--', label='rate: 4. kHz')
lgd = legend(loc='upper center', fontsize=16, bbox_to_anchor=(0.5, 1.3), ncol=4)
title('ROC for events (all decays together)', fontsize=20)
xlabel('FRP, background events efficiency', fontsize=20)
ylabel('TPR, signal events efficiency', fontsize=20)


Out[144]:
<matplotlib.text.Text at 0x5724bc10>

DIfferent rates


In [81]:
thresholds = OrderedDict()
RATE = [2000., 2500., 3000., 3500., 4000.]
for name, cl in estimators.items():
    prob = cl.predict_proba(ds_test_bck)
    thr, result = calculate_thresholds(ds_test_bck, prob, total_test_bck_events, rates=RATE)
    thresholds[name] = thr
    print name, result


BBDT MN-6 {2000.0: (0.9967790194752344, 111, 0.00199450164411622), 3000.0: (0.99547798927619391, 166, 0.002982768224534167), 4000.0: (0.99402680780929342, 222, 0.00398900328823244), 3500.0: (0.99472365991502065, 194, 0.003485885756383304), 2500.0: (0.99595672446060457, 139, 0.002497619175965357)}
BBDT MN-5 {2000.0: (0.99676111865222461, 111, 0.00199450164411622), 3000.0: (0.99559207475332101, 166, 0.002982768224534167), 4000.0: (0.99425757263704562, 222, 0.00398900328823244), 3500.0: (0.99487682163832258, 194, 0.003485885756383304), 2500.0: (0.99628784046073426, 139, 0.002497619175965357)}
BBDT MN special {2000.0: (0.99687789022772133, 111, 0.00199450164411622), 3000.0: (0.99553511016986351, 166, 0.002982768224534167), 4000.0: (0.99441421391844975, 222, 0.00398900328823244), 3500.0: (0.99498939292199462, 194, 0.003485885756383304), 2500.0: (0.99615994806063501, 139, 0.002497619175965357)}
Prunned MN {2000.0: (0.84930329512954106, 111, 0.00199450164411622), 3000.0: (0.80573534113098733, 166, 0.002982768224534167), 4000.0: (0.76591901044032684, 222, 0.00398900328823244), 3500.0: (0.78564740990945015, 194, 0.003485885756383304), 2500.0: (0.81966655622937423, 139, 0.002497619175965357)}
base MN {2000.0: (0.9967558467810903, 111, 0.00199450164411622), 3000.0: (0.99552231740013475, 166, 0.002982768224534167), 4000.0: (0.99410103269102013, 222, 0.00398900328823244), 3500.0: (0.99502030114377649, 194, 0.003485885756383304), 2500.0: (0.99610789680421596, 139, 0.002497619175965357)}

In [82]:
train_modes_eff, statistic = result_statistic({'base MN': estimators['base MN']}, sig_train_modes_names, 
                                              test[test['signal'] == 1],
                                              thresholds, RATE, total_test_signal_events)

In [93]:
order_rate = OrderedDict()
for j in numpy.argsort([i[1] for i in train_modes_eff.keys()]):
    order_rate[train_modes_eff.keys()[j]] = train_modes_eff.values()[j]

In [94]:
from rep.plotting import BarComparePlot
BarComparePlot(order_rate).plot(new_plot=True, figsize=(18, 6), ylabel='efficiency', fontsize=18)
lgd = legend(bbox_to_anchor=(0.5, 1.2), loc='upper center', ncol=5, fontsize=18)
# plt.savefig('rates.pdf' , format='pdf', bbox_extra_artists=(lgd,), bbox_inches='tight')