notebook.community



In [1]:

    
%run additional.ipynb



In [2]:

    
%pylab inline









    



Populating the interactive namespace from numpy and matplotlib






    



WARNING: pylab import has clobbered these variables: ['empty']
`%matplotlib` prevents importing * from pylab and numpy



In [3]:

    
pandas.set_option('display.max_colwidth', 120)



In [103]:

    
PROFILE = 'ssh-ipy'

HLT2 nbody classification

did preselections:

any sv.n,
any sv.minpt
sv.nlt16 < 2

Training channels (read data)

We will use just 11114001, 11296013, 11874042, 12103035, 13246001, 13264021



In [4]:

    
sig_train_modes_names = [11114001, 11296013, 11874042, 12103035, 13246001, 13264021]
bck_train_mode_name = 30000000
sig_train_files = ['mod_{}.csv'.format(name) for name in sig_train_modes_names]
bck_train_files = 'mod_30000000.csv'
folder = "datasets/prepared_hlt_body/"



In [5]:

    
# concat all signal data
if not os.path.exists(folder + 'signal_hlt2.csv'):
    concat_files(folder, sig_train_files, os.path.join(folder , 'signal_hlt2.csv'))



In [6]:

    
signal_data = pandas.read_csv(os.path.join(folder , 'signal_hlt2.csv'), sep='\t')
bck_data = pandas.read_csv(os.path.join(folder , bck_train_files), sep='\t')



In [7]:

    
signal_data.columns









    Out[7]:





Index([u'unique', u'mode', u'event_number', u'sv_number', u'pass_2body', u'pass_nbody', u'signal', u'sumpt', u'm', u'mcor', u'ipchi2', u'chi2', u'sumipchi2', u'fdr', u'nlt16', u'minpt', u'eta', u'pt', u'nmu', u'n', u'fdchi2', u'maxtchi2', u'ngood', u'nmu1', u'mupt', u'n1trk', u'sig', u'idx'], dtype='object')

Counting events and svrs,

that passed L0 and GoodGenB preselection (this data was generated by skim)



In [8]:

    
print 'Signal', statistic_length(signal_data)
print 'Bck', statistic_length(bck_data)









    



Signal {'SVR': 12367085, 'Events': 166059}
Bck {'SVR': 408147, 'Events': 38035}



In [9]:

    
total_bck_events = statistic_length(bck_data)['Events'] + empty_events[bck_train_mode_name]
total_signal_events_by_mode = dict()
for mode in sig_train_modes_names:
    total_signal_events_by_mode[mode] = statistic_length(signal_data[signal_data['mode'] == mode])['Events'] + empty_events[mode]

events distribution by mode



In [10]:

    
print 'Bck:', total_bck_events
'Signal:', total_signal_events_by_mode









    



Bck: 111306






    Out[10]:





('Signal:',
 {11114001: 61102,
  11296013: 17149,
  11874042: 3942,
  12103035: 25250,
  13246001: 37313,
  13264021: 25165})

Define variables



In [11]:

    
variables = ["n", "mcor", "chi2", "eta", "fdchi2", "minpt", "nlt16", "ipchi2", "n1trk", "sumpt"]

Counting events and svrs,

which passed pass_nbody (equivalent Mike's preselections for nbody selection)



In [12]:

    
# hlt2 nbody selection
signal_data = signal_data[(signal_data['pass_nbody'] == 1) & (signal_data['mcor'] <= 10e3)]
bck_data = bck_data[(bck_data['pass_nbody'] == 1) & (bck_data['mcor'] <= 10e3)]



In [13]:

    
print 'Signal', statistic_length(signal_data)
print 'Bck', statistic_length(bck_data)









    



Signal {'SVR': 960600, 'Events': 150566}
Bck {'SVR': 21868, 'Events': 9664}



In [14]:

    
total_signal_events_by_mode_presel = dict()
for mode in sig_train_modes_names:
    total_signal_events_by_mode_presel[mode] = statistic_length(signal_data[signal_data['mode'] == mode])['Events']
total_bck_events_presel = statistic_length(bck_data)['Events']

events distribution by mode



In [15]:

    
print 'Bck:', total_bck_events_presel
'Signal:', total_signal_events_by_mode_presel









    



Bck: 9664






    Out[15]:





('Signal:',
 {11114001: 53105,
  11296013: 16256,
  11874042: 3290,
  12103035: 21498,
  13246001: 33807,
  13264021: 22610})



In [16]:

    
signal_data.head()









    Out[16]:






  
    
      
      unique
      mode
      event_number
      sv_number
      pass_2body
      pass_nbody
      signal
      sumpt
      m
      mcor
      ...
      nmu
      n
      fdchi2
      maxtchi2
      ngood
      nmu1
      mupt
      n1trk
      sig
      idx
    
  
  
    
      1
       11114001_0
       11114001
       0
       1
       1
       1
       1
       6912.10
       2806.110
       4168.77
      ...
       1
       2
       9653.910
       1.20832
       2
       1
       2972.750
       2
       1
       1
    
    
      2
       11114001_0
       11114001
       0
       2
       1
       1
       1
       3731.68
        728.895
       4429.24
      ...
       2
       2
       7424.620
       1.58733
       2
       2
       2972.750
       1
       1
       2
    
    
      3
       11114001_0
       11114001
       0
       3
       1
       1
       1
       4698.27
       1249.280
       2144.50
      ...
       1
       2
       1382.720
       1.58733
       2
       1
        758.924
       1
       1
       3
    
    
      5
       11114001_1
       11114001
       1
       1
       1
       1
       1
       4003.36
       3530.930
       4651.29
      ...
       1
       2
       1102.850
       1.20808
       2
       1
       2611.970
       2
       1
       1
    
    
      9
       11114001_1
       11114001
       1
       5
       1
       1
       1
       2145.67
       1520.500
       2266.15
      ...
       1
       2
        868.049
       1.35907
       2
       1
       1137.010
       2
       1
       5
    
  

5 rows × 28 columns

Prepare train/test splitting

Divide events which passed alll preselections into two equal parts randomly



In [17]:

    
ds_train_signal, ds_train_bck, ds_test_signal, ds_test_bck = prepare_data(signal_data, bck_data, 'unique')

train: counting events and svrs



In [18]:

    
print 'Signal', statistic_length(ds_train_signal)
print 'Bck', statistic_length(ds_train_bck)









    



Signal {'SVR': 479851, 'Events': 75283}
Bck {'SVR': 10759, 'Events': 4832}



In [19]:

    
train = pandas.concat([ds_train_bck, ds_train_signal])

test: counting events and svrs



In [20]:

    
print 'Signal', statistic_length(ds_test_signal)
print 'Bck', statistic_length(ds_test_bck)









    



Signal {'SVR': 480749, 'Events': 75283}
Bck {'SVR': 11109, 'Events': 4832}



In [21]:

    
test = pandas.concat([ds_test_bck, ds_test_signal])

Define all total events in test samples

(which passed just l0 and goodgenB) using also empty events. Suppose that events which didn't pass pass_nboby also were equal randomly divided into training and test samples



In [22]:

    
total_test_bck_events = (total_bck_events - total_bck_events_presel) // 2 + statistic_length(ds_test_bck)['Events']
total_test_signal_events = dict()
for mode in sig_train_modes_names:
    total_not_passed_signal = total_signal_events_by_mode[mode] - total_signal_events_by_mode_presel[mode]
    total_test_signal_events[mode] = total_not_passed_signal // 2 + \
        statistic_length(ds_test_signal[ds_test_signal['mode'] == mode])['Events']



In [23]:

    
print 'Bck total test events:', total_test_bck_events
'Signal total test events:', total_test_signal_events









    



Bck total test events: 55653






    Out[23]:





('Signal total test events:',
 {11114001: 30533,
  11296013: 8630,
  11874042: 1950,
  12103035: 12682,
  13246001: 18627,
  13264021: 12537})



In [24]:

    
import cPickle
if os.path.exists('models/prunned.pkl'):
    with open('models/prunned.pkl', 'r') as file_pr:
        estimators = cPickle.load(file_pr)

Matrixnet training



In [27]:

    
from rep_ef.estimators import MatrixNetSkyGridClassifier

Base model with 5000 trees



In [28]:

    
ef_base = MatrixNetSkyGridClassifier(train_features=variables, user_name='antares',
                                     connection='skygrid',
                                     iterations=5000, sync=False)
ef_base.fit(train, train['signal'])









    Out[28]:





MatrixNetSkyGridClassifier(auto_stop=None, baseline_feature=None,
              command_line_params=None, connection='skygrid',
              dump_filename=None, features_sample_rate_per_iteration=1.0,
              intervals=64, iterations=5000, max_features_per_iteration=6,
              regularization=0.01, sync=False,
              train_features=['n', 'mcor', 'chi2', 'eta', 'fdchi2', 'minpt', 'nlt16', 'ipchi2', 'n1trk', 'sumpt'],
              training_fraction=0.5, user_name='antares')

Base BBDT model



In [29]:

    
special_b = {
'n': [2.5, 3.5],
'mcor': [2000,3000,4000,5000,7500],  # I want to remove splits too close the the B mass as I was looking in simulation and this could distort the mass peak (possibly)
'chi2': [1,2.5,5,7.5,10,100],  # I also propose we add a cut to the pre-selection of chi2 < 1000.  I don't want to put in splits at too small values here b/c these type of inputs are never modeled quite right in the simulation (they always look a bit more smeared in data).
'sumpt': [3000,4000,5000,6000,7500,9000,12e3,23e3,50e3],  # I am happy with the MN splits here (these are almost "as is" from modify-6)
'eta': [2.5,3,3.75,4.25,4.5], # Close to MN.  
'fdchi2': [33,125,350,780,1800,5000,10000],  # I want to make the biggest split 10e3 because in the simulated events there is pretty much only BKGD above 40e3 but we don't want the BDT to learn to kill these as new particles would live here.   Otherwise I took the MN splits and modified the first one (the first one is 5sigma now).
'minpt': [350,500,750,1500,3000,5000],   # let's make 500 the 2nd split so that this lines up with the HLT1 SVs.
'nlt16': [0.5],
'ipchi2': [8,26,62,150,500,1000],  # I also propose we add a cut of IP chi2 < 5000 as it's all background out there.  
'n1trk': [0.5, 1.5, 2.5, 3.5]
}

ef_base_bbdt = MatrixNetSkyGridClassifier(train_features=variables, user_name='antares',
                                          connection='skygrid',
                                          iterations=5000, sync=False, intervals=special_b)
ef_base_bbdt.fit(train, train['signal'])









    Out[29]:





MatrixNetSkyGridClassifier(auto_stop=None, baseline_feature=None,
              command_line_params=None, connection='skygrid',
              dump_filename=None, features_sample_rate_per_iteration=1.0,
              intervals={'sumpt': [3000, 4000, 5000, 6000, 7500, 9000, 12000.0, 23000.0, 50000.0], 'eta': [2.5, 3, 3.75, 4.25, 4.5], 'minpt': [350, 500, 750, 1500, 3000, 5000], 'chi2': [1, 2.5, 5, 7.5, 10, 100], 'n1trk': [0.5, 1.5, 2.5, 3.5], 'nlt16': [0.5], 'mcor': [2000, 3000, 4000, 5000, 7500], 'fdchi2': [33, 125, 350, 780, 1800, 5000, 10000], 'ipchi2': [8, 26, 62, 150, 500, 1000], 'n': [2.5, 3.5]},
              iterations=5000, max_features_per_iteration=6,
              regularization=0.01, sync=False,
              train_features=['n', 'mcor', 'chi2', 'eta', 'fdchi2', 'minpt', 'nlt16', 'ipchi2', 'n1trk', 'sumpt'],
              training_fraction=0.5, user_name='antares')

BBDT-5, 6



In [30]:

    
ef_base_bbdt5 = MatrixNetSkyGridClassifier(train_features=variables, user_name='antares',
                                          connection='skygrid',
                                          iterations=5000, sync=False, intervals=5)
ef_base_bbdt5.fit(train, train['signal'])

ef_base_bbdt6 = MatrixNetSkyGridClassifier(train_features=variables, user_name='antares',
                                          connection='skygrid',
                                          iterations=5000, sync=False, intervals=6)
ef_base_bbdt6.fit(train, train['signal'])









    Out[30]:





MatrixNetSkyGridClassifier(auto_stop=None, baseline_feature=None,
              command_line_params=None, connection='skygrid',
              dump_filename=None, features_sample_rate_per_iteration=1.0,
              intervals=6, iterations=5000, max_features_per_iteration=6,
              regularization=0.01, sync=False,
              train_features=['n', 'mcor', 'chi2', 'eta', 'fdchi2', 'minpt', 'nlt16', 'ipchi2', 'n1trk', 'sumpt'],
              training_fraction=0.5, user_name='antares')

Pruning



In [31]:

    
from rep.data import LabeledDataStorage
from rep.report import ClassificationReport
report = ClassificationReport({'base': ef_base}, LabeledDataStorage(test, test['signal']))



In [32]:

    
report.roc()









    Out[32]:

Minimize log_loss

он же BinomialDeviance



In [33]:

    
%run pruning.py

Training sample is cut to be aliquot 8



In [34]:

    
new_trainlen = (len(train) // 8) * 8
trainX = train[ef_base.features][:new_trainlen].values
trainY = train['signal'][:new_trainlen].values 
trainW = numpy.ones(len(trainY))
trainW[trainY == 0] *= sum(trainY) / sum(1 - trainY)



In [35]:

    
new_features, new_formula_mx, new_classifier = select_trees(trainX, trainY, sample_weight=trainW,
                                                            initial_classifier=ef_base,
                                                            iterations=100, n_candidates=100, 
                                                            learning_rate=0.1, regularization=50.)









    



0 1.61639046851 0.864149179375
1 1.55188342817 0.871006756536
2 1.4974419247 0.871931415195
3 1.45063957562 0.871257415717
4 1.40839869622 0.882903045944
5 1.37366046126 0.88787704316
6 1.34072767041 0.887003202722
7 1.31150104365 0.893023018467
8 1.28379095058 0.896263184453
9 1.25809114034 0.898190409332
10 1.23593045848 0.89788161298
11 1.21672628163 0.899391156041
12 1.19916894981 0.901041763496
13 1.18285986055 0.900870790499
14 1.16821214312 0.901828769835
15 1.15477734313 0.902456983346
16 1.14213029917 0.903531415611
17 1.12974589809 0.904730333043
18 1.11902162756 0.905431320555
19 1.10922459706 0.906217749773
20 1.09967334445 0.907074639763
21 1.09128157517 0.908045647467
22 1.08329445479 0.908310608384
23 1.07526925282 0.909011025167
24 1.06802464375 0.909790861995
25 1.06178926981 0.910268591808
26 1.05538189711 0.910926982288
27 1.04974682021 0.911391242005
28 1.04431160909 0.911875445471
29 1.03922534297 0.912452518835
30 1.03436250836 0.912689445171
31 1.02963139549 0.913149040851
32 1.02534119856 0.913312034488
33 1.02116665067 0.913531434514
34 1.01741194297 0.913966424249
35 1.01364996586 0.914297519611
36 1.01003652599 0.91472001994
37 1.00665776596 0.915135662804
38 1.00334874405 0.915348876042
39 1.00006517906 0.915804553904
40 0.997073869799 0.916169904628
41 0.994289821897 0.916483854005
42 0.991591343735 0.916773954987
43 0.988827887912 0.917221982097
44 0.986271418313 0.91752158054
45 0.983893571884 0.917694392983
46 0.981632392586 0.91795698595
47 0.979477890859 0.91804893046
48 0.977394041489 0.918156519221
49 0.975265168264 0.918412660519
50 0.973352807959 0.918617887619
51 0.971251012274 0.91875493202
52 0.969354531674 0.918978874537
53 0.967553121642 0.919198296838
54 0.965804421623 0.919480419235
55 0.96409568683 0.919733387772
56 0.9623992872 0.920035855162
57 0.960784529657 0.920191115823
58 0.959280770215 0.920367844727
59 0.957787245789 0.920583069027
60 0.956201299682 0.920745173206
61 0.954705664307 0.920993930474
62 0.953308382945 0.921239222111
63 0.951929776196 0.921432167157
64 0.950565745052 0.921595320013
65 0.949272821333 0.92172460219
66 0.947943617271 0.921915534431
67 0.94672827271 0.922104992055
68 0.945491761711 0.922277753458
69 0.944280050248 0.922392816125
70 0.943171510591 0.922532947673
71 0.942075376538 0.922717989193
72 0.94103088427 0.922812923227
73 0.939868657246 0.923043876713
74 0.938893696144 0.923132846034
75 0.937832755491 0.923224778438
76 0.936780426641 0.923351691601
77 0.935799722128 0.923507977501
78 0.934856256431 0.923549669289
79 0.933859349167 0.923711498903
80 0.932928206017 0.923847081665
81 0.932001324719 0.924036900534
82 0.931088570411 0.924177670024
83 0.930261124209 0.924254283008
84 0.929307015668 0.924424364707
85 0.928375147698 0.924595685293
86 0.92745132652 0.92472634456
87 0.926564151302 0.924850853359
88 0.925662013781 0.925017828153
89 0.924834935539 0.925096802694
90 0.923977048654 0.925209568596
91 0.92315137584 0.92531444453
92 0.92236916542 0.925436261227
93 0.921630076264 0.925546750023
94 0.920800498935 0.925651854423
95 0.920078219071 0.925734530712
96 0.919332405527 0.925880794329
97 0.918620721909 0.925988038696
98 0.917779386901 0.926118577388
99 0.917027666116 0.92621668273



In [36]:

    
prunned = cPickle.loads(cPickle.dumps(ef_base))
prunned.formula_mx = new_formula_mx



In [100]:

    
def mode_scheme_fit(train, base, suf, model_file):
    blending_parts = OrderedDict()
    for n_ch, ch in enumerate(sig_train_modes_names):
        temp = FoldingClassifier(base_estimator=base, random_state=11, features=variables, ipc_profile=PROFILE)
        temp_data = train[(train['mode'] == ch) | (train['mode'] == bck_train_mode_name)]
        temp.fit(temp_data, temp_data['signal'])
        blending_parts['ch' + str(n_ch) + suf] = temp
    import cPickle
    with open(model_file, 'w') as f:
        cPickle.dump(blending_parts, f)

def mode_scheme_predict(data, suf, model_file, mode='train'):
    with open(model_file, 'r') as f:
        blending_parts = cPickle.load(f)
    for n_ch, ch in enumerate(sig_train_modes_names):
        temp_name = 'ch' + str(n_ch) + suf
        if mode == 'train':
            temp_key = ((data['mode'] == ch) | (data['mode'] == bck_train_mode_name))
            data.ix[temp_key, temp_name] = blending_parts[temp_name].predict_proba(
                data[temp_key])[:, 1]
            data.ix[~temp_key, temp_name] = blending_parts[temp_name].predict_proba(
                data[~temp_key])[:, 1]
        else:
            data[temp_name] = blending_parts[temp_name].predict_proba(data)[:, 1]



In [101]:

    
def get_best_svr_by_channel(data, feature_mask, count=1):
    add_events = []
    for id_est, channel in enumerate(sig_train_modes_names):
        train_part = data[(data['mode'] == channel)]
        for num, group in train_part.groupby('unique'):
            index = numpy.argsort(group[feature_mask.format(id_est)].values)[::-1]
            add_events.append(group.iloc[index[:count], :])
    good_events = pandas.concat([data[(data['mode'] == bck_train_mode_name)]] + add_events)
    print len(good_events)
    return good_events



In [104]:

    
from sklearn.ensemble import RandomForestClassifier
from rep.metaml import FoldingClassifier
base = RandomForestClassifier(n_estimators=500, min_samples_leaf=50, max_depth=6,
                              max_features=7, n_jobs=8)

mode_scheme_fit(train, base, '', 'forest_trick.pkl')
mode_scheme_predict(train, '', 'forest_trick.pkl')
mode_scheme_predict(test, '', 'forest_trick.pkl', mode='test')









    



KFold prediction using folds column
KFold prediction using folds column
KFold prediction using folds column
KFold prediction using folds column
KFold prediction using folds column
KFold prediction using folds column
KFold prediction using folds column
KFold prediction using folds column
KFold prediction using folds column
KFold prediction using folds column
KFold prediction using folds column
KFold prediction using folds column
KFold prediction using folds column
KFold prediction using folds column
KFold prediction using folds column
KFold prediction using folds column
KFold prediction using folds column
KFold prediction using folds column



In [107]:

    
good_events = get_best_svr_by_channel(train, 'ch{}', 2)
forest_mn = MatrixNetSkyGridClassifier(train_features=variables,
                                     user_name='antares',
                                     connection='skygrid',
                                     iterations=5000, sync=False)
forest_mn.fit(good_events, good_events['signal'])









    



148864






    Out[107]:





MatrixNetSkyGridClassifier(auto_stop=None, baseline_feature=None,
              command_line_params=None, connection='skygrid',
              dump_filename=None, features_sample_rate_per_iteration=1.0,
              intervals=64, iterations=5000, max_features_per_iteration=6,
              regularization=0.01, sync=False,
              train_features=['n', 'mcor', 'chi2', 'eta', 'fdchi2', 'minpt', 'nlt16', 'ipchi2', 'n1trk', 'sumpt'],
              training_fraction=0.5, user_name='antares')



In [112]:

    
forest_mn_bbdt = MatrixNetSkyGridClassifier(train_features=variables,
                                            user_name='antares',
                                             connection='skygrid',
                                             iterations=5000, sync=False, intervals=special_b)
forest_mn_bbdt.fit(good_events, good_events['signal'])









    Out[112]:





MatrixNetSkyGridClassifier(auto_stop=None, baseline_feature=None,
              command_line_params=None, connection='skygrid',
              dump_filename=None, features_sample_rate_per_iteration=1.0,
              intervals={'sumpt': [3000, 4000, 5000, 6000, 7500, 9000, 12000.0, 23000.0, 50000.0], 'eta': [2.5, 3, 3.75, 4.25, 4.5], 'minpt': [350, 500, 750, 1500, 3000, 5000], 'chi2': [1, 2.5, 5, 7.5, 10, 100], 'n1trk': [0.5, 1.5, 2.5, 3.5], 'nlt16': [0.5], 'mcor': [2000, 3000, 4000, 5000, 7500], 'fdchi2': [33, 125, 350, 780, 1800, 5000, 10000], 'ipchi2': [8, 26, 62, 150, 500, 1000], 'n': [2.5, 3.5]},
              iterations=5000, max_features_per_iteration=6,
              regularization=0.01, sync=False,
              train_features=['n', 'mcor', 'chi2', 'eta', 'fdchi2', 'minpt', 'nlt16', 'ipchi2', 'n1trk', 'sumpt'],
              training_fraction=0.5, user_name='antares')



In [116]:

    
new_trainlen = (len(good_events) // 8) * 8
trainX = good_events[forest_mn.features][:new_trainlen].values
trainY = good_events['signal'][:new_trainlen].values 
trainW = numpy.ones(len(trainY))
trainW[trainY == 0] *= sum(trainY) / sum(1 - trainY)



In [143]:

    
len(train), len(good_events)









    Out[143]:





(490610, 148864)



In [117]:

    
new_features_f, new_formula_mx_f, new_classifier_f = select_trees(trainX, trainY, sample_weight=trainW,
                                                                initial_classifier=forest_mn,
                                                                iterations=100, n_candidates=100, 
                                                                learning_rate=0.1, regularization=50.)









    



0 1.59878247354 0.898267058314
1 1.51376874775 0.923562775721
2 1.44079443049 0.924933289748
3 1.3805166538 0.926697044996
4 1.32495882539 0.927486156871
5 1.27496922824 0.929864337647
6 1.23062960925 0.933785251222
7 1.19300866229 0.933685420934
8 1.15761858181 0.934627903387
9 1.12736800412 0.935328811146
10 1.0991202171 0.935250788258
11 1.07445225558 0.936354042668
12 1.05126925344 0.937850097817
13 1.03058754558 0.938862143141
14 1.011693557 0.939935307133
15 0.994375772612 0.939972899881
16 0.978284559854 0.940253851797
17 0.963552829217 0.94151909765
18 0.95009519394 0.942402605293
19 0.937824257337 0.943313473644
20 0.925258705091 0.943963868293
21 0.914464860855 0.944279791601
22 0.904148147213 0.94469312103
23 0.894715951762 0.945032701158
24 0.885681679521 0.945655614969
25 0.877518873326 0.946299568954
26 0.869470989835 0.947021607744
27 0.862325731184 0.947294104017
28 0.85581607304 0.947788591194
29 0.849038920939 0.948225310934
30 0.842894270099 0.948501071958
31 0.837094979986 0.948995302384
32 0.831854245127 0.949197773096
33 0.826866939463 0.949624221087
34 0.822073866616 0.950014177368
35 0.817624153846 0.950377257506
36 0.813239161583 0.950787564131
37 0.809029532184 0.951188790564
38 0.805019091745 0.951439237491
39 0.801188836807 0.951767397723
40 0.797425227379 0.95204708708
41 0.793877353362 0.952296314858
42 0.790359499498 0.952553469632
43 0.787111040846 0.952867135342
44 0.784038253519 0.953166642023
45 0.7810720125 0.953454269145
46 0.778247009403 0.953694128011
47 0.77554796413 0.953835621722
48 0.772927761308 0.953910043357
49 0.770445182331 0.954081491538
50 0.768067426525 0.954254790148
51 0.76568604058 0.954442982373
52 0.763463801792 0.954526193461
53 0.761344647854 0.954709426307
54 0.759307040909 0.954886963171
55 0.757037477502 0.955075411474
56 0.755142163347 0.955209565722
57 0.753288753573 0.955342839342
58 0.751395688236 0.955431219786
59 0.749595943251 0.955640305471
60 0.747779605378 0.955797967132
61 0.746098369404 0.955874234484
62 0.744367946735 0.956023756816
63 0.742807168465 0.956142489476
64 0.741153084045 0.956320105754
65 0.739642832294 0.956437582587
66 0.738204947827 0.956526529701
67 0.736742814892 0.956671447665
68 0.735430133919 0.956724998722
69 0.73409406536 0.956883355598
70 0.732739448365 0.957025961114
71 0.731516026479 0.957146624292
72 0.730234646269 0.95725046031
73 0.728962072682 0.957391345623
74 0.727785085272 0.957488075375
75 0.726656382863 0.957576798379
76 0.725556843544 0.957672831233
77 0.724400637666 0.957772615421
78 0.723314294513 0.957877710969
79 0.722304240869 0.957955053784
80 0.721265254603 0.958010397728
81 0.720290057549 0.958100522603
82 0.719287866499 0.958197303166
83 0.718327446866 0.95828584715
84 0.717248826746 0.95837316761
85 0.716359696296 0.958473355265
86 0.715416786885 0.958574747263
87 0.714477785961 0.958671261316
88 0.713549252037 0.95874033121
89 0.712539082925 0.958824843891
90 0.711706540491 0.958912600795
91 0.710874318757 0.958997675435
92 0.71009676271 0.959052239702
93 0.709309638511 0.959126766662
94 0.708524743109 0.959228390847
95 0.707789412854 0.95931586206
96 0.707022757936 0.959388495519
97 0.706333697516 0.959431312136
98 0.705551481107 0.959490530574
99 0.704832677023 0.959557165533



In [118]:

    
prunned_f = cPickle.loads(cPickle.dumps(forest_mn))
prunned_f.formula_mx = new_formula_mx_f



In [120]:

    
estimators = {'base MN': ef_base, 'BBDT MN-6': ef_base_bbdt6, 'BBDT MN-5': ef_base_bbdt5,
              'BBDT MN special': ef_base_bbdt,
              'Prunned MN': prunned, 'base MN + forest': forest_mn,
              'BBDT MN special + forest': forest_mn_bbdt, 'Prunned MN + forest': prunned_f}



In [121]:

    
import cPickle
with open('models/prunned.pkl', 'w') as file_pr:
    cPickle.dump(estimators, file_pr)

Calculate thresholds on classifiers



In [122]:

    
thresholds = dict()
test_bck = test[test['signal'] == 0]
RATE = [2500., 4000.]
events_pass = dict()
for name, cl in estimators.items():
    prob = cl.predict_proba(test_bck)
    thr, result = calculate_thresholds(test_bck, prob, total_test_bck_events, rates=RATE)
    for rate, val in result.items():
        events_pass['{}-{}'.format(rate, name)] = val[1]
    thresholds[name] = thr
    print name, result









    



BBDT MN-6 {4000.0: (0.99402680780929342, 222, 0.00398900328823244), 2500.0: (0.99595672446060457, 139, 0.002497619175965357)}
BBDT MN-5 {4000.0: (0.99425757263704562, 222, 0.00398900328823244), 2500.0: (0.99628784046073426, 139, 0.002497619175965357)}
Prunned MN {4000.0: (0.76591901044032684, 222, 0.00398900328823244), 2500.0: (0.81966655622937423, 139, 0.002497619175965357)}
BBDT MN special + forest {4000.0: (0.9804137431800608, 222, 0.00398900328823244), 2500.0: (0.98829762728883797, 139, 0.002497619175965357)}
Prunned MN + forest {4000.0: (0.7291955536845629, 222, 0.00398900328823244), 2500.0: (0.81979088052929106, 139, 0.002497619175965357)}
BBDT MN special {4000.0: (0.99441421391844975, 222, 0.00398900328823244), 2500.0: (0.99615994806063501, 139, 0.002497619175965357)}
base MN + forest {4000.0: (0.97825892637449507, 222, 0.00398900328823244), 2500.0: (0.98666907201136622, 139, 0.002497619175965357)}
base MN {4000.0: (0.99410103269102013, 222, 0.00398900328823244), 2500.0: (0.99610789680421596, 139, 0.002497619175965357)}

Final efficiencies for each mode



In [123]:

    
train_modes_eff, statistic = result_statistic(estimators, sig_train_modes_names, 
                                              test[test['signal'] == 1],
                                              thresholds, RATE, total_test_signal_events)



In [153]:

    
from rep.plotting import BarComparePlot
xticks_labels = ['$B^0 \\to K^*\mu^+\mu^-$', "$B^0 \\to D^+D^-$", "$B^0 \\to D^- \mu^+ \\nu_{\mu}$", 
                 '$B^+ \\to \pi^+ K^-K^+$', '$B^0_s \\to \psi(1S) K^+K^-\pi^+\pi^-$', '$B^0_s \\to D_s^-\pi^+$']
for r in RATE:
    new_dict = [] 
    for key, val in train_modes_eff.iteritems():
        if (key[0] in {'base MN', 'Prunned MN', 'BBDT MN special', 
                       'base MN + forest', 'Prunned MN + forest', 'BBDT MN special + forest'}) and r == key[1]:
            new_dict.append((key, val))
    new_dict = dict(new_dict)        
    BarComparePlot(new_dict).plot(new_plot=True, figsize=(24, 8), ylabel='efficiency', fontsize=22)
    xticks(3 + 11 * numpy.arange(6), xticks_labels, rotation=0)
    lgd = legend(bbox_to_anchor=(0.5, 1.3), loc='upper center', ncol=2, fontsize=22)
# plt.savefig('hlt2-experiments.pdf' , format='pdf', bbox_extra_artists=(lgd,), bbox_inches='tight')



In [125]:

    
from rep.plotting import BarComparePlot
for r in RATE:
    new_dict = [] 
    for key, val in train_modes_eff.iteritems():
        if r == key[1]:
            new_dict.append((key, val))
    new_dict = dict(new_dict)        
    BarComparePlot(new_dict).plot(new_plot=True, figsize=(24, 8), ylabel='efficiency', fontsize=22)
    lgd = legend(bbox_to_anchor=(0.5, 1.3), loc='upper center', ncol=2, fontsize=22)
# plt.savefig('hlt2-experiments.pdf' , format='pdf', bbox_extra_artists=(lgd,), bbox_inches='tight')

Classification report using events



In [126]:

    
plots = OrderedDict()
for key, value in estimators.items():
    plots[key] = plot_roc_events(value, test[test['signal'] == 1], test[test['signal'] == 0], key)









    



BBDT MN-6 AUC: 0.948787065058
BBDT MN-5 AUC: 0.946395911938
Prunned MN AUC: 0.954561135933
BBDT MN special + forest AUC: 0.956099303177
Prunned MN + forest AUC: 0.957158203039
BBDT MN special AUC: 0.952332233169
base MN + forest AUC: 0.959967543942
base MN AUC: 0.955959310671



In [127]:

    
bbdt_plots = plots.copy()
bbdt_plots.pop('Prunned MN')
bbdt_plots.pop('Prunned MN + forest')









    Out[127]:





(array([ 0.        ,  0.        ,  0.        , ...,  0.99958609,
         0.99979305,  1.        ]),
 array([  1.54235301e-05,   3.71280728e-05,   4.34090853e-05, ...,
          1.00000000e+00,   1.00000000e+00,   1.00000000e+00]))



In [128]:

    
from rep.plotting import FunctionsPlot
FunctionsPlot(bbdt_plots).plot(new_plot=True, xlim=(0.02, 0.06), ylim=(0.65, 0.82))
plot([1. * events_pass['2500.0-base MN'] / statistic_length(ds_test_bck)['Events']] * 2, 
     [0., 1], 'b--', label='rate: 2.5 kHz')
plot([1. * events_pass['4000.0-base MN'] / statistic_length(ds_test_bck)['Events']] * 2, 
     [0., 1], 'g--', label='rate: 4. kHz')
lgd = legend(loc='upper center', fontsize=16, bbox_to_anchor=(0.5, 1.3), ncol=3)
title('ROC for events (training decays)', fontsize=20)
xlabel('FRP, background events efficiency', fontsize=20)
ylabel('TPR, signal events efficiency', fontsize=20)









    Out[128]:





<matplotlib.text.Text at 0x71ba1d0>



In [130]:

    
from rep.plotting import FunctionsPlot
FunctionsPlot(plots).plot(new_plot=True, xlim=(0.02, 0.06), ylim=(0.65, 0.82))
plot([1. * events_pass['2500.0-base MN'] / statistic_length(ds_test_bck)['Events']] * 2, 
     [0., 1], 'b--', label='rate: 2.5 kHz')
plot([1. * events_pass['4000.0-base MN'] / statistic_length(ds_test_bck)['Events']] * 2, 
     [0., 1], 'g--', label='rate: 4. kHz')
lgd = legend(loc='upper center', fontsize=16, bbox_to_anchor=(0.5, 1.4), ncol=3)
title('ROC for events (training decays)', fontsize=20)
xlabel('FRP, background events efficiency', fontsize=20)
ylabel('TPR, signal events efficiency', fontsize=20)









    Out[130]:





<matplotlib.text.Text at 0x472e70d0>

all channels efficiencies



In [131]:

    
from collections import defaultdict
all_channels = []
efficiencies = defaultdict(OrderedDict)
for mode in empty_events.keys():
    if mode in set(sig_train_modes_names) or mode == bck_train_mode_name:
        continue
    df = pandas.read_csv(os.path.join(folder , 'mod_{}.csv'.format(mode)), sep='\t')
    if len(df) <= 0:
        continue
    total_events = statistic_length(df)['Events'] + empty_events[mode]
    df = df[(df['pass_nbody'] == 1) & (df['mcor'] <= 10e3)]
    passed_events = statistic_length(df)['Events']
    all_channels.append(df)
    for name, cl in estimators.items():
        prob = cl.predict_proba(df)
        for rate, thresh in thresholds[name].items():
            eff = final_eff_for_mode(df, prob, total_events, thresh)
            latex_name = '$' + Samples[str(mode)]['root'].replace("#", "\\") + '$'
            efficiencies[(name, rate)][latex_name] = eff



In [132]:

    
for key, val in efficiencies.items():
    for key_2, val_2 in val.items():
        if val_2 <= 0.1:
            efficiencies[key].pop(key_2)



In [133]:

    
from rep.plotting import BarComparePlot
for r in RATE:
    new_dict = [] 
    for key, val in efficiencies.iteritems():
        if r == key[1]:
            new_dict.append((key, val))
    new_dict = dict(new_dict)        
    BarComparePlot(new_dict).plot(new_plot=True, figsize=(24, 8), ylabel='efficiency', fontsize=22)
    lgd = legend(bbox_to_anchor=(0.5, 1.4), loc='upper center', ncol=2, fontsize=22)



In [134]:

    
plots_all = OrderedDict()
for key, value in estimators.items():
    plots_all[key] = plot_roc_events(value, pandas.concat([test[test['signal'] == 1]] + all_channels), 
                                     test[test['signal'] == 0], key)









    



BBDT MN-6 AUC: 0.893047340693
BBDT MN-5 AUC: 0.891496975588
Prunned MN AUC: 0.899367387491
BBDT MN special + forest AUC: 0.894444678162
Prunned MN + forest AUC: 0.891066586762
BBDT MN special AUC: 0.896260843657
base MN + forest AUC: 0.890947087059
base MN AUC: 0.897475380098



In [144]:

    
from rep.plotting import FunctionsPlot
FunctionsPlot(plots_all).plot(new_plot=True, xlim=(0.02, 0.06), ylim=(0.5, 0.66))
plot([1. * events_pass['2500.0-base MN'] / statistic_length(ds_test_bck)['Events']] * 2, 
     [0., 1], 'b--', label='rate: 2.5 kHz')
plot([1. * events_pass['4000.0-base MN'] / statistic_length(ds_test_bck)['Events']] * 2, 
     [0., 1], 'g--', label='rate: 4. kHz')
lgd = legend(loc='upper center', fontsize=16, bbox_to_anchor=(0.5, 1.3), ncol=4)
title('ROC for events (all decays together)', fontsize=20)
xlabel('FRP, background events efficiency', fontsize=20)
ylabel('TPR, signal events efficiency', fontsize=20)









    Out[144]:





<matplotlib.text.Text at 0x5724bc10>

DIfferent rates



In [81]:

    
thresholds = OrderedDict()
RATE = [2000., 2500., 3000., 3500., 4000.]
for name, cl in estimators.items():
    prob = cl.predict_proba(ds_test_bck)
    thr, result = calculate_thresholds(ds_test_bck, prob, total_test_bck_events, rates=RATE)
    thresholds[name] = thr
    print name, result









    



BBDT MN-6 {2000.0: (0.9967790194752344, 111, 0.00199450164411622), 3000.0: (0.99547798927619391, 166, 0.002982768224534167), 4000.0: (0.99402680780929342, 222, 0.00398900328823244), 3500.0: (0.99472365991502065, 194, 0.003485885756383304), 2500.0: (0.99595672446060457, 139, 0.002497619175965357)}
BBDT MN-5 {2000.0: (0.99676111865222461, 111, 0.00199450164411622), 3000.0: (0.99559207475332101, 166, 0.002982768224534167), 4000.0: (0.99425757263704562, 222, 0.00398900328823244), 3500.0: (0.99487682163832258, 194, 0.003485885756383304), 2500.0: (0.99628784046073426, 139, 0.002497619175965357)}
BBDT MN special {2000.0: (0.99687789022772133, 111, 0.00199450164411622), 3000.0: (0.99553511016986351, 166, 0.002982768224534167), 4000.0: (0.99441421391844975, 222, 0.00398900328823244), 3500.0: (0.99498939292199462, 194, 0.003485885756383304), 2500.0: (0.99615994806063501, 139, 0.002497619175965357)}
Prunned MN {2000.0: (0.84930329512954106, 111, 0.00199450164411622), 3000.0: (0.80573534113098733, 166, 0.002982768224534167), 4000.0: (0.76591901044032684, 222, 0.00398900328823244), 3500.0: (0.78564740990945015, 194, 0.003485885756383304), 2500.0: (0.81966655622937423, 139, 0.002497619175965357)}
base MN {2000.0: (0.9967558467810903, 111, 0.00199450164411622), 3000.0: (0.99552231740013475, 166, 0.002982768224534167), 4000.0: (0.99410103269102013, 222, 0.00398900328823244), 3500.0: (0.99502030114377649, 194, 0.003485885756383304), 2500.0: (0.99610789680421596, 139, 0.002497619175965357)}



In [82]:

    
train_modes_eff, statistic = result_statistic({'base MN': estimators['base MN']}, sig_train_modes_names, 
                                              test[test['signal'] == 1],
                                              thresholds, RATE, total_test_signal_events)



In [93]:

    
order_rate = OrderedDict()
for j in numpy.argsort([i[1] for i in train_modes_eff.keys()]):
    order_rate[train_modes_eff.keys()[j]] = train_modes_eff.values()[j]



In [94]:

    
from rep.plotting import BarComparePlot
BarComparePlot(order_rate).plot(new_plot=True, figsize=(18, 6), ylabel='efficiency', fontsize=18)
lgd = legend(bbox_to_anchor=(0.5, 1.2), loc='upper center', ncol=5, fontsize=18)
# plt.savefig('rates.pdf' , format='pdf', bbox_extra_artists=(lgd,), bbox_inches='tight')

	unique	mode	event_number	sv_number	pass_2body	pass_nbody	signal	sumpt	m	mcor	...	nmu	n	fdchi2	maxtchi2	ngood	nmu1	mupt	n1trk	sig	idx
1	11114001_0	11114001	0	1	1	1	1	6912.10	2806.110	4168.77	...	1	2	9653.910	1.20832	2	1	2972.750	2	1	1
2	11114001_0	11114001	0	2	1	1	1	3731.68	728.895	4429.24	...	2	2	7424.620	1.58733	2	2	2972.750	1	1	2
3	11114001_0	11114001	0	3	1	1	1	4698.27	1249.280	2144.50	...	1	2	1382.720	1.58733	2	1	758.924	1	1	3
5	11114001_1	11114001	1	1	1	1	1	4003.36	3530.930	4651.29	...	1	2	1102.850	1.20808	2	1	2611.970	2	1	1
9	11114001_1	11114001	1	5	1	1	1	2145.67	1520.500	2266.15	...	1	2	868.049	1.35907	2	1	1137.010	2	1	5