In [1]:

    
%run additional.ipynb



In [2]:

    
%pylab inline









    



Populating the interactive namespace from numpy and matplotlib






    



WARNING: pylab import has clobbered these variables: ['empty']
`%matplotlib` prevents importing * from pylab and numpy



In [3]:

    
pandas.set_option('display.max_colwidth', 120)

HLT2 nbody classification

did preselections:

any sv.n,
any sv.minpt
sv.nlt16 < 2

Training channels (read data)

We will use just 11114001, 11296013, 11874042, 12103035, 13246001, 13264021



In [4]:

    
sig_train_modes_names = [11114001, 11296013, 11874042, 12103035, 13246001, 13264021]
bck_train_mode_name = 30000000
sig_train_files = ['mod_{}.csv'.format(name) for name in sig_train_modes_names]
bck_train_files = 'mod_30000000.csv'
folder = "datasets/prepared_hlt_body/"



In [5]:

    
# concat all signal data
if not os.path.exists(folder + 'signal_hlt2.csv'):
    concat_files(folder, sig_train_files, os.path.join(folder , 'signal_hlt2.csv'))



In [6]:

    
signal_data = pandas.read_csv(os.path.join(folder , 'signal_hlt2.csv'), sep='\t')
bck_data = pandas.read_csv(os.path.join(folder , bck_train_files), sep='\t')



In [7]:

    
signal_data.columns









    Out[7]:





Index([u'unique', u'mode', u'event_number', u'sv_number', u'pass_2body', u'pass_nbody', u'signal', u'sumpt', u'm', u'mcor', u'ipchi2', u'chi2', u'sumipchi2', u'fdr', u'nlt16', u'minpt', u'eta', u'pt', u'nmu', u'n', u'fdchi2', u'maxtchi2', u'ngood', u'nmu1', u'mupt', u'n1trk', u'sig', u'idx'], dtype='object')

Counting events and svrs,

that passed L0 and GoodGenB preselection (this data was generated by skim)



In [8]:

    
print 'Signal', statistic_length(signal_data)
print 'Bck', statistic_length(bck_data)









    



Signal {'SVR': 12367085, 'Events': 166059}
Bck {'SVR': 408147, 'Events': 38035}



In [9]:

    
total_bck_events = statistic_length(bck_data)['Events'] + empty_events[bck_train_mode_name]
total_signal_events_by_mode = dict()
for mode in sig_train_modes_names:
    total_signal_events_by_mode[mode] = statistic_length(signal_data[signal_data['mode'] == mode])['Events'] + empty_events[mode]

events distribution by mode



In [10]:

    
print 'Bck:', total_bck_events
'Signal:', total_signal_events_by_mode









    



Bck: 111306






    Out[10]:





('Signal:',
 {11114001: 61102,
  11296013: 17149,
  11874042: 3942,
  12103035: 25250,
  13246001: 37313,
  13264021: 25165})

Define variables

We will use two sets of variables



In [11]:

    
variables = ["n", "mcor", "chi2", "eta", "fdchi2", "minpt", "nlt16", "ipchi2", "n1trk"]
variables_base = variables + ["sumpt", "pt", "m"]
variables_additional = ["fdr", "sumpt", "sumipchi2"]

Counting events and svrs,

which passed pass_nbody (equivalent Mike's preselections for nbody selection)



In [12]:

    
# hlt2 nbody selection
signal_data = signal_data[(signal_data['pass_nbody'] == 1) & (signal_data['mcor'] <= 10e3)]
bck_data = bck_data[(bck_data['pass_nbody'] == 1) & (bck_data['mcor'] <= 10e3)]



In [13]:

    
print 'Signal', statistic_length(signal_data)
print 'Bck', statistic_length(bck_data)









    



Signal {'SVR': 960600, 'Events': 150566}
Bck {'SVR': 21868, 'Events': 9664}



In [14]:

    
total_signal_events_by_mode_presel = dict()
for mode in sig_train_modes_names:
    total_signal_events_by_mode_presel[mode] = statistic_length(signal_data[signal_data['mode'] == mode])['Events']
total_bck_events_presel = statistic_length(bck_data)['Events']

events distribution by mode



In [15]:

    
print 'Bck:', total_bck_events_presel
'Signal:', total_signal_events_by_mode_presel









    



Bck: 9664






    Out[15]:





('Signal:',
 {11114001: 53105,
  11296013: 16256,
  11874042: 3290,
  12103035: 21498,
  13246001: 33807,
  13264021: 22610})



In [16]:

    
signal_data.head()









    Out[16]:






  
    
      
      unique
      mode
      event_number
      sv_number
      pass_2body
      pass_nbody
      signal
      sumpt
      m
      mcor
      ...
      nmu
      n
      fdchi2
      maxtchi2
      ngood
      nmu1
      mupt
      n1trk
      sig
      idx
    
  
  
    
      1
       11114001_0
       11114001
       0
       1
       1
       1
       1
       6912.10
       2806.110
       4168.77
      ...
       1
       2
       9653.910
       1.20832
       2
       1
       2972.750
       2
       1
       1
    
    
      2
       11114001_0
       11114001
       0
       2
       1
       1
       1
       3731.68
        728.895
       4429.24
      ...
       2
       2
       7424.620
       1.58733
       2
       2
       2972.750
       1
       1
       2
    
    
      3
       11114001_0
       11114001
       0
       3
       1
       1
       1
       4698.27
       1249.280
       2144.50
      ...
       1
       2
       1382.720
       1.58733
       2
       1
        758.924
       1
       1
       3
    
    
      5
       11114001_1
       11114001
       1
       1
       1
       1
       1
       4003.36
       3530.930
       4651.29
      ...
       1
       2
       1102.850
       1.20808
       2
       1
       2611.970
       2
       1
       1
    
    
      9
       11114001_1
       11114001
       1
       5
       1
       1
       1
       2145.67
       1520.500
       2266.15
      ...
       1
       2
        868.049
       1.35907
       2
       1
       1137.010
       2
       1
       5
    
  

5 rows × 28 columns

Prepare train/test splitting

Divide events which passed alll preselections into two equal parts randomly



In [17]:

    
ds_train_signal, ds_train_bck, ds_test_signal, ds_test_bck = prepare_data(signal_data, bck_data, 'unique')

train: counting events and svrs



In [18]:

    
print 'Signal', statistic_length(ds_train_signal)
print 'Bck', statistic_length(ds_train_bck)









    



Signal {'SVR': 479851, 'Events': 75283}
Bck {'SVR': 10759, 'Events': 4832}



In [19]:

    
train = pandas.concat([ds_train_bck, ds_train_signal])

test: counting events and svrs



In [20]:

    
print 'Signal', statistic_length(ds_test_signal)
print 'Bck', statistic_length(ds_test_bck)









    



Signal {'SVR': 480749, 'Events': 75283}
Bck {'SVR': 11109, 'Events': 4832}



In [21]:

    
test = pandas.concat([ds_test_bck, ds_test_signal])

Define all total events in test samples

(which passed just l0 and goodgenB) using also empty events. Suppose that events which didn't pass pass_nboby also were equal randomly divided into training and test samples



In [22]:

    
total_test_bck_events = (total_bck_events - total_bck_events_presel) // 2 + statistic_length(ds_test_bck)['Events']
total_test_signal_events = dict()
for mode in sig_train_modes_names:
    total_not_passed_signal = total_signal_events_by_mode[mode] - total_signal_events_by_mode_presel[mode]
    total_test_signal_events[mode] = total_not_passed_signal // 2 + \
        statistic_length(ds_test_signal[ds_test_signal['mode'] == mode])['Events']



In [23]:

    
print 'Bck total test events:', total_test_bck_events
'Signal total test events:', total_test_signal_events









    



Bck total test events: 55653






    Out[23]:





('Signal total test events:',
 {11114001: 30533,
  11296013: 8630,
  11874042: 1950,
  12103035: 12682,
  13246001: 18627,
  13264021: 12537})

Matrixnet training

Reload models



In [24]:

    
import cPickle
if os.path.exists('models/hlt2_bodyn.pkl'):
    with open('models/hlt2_bodyn.pkl', 'r') as file_mn:
        estimators = cPickle.load(file_mn)
if os.path.exists('models/hlt2_bodyn_borders.pkl'):
    with open('models/hlt2_bodyn_borders.pkl', 'r') as file_mn:
        estimators_border = cPickle.load(file_mn)



In [25]:

    
from rep_ef.estimators import MatrixNetSkyGridClassifier
from rep.metaml import ClassifiersFactory



In [26]:

    
estimators = ClassifiersFactory()
params = {'user_name': 'antares', 'connection':'skygrid',
          'iterations': 5000, 'sync': False}
estimators['base'] = MatrixNetSkyGridClassifier(train_features=variables_base, **params)
estimators['base reg'] = MatrixNetSkyGridClassifier(train_features=variables_base, 
                                                user_name='antares', connection='skygrid',
                                                iterations=2500, sync=False, regularization=0.02)
estimators['base + additional features'] = MatrixNetSkyGridClassifier(train_features=variables_base +
                                                                         variables_additional, 
                                                                      **params)
estimators['base without (pt, m)'] = MatrixNetSkyGridClassifier(train_features=variables + ['sumpt'], **params)
estimators['base without (m)'] = MatrixNetSkyGridClassifier(train_features=variables + ['sumpt', 'pt'], **params)
estimators['base without (sumpt)'] = MatrixNetSkyGridClassifier(train_features=variables + ['m', 'pt'], **params)
estimators['base without (sumpt, m)'] = MatrixNetSkyGridClassifier(train_features=variables + ['pt'], **params)



In [27]:

    
estimators.fit(train, train['signal'])









    



model base         was trained in 5.88 seconds
model base reg     was trained in 5.72 seconds
model base + additional features was trained in 6.79 seconds
model base without (pt, m) was trained in 5.17 seconds
model base without (m) was trained in 5.47 seconds
model base without (sumpt) was trained in 5.49 seconds
model base without (sumpt, m) was trained in 5.16 seconds
Totally spent 39.68 seconds on training






    Out[27]:





ClassifiersFactory([('base', MatrixNetSkyGridClassifier(auto_stop=None, baseline_feature=None,
              command_line_params=None, connection='skygrid',
              dump_filename=None, features_sample_rate_per_iteration=1.0,
              intervals=64, iterations=5000, max_features_per_iteration=6,
              regularization=0.01, sync=False,
              train_features=['n', 'mcor', 'chi2', 'eta', 'fdchi2', 'minpt', 'nlt16', 'ipchi2', 'n1trk', 'sumpt', 'pt', 'm'],
              training_fraction=0.5, user_name='antares')), ('base reg', MatrixNetSkyGridClassifier(auto_stop=None, baseline_feature=None,
              command_line_params=None, connection='skygrid',
              dump_filename=None, features_sample_rate_per_iteration=1.0,
              intervals=64, iterations=2500, max_features_per_iteration=6,
              regularization=0.02, sync=False,
              train_features=['n', 'mcor', 'chi2', 'eta', 'fdchi2', 'minpt', 'nlt16', 'ipchi2', 'n1trk', 'sumpt', 'pt', 'm'],
              training_fraction=0.5, user_name='antares')), ('base + additional features', MatrixNetSkyGridClassifier(auto_stop=None, baseline_feature=None,
              command_line_params=None, connection='skygrid',
              dump_filename=None, features_sample_rate_per_iteration=1.0,
              intervals=64, iterations=5000, max_features_per_iteration=6,
              regularization=0.01, sync=False,
              train_features=['n', 'mcor', 'chi2', 'eta', 'fdchi2', 'minpt', 'nlt16', 'ipchi2', 'n1trk', 'sumpt', 'pt', 'm', 'fdr', 'sumpt', 'sumipchi2'],
              training_fraction=0.5, user_name='antares')), ('base without (pt, m)', MatrixNetSkyGridClassifier(auto_stop=None, baseline_feature=None,
              command_line_params=None, connection='skygrid',
              dump_filename=None, features_sample_rate_per_iteration=1.0,
              intervals=64, iterations=5000, max_features_per_iteration=6,
              regularization=0.01, sync=False,
              train_features=['n', 'mcor', 'chi2', 'eta', 'fdchi2', 'minpt', 'nlt16', 'ipchi2', 'n1trk', 'sumpt'],
              training_fraction=0.5, user_name='antares')), ('base without (m)', MatrixNetSkyGridClassifier(auto_stop=None, baseline_feature=None,
              command_line_params=None, connection='skygrid',
              dump_filename=None, features_sample_rate_per_iteration=1.0,
              intervals=64, iterations=5000, max_features_per_iteration=6,
              regularization=0.01, sync=False,
              train_features=['n', 'mcor', 'chi2', 'eta', 'fdchi2', 'minpt', 'nlt16', 'ipchi2', 'n1trk', 'sumpt', 'pt'],
              training_fraction=0.5, user_name='antares')), ('base without (sumpt)', MatrixNetSkyGridClassifier(auto_stop=None, baseline_feature=None,
              command_line_params=None, connection='skygrid',
              dump_filename=None, features_sample_rate_per_iteration=1.0,
              intervals=64, iterations=5000, max_features_per_iteration=6,
              regularization=0.01, sync=False,
              train_features=['n', 'mcor', 'chi2', 'eta', 'fdchi2', 'minpt', 'nlt16', 'ipchi2', 'n1trk', 'm', 'pt'],
              training_fraction=0.5, user_name='antares')), ('base without (sumpt, m)', MatrixNetSkyGridClassifier(auto_stop=None, baseline_feature=None,
              command_line_params=None, connection='skygrid',
              dump_filename=None, features_sample_rate_per_iteration=1.0,
              intervals=64, iterations=5000, max_features_per_iteration=6,
              regularization=0.01, sync=False,
              train_features=['n', 'mcor', 'chi2', 'eta', 'fdchi2', 'minpt', 'nlt16', 'ipchi2', 'n1trk', 'pt'],
              training_fraction=0.5, user_name='antares'))])



In [28]:

    
with open('models/hlt2_bodyn.pkl', 'w') as file_mn:
    cPickle.dump(estimators, file_mn)



In [30]:

    
for key in estimators.keys():
    print key
    print estimators[key].get_feature_importances().sort('effect')









    



base
          effect  efficiency  information
pt      0.154765    0.050105     0.936843
n       0.291004    0.453911     0.194448
n1trk   0.341499    0.424466     0.244019
ipchi2  0.354181    0.118516     0.906412
chi2    0.381149    0.117093     0.987278
minpt   0.389755    0.127029     0.930606
nlt16   0.402248    1.000000     0.122003
m       0.475211    0.145232     0.992429
eta     0.547440    0.166040     1.000000
sumpt   0.704083    0.226970     0.940874
fdchi2  0.877475    0.293967     0.905340
mcor    1.000000    0.308240     0.983982
base reg
          effect  efficiency  information
pt      0.100465    0.029906     0.936843
n       0.262449    0.376397     0.194448
ipchi2  0.262964    0.080905     0.906412
minpt   0.288414    0.086428     0.930606
chi2    0.325234    0.091868     0.987278
n1trk   0.343444    0.392500     0.244019
m       0.376614    0.105829     0.992429
eta     0.413997    0.115453     1.000000
nlt16   0.437486    1.000000     0.122003
sumpt   0.709078    0.210169     0.940874
fdchi2  0.844570    0.260154     0.905340
mcor    1.000000    0.283413     0.983982
base + additional features
             effect  efficiency  information
pt         0.141168    0.040792     0.936843
ipchi2     0.309080    0.092310     0.906412
n          0.318202    0.443001     0.194448
n1trk      0.393383    0.436413     0.244019
minpt      0.402583    0.117110     0.930606
sumipchi2  0.423588    0.126608     0.905708
chi2       0.443984    0.121740     0.987278
nlt16      0.450677    1.000000     0.122003
m          0.492205    0.134262     0.992429
fdchi2     0.570584    0.170613     0.905340
eta        0.621814    0.168332     1.000000
fdr        0.742813    0.222251     0.904777
sumpt      0.846241    0.243483     0.940874
mcor       1.000000    0.275118     0.983982
base without (pt, m)
          effect  efficiency  information
n       0.253953    0.428031     0.194448
n1trk   0.343478    0.461320     0.244019
ipchi2  0.347039    0.125481     0.906412
nlt16   0.372258    1.000000     0.122003
chi2    0.374466    0.124308     0.987278
minpt   0.386831    0.136233     0.930606
eta     0.559773    0.183459     1.000000
sumpt   0.794575    0.276777     0.940874
fdchi2  0.829651    0.300338     0.905340
mcor    1.000000    0.333073     0.983982
base without (m)
          effect  efficiency  information
pt      0.192352    0.067804     0.936843
n       0.252561    0.428931     0.194448
n1trk   0.333030    0.450699     0.244019
ipchi2  0.336214    0.122494     0.906412
chi2    0.368264    0.123182     0.987278
nlt16   0.369441    1.000000     0.122003
minpt   0.380963    0.135190     0.930606
eta     0.548780    0.181228     1.000000
sumpt   0.653494    0.229370     0.940874
fdchi2  0.817259    0.298108     0.905340
mcor    1.000000    0.335613     0.983982
base without (sumpt)
          effect  efficiency  information
n       0.291434    0.459231     0.194448
ipchi2  0.348510    0.117811     0.906412
n1trk   0.363606    0.456567     0.244019
chi2    0.384245    0.119252     0.987278
nlt16   0.398176    1.000000     0.122003
minpt   0.406356    0.133794     0.930606
m       0.516056    0.159328     0.992429
eta     0.555477    0.170201     1.000000
pt      0.704924    0.230553     0.936843
fdchi2  0.877626    0.297026     0.905340
mcor    1.000000    0.311393     0.983982
base without (sumpt, m)
          effect  efficiency  information
n       0.245004    0.426773     0.194448
n1trk   0.350370    0.486330     0.244019
nlt16   0.360199    1.000000     0.122003
chi2    0.362014    0.124198     0.987278
ipchi2  0.365005    0.136396     0.906412
minpt   0.384747    0.140035     0.930606
eta     0.549873    0.186248     1.000000
pt      0.658114    0.237937     0.936843
fdchi2  0.839088    0.313923     0.905340
mcor    1.000000    0.344224     0.983982



In [31]:

    
borders = {'n': [2.5, 3.5],
           'mcor': [2000, 3250.70996094, 4283.83984375, 5124.98535156, 6000, 7749.91992188],
           'chi2': [0.3, 1., 2.5, 5., 7.5, 10., 100.],
           'sumpt': [3500, 5000, 7500, 10e3, 20e3, 50e3],
           'eta': [2.5, 2.75, 3, 3.5, 4., 4.2],
           'fdchi2': [10, 200, 500, 1000, 10000, 100e3],
           'minpt': [600, 1000, 1500, 5000],
           'nlt16': [0.5],
           'ipchi2': [16, 62, 250, 1000],
           'n1trk': [0.5, 1.5, 2.5, 3.5]
           }

borders_chi2 = {'n': [2.5, 3.5],
           'mcor': [2000, 3250.70996094, 4283.83984375, 5124.98535156, 6000, 7749.91992188],
           'chi2': [0.3, 1., 2.5, 5., 7.5],
           'sumpt': [3500, 5000, 7500, 10e3, 20e3, 50e3],
           'eta': [2.5, 2.75, 3, 3.5, 4., 4.2],
           'fdchi2': [10, 200, 500, 1000, 10000, 100e3],
           'minpt': [600, 1000, 1500, 5000],
           'nlt16': [0.5],
           'ipchi2': [16, 62, 250, 1000],
           'n1trk': [0.5, 1.5, 2.5, 3.5]
           }

borders_chi2_eta = {'n': [2.5, 3.5],
           'mcor': [2000, 3250.70996094, 4283.83984375, 5124.98535156, 6000, 7749.91992188],
           'chi2': [0.3, 1., 2.5, 5., 7.5],
           'sumpt': [3500, 5000, 7500, 10e3, 20e3, 50e3],
           'eta': [2.75, 3, 3.5, 4., 4.2],
           'fdchi2': [10, 200, 500, 1000, 10000, 100e3],
           'minpt': [600, 1000, 1500, 5000],
           'nlt16': [0.5],
           'ipchi2': [16, 62, 250, 1000],
           'n1trk': [0.5, 1.5, 2.5, 3.5]
           }

borders_many = {'n': [2.5, 3.5],
           'mcor': [2000, 2500, 2674.76513672, 3613.61010742, 4000., 
                    4283.83984375, 4851.39501953, 5124.98535156, 6000, 7000., 8498.92480469],
           'chi2': [0.15, 0.5, 1., 1.66, 2.5, 3.3, 3.7, 5., 6.5, 8.3],
           'sumpt': [3000, 4000, 5000, 6000, 7500, 8500, 12e3, 23e3, 50e3],
           'eta': [2.5, 3, 3.25, 3.5, 3.75, 4., 4.5],
           'fdchi2': [10, 125, 350, 780, 1800, 5000, 100e3],
           'minpt': [450, 700, 950, 1300, 1800, 3500, 6600],
           'nlt16': [0.5],
           'ipchi2': [8, 25, 62, 150, 250, 460, 1000],
           'n1trk': [0.5, 1.5, 2.5, 3.5]
           }

borders_6_fix = {'n': [2.5, 3.5],
           'mcor': [2500., 2675,  3615, 4000., 4300, 4850, 6000, 7000, 8500],
           'chi2': [0.15, 0.5, 1., 1.66, 1.9, 3.3, 3.7, 5., 6.6, 8.3, 100],
           'sumpt': [3000, 4000, 5000, 6000, 7500, 8500, 12e3, 23e3, 50e3],
           'eta': [2.5, 3, 3.25, 3.5, 3.78, 4., 4.2, 4.5],
           'fdchi2': [10, 125, 350, 790, 1800, 5100, 100e3],
           'minpt': [460, 700, 950, 1260, 1830, 3420, 6650],
           'nlt16': [0.5],
           'ipchi2': [8, 26, 62, 150, 250, 460, 1000],
           'n1trk': [0.5, 1.5, 2.5, 3.5]
           }

borders_6_fix_high = {'n': [2.5, 3.5],
           'mcor': [2500.93017578, 2674.76513672, 3613.61010742, 4000.4699707, 4283.83984375, 4851.39501953,
                    5000., 6000., 7000.02490234, 8498.92480469],
           'chi2': [0.156691491604, 0.515842497349, 1.0365049839, 1.66650009155, 1.87674498558, 
                    3.33287501335, 3.70866012573, 4.99924993515, 6.66611480713, 8.3325548172, 100.],
           'sumpt': [3057.95996094, 3972.98486328, 4978.28515625, 6273.98535156, 8358.38476562, 
                     12693.5, 23393.3007812, 44177.3007812, 56187.5],
           'eta': [2.50017499924, 2.9353249073, 3.99941015244, 2.99988007545, 3.25001502037,
                   3.49966001511, 3.51357507706, 3.77819991112, 4.09203529358, 4.2, 4.49913978577],
           'fdchi2': [9.79110527039, 125.479499817, 341.268005371, 787.989013672, 1810.72497559, 5104.29980469, 10e3],
           'minpt': [464.580505371, 695.025512695, 945.834960938, 1264.60998535, 1831.51501465, 3422.45507812, 6646.77001953],
           'nlt16': [0.5],
           'ipchi2': [8.61159992218, 25.8182506561, 62.0043487549, 151.548995972, 457.808502197, 10e3],
           'n1trk': [0.5, 1.5, 2.5, 3.5]
           }



In [32]:

    
special_b = {
'n': [2.5, 3.5],
'mcor': [2000,3000,4000,5000,7500],  # I want to remove splits too close the the B mass as I was looking in simulation and this could distort the mass peak (possibly)
'chi2': [1,2.5,5,7.5,10,100],  # I also propose we add a cut to the pre-selection of chi2 < 1000.  I don't want to put in splits at too small values here b/c these type of inputs are never modeled quite right in the simulation (they always look a bit more smeared in data).
'sumpt': [3000,4000,5000,6000,7500,9000,12e3,23e3,50e3],  # I am happy with the MN splits here (these are almost "as is" from modify-6)
'eta': [2.5,3,3.75,4.25,4.5], # Close to MN.  
'fdchi2': [33,125,350,780,1800,5000,10000],  # I want to make the biggest split 10e3 because in the simulated events there is pretty much only BKGD above 40e3 but we don't want the BDT to learn to kill these as new particles would live here.   Otherwise I took the MN splits and modified the first one (the first one is 5sigma now).
'minpt': [350,500,750,1500,3000,5000],   # let's make 500 the 2nd split so that this lines up with the HLT1 SVs.
'nlt16': [0.5],
'ipchi2': [8,26,62,150,500,1000],  # I also propose we add a cut of IP chi2 < 5000 as it's all background out there.  
'n1trk': [0.5, 1.5, 2.5, 3.5]
}



In [33]:

    
estimators_border = ClassifiersFactory()
params = {'user_name': 'antares', 'connection':'skygrid',
          'iterations': 5000, 'sync': True, 
          'train_features': variables + ['sumpt']}
estimators_border['BBDT'] = MatrixNetSkyGridClassifier(intervals=borders, **params)
estimators_border['BBDT chi2'] = MatrixNetSkyGridClassifier(intervals=borders_chi2, **params)
estimators_border['BBDT chi2, eta'] = MatrixNetSkyGridClassifier(intervals=borders_chi2_eta, **params)
estimators_border['BBDT 6-bins'] = MatrixNetSkyGridClassifier(intervals=6, **params)
estimators_border['BBDT modify 6-bins'] = MatrixNetSkyGridClassifier(intervals=borders_many,  **params)
estimators_border['BBDT modify 6-bins strong'] = MatrixNetSkyGridClassifier(intervals=borders_6_fix,  **params)
estimators_border['BBDT modify 6-bins small'] = MatrixNetSkyGridClassifier(intervals=borders_6_fix_high,  **params)
estimators_border['BBDT by hand'] = MatrixNetSkyGridClassifier(intervals=special_b,  **params)



In [ ]:

    
estimators_border.fit(train, train['signal'])



In [ ]:

    
with open('models/hlt2_bodyn_borders.pkl', 'w') as file_mn:
    cPickle.dump(estimators_border, file_mn)

Calculate thresholds on classifiers



In [74]:

    
thresholds = dict()
RATE = [2500., 4000.]
events_pass = dict()
for name, cl in estimators.items():
    prob = cl.predict_proba(ds_test_bck)
    thr, result = calculate_thresholds(ds_test_bck, prob, total_test_bck_events, rates=RATE)
    for rate, val in result.items():
        events_pass['{}-{}'.format(rate, name)] = val[1]
    thresholds[name] = thr
    print name, result









    



base {4000.0: (0.99378738284236912, 222, 0.00398900328823244), 2500.0: (0.9961547461235819, 139, 0.002497619175965357)}
base reg {4000.0: (0.99319452414371645, 222, 0.00398900328823244), 2500.0: (0.99554436161910365, 139, 0.002497619175965357)}
base + additional features {4000.0: (0.99368087833143448, 222, 0.00398900328823244), 2500.0: (0.99606688681376176, 139, 0.002497619175965357)}
base without (pt, m) {4000.0: (0.99410103269102013, 222, 0.00398900328823244), 2500.0: (0.99610789680421596, 139, 0.002497619175965357)}
base without (m) {4000.0: (0.99413627210709454, 222, 0.00398900328823244), 2500.0: (0.99614607985176895, 139, 0.002497619175965357)}
base without (sumpt) {4000.0: (0.99400961806243882, 222, 0.00398900328823244), 2500.0: (0.99615591271296366, 139, 0.002497619175965357)}
base without (sumpt, m) {4000.0: (0.99434633385661786, 222, 0.00398900328823244), 2500.0: (0.99626445119293383, 139, 0.002497619175965357)}



In [75]:

    
for name, cl in estimators_border.items():
    prob = cl.predict_proba(ds_test_bck)
    thr, result = calculate_thresholds(ds_test_bck, prob, total_test_bck_events, rates=RATE)
    for rate, val in result.items():
        events_pass['{}-{}'.format(rate, name)] = val[1]
    thresholds[name] = thr
    print name, result









    



BBDT {4000.0: (0.9948623553274516, 222, 0.00398900328823244), 2500.0: (0.99644124532301714, 139, 0.002497619175965357)}
BBDT chi2 {4000.0: (0.99471695127146487, 222, 0.00398900328823244), 2500.0: (0.99642261356810102, 139, 0.002497619175965357)}
BBDT chi2, eta {4000.0: (0.99482326983877067, 222, 0.00398900328823244), 2500.0: (0.99653381237132344, 139, 0.002497619175965357)}
BBDT 6-bins {4000.0: (0.99402680780929342, 222, 0.00398900328823244), 2500.0: (0.99595672446060457, 139, 0.002497619175965357)}
BBDT modify 6-bins {4000.0: (0.99438273129820987, 222, 0.00398900328823244), 2500.0: (0.99597285602826535, 139, 0.002497619175965357)}
BBDT modify 6-bins strong {4000.0: (0.99420128981760203, 222, 0.00398900328823244), 2500.0: (0.99608596752421563, 139, 0.002497619175965357)}
BBDT modify 6-bins small {4000.0: (0.99412496117804794, 222, 0.00398900328823244), 2500.0: (0.99615391829393951, 139, 0.002497619175965357)}
BBDT by hand {4000.0: (0.99441421391844975, 222, 0.00398900328823244), 2500.0: (0.99615994806063501, 139, 0.002497619175965357)}

Final efficiencies for each mode



In [76]:

    
train_modes_eff, statistic = result_statistic(estimators, sig_train_modes_names, 
                                              test[test['signal'] == 1],
                                              thresholds, RATE, total_test_signal_events)



In [77]:

    
from rep.plotting import BarComparePlot
for r in RATE:
    new_dict = [] 
    for key, val in train_modes_eff.iteritems():
        if r == key[1]:
            new_dict.append((key, val))
    new_dict = dict(new_dict)        
    BarComparePlot(new_dict).plot(new_plot=True, figsize=(24, 8), ylabel='efficiency', fontsize=22)
    lgd = legend(bbox_to_anchor=(0.5, 1.4), loc='upper center', ncol=2, fontsize=22)
# plt.savefig('hlt2.pdf' , format='pdf', bbox_extra_artists=(lgd,), bbox_inches='tight')



In [78]:

    
plots = OrderedDict()
for key, value in estimators.items():
    plots[key] = plot_roc_events(value, test[test['signal'] == 1], test[test['signal'] == 0], key)









    



base AUC: 0.958564700873
base reg AUC: 0.957389178789
base + additional features AUC: 0.959527972865
base without (pt, m) AUC: 0.955959310671
base without (m) AUC: 0.955864634415
base without (sumpt) AUC: 0.957555024603
base without (sumpt, m) AUC: 0.955220974533



In [79]:

    
def plot_rocs(plots):
    from rep.plotting import FunctionsPlot
    FunctionsPlot(plots).plot(new_plot=True, xlim=(0.02, 0.06), ylim=(0.65, 0.85))
    plot([1. * events_pass['2500.0-base'] / statistic_length(ds_test_bck)['Events']] * 2, 
         [0., 1], 'b--', label='rate: 2.5 kHz')
    plot([1. * events_pass['4000.0-base'] / statistic_length(ds_test_bck)['Events']] * 2, 
         [0., 1], 'g--', label='rate: 4. kHz')
    lgd = legend(loc='upper center', fontsize=16, bbox_to_anchor=(0.5, 1.4), ncol=3)
    xlabel('FRP, background events efficiency', fontsize=20)
    ylabel('TPR, signal events efficiency', fontsize=20)



In [80]:

    
plot_rocs(plots)

efficiencies for each mode per iterations

Generate eff metrics



In [ ]:

    
mode_metrics = dict()
for mode in sig_train_modes_names:
    mode_metrics[mode] = dict()
    for rate in [2500.]:
        mode_metrics[mode][rate] = generate_topo_metric(ds_test_bck, ds_test_signal[ds_test_signal['mode'] == mode], 
                                                        total_test_bck_events, total_test_signal_events[mode], rate)



In [ ]:

    
def generate_select_mode(mode):
    def select_mode(pd):
        result = numpy.zeros(len(pd), dtype=bool)
        result[numpy.array(pd['mode']) == mode] = True
        result[numpy.array(pd['signal']) == 0] = True
        return result
    return select_mode



In [ ]:

    
from rep.report import ClassificationReport
from rep.data import LabeledDataStorage
report = ClassificationReport(estimators, LabeledDataStorage(test, test['signal']))



In [ ]:

    
staged_eff_plots = []
for mode in sig_train_modes_names:
    select_mode = generate_select_mode(mode)
    staged_eff_plots.append(report.learning_curve(mode_metrics[mode][2500.], mask=select_mode, steps=10, 
                                                  metric_label='mode {}, rate {}, eff'.format(mode, 2500.) ))



In [81]:

    
for elem in staged_eff_plots:
    elem.plot()
    show()

Best model; all channels efficiencies



In [82]:

    
estimators_bbdt = estimators_border.copy()
estimators_bbdt['base'] = estimators['base without (pt, m)']



In [83]:

    
thresholds = dict()
RATE = [2500., 4000.]
events_pass = dict()
for name, cl in estimators_bbdt.items():
    prob = cl.predict_proba(ds_test_bck)
    thr, result = calculate_thresholds(ds_test_bck, prob, total_test_bck_events, rates=RATE)
    for rate, val in result.items():
        events_pass['{}-{}'.format(rate, name)] = val[1]
    thresholds[name] = thr
    print name, result









    



BBDT {4000.0: (0.9948623553274516, 222, 0.00398900328823244), 2500.0: (0.99644124532301714, 139, 0.002497619175965357)}
BBDT chi2 {4000.0: (0.99471695127146487, 222, 0.00398900328823244), 2500.0: (0.99642261356810102, 139, 0.002497619175965357)}
BBDT chi2, eta {4000.0: (0.99482326983877067, 222, 0.00398900328823244), 2500.0: (0.99653381237132344, 139, 0.002497619175965357)}
BBDT 6-bins {4000.0: (0.99402680780929342, 222, 0.00398900328823244), 2500.0: (0.99595672446060457, 139, 0.002497619175965357)}
BBDT modify 6-bins {4000.0: (0.99438273129820987, 222, 0.00398900328823244), 2500.0: (0.99597285602826535, 139, 0.002497619175965357)}
BBDT modify 6-bins strong {4000.0: (0.99420128981760203, 222, 0.00398900328823244), 2500.0: (0.99608596752421563, 139, 0.002497619175965357)}
BBDT modify 6-bins small {4000.0: (0.99412496117804794, 222, 0.00398900328823244), 2500.0: (0.99615391829393951, 139, 0.002497619175965357)}
BBDT by hand {4000.0: (0.99441421391844975, 222, 0.00398900328823244), 2500.0: (0.99615994806063501, 139, 0.002497619175965357)}
base {4000.0: (0.99410103269102013, 222, 0.00398900328823244), 2500.0: (0.99610789680421596, 139, 0.002497619175965357)}

trainign channels



In [84]:

    
train_modes_eff, statistic = result_statistic(estimators_bbdt, sig_train_modes_names, 
                                              test[test['signal'] == 1],
                                              thresholds, RATE, total_test_signal_events)



In [85]:

    
from rep.plotting import BarComparePlot
for r in RATE:
    new_dict = [] 
    for key, val in train_modes_eff.iteritems():
        if r == key[1]:
            new_dict.append((key, val))
    new_dict = dict(new_dict)        
    BarComparePlot(new_dict).plot(new_plot=True, figsize=(24, 8), ylabel='efficiency', fontsize=22)
    lgd = legend(bbox_to_anchor=(0.5, 1.4), loc='upper center', ncol=2, fontsize=22)



In [86]:

    
plots = OrderedDict()
for key, value in estimators_border.items():
    plots[key] = plot_roc_events(value, test[test['signal'] == 1], test[test['signal'] == 0], key)
plot_rocs(plots)
title('ROC for events (training decays)', fontsize=20)









    



BBDT AUC: 0.949483590883
BBDT chi2 AUC: 0.949510455309
BBDT chi2, eta AUC: 0.949325554135
BBDT 6-bins AUC: 0.948787065058
BBDT modify 6-bins AUC: 0.952867875138
BBDT modify 6-bins strong AUC: 0.951526439973
BBDT modify 6-bins small AUC: 0.951137447779
BBDT by hand AUC: 0.952332233169






    Out[86]:





<matplotlib.text.Text at 0x95041ad0>

all channels



In [87]:

    
from collections import defaultdict
all_channels = []
efficiencies = defaultdict(OrderedDict)
for mode in empty_events.keys():
    if mode in set(sig_train_modes_names) or mode == bck_train_mode_name:
        continue
    df = pandas.read_csv(os.path.join(folder , 'mod_{}.csv'.format(mode)), sep='\t')
    if len(df) <= 0:
        continue
    total_events = statistic_length(df)['Events'] + empty_events[mode]
    df = df[(df['pass_nbody'] == 1) & (df['mcor'] <= 10e3)]
    passed_events = statistic_length(df)['Events']
    all_channels.append(df)
    for name, cl in estimators_bbdt.items():
        prob = cl.predict_proba(df)
        for rate, thresh in thresholds[name].items():
            eff = final_eff_for_mode(df, prob, total_events, thresh)
            latex_name = '$' + Samples[str(mode)]['root'].replace("#", "\\") + '$'
            efficiencies[(name, rate)][latex_name] = eff



In [88]:

    
for key, val in efficiencies.items():
    for key_2, val_2 in val.items():
        if val_2 <= 0.1:
            efficiencies[key].pop(key_2)
            
from rep.plotting import BarComparePlot
for r in RATE:
    new_dict = [] 
    for key, val in efficiencies.iteritems():
        if r == key[1]:
            new_dict.append((key, val))
    new_dict = dict(new_dict)        
    BarComparePlot(new_dict).plot(new_plot=True, figsize=(24, 8), ylabel='efficiency', fontsize=22)
    lgd = legend(bbox_to_anchor=(0.5, 1.4), loc='upper center', ncol=2, fontsize=22)



In [89]:

    
plots_all = OrderedDict()
for key, value in estimators.items():
    plots_all[key] = plot_roc_events(value, pandas.concat([test[test['signal'] == 1]] + all_channels), 
                                     test[test['signal'] == 0], key)









    



base AUC: 0.89499463248
base reg AUC: 0.895685850847
base + additional features AUC: 0.899267979933
base without (pt, m) AUC: 0.897475380098
base without (m) AUC: 0.897495107093
base without (sumpt) AUC: 0.894163870172
base without (sumpt, m) AUC: 0.89662533543



In [90]:

    
plot_rocs(plots_all)
title('ROC for events (all decays)', fontsize=20)
ylim(0.5, 0.65)









    Out[90]:





(0.5, 0.65)



In [91]:

    
plots_all = OrderedDict()
for key, value in estimators_bbdt.items():
    plots_all[key] = plot_roc_events(value, pandas.concat([test[test['signal'] == 1]] + all_channels), 
                                     test[test['signal'] == 0], key)









    



BBDT AUC: 0.892968227623
BBDT chi2 AUC: 0.892993049853
BBDT chi2, eta AUC: 0.892547680488
BBDT 6-bins AUC: 0.893047340693
BBDT modify 6-bins AUC: 0.896150179901
BBDT modify 6-bins strong AUC: 0.896187352552
BBDT modify 6-bins small AUC: 0.895816788159
BBDT by hand AUC: 0.896260843657
base AUC: 0.897475380098



In [92]:

    
plot_rocs(plots_all)
ylim(0.5, 0.65)
title('ROC for events (all decays)', fontsize=20)









    Out[92]:





<matplotlib.text.Text at 0x1adf1990>

Experiments with x



In [ ]:

    
params = {'user_name': 'antares', 'connection':'skygrid',
          'iterations': 5000, 'sync': False, 
          'train_features': variables + ['sumpt']}
estimators_x = ClassifiersFactory()
for x in range(4, 8):
    estimators_x['BBDT-{}'.format(x)] = MatrixNetSkyGridClassifier(intervals=x, **params)

estimators_x.fit(train, train['signal'])



In [ ]:

    
estimators_x['base'] = estimators['base without (pt, m)']



In [93]:

    
thresholds_x = dict()
for name, cl in estimators_x.items():
    prob = cl.predict_proba(ds_test_bck)
    thr, result = calculate_thresholds(ds_test_bck, prob, total_test_bck_events, rates=[2500.])
    thresholds_x[name] = thr
    print name, result









    



BBDT-4 {2500.0: (0.99626207324863569, 139, 0.002497619175965357)}
BBDT-5 {2500.0: (0.99628784046073426, 139, 0.002497619175965357)}
BBDT-6 {2500.0: (0.99595672446060457, 139, 0.002497619175965357)}
BBDT-7 {2500.0: (0.99589233387417819, 139, 0.002497619175965357)}
base {2500.0: (0.99610789680421596, 139, 0.002497619175965357)}



In [94]:

    
train_modes_eff_x, _ = result_statistic(estimators_x, sig_train_modes_names, 
                                        test[test['signal'] == 1],
                                        thresholds_x, [2500.], total_test_signal_events)



In [95]:

    
from rep.plotting import BarComparePlot
BarComparePlot(train_modes_eff_x).plot(new_plot=True, figsize=(18, 6), ylabel='efficiency')
legend(bbox_to_anchor=(0.5, 1.2), loc='upper center', ncol=3)









    Out[95]:





<matplotlib.legend.Legend at 0x13edf850>

reg + iter experiments



In [ ]:

    
params = {'user_name': 'antares', 'connection':'skygrid',
          'sync': False, 
          'train_features': variables + ['sumpt']}
estimators_w_i = ClassifiersFactory()
for w, i in [(0.01, 5000), (0.05, 1000), (0.1, 500)]:
    estimators_w_i['BBDT-{}-{}'.format(w, i)] = MatrixNetSkyGridClassifier(iterations=i, regularization=w, **params)

estimators_w_i.fit(train, train['signal'])



In [96]:

    
thresholds_w_i = dict()
for name, cl in estimators_w_i.items():
    prob = cl.predict_proba(ds_test_bck)
    thr, result = calculate_thresholds(ds_test_bck, prob, total_test_bck_events, rates=[2500.])
    thresholds_w_i[name] = thr
    print name, result









    



BBDT-0.01-5000 {2500.0: (0.99610789680421596, 139, 0.002497619175965357)}
BBDT-0.05-1000 {2500.0: (0.99628077194020204, 139, 0.002497619175965357)}
BBDT-0.1-500 {2500.0: (0.99613470573056284, 139, 0.002497619175965357)}



In [97]:

    
train_modes_eff_w_i, _ = result_statistic(estimators_w_i, sig_train_modes_names, 
                                        test[test['signal'] == 1],
                                        thresholds_w_i, [2500.], total_test_signal_events)
from rep.plotting import BarComparePlot
BarComparePlot(train_modes_eff_w_i).plot(new_plot=True, figsize=(18, 6), ylabel='efficiency')
legend(bbox_to_anchor=(0.5, 1.2), loc='upper center', ncol=3)









    Out[97]:





<matplotlib.legend.Legend at 0x9508db50>



In [98]:

    
plots_all = OrderedDict()
for key, value in estimators_w_i.items():
    plots_all[key] = plot_roc_events(value, pandas.concat([test[test['signal'] == 1]] + all_channels), 
                                     test[test['signal'] == 0], key)









    



BBDT-0.01-5000 AUC: 0.897475380098
BBDT-0.05-1000 AUC: 0.897212857614
BBDT-0.1-500 AUC: 0.8974984257



In [99]:

    
plot_rocs(plots_all)
title('ROC for events (all decays)', fontsize=20)
ylim(0.5, 0.65)









    Out[99]:





(0.5, 0.65)

BEST borders save



In [ ]:

    
with open("bbdt_run2/hlt2_modify.mx", "w") as f:
    f.write(estimators_border['BBDT modify 6-bins'].formula_mx)

with open("bbdt_run2/hlt2_fix6.mx", "w") as f:
    f.write(estimators_border['BBDT modify 6-bins small'].formula_mx)

with open("bbdt_run2/hlt2_border_base.mx", "w") as f:
    f.write(estimators_border['BBDT'].formula_mx)

with open("bbdt_run2/hlt2_fix6-strong.mx", "w") as f:
    f.write(estimators_border['BBDT modify 6-bins strong'].formula_mx)
    
with open("bbdt_run2/hlt2_by_hand.mx", "w") as f:
    f.write(estimators_border['BBDT by hand'].formula_mx)



In [ ]:

    
result_thresholds = {'hlt2_modify.mx': thresholds['BBDT modify 6-bins'][2500.],
                    'hlt2_fix6.mx': thresholds['BBDT modify 6-bins small'][2500.],
                    'hlt2_border_base.mx': thresholds['BBDT'][2500.],
                    'hlt2_fix6-strong.mx': thresholds['BBDT modify 6-bins strong'][2500.],
                    'hlt2_by_hand.mx': thresholds['BBDT by hand'][2500.]}



In [ ]:

    
with open('models/bbdt_thresholds.pkl', 'w') as f:
    cPickle.dump(result_thresholds, f)

	unique	mode	event_number	sv_number	pass_2body	pass_nbody	signal	sumpt	m	mcor	...	nmu	n	fdchi2	maxtchi2	ngood	nmu1	mupt	n1trk	sig	idx
1	11114001_0	11114001	0	1	1	1	1	6912.10	2806.110	4168.77	...	1	2	9653.910	1.20832	2	1	2972.750	2	1	1
2	11114001_0	11114001	0	2	1	1	1	3731.68	728.895	4429.24	...	2	2	7424.620	1.58733	2	2	2972.750	1	1	2
3	11114001_0	11114001	0	3	1	1	1	4698.27	1249.280	2144.50	...	1	2	1382.720	1.58733	2	1	758.924	1	1	3
5	11114001_1	11114001	1	1	1	1	1	4003.36	3530.930	4651.29	...	1	2	1102.850	1.20808	2	1	2611.970	2	1	1
9	11114001_1	11114001	1	5	1	1	1	2145.67	1520.500	2266.15	...	1	2	868.049	1.35907	2	1	1137.010	2	1	5