In [1]:
    
%pylab inline
    
    
In [2]:
    
%run additional.ipynb
    
In [3]:
    
pandas.set_option('display.max_colwidth', 120)
    
In [5]:
    
PROFILE = 'ssh-py2'
    
did preselections:
In [6]:
    
bck_train_mode_name = 30000000
sig_train_modes_names = list(set(empty_events.keys()) - {bck_train_mode_name})
sig_train_files = ['mod_{}.csv'.format(name) for name in sig_train_modes_names]
bck_train_files = 'mod_30000000.csv'
folder = "datasets/prepared_hlt_body/"
    
In [7]:
    
# concat all signal data
if not os.path.exists(folder + 'signal_hlt1.csv'):
    concat_files(folder, sig_train_files, os.path.join(folder , 'signal_hlt1.csv'))
    
In [8]:
    
signal_data = pandas.read_csv(os.path.join(folder, 'signal_hlt1.csv'), sep='\t')
bck_data = pandas.read_csv(os.path.join(folder, bck_train_files), sep='\t')
    
In [9]:
    
signal_data.columns
    
    Out[9]:
In [10]:
    
print 'Signal', statistic_length(signal_data)
print 'Bck', statistic_length(bck_data)
    
    
In [11]:
    
total_bck_events = statistic_length(bck_data)['Events'] + empty_events[bck_train_mode_name]
total_signal_events_by_mode = dict()
for mode in sig_train_modes_names:
    total_signal_events_by_mode[mode] = statistic_length(signal_data[signal_data['mode'] == mode])['Events'] + empty_events[mode]
    
In [12]:
    
print 'Bck:', total_bck_events
'Signal:', total_signal_events_by_mode
    
    
    Out[12]:
In [13]:
    
variables_base = ["mcor", "chi2", "pt", "fdchi2", "minpt", "nlt16"]
variables_mcor = ["chi2", "pt", "fdchi2", "minpt", "nlt16"]
variables_additional = ["m", "fdr", "sumpt", "sumipchi2", "eta"]
variables_new = ['chi2', 'minpt', 'sumpt', 'fdchi2', 'nlt16']
variables_new_minpt = ['chi2', 'sumpt', 'fdchi2', 'nlt16']
    
In [14]:
    
# hlt1 2body selection
signal_data = signal_data[signal_data['pass_2body'] == 1]
bck_data = bck_data[bck_data['pass_2body'] == 1]
    
In [15]:
    
print 'Signal', statistic_length(signal_data)
print 'Bck', statistic_length(bck_data)
    
    
In [16]:
    
total_signal_events_by_mode_presel = dict()
for mode in sig_train_modes_names:
    total_signal_events_by_mode_presel[mode] = statistic_length(signal_data[signal_data['mode'] == mode])['Events']
total_bck_events_presel = statistic_length(bck_data)['Events']
    
In [17]:
    
print 'Bck:', total_bck_events_presel
'Signal:', total_signal_events_by_mode_presel
    
    
    Out[17]:
In [18]:
    
signal_data.head()
    
    Out[18]:
In [19]:
    
ds_train_signal, ds_train_bck, ds_test_signal, ds_test_bck = prepare_data(signal_data, bck_data, 'unique')
    
In [20]:
    
print 'Signal', statistic_length(ds_train_signal)
print 'Bck', statistic_length(ds_train_bck)
    
    
In [21]:
    
train = pandas.concat([ds_train_bck, ds_train_signal])
    
In [22]:
    
print 'Signal', statistic_length(ds_test_signal)
print 'Bck', statistic_length(ds_test_bck)
    
    
In [23]:
    
test = pandas.concat([ds_test_bck, ds_test_signal])
    
In [24]:
    
total_test_bck_events = (total_bck_events - total_bck_events_presel) // 2 + statistic_length(ds_test_bck)['Events']
total_test_signal_events = dict()
for mode in sig_train_modes_names:
    total_not_passed_signal = total_signal_events_by_mode[mode] - total_signal_events_by_mode_presel[mode]
    total_test_signal_events[mode] = total_not_passed_signal // 2 + \
        statistic_length(ds_test_signal[ds_test_signal['mode'] == mode])['Events']
    
In [25]:
    
print 'Bck total test events:', total_test_bck_events
'Signal total test events:', total_test_signal_events
    
    
    Out[25]:
In [26]:
    
# cut on mcor
train_cut = train[train['mcor'] <= 10e3]
    
In [27]:
    
for var in variables_base + variables_additional:
    hist(train[train.signal == 1][var].values, color='b', bins=60, histtype='step', normed=True)
    hist(train[train.signal == 0][var].values, color='r', bins=60, histtype='step', normed=True)
    title(var)
    show()
    
    
    
    
    
    
    
    
    
    
    
    
In [28]:
    
import cPickle
if os.path.exists('models/hlt1_body2.pkl'):
    with open('models/hlt1_body2.pkl', 'r') as file_mn:
        estimators = cPickle.load(file_mn)
else:
    estimators = {}
    
In [29]:
    
from rep_ef.estimators import MatrixNetClassifier
params = {'connection_auth': 'AUTH_HEADERS', 'connection': 'skygrid', 'iterations': 5000, 'sync': False}
    
In [30]:
    
estimators['MN: mcor cut, mcor var'] = MatrixNetClassifier(train_features=variables_base, **params)
estimators['MN: mcor cut, mcor var'].fit(train_cut, train_cut['signal'])
    
    Out[30]:
In [31]:
    
estimators['MN: mcor cut, mcor var'].get_feature_importances().sort('effect')
    
    
    Out[31]:
In [34]:
    
estimators['MN'] = MatrixNetClassifier(train_features=variables_mcor, **params)
estimators['MN'].fit(train, train['signal'])
    
    Out[34]:
In [35]:
    
estimators['MN'].get_feature_importances().sort('effect')
    
    
    Out[35]:
In [36]:
    
estimators['additional features'] = MatrixNetClassifier(train_features=variables_base + variables_additional, **params)
estimators['additional features'].fit(train_cut, train_cut['signal'])
    
    Out[36]:
In [37]:
    
estimators['additional features'].get_feature_importances().sort('effect')
    
    
    Out[37]:
In [38]:
    
estimators['MN: pt->sumpt'] = MatrixNetClassifier(train_features=variables_new, **params)
estimators['MN: pt->sumpt'].fit(train, train['signal'])
    
    Out[38]:
In [39]:
    
estimators['MN: pt->sumpt'].get_feature_importances().sort('effect')
    
    
    Out[39]:
In [40]:
    
estimators['MN: pt->sumpt, remove minpt'] = MatrixNetClassifier(train_features=variables_new_minpt, **params)
estimators['MN: pt->sumpt, remove minpt'].fit(train, train['signal'])
    
    Out[40]:
In [41]:
    
estimators['MN: pt->sumpt, remove minpt'].get_feature_importances().sort('effect')
    
    
    Out[41]:
In [42]:
    
borders = {
'chi2': [1,2.5,5,7.5,10,100], 
'sumpt': [3000,4000,5000,6000,7500,9000,12e3,23e3,50e3],  
'fdchi2': [33,125,350,780,1800,5000,10000], 
'minpt': [350,500,750,1500,3000,5000], 
'nlt16': [0.5]
}
    
In [43]:
    
borders_minpt = {
'chi2': [1,2.5,5,7.5,10,100],  
'sumpt': [3000,4000,5000,6000,7500,9000,12e3,23e3,50e3],  
'fdchi2': [33,125,350,780,1800,5000,10000],  
'nlt16': [0.5]
}
    
In [44]:
    
estimators['MN BBDT: pt->sumpt'] = MatrixNetClassifier(train_features=variables_new, intervals=borders, **params)
estimators['MN BBDT: pt->sumpt'].fit(train, train['signal'])
    
    Out[44]:
In [45]:
    
estimators['MN BBDT: pt->sumpt'].get_feature_importances().sort('effect')
    
    
    Out[45]:
In [46]:
    
estimators['MN BBDT: pt->sumpt, remove minpt'] = MatrixNetClassifier(train_features=variables_new_minpt, 
                                                                     intervals=borders_minpt, **params)
estimators['MN BBDT: pt->sumpt, remove minpt'].fit(train, train['signal'])
    
    Out[46]:
In [47]:
    
estimators['MN BBDT: pt->sumpt, remove minpt'].get_feature_importances().sort('effect')
    
    
    Out[47]:
In [48]:
    
from rep.metaml import FoldingClassifier
from rep.estimators import SklearnClassifier
from sklearn.ensemble import RandomForestClassifier
    
In [50]:
    
forest_base_partial = SklearnClassifier(RandomForestClassifier(n_estimators=300, min_samples_leaf=500, max_depth=7,
                                                               max_features=4))
forest_folding_top = FoldingClassifier(base_estimator=forest_base_partial, random_state=11, 
                                       features=variables_new_minpt, parallel_profile=PROFILE)
forest_folding_top.fit(train, train['signal'])
    
    
    Out[50]:
In [51]:
    
for rank in range(1, 3):
    good_events = get_best_svr(train, forest_folding_top, rank)
    ef_good = MatrixNetClassifier(train_features=variables_base, **params)
    ef_good.fit(good_events, good_events['signal'])
    estimators['top-{} forest preselection, use mcor'.format(rank)] = ef_good
    
    
In [52]:
    
for rank in range(1, 3):
    good_events = get_best_svr(train, forest_folding_top, rank)
    ef_good = MatrixNetClassifier(train_features=variables_new_minpt, **params)
    ef_good.fit(good_events, good_events['signal'])
    estimators['top-{} forest preselection'.format(rank)] = ef_good
    
    
In [53]:
    
forest_base_partial = SklearnClassifier(RandomForestClassifier(n_estimators=300, min_samples_leaf=500, max_depth=7,
                                                               max_features=5))
forest_folding_top_minpt = FoldingClassifier(base_estimator=forest_base_partial, random_state=11, 
                                             features=variables_new, parallel_profile=PROFILE)
forest_folding_top_minpt.fit(train, train['signal'])
    
    Out[53]:
In [54]:
    
for rank in range(1, 3):
    good_events = get_best_svr(train, forest_folding_top_minpt, rank)
    ef_good = MatrixNetClassifier(train_features=variables_new, **params)
    ef_good.fit(good_events, good_events['signal'])
    estimators['top-{} forest preselection, minpt'.format(rank)] = ef_good
    
    
In [58]:
    
estimators['top-1 forest preselection, minpt'].get_feature_importances()
    
    Out[58]:
In [72]:
    
for rank in range(1, 3):
    good_events = get_best_svr(train, forest_folding_top_minpt, rank)
    ef_good = MatrixNetClassifier(train_features=variables_base, **params)
    ef_good.fit(good_events, good_events['signal'])
    estimators['top-{} forest preselection, minpt, base'.format(rank)] = ef_good
    
    
In [73]:
    
estimators['top-2 forest preselection, minpt, base'].get_feature_importances()
    
    Out[73]:
In [63]:
    
forest_base_partial = SklearnClassifier(RandomForestClassifier(n_estimators=300, min_samples_leaf=500, max_depth=7,
                                                               max_features=5))
forest_folding_top_mcor = FoldingClassifier(base_estimator=forest_base_partial, random_state=11, 
                                             features=variables_base, parallel_profile=PROFILE)
forest_folding_top_mcor.fit(train, train['signal'])
    
    Out[63]:
In [64]:
    
for rank in range(1, 3):
    good_events = get_best_svr(train, forest_folding_top_mcor, rank)
    ef_good = MatrixNetClassifier(train_features=variables_new, **params)
    ef_good.fit(good_events, good_events['signal'])
    estimators['top-{} forest preselection, minpt; mcor for forest'.format(rank)] = ef_good
    
    
In [65]:
    
for rank in range(1, 3):
    good_events = get_best_svr(train, forest_folding_top_mcor, rank)
    ef_good = MatrixNetClassifier(train_features=variables_base, **params)
    ef_good.fit(good_events, good_events['signal'])
    estimators['top-{} forest preselection, mcor'.format(rank)] = ef_good
    
    
In [66]:
    
import cPickle
with open('models/hlt1_body2.pkl', 'w') as file_mn:
    cPickle.dump(estimators, file_mn)
    
In [67]:
    
estimators.keys()
    
    Out[67]:
In [70]:
    
estimators.keys()
    
    Out[70]:
In [74]:
    
thresholds = dict()
RATE = [50000.]
events_pass = dict()
for name, cl in estimators.items():
    prob = cl.predict_proba(ds_test_bck)
    if 'mcor cut' not in name and 'additional' not in name:
        thr, result = calculate_thresholds(ds_test_bck, prob, total_test_bck_events, rates=RATE)
        for rate, val in result.items():
            events_pass['{}-{}'.format(rate, name)] = val[1]
    else:
        thr, result = calculate_thresholds(ds_test_bck[ds_test_bck['mcor'] <= 10e3], 
                                           prob[numpy.array(ds_test_bck['mcor']) <= 10e3], 
                                           total_test_bck_events, rates=RATE)
        for rate, val in result.items():
            events_pass['{}-{}'.format(rate, name)] = val[1]
    thresholds[name] = thr
    print name, result
    
    
In [75]:
    
est_cut = dict([('MN: mcor cut, mcor var', estimators['MN: mcor cut, mcor var']),
                ('additional features', estimators['additional features'])])
train_modes_eff, statistic = result_statistic(est_cut, sig_train_modes_names, 
                                              ds_test_signal[ds_test_signal['mcor'] <= 10e3],
                                              thresholds, RATE, total_test_signal_events)
    
In [94]:
    
est = []
for key in ['MN', 'MN: pt->sumpt', 'MN: pt->sumpt, remove minpt']:
    est.append((key, estimators[key]))
est = dict(est)
train_modes_eff_rest, statistic_rest = result_statistic(est, sig_train_modes_names, 
                                                        ds_test_signal,
                                                        thresholds, RATE, total_test_signal_events)
    
In [95]:
    
from rep.plotting import BarComparePlot
BarComparePlot(OrderedDict(train_modes_eff.items() + train_modes_eff_rest.items()), 
               sortby=('MN', 50000.0)).plot(new_plot=True, figsize=(24, 8), ylabel='efficiency', fontsize=22)
lgd = legend(bbox_to_anchor=(0.5, 1.4), loc='upper center', ncol=2, fontsize=22)
# plt.savefig('hlt1.pdf' , format='pdf', bbox_extra_artists=(lgd,), bbox_inches='tight')
    
    
In [96]:
    
compare_hierarhical = OrderedDict()
hierarhical = ['top-1 forest preselection',
               'top-2 forest preselection',
               'top-1 forest preselection, use mcor',
               'top-2 forest preselection, use mcor',
               'top-2 forest preselection, minpt',
               'top-1 forest preselection, minpt',
               'top-2 forest preselection, minpt, base',
               'top-1 forest preselection, minpt, base',
               
               'top-1 forest preselection, minpt; mcor for forest',
               'top-2 forest preselection, minpt; mcor for forest',
               'top-1 forest preselection, mcor',
               'top-2 forest preselection, mcor',
               
               'MN: pt->sumpt',
               'MN BBDT: pt->sumpt', 
               'MN: pt->sumpt, remove minpt',
               'MN BBDT: pt->sumpt, remove minpt'
                ]
for key in hierarhical:
    compare_hierarhical[key] = estimators[key]
train_modes_eff_forest, statistic_forest = result_statistic(compare_hierarhical, sig_train_modes_names, 
                                                            ds_test_signal,
                                                            thresholds, RATE, total_test_signal_events)
    
In [97]:
    
BarComparePlot(OrderedDict([((key, 50000.), train_modes_eff_forest[(key, 50000.)]) for key in hierarhical]),
               sortby=('MN BBDT: pt->sumpt', 50000.0)).plot(new_plot=True, figsize=(24, 8), 
                                                            ylabel='efficiency', fontsize=22)
lgd = legend(bbox_to_anchor=(0.5, 1.7), loc='upper center', ncol=2, fontsize=22)
    
    
In [98]:
    
pandas.DataFrame(statistic)
    
    Out[98]:
In [99]:
    
pandas.DataFrame(statistic_rest)
    
    Out[99]:
In [100]:
    
from rep.data import LabeledDataStorage
from rep.report import ClassificationReport
lds = LabeledDataStorage(test, test['signal'])
report = ClassificationReport(OrderedDict(est.items() + compare_hierarhical.items()), lds)
    
In [101]:
    
report.roc().plot(new_plot=True)
    
    
In [102]:
    
mode_metrics = dict()
for mode in sig_train_modes_names:
    mode_metrics[mode] = dict()
    for rate in [50000.]:
        mode_metrics[mode][rate] = generate_topo_metric(ds_test_bck, ds_test_signal[ds_test_signal['mode'] == mode], 
                                                        total_test_bck_events, total_test_signal_events[mode], rate)
    
In [103]:
    
def generate_select_mode(mode):
    def select_mode(pd):
        result = numpy.zeros(len(pd), dtype=bool)
        result[numpy.array(pd['mode']) == mode] = True
        result[numpy.array(pd['signal']) == 0] = True
        return result
    return select_mode
    
In [104]:
    
staged_eff_plots = []
for mode in sig_train_modes_names:
    eff_metrics = OrderedDict()
    select_mode = generate_select_mode(mode)
    if len(ds_test_signal[ds_test_signal['mode'] == mode]) <= 0:
        continue
    staged_eff_plots.append(report.learning_curve(mode_metrics[mode][50000.], mask=select_mode, steps=10,
                                                  metric_label='mode {}, rate {}, eff'.format(mode, 50000.)))
    
In [105]:
    
for n, elem in enumerate(staged_eff_plots):
    elem.plot(new_plot=True)
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
In [106]:
    
plots = OrderedDict()
for key, value in est.items() + compare_hierarhical.items():
    plots[key] = plot_roc_events(value, ds_test_signal, ds_test_bck, key)
    
    
In [107]:
    
from rep.plotting import FunctionsPlot
FunctionsPlot(plots).plot(new_plot=True)
plot([1. * events_pass['50000.0-MN'] / statistic_length(ds_test_bck)['Events']] * 2, [0., 1], 'b--', label='rate: 50 kHz')
lgd = legend(loc='upper center', fontsize=16, bbox_to_anchor=(0.5, 1.7), ncol=2)
    
    
In [108]:
    
FunctionsPlot(plots).plot(new_plot=True, xlim=(0.42, 0.44), ylim=(0.9445, 0.9645))
plot([1. * events_pass['50000.0-MN'] / statistic_length(ds_test_bck)['Events']] * 2, [0., 1], 'b--', label='rate: 50 kHz')
lgd = legend(loc='upper center', fontsize=16, bbox_to_anchor=(0.5, 1.7), ncol=2)
    
    
In [109]:
    
with open("bbdt_run2/hlt1_borders_sumpt.mx", "w") as f:
    f.write(estimators['MN BBDT: pt->sumpt'].formula_mx)
    
In [110]:
    
with open("bbdt_run2/hlt1_borders_minpt.mx", "w") as f:
    f.write(estimators['MN BBDT: pt->sumpt, remove minpt'].formula_mx)
    
In [ ]: