In [1]:
%pylab inline
In [2]:
%run additional.ipynb
In [3]:
pandas.set_option('display.max_colwidth', 120)
In [5]:
PROFILE = 'ssh-py2'
did preselections:
In [6]:
bck_train_mode_name = 30000000
sig_train_modes_names = list(set(empty_events.keys()) - {bck_train_mode_name})
sig_train_files = ['mod_{}.csv'.format(name) for name in sig_train_modes_names]
bck_train_files = 'mod_30000000.csv'
folder = "datasets/prepared_hlt_body/"
In [7]:
# concat all signal data
if not os.path.exists(folder + 'signal_hlt1.csv'):
concat_files(folder, sig_train_files, os.path.join(folder , 'signal_hlt1.csv'))
In [8]:
signal_data = pandas.read_csv(os.path.join(folder, 'signal_hlt1.csv'), sep='\t')
bck_data = pandas.read_csv(os.path.join(folder, bck_train_files), sep='\t')
In [9]:
signal_data.columns
Out[9]:
In [10]:
print 'Signal', statistic_length(signal_data)
print 'Bck', statistic_length(bck_data)
In [11]:
total_bck_events = statistic_length(bck_data)['Events'] + empty_events[bck_train_mode_name]
total_signal_events_by_mode = dict()
for mode in sig_train_modes_names:
total_signal_events_by_mode[mode] = statistic_length(signal_data[signal_data['mode'] == mode])['Events'] + empty_events[mode]
In [12]:
print 'Bck:', total_bck_events
'Signal:', total_signal_events_by_mode
Out[12]:
In [13]:
variables_base = ["mcor", "chi2", "pt", "fdchi2", "minpt", "nlt16"]
variables_mcor = ["chi2", "pt", "fdchi2", "minpt", "nlt16"]
variables_additional = ["m", "fdr", "sumpt", "sumipchi2", "eta"]
variables_new = ['chi2', 'minpt', 'sumpt', 'fdchi2', 'nlt16']
variables_new_minpt = ['chi2', 'sumpt', 'fdchi2', 'nlt16']
In [14]:
# hlt1 2body selection
signal_data = signal_data[signal_data['pass_2body'] == 1]
bck_data = bck_data[bck_data['pass_2body'] == 1]
In [15]:
print 'Signal', statistic_length(signal_data)
print 'Bck', statistic_length(bck_data)
In [16]:
total_signal_events_by_mode_presel = dict()
for mode in sig_train_modes_names:
total_signal_events_by_mode_presel[mode] = statistic_length(signal_data[signal_data['mode'] == mode])['Events']
total_bck_events_presel = statistic_length(bck_data)['Events']
In [17]:
print 'Bck:', total_bck_events_presel
'Signal:', total_signal_events_by_mode_presel
Out[17]:
In [18]:
signal_data.head()
Out[18]:
In [19]:
ds_train_signal, ds_train_bck, ds_test_signal, ds_test_bck = prepare_data(signal_data, bck_data, 'unique')
In [20]:
print 'Signal', statistic_length(ds_train_signal)
print 'Bck', statistic_length(ds_train_bck)
In [21]:
train = pandas.concat([ds_train_bck, ds_train_signal])
In [22]:
print 'Signal', statistic_length(ds_test_signal)
print 'Bck', statistic_length(ds_test_bck)
In [23]:
test = pandas.concat([ds_test_bck, ds_test_signal])
In [24]:
total_test_bck_events = (total_bck_events - total_bck_events_presel) // 2 + statistic_length(ds_test_bck)['Events']
total_test_signal_events = dict()
for mode in sig_train_modes_names:
total_not_passed_signal = total_signal_events_by_mode[mode] - total_signal_events_by_mode_presel[mode]
total_test_signal_events[mode] = total_not_passed_signal // 2 + \
statistic_length(ds_test_signal[ds_test_signal['mode'] == mode])['Events']
In [25]:
print 'Bck total test events:', total_test_bck_events
'Signal total test events:', total_test_signal_events
Out[25]:
In [26]:
# cut on mcor
train_cut = train[train['mcor'] <= 10e3]
In [27]:
for var in variables_base + variables_additional:
hist(train[train.signal == 1][var].values, color='b', bins=60, histtype='step', normed=True)
hist(train[train.signal == 0][var].values, color='r', bins=60, histtype='step', normed=True)
title(var)
show()
In [28]:
import cPickle
if os.path.exists('models/hlt1_body2.pkl'):
with open('models/hlt1_body2.pkl', 'r') as file_mn:
estimators = cPickle.load(file_mn)
else:
estimators = {}
In [29]:
from rep_ef.estimators import MatrixNetClassifier
params = {'connection_auth': 'AUTH_HEADERS', 'connection': 'skygrid', 'iterations': 5000, 'sync': False}
In [30]:
estimators['MN: mcor cut, mcor var'] = MatrixNetClassifier(train_features=variables_base, **params)
estimators['MN: mcor cut, mcor var'].fit(train_cut, train_cut['signal'])
Out[30]:
In [31]:
estimators['MN: mcor cut, mcor var'].get_feature_importances().sort('effect')
Out[31]:
In [34]:
estimators['MN'] = MatrixNetClassifier(train_features=variables_mcor, **params)
estimators['MN'].fit(train, train['signal'])
Out[34]:
In [35]:
estimators['MN'].get_feature_importances().sort('effect')
Out[35]:
In [36]:
estimators['additional features'] = MatrixNetClassifier(train_features=variables_base + variables_additional, **params)
estimators['additional features'].fit(train_cut, train_cut['signal'])
Out[36]:
In [37]:
estimators['additional features'].get_feature_importances().sort('effect')
Out[37]:
In [38]:
estimators['MN: pt->sumpt'] = MatrixNetClassifier(train_features=variables_new, **params)
estimators['MN: pt->sumpt'].fit(train, train['signal'])
Out[38]:
In [39]:
estimators['MN: pt->sumpt'].get_feature_importances().sort('effect')
Out[39]:
In [40]:
estimators['MN: pt->sumpt, remove minpt'] = MatrixNetClassifier(train_features=variables_new_minpt, **params)
estimators['MN: pt->sumpt, remove minpt'].fit(train, train['signal'])
Out[40]:
In [41]:
estimators['MN: pt->sumpt, remove minpt'].get_feature_importances().sort('effect')
Out[41]:
In [42]:
borders = {
'chi2': [1,2.5,5,7.5,10,100],
'sumpt': [3000,4000,5000,6000,7500,9000,12e3,23e3,50e3],
'fdchi2': [33,125,350,780,1800,5000,10000],
'minpt': [350,500,750,1500,3000,5000],
'nlt16': [0.5]
}
In [43]:
borders_minpt = {
'chi2': [1,2.5,5,7.5,10,100],
'sumpt': [3000,4000,5000,6000,7500,9000,12e3,23e3,50e3],
'fdchi2': [33,125,350,780,1800,5000,10000],
'nlt16': [0.5]
}
In [44]:
estimators['MN BBDT: pt->sumpt'] = MatrixNetClassifier(train_features=variables_new, intervals=borders, **params)
estimators['MN BBDT: pt->sumpt'].fit(train, train['signal'])
Out[44]:
In [45]:
estimators['MN BBDT: pt->sumpt'].get_feature_importances().sort('effect')
Out[45]:
In [46]:
estimators['MN BBDT: pt->sumpt, remove minpt'] = MatrixNetClassifier(train_features=variables_new_minpt,
intervals=borders_minpt, **params)
estimators['MN BBDT: pt->sumpt, remove minpt'].fit(train, train['signal'])
Out[46]:
In [47]:
estimators['MN BBDT: pt->sumpt, remove minpt'].get_feature_importances().sort('effect')
Out[47]:
In [48]:
from rep.metaml import FoldingClassifier
from rep.estimators import SklearnClassifier
from sklearn.ensemble import RandomForestClassifier
In [50]:
forest_base_partial = SklearnClassifier(RandomForestClassifier(n_estimators=300, min_samples_leaf=500, max_depth=7,
max_features=4))
forest_folding_top = FoldingClassifier(base_estimator=forest_base_partial, random_state=11,
features=variables_new_minpt, parallel_profile=PROFILE)
forest_folding_top.fit(train, train['signal'])
Out[50]:
In [51]:
for rank in range(1, 3):
good_events = get_best_svr(train, forest_folding_top, rank)
ef_good = MatrixNetClassifier(train_features=variables_base, **params)
ef_good.fit(good_events, good_events['signal'])
estimators['top-{} forest preselection, use mcor'.format(rank)] = ef_good
In [52]:
for rank in range(1, 3):
good_events = get_best_svr(train, forest_folding_top, rank)
ef_good = MatrixNetClassifier(train_features=variables_new_minpt, **params)
ef_good.fit(good_events, good_events['signal'])
estimators['top-{} forest preselection'.format(rank)] = ef_good
In [53]:
forest_base_partial = SklearnClassifier(RandomForestClassifier(n_estimators=300, min_samples_leaf=500, max_depth=7,
max_features=5))
forest_folding_top_minpt = FoldingClassifier(base_estimator=forest_base_partial, random_state=11,
features=variables_new, parallel_profile=PROFILE)
forest_folding_top_minpt.fit(train, train['signal'])
Out[53]:
In [54]:
for rank in range(1, 3):
good_events = get_best_svr(train, forest_folding_top_minpt, rank)
ef_good = MatrixNetClassifier(train_features=variables_new, **params)
ef_good.fit(good_events, good_events['signal'])
estimators['top-{} forest preselection, minpt'.format(rank)] = ef_good
In [58]:
estimators['top-1 forest preselection, minpt'].get_feature_importances()
Out[58]:
In [72]:
for rank in range(1, 3):
good_events = get_best_svr(train, forest_folding_top_minpt, rank)
ef_good = MatrixNetClassifier(train_features=variables_base, **params)
ef_good.fit(good_events, good_events['signal'])
estimators['top-{} forest preselection, minpt, base'.format(rank)] = ef_good
In [73]:
estimators['top-2 forest preselection, minpt, base'].get_feature_importances()
Out[73]:
In [63]:
forest_base_partial = SklearnClassifier(RandomForestClassifier(n_estimators=300, min_samples_leaf=500, max_depth=7,
max_features=5))
forest_folding_top_mcor = FoldingClassifier(base_estimator=forest_base_partial, random_state=11,
features=variables_base, parallel_profile=PROFILE)
forest_folding_top_mcor.fit(train, train['signal'])
Out[63]:
In [64]:
for rank in range(1, 3):
good_events = get_best_svr(train, forest_folding_top_mcor, rank)
ef_good = MatrixNetClassifier(train_features=variables_new, **params)
ef_good.fit(good_events, good_events['signal'])
estimators['top-{} forest preselection, minpt; mcor for forest'.format(rank)] = ef_good
In [65]:
for rank in range(1, 3):
good_events = get_best_svr(train, forest_folding_top_mcor, rank)
ef_good = MatrixNetClassifier(train_features=variables_base, **params)
ef_good.fit(good_events, good_events['signal'])
estimators['top-{} forest preselection, mcor'.format(rank)] = ef_good
In [66]:
import cPickle
with open('models/hlt1_body2.pkl', 'w') as file_mn:
cPickle.dump(estimators, file_mn)
In [67]:
estimators.keys()
Out[67]:
In [70]:
estimators.keys()
Out[70]:
In [74]:
thresholds = dict()
RATE = [50000.]
events_pass = dict()
for name, cl in estimators.items():
prob = cl.predict_proba(ds_test_bck)
if 'mcor cut' not in name and 'additional' not in name:
thr, result = calculate_thresholds(ds_test_bck, prob, total_test_bck_events, rates=RATE)
for rate, val in result.items():
events_pass['{}-{}'.format(rate, name)] = val[1]
else:
thr, result = calculate_thresholds(ds_test_bck[ds_test_bck['mcor'] <= 10e3],
prob[numpy.array(ds_test_bck['mcor']) <= 10e3],
total_test_bck_events, rates=RATE)
for rate, val in result.items():
events_pass['{}-{}'.format(rate, name)] = val[1]
thresholds[name] = thr
print name, result
In [75]:
est_cut = dict([('MN: mcor cut, mcor var', estimators['MN: mcor cut, mcor var']),
('additional features', estimators['additional features'])])
train_modes_eff, statistic = result_statistic(est_cut, sig_train_modes_names,
ds_test_signal[ds_test_signal['mcor'] <= 10e3],
thresholds, RATE, total_test_signal_events)
In [94]:
est = []
for key in ['MN', 'MN: pt->sumpt', 'MN: pt->sumpt, remove minpt']:
est.append((key, estimators[key]))
est = dict(est)
train_modes_eff_rest, statistic_rest = result_statistic(est, sig_train_modes_names,
ds_test_signal,
thresholds, RATE, total_test_signal_events)
In [95]:
from rep.plotting import BarComparePlot
BarComparePlot(OrderedDict(train_modes_eff.items() + train_modes_eff_rest.items()),
sortby=('MN', 50000.0)).plot(new_plot=True, figsize=(24, 8), ylabel='efficiency', fontsize=22)
lgd = legend(bbox_to_anchor=(0.5, 1.4), loc='upper center', ncol=2, fontsize=22)
# plt.savefig('hlt1.pdf' , format='pdf', bbox_extra_artists=(lgd,), bbox_inches='tight')
In [96]:
compare_hierarhical = OrderedDict()
hierarhical = ['top-1 forest preselection',
'top-2 forest preselection',
'top-1 forest preselection, use mcor',
'top-2 forest preselection, use mcor',
'top-2 forest preselection, minpt',
'top-1 forest preselection, minpt',
'top-2 forest preselection, minpt, base',
'top-1 forest preselection, minpt, base',
'top-1 forest preselection, minpt; mcor for forest',
'top-2 forest preselection, minpt; mcor for forest',
'top-1 forest preselection, mcor',
'top-2 forest preselection, mcor',
'MN: pt->sumpt',
'MN BBDT: pt->sumpt',
'MN: pt->sumpt, remove minpt',
'MN BBDT: pt->sumpt, remove minpt'
]
for key in hierarhical:
compare_hierarhical[key] = estimators[key]
train_modes_eff_forest, statistic_forest = result_statistic(compare_hierarhical, sig_train_modes_names,
ds_test_signal,
thresholds, RATE, total_test_signal_events)
In [97]:
BarComparePlot(OrderedDict([((key, 50000.), train_modes_eff_forest[(key, 50000.)]) for key in hierarhical]),
sortby=('MN BBDT: pt->sumpt', 50000.0)).plot(new_plot=True, figsize=(24, 8),
ylabel='efficiency', fontsize=22)
lgd = legend(bbox_to_anchor=(0.5, 1.7), loc='upper center', ncol=2, fontsize=22)
In [98]:
pandas.DataFrame(statistic)
Out[98]:
In [99]:
pandas.DataFrame(statistic_rest)
Out[99]:
In [100]:
from rep.data import LabeledDataStorage
from rep.report import ClassificationReport
lds = LabeledDataStorage(test, test['signal'])
report = ClassificationReport(OrderedDict(est.items() + compare_hierarhical.items()), lds)
In [101]:
report.roc().plot(new_plot=True)
In [102]:
mode_metrics = dict()
for mode in sig_train_modes_names:
mode_metrics[mode] = dict()
for rate in [50000.]:
mode_metrics[mode][rate] = generate_topo_metric(ds_test_bck, ds_test_signal[ds_test_signal['mode'] == mode],
total_test_bck_events, total_test_signal_events[mode], rate)
In [103]:
def generate_select_mode(mode):
def select_mode(pd):
result = numpy.zeros(len(pd), dtype=bool)
result[numpy.array(pd['mode']) == mode] = True
result[numpy.array(pd['signal']) == 0] = True
return result
return select_mode
In [104]:
staged_eff_plots = []
for mode in sig_train_modes_names:
eff_metrics = OrderedDict()
select_mode = generate_select_mode(mode)
if len(ds_test_signal[ds_test_signal['mode'] == mode]) <= 0:
continue
staged_eff_plots.append(report.learning_curve(mode_metrics[mode][50000.], mask=select_mode, steps=10,
metric_label='mode {}, rate {}, eff'.format(mode, 50000.)))
In [105]:
for n, elem in enumerate(staged_eff_plots):
elem.plot(new_plot=True)
In [106]:
plots = OrderedDict()
for key, value in est.items() + compare_hierarhical.items():
plots[key] = plot_roc_events(value, ds_test_signal, ds_test_bck, key)
In [107]:
from rep.plotting import FunctionsPlot
FunctionsPlot(plots).plot(new_plot=True)
plot([1. * events_pass['50000.0-MN'] / statistic_length(ds_test_bck)['Events']] * 2, [0., 1], 'b--', label='rate: 50 kHz')
lgd = legend(loc='upper center', fontsize=16, bbox_to_anchor=(0.5, 1.7), ncol=2)
In [108]:
FunctionsPlot(plots).plot(new_plot=True, xlim=(0.42, 0.44), ylim=(0.9445, 0.9645))
plot([1. * events_pass['50000.0-MN'] / statistic_length(ds_test_bck)['Events']] * 2, [0., 1], 'b--', label='rate: 50 kHz')
lgd = legend(loc='upper center', fontsize=16, bbox_to_anchor=(0.5, 1.7), ncol=2)
In [109]:
with open("bbdt_run2/hlt1_borders_sumpt.mx", "w") as f:
f.write(estimators['MN BBDT: pt->sumpt'].formula_mx)
In [110]:
with open("bbdt_run2/hlt1_borders_minpt.mx", "w") as f:
f.write(estimators['MN BBDT: pt->sumpt, remove minpt'].formula_mx)
In [ ]: