In [1]:
%run additional.ipynb
In [2]:
%pylab inline
In [3]:
pandas.set_option('display.max_colwidth', 120)
In [4]:
PROFILE = 'ssh-py2'
did preselections:
In [5]:
sig_train_modes_names = [11114001, 11296013, 11874042, 12103035, 13246001, 13264021]
bck_train_mode_name = 30000000
sig_train_files = ['mod_{}.csv'.format(name) for name in sig_train_modes_names]
bck_train_files = 'mod_30000000.csv'
folder = "datasets/prepared_hlt_body/"
In [6]:
# concat all signal data
if not os.path.exists(folder + 'signal_hlt2.csv'):
concat_files(folder, sig_train_files, os.path.join(folder , 'signal_hlt2.csv'))
In [7]:
signal_data = pandas.read_csv(os.path.join(folder , 'signal_hlt2.csv'), sep='\t')
bck_data = pandas.read_csv(os.path.join(folder , bck_train_files), sep='\t')
In [8]:
signal_data.columns
Out[8]:
In [9]:
print 'Signal', statistic_length(signal_data)
print 'Bck', statistic_length(bck_data)
In [10]:
total_bck_events = statistic_length(bck_data)['Events'] + empty_events[bck_train_mode_name]
total_signal_events_by_mode = dict()
for mode in sig_train_modes_names:
total_signal_events_by_mode[mode] = statistic_length(signal_data[signal_data['mode'] == mode])['Events'] + empty_events[mode]
In [11]:
print 'Bck:', total_bck_events
'Signal:', total_signal_events_by_mode
Out[11]:
In [12]:
variables = ["n", "mcor", "chi2", "eta", "fdchi2", "minpt", "nlt16", "ipchi2", "n1trk", "sumpt"]
In [13]:
# hlt2 nbody selection
signal_data = signal_data[(signal_data['pass_nbody'] == 1) & (signal_data['mcor'] <= 10e3)]
bck_data = bck_data[(bck_data['pass_nbody'] == 1) & (bck_data['mcor'] <= 10e3)]
In [14]:
print 'Signal', statistic_length(signal_data)
print 'Bck', statistic_length(bck_data)
In [15]:
total_signal_events_by_mode_presel = dict()
for mode in sig_train_modes_names:
total_signal_events_by_mode_presel[mode] = statistic_length(signal_data[signal_data['mode'] == mode])['Events']
total_bck_events_presel = statistic_length(bck_data)['Events']
In [16]:
print 'Bck:', total_bck_events_presel
'Signal:', total_signal_events_by_mode_presel
Out[16]:
In [17]:
signal_data.head()
Out[17]:
In [18]:
ds_train_signal, ds_train_bck, ds_test_signal, ds_test_bck = prepare_data(signal_data, bck_data, 'unique')
In [19]:
print 'Signal', statistic_length(ds_train_signal)
print 'Bck', statistic_length(ds_train_bck)
In [20]:
train = pandas.concat([ds_train_bck, ds_train_signal])
In [21]:
print 'Signal', statistic_length(ds_test_signal)
print 'Bck', statistic_length(ds_test_bck)
In [22]:
test = pandas.concat([ds_test_bck, ds_test_signal])
In [23]:
total_test_bck_events = (total_bck_events - total_bck_events_presel) // 2 + statistic_length(ds_test_bck)['Events']
total_test_signal_events = dict()
for mode in sig_train_modes_names:
total_not_passed_signal = total_signal_events_by_mode[mode] - total_signal_events_by_mode_presel[mode]
total_test_signal_events[mode] = total_not_passed_signal // 2 + \
statistic_length(ds_test_signal[ds_test_signal['mode'] == mode])['Events']
In [24]:
print 'Bck total test events:', total_test_bck_events
'Signal total test events:', total_test_signal_events
Out[24]:
In [25]:
import cPickle
if os.path.exists('models/prunned.pkl'):
with open('models/prunned.pkl', 'r') as file_pr:
estimators = cPickle.load(file_pr)
In [26]:
from rep_ef.estimators import MatrixNetClassifier
In [27]:
params = {'connection_auth': 'AUTH_HEADERS', 'connection': 'skygrid', 'regularization': 0.02,
'sync': False, "iterations": 5000}
In [28]:
ef_base = MatrixNetClassifier(train_features=variables, **params)
ef_base.fit(train, train['signal'])
Out[28]:
In [29]:
from hep_ml.decisiontrain import DecisionTrainClassifier
from hep_ml.losses import LogLossFunction
In [30]:
from rep.estimators import SklearnClassifier
tt_base = SklearnClassifier(DecisionTrainClassifier(learning_rate=0.02, n_estimators=10000,
depth=6, pretransform_needed=True,
max_features=6, loss=LogLossFunction(regularization=100)),
features=variables)
tt_base.fit(train, train['signal'])
pass
In [31]:
special_b = {
'n': [2.5, 3.5],
'mcor': [2000,3000,4000,5000,7500], # I want to remove splits too close the the B mass as I was looking in simulation and this could distort the mass peak (possibly)
'chi2': [1,2.5,5,7.5,10,100], # I also propose we add a cut to the pre-selection of chi2 < 1000. I don't want to put in splits at too small values here b/c these type of inputs are never modeled quite right in the simulation (they always look a bit more smeared in data).
'sumpt': [3000,4000,5000,6000,7500,9000,12e3,23e3,50e3], # I am happy with the MN splits here (these are almost "as is" from modify-6)
'eta': [2.5,3,3.75,4.25,4.5], # Close to MN.
'fdchi2': [33,125,350,780,1800,5000,10000], # I want to make the biggest split 10e3 because in the simulated events there is pretty much only BKGD above 40e3 but we don't want the BDT to learn to kill these as new particles would live here. Otherwise I took the MN splits and modified the first one (the first one is 5sigma now).
'minpt': [350,500,750,1500,3000,5000], # let's make 500 the 2nd split so that this lines up with the HLT1 SVs.
'nlt16': [0.5],
'ipchi2': [8,26,62,150,500,1000], # I also propose we add a cut of IP chi2 < 5000 as it's all background out there.
'n1trk': [0.5, 1.5, 2.5, 3.5]
}
ef_base_bbdt = MatrixNetClassifier(train_features=variables, intervals=special_b, **params)
ef_base_bbdt.fit(train, train['signal'])
Out[31]:
In [32]:
ef_base_bbdt5 = MatrixNetClassifier(train_features=variables, intervals=5, **params)
ef_base_bbdt5.fit(train, train['signal'])
ef_base_bbdt6 = MatrixNetClassifier(train_features=variables, intervals=6, **params)
ef_base_bbdt6.fit(train, train['signal'])
Out[32]:
In [33]:
from rep.data import LabeledDataStorage
from rep.report import ClassificationReport
report = ClassificationReport({'base': ef_base}, LabeledDataStorage(test, test['signal']))
In [34]:
report.roc()
Out[34]:
In [35]:
%run pruning.py
In [ ]:
new_trainlen = (len(train) // 8) * 8
trainX = train[ef_base.features][:new_trainlen].values
trainY = train['signal'][:new_trainlen].values
trainW = numpy.ones(len(trainY))
trainW[trainY == 0] *= sum(trainY) / sum(1 - trainY)
In [ ]:
new_features, new_formula_mx, new_classifier = select_trees(trainX, trainY, sample_weight=trainW,
initial_classifier=ef_base,
iterations=100, n_candidates=100,
learning_rate=0.1, regularization=50.)
In [ ]:
prunned = cPickle.loads(cPickle.dumps(ef_base))
prunned.formula_mx = new_formula_mx
In [26]:
def mode_scheme_fit(train, base, suf, model_file):
blending_parts = OrderedDict()
for n_ch, ch in enumerate(sig_train_modes_names):
temp = FoldingClassifier(base_estimator=base, random_state=11, features=variables, parallel_profile=PROFILE)
temp_data = train[(train['mode'] == ch) | (train['mode'] == bck_train_mode_name)]
temp.fit(temp_data, temp_data['signal'])
blending_parts['ch' + str(n_ch) + suf] = temp
import cPickle
with open(model_file, 'w') as f:
cPickle.dump(blending_parts, f)
def mode_scheme_predict(data, suf, model_file, mode='train'):
with open(model_file, 'r') as f:
blending_parts = cPickle.load(f)
for n_ch, ch in enumerate(sig_train_modes_names):
temp_name = 'ch' + str(n_ch) + suf
if mode == 'train':
temp_key = ((data['mode'] == ch) | (data['mode'] == bck_train_mode_name))
data.ix[temp_key, temp_name] = blending_parts[temp_name].predict_proba(
data[temp_key])[:, 1]
data.ix[~temp_key, temp_name] = blending_parts[temp_name].predict_proba(
data[~temp_key])[:, 1]
else:
data[temp_name] = blending_parts[temp_name].predict_proba(data)[:, 1]
In [27]:
def get_best_svr_by_channel(data, feature_mask, count=1):
add_events = []
for id_est, channel in enumerate(sig_train_modes_names):
train_part = data[(data['mode'] == channel)]
for num, group in train_part.groupby('unique'):
index = numpy.argsort(group[feature_mask.format(id_est)].values)[::-1]
add_events.append(group.iloc[index[:count], :])
good_events = pandas.concat([data[(data['mode'] == bck_train_mode_name)]] + add_events)
print len(good_events)
return good_events
In [ ]:
from sklearn.ensemble import RandomForestClassifier
from rep.metaml import FoldingClassifier
base = RandomForestClassifier(n_estimators=500, min_samples_leaf=50, max_depth=6,
max_features=7, n_jobs=8)
mode_scheme_fit(train, base, '', 'forest_trick.pkl')
mode_scheme_predict(train, '', 'forest_trick.pkl')
mode_scheme_predict(test, '', 'forest_trick.pkl', mode='test')
In [ ]:
good_events = get_best_svr_by_channel(train, 'ch{}', 2)
forest_mn = MatrixNetClassifier(train_features=variables, **params)
forest_mn.fit(good_events, good_events['signal'])
In [ ]:
forest_mn_bbdt = MatrixNetClassifier(train_features=variables, intervals=special_b, **params)
forest_mn_bbdt.fit(good_events, good_events['signal'])
In [47]:
estimators = {'base MN': ef_base, 'BBDT MN-6': ef_base_bbdt6, 'BBDT MN-5': ef_base_bbdt5,
'BBDT MN special': ef_base_bbdt,
'Prunned MN': prunned, 'base MN + forest selection': forest_mn,
'BBDT MN special + forest selection': forest_mn_bbdt}
In [48]:
import cPickle
with open('models/prunned.pkl', 'w') as file_pr:
cPickle.dump(estimators, file_pr)
In [28]:
mode_scheme_predict(train, '', 'forest_trick.pkl')
mode_scheme_predict(test, '', 'forest_trick.pkl', mode='test')
good_events = get_best_svr_by_channel(train, 'ch{}', 2)
In [32]:
forest_mn = estimators['base MN + forest selection']
In [30]:
new_trainlen = (len(good_events) // 8) * 8
trainX = good_events[forest_mn.features][:new_trainlen].values
trainY = good_events['signal'][:new_trainlen].values
trainW = numpy.ones(len(trainY))
trainW[trainY == 0] *= sum(trainY) / sum(1 - trainY)
In [31]:
len(train), len(good_events)
Out[31]:
In [39]:
from pruning import select_trees
In [46]:
new_features_f, new_formula_mx_f, new_classifier_f = select_trees(trainX, trainY, sample_weight=trainW,
initial_classifier=forest_mn,
iterations=100, n_candidates=100,
learning_rate=0.1, regularization=50.)
In [74]:
prunned_f = cPickle.loads(cPickle.dumps(forest_mn))
prunned_f.formula_mx = new_formula_mx_f
estimators['Prunned MN + forest selection'] = prunned_f
In [76]:
import cPickle
with open('models/prunned.pkl', 'w') as file_pr:
cPickle.dump(estimators, file_pr)
In [77]:
thresholds = dict()
test_bck = test[test['signal'] == 0]
RATE = [2500., 4000.]
events_pass = dict()
for name, cl in estimators.items():
prob = cl.predict_proba(test_bck)
thr, result = calculate_thresholds(test_bck, prob, total_test_bck_events, rates=RATE)
for rate, val in result.items():
events_pass['{}-{}'.format(rate, name)] = val[1]
thresholds[name] = thr
print name, result
In [58]:
train_modes_eff, statistic = result_statistic(estimators, sig_train_modes_names,
test[test['signal'] == 1],
thresholds, RATE, total_test_signal_events)
In [59]:
from rep.plotting import BarComparePlot
xticks_labels = ['$B^0 \\to K^*\mu^+\mu^-$', "$B^0 \\to D^+D^-$", "$B^0 \\to D^- \mu^+ \\nu_{\mu}$",
'$B^+ \\to \pi^+ K^-K^+$', '$B^0_s \\to \psi(1S) K^+K^-\pi^+\pi^-$', '$B^0_s \\to D_s^-\pi^+$']
for r in RATE:
new_dict = []
for key, val in train_modes_eff.iteritems():
if (key[0] in {'base MN', 'Prunned MN', 'BBDT MN special',
'base MN + forest', 'Prunned MN + forest', 'BBDT MN special + forest'}) and r == key[1]:
new_dict.append((key, val))
new_dict = dict(new_dict)
BarComparePlot(new_dict).plot(new_plot=True, figsize=(24, 8), ylabel='efficiency', fontsize=22)
xticks(3 + 11 * numpy.arange(6), xticks_labels, rotation=0)
lgd = legend(bbox_to_anchor=(0.5, 1.3), loc='upper center', ncol=2, fontsize=22)
# plt.savefig('hlt2-experiments.pdf' , format='pdf', bbox_extra_artists=(lgd,), bbox_inches='tight')
In [60]:
from rep.plotting import BarComparePlot
for r in RATE:
new_dict = []
for key, val in train_modes_eff.iteritems():
if r == key[1]:
new_dict.append((key, val))
new_dict = dict(new_dict)
BarComparePlot(new_dict).plot(new_plot=True, figsize=(24, 8), ylabel='efficiency', fontsize=22)
lgd = legend(bbox_to_anchor=(0.5, 1.3), loc='upper center', ncol=2, fontsize=22)
# plt.savefig('hlt2-experiments.pdf' , format='pdf', bbox_extra_artists=(lgd,), bbox_inches='tight')
In [78]:
plots = OrderedDict()
for key, value in estimators.items():
plots[key] = plot_roc_events(value, test[test['signal'] == 1], test[test['signal'] == 0], key)
In [80]:
bbdt_plots = plots.copy()
bbdt_plots.pop('Prunned MN')
bbdt_plots.pop('Prunned MN + forest selection')
Out[80]:
In [63]:
from rep.plotting import FunctionsPlot
FunctionsPlot(bbdt_plots).plot(new_plot=True, xlim=(0.02, 0.06), ylim=(0.65, 0.82))
plot([1. * events_pass['2500.0-base MN'] / statistic_length(ds_test_bck)['Events']] * 2,
[0., 1], 'b--', label='rate: 2.5 kHz')
plot([1. * events_pass['4000.0-base MN'] / statistic_length(ds_test_bck)['Events']] * 2,
[0., 1], 'g--', label='rate: 4. kHz')
lgd = legend(loc='upper center', fontsize=16, bbox_to_anchor=(0.5, 1.3), ncol=3)
title('ROC for events (training decays)', fontsize=20)
xlabel('FRP, background events efficiency', fontsize=20)
ylabel('TPR, signal events efficiency', fontsize=20)
Out[63]:
In [81]:
from rep.plotting import FunctionsPlot
FunctionsPlot(plots).plot(new_plot=True, xlim=(0.02, 0.06), ylim=(0.65, 0.82))
plot([1. * events_pass['2500.0-base MN'] / statistic_length(ds_test_bck)['Events']] * 2,
[0., 1], 'b--', label='rate: 2.5 kHz')
plot([1. * events_pass['4000.0-base MN'] / statistic_length(ds_test_bck)['Events']] * 2,
[0., 1], 'g--', label='rate: 4. kHz')
lgd = legend(loc='upper center', fontsize=16, bbox_to_anchor=(0.5, 1.4), ncol=3)
title('ROC for events (training decays)', fontsize=20)
xlabel('FRP, background events efficiency', fontsize=20)
ylabel('TPR, signal events efficiency', fontsize=20)
plt.savefig('img/rocs_fast_training.pdf' , format='pdf', bbox_extra_artists=(lgd,), bbox_inches='tight')
In [65]:
from collections import defaultdict
all_channels = []
efficiencies = defaultdict(OrderedDict)
for mode in empty_events.keys():
if mode in set(sig_train_modes_names) or mode == bck_train_mode_name:
continue
df = pandas.read_csv(os.path.join(folder , 'mod_{}.csv'.format(mode)), sep='\t')
if len(df) <= 0:
continue
total_events = statistic_length(df)['Events'] + empty_events[mode]
df = df[(df['pass_nbody'] == 1) & (df['mcor'] <= 10e3)]
passed_events = statistic_length(df)['Events']
all_channels.append(df)
for name, cl in estimators.items():
prob = cl.predict_proba(df)
for rate, thresh in thresholds[name].items():
eff = final_eff_for_mode(df, prob, total_events, thresh)
latex_name = '$' + Samples[str(mode)]['root'].replace("#", "\\") + '$'
efficiencies[(name, rate)][latex_name] = eff
In [66]:
for key, val in efficiencies.items():
for key_2, val_2 in val.items():
if val_2 <= 0.1:
efficiencies[key].pop(key_2)
In [67]:
from rep.plotting import BarComparePlot
for r in RATE:
new_dict = []
for key, val in efficiencies.iteritems():
if r == key[1]:
new_dict.append((key, val))
new_dict = dict(new_dict)
BarComparePlot(new_dict).plot(new_plot=True, figsize=(24, 8), ylabel='efficiency', fontsize=22)
lgd = legend(bbox_to_anchor=(0.5, 1.4), loc='upper center', ncol=2, fontsize=22)
In [68]:
plots_all = OrderedDict()
for key, value in estimators.items():
plots_all[key] = plot_roc_events(value, pandas.concat([test[test['signal'] == 1]] + all_channels),
test[test['signal'] == 0], key)
In [85]:
from rep.plotting import FunctionsPlot
FunctionsPlot(plots_all).plot(new_plot=True, xlim=(0.02, 0.06), ylim=(0.5, 0.66))
plot([1. * events_pass['2500.0-base MN'] / statistic_length(ds_test_bck)['Events']] * 2,
[0., 1], 'b--', label='rate: 2.5 kHz')
plot([1. * events_pass['4000.0-base MN'] / statistic_length(ds_test_bck)['Events']] * 2,
[0., 1], 'g--', label='rate: 4. kHz')
lgd = legend(loc='upper center', fontsize=16, bbox_to_anchor=(0.5, 1.4), ncol=3)
title('ROC for events (all decays together)', fontsize=20)
xlabel('FRP, background events efficiency', fontsize=20)
ylabel('TPR, signal events efficiency', fontsize=20)
plt.savefig('img/rocs_fast_all.pdf' , format='pdf', bbox_extra_artists=(lgd,), bbox_inches='tight')
In [70]:
thresholds = OrderedDict()
RATE = [2000., 2500., 3000., 3500., 4000.]
for name, cl in estimators.items():
prob = cl.predict_proba(ds_test_bck)
thr, result = calculate_thresholds(ds_test_bck, prob, total_test_bck_events, rates=RATE)
thresholds[name] = thr
print name, result
In [71]:
train_modes_eff, statistic = result_statistic({'base MN': estimators['base MN']}, sig_train_modes_names,
test[test['signal'] == 1],
thresholds, RATE, total_test_signal_events)
In [72]:
order_rate = OrderedDict()
for j in numpy.argsort([i[1] for i in train_modes_eff.keys()]):
order_rate[train_modes_eff.keys()[j]] = train_modes_eff.values()[j]
In [73]:
from rep.plotting import BarComparePlot
BarComparePlot(order_rate).plot(new_plot=True, figsize=(18, 6), ylabel='efficiency', fontsize=18)
lgd = legend(bbox_to_anchor=(0.5, 1.2), loc='upper center', ncol=5, fontsize=18)
# plt.savefig('rates.pdf' , format='pdf', bbox_extra_artists=(lgd,), bbox_inches='tight')
In [ ]: