In [1]:
%pylab inline
In [2]:
PROFILE = None #'threads-2'
In [3]:
matplotlib.rc('font', size=14)
In [4]:
def generate_result(auc_b_c, auc_b_light, auc_c_light, label=""):
result = {}
result['b vs c'] = [auc_b_c]
result['b vs light'] = [auc_b_light]
result['c vs light'] = [auc_c_light]
result['name'] = [label]
return pandas.DataFrame(result)
In [5]:
import root_numpy
import pandas
from rep.data import LabeledDataStorage
from hep_ml.decisiontrain import DecisionTrainClassifier, DecisionTrainRegressor
from hep_ml.losses import LogLossFunction, MSELossFunction
from rep.metaml import FoldingClassifier, FoldingRegressor
from rep.report import ClassificationReport
from rep.report.metrics import RocAuc
from sklearn.metrics import roc_auc_score
In [6]:
treename = 'tag'
data_b = pandas.DataFrame(root_numpy.root2array('datasets/type=5.root', treename=treename)).dropna()
data_b = data_b[::40]
data_c = pandas.DataFrame(root_numpy.root2array('datasets/type=4.root', treename=treename)).dropna()
data_light = pandas.DataFrame(root_numpy.root2array('datasets/type=0.root', treename=treename)).dropna()
In [7]:
data = {'b': data_b, 'c': data_c, 'light': data_light}
In [8]:
jet_features = [column for column in data_b.columns if "Jet" in column]
sv_features = [column for column in data_b.columns if "SV" in column]
print "Jet features", ", ".join(jet_features)
print "SV features", ", ".join(sv_features)
In [10]:
for d in data.values():
d['log_SVFDChi2'] = numpy.log(d['SVFDChi2'].values)
d['log_SVSumIPChi2'] = numpy.log(d['SVSumIPChi2'].values)
d['SVM_diff'] = numpy.log(d['SVMC'] ** 2 - d['SVM']**2)
d['SVM_rel'] = numpy.tanh(d['SVM'] / d['SVMC'])
d['SVM_rel2'] = (d['SVM'] / d['SVMC'])**2
d['SVR_rel'] = d['SVDR'] / (d['SVR'] + 1e-5)
d['R_FD_rel'] = numpy.tanh(d['SVR'] / d['SVFDChi2'])
d['jetP'] = numpy.sqrt(d['JetPx'] ** 2 + d['JetPy'] ** 2 + d['JetPz'] ** 2)
d['jetPt'] = numpy.sqrt(d['JetPx'] ** 2 + d['JetPy'] ** 2)
d['jetM'] = numpy.sqrt(d['JetE'] ** 2 - d['jetP'] ** 2 )
d['SV_jet_M_rel'] = d['SVM'] / d['jetM']
d['SV_jet_MC_rel'] = d['SVMC'] / d['jetM']
# full_data['P_Sin'] = 0.5 * d['SVMC'].values - (d['SVM'].values)**2 / (2. * d['SVMC'].values)
# full_data['Psv'] = d['SVPT'].values * d['P_Sin'].values
# full_data['Psv2'] = d['P_Sin'].values / d['SVPT'].values
# full_data['Mt'] = d['SVMC'].values - d['P_Sin'].values
# full_data['QtoN'] = 1. * d['SVQ'].values / d['SVN'].values
In [14]:
data_b = data_b.drop(['JetParton', 'JetFlavor', 'JetPx', 'JetPy'], axis=1)
data_c = data_c.drop(['JetParton', 'JetFlavor', 'JetPx', 'JetPy'], axis=1)
data_light = data_light.drop(['JetParton', 'JetFlavor', 'JetPx', 'JetPy'], axis=1)
In [23]:
jet_features = [column for column in data_b.columns if "Jet" in column]
In [11]:
additional_features = ['log_SVFDChi2', 'log_SVSumIPChi2',
'SVM_diff', 'SVM_rel', 'SVR_rel', 'SVM_rel2', 'SVR_rel', 'R_FD_rel',
'jetP', 'jetPt', 'jetM', 'SV_jet_M_rel', 'SV_jet_MC_rel']
In [62]:
figsize(18, 60)
for i, feature in enumerate(data_b.columns):
subplot(len(data_b.columns) / 3, 3, i)
hist(data_b[feature].values, label='b', alpha=0.2, bins=60, normed=True)
hist(data_c[feature].values, label='c', alpha=0.2, bins=60, normed=True)
# hist(data_light[feature].values, label='light', alpha=0.2, bins=60, normed=True)
xlabel(feature); legend(loc='best');
title(roc_auc_score([0] * len(data_b) + [1]*len(data_c),
numpy.hstack([data_b[feature].values, data_c[feature].values])))
In [16]:
len(data_b), len(data_c), len(data_light)
Out[16]:
In [17]:
jet_features = jet_features[2:]
In [18]:
data_b_c_lds = LabeledDataStorage(pandas.concat([data_b, data_c]), [1] * len(data_b) + [0] * len(data_c))
data_c_light_lds = LabeledDataStorage(pandas.concat([data_c, data_light]), [1] * len(data_c) + [0] * len(data_light))
data_b_light_lds = LabeledDataStorage(pandas.concat([data_b, data_light]), [1] * len(data_b) + [0] * len(data_light))
In [19]:
def one_vs_one_training(base_estimators, data_b_c_lds, data_c_light_lds, data_b_light_lds, full_data,
prefix='bdt', folding=True, features=None):
if folding:
tt_folding_b_c = FoldingClassifier(base_estimators[0], n_folds=2, random_state=11, parallel_profile=PROFILE,
features=features)
tt_folding_c_light = FoldingClassifier(base_estimators[1], n_folds=2, random_state=11, parallel_profile=PROFILE,
features=features)
tt_folding_b_light = FoldingClassifier(base_estimators[2], n_folds=2, random_state=11, parallel_profile=PROFILE,
features=features)
else:
tt_folding_b_c = base_estimators[0]
tt_folding_b_c.features = features
tt_folding_c_light = base_estimators[1]
tt_folding_c_light.features = features
tt_folding_b_light = base_estimators[2]
tt_folding_b_light.features = features
%time tt_folding_b_c.fit_lds(data_b_c_lds)
%time tt_folding_c_light.fit_lds(data_c_light_lds)
%time tt_folding_b_light.fit_lds(data_b_light_lds)
bdt_b_c = numpy.concatenate([tt_folding_b_c.predict_proba(pandas.concat([data_b, data_c])),
tt_folding_b_c.predict_proba(data_light)])[:, 1]
bdt_c_light = numpy.concatenate([tt_folding_c_light.predict_proba(data_b),
tt_folding_c_light.predict_proba(pandas.concat([data_c, data_light]))])[:, 1]
p_b_light = tt_folding_b_light.predict_proba(pandas.concat([data_b, data_light]))[:, 1]
bdt_b_light = numpy.concatenate([p_b_light[:len(data_b)], tt_folding_b_light.predict_proba(data_c)[:, 1],
p_b_light[len(data_b):]])
full_data[prefix + '_b_c'] = bdt_b_c
full_data[prefix + '_b_light'] = bdt_b_light
full_data[prefix + '_c_light'] = bdt_c_light
In [20]:
full_data = pandas.concat([data_b, data_c, data_light])
full_data['label'] = [0] * len(data_b) + [1] * len(data_c) + [2] * len(data_light)
In [21]:
from hep_ml.nnet import MLPClassifier
from rep.estimators import SklearnClassifier
In [25]:
one_vs_one_training([SklearnClassifier(MLPClassifier(layers=(30, 10), epochs=700, random_state=11))]*3,
data_b_c_lds, data_c_light_lds, data_b_light_lds, full_data, 'mlp', folding=True,
features=sv_features + additional_features + jet_features)
In [27]:
from sklearn.linear_model import LogisticRegression
one_vs_one_training([LogisticRegression()]*3,
data_b_c_lds, data_c_light_lds, data_b_light_lds, full_data,
'logistic', folding=True, features=sv_features + additional_features + jet_features)
In [26]:
# from sklearn.svm import SVC
# from sklearn.pipeline import make_pipeline
# from sklearn.preprocessing import StandardScaler
# svm_feat = SklearnClassifier(make_pipeline(StandardScaler(), SVC(probability=True)), features=sv_features)
# %time svm_feat.fit(data_b_c_lds.data, data_b_c_lds.target)
In [28]:
# from sklearn.neighbors import KNeighborsClassifier
# one_vs_one_training([KNeighborsClassifier(metric='canberra')]*3,
# data_b_c_lds, data_c_light_lds, data_b_light_lds, full_data,
# 'knn', folding=True, features=sv_features)
In [29]:
# from rep.estimators import TheanetsClassifier
# theanets_base = TheanetsClassifier(layers=(20, 10), trainers=[{'algo': 'adadelta', 'learining_rate': 0.1}, ])
# nn = FoldingClassifier(theanets_base, features=sv_features, random_state=11, parallel_profile='ssh-py2')
# nn.fit(full_data, full_data.label)
# multi_probs = nn.predict_proba(full_data)
In [30]:
# full_data['th_0'] = multi_probs[:, 0] / multi_probs[:, 1]
# full_data['th_1'] = multi_probs[:, 0] / multi_probs[:, 2]
# full_data['th_2'] = multi_probs[:, 1] / multi_probs[:, 2]
In [31]:
mlp_features = ['mlp_b_c', 'mlp_b_light', 'mlp_c_light']
# knn_features = ['knn_b_c', 'knn_b_light', 'knn_c_light']
# th_features = ['th_0', 'th_1', 'th_2']
logistic_features = ['logistic_b_c', 'logistic_b_light', 'logistic_c_light']
In [32]:
data_multi_lds = LabeledDataStorage(full_data, 'label')
In [55]:
variables_final = set(sv_features + additional_features + jet_features + mlp_features)
# variables_final = list(variables_final - {'SVN', 'SVQ', 'log_SVFDChi2', 'log_SVSumIPChi2', 'SVM_rel2', 'JetE', 'JetNDis'})
In [67]:
from rep.estimators import XGBoostClassifier
xgb_base = XGBoostClassifier(n_estimators=3000, colsample=0.7, eta=0.005, nthreads=8,
subsample=0.7, max_depth=6)
multi_folding_rbf = FoldingClassifier(xgb_base, n_folds=2, random_state=11,
features=variables_final)
%time multi_folding_rbf.fit_lds(data_multi_lds)
Out[67]:
In [68]:
multi_probs = multi_folding_rbf.predict_proba(full_data)
'log loss', -numpy.log(multi_probs[numpy.arange(len(multi_probs)), full_data['label']]).sum() / len(full_data)
Out[68]:
In [69]:
multi_folding_rbf.get_feature_importances()
Out[69]:
In [70]:
labels = full_data['label'].values.astype(int)
multiclass_result = generate_result(1 - roc_auc_score(labels > 0, multi_probs[:, 0] / multi_probs[:, 1],
sample_weight=(labels != 2) * 1),
1 - roc_auc_score(labels > 1, multi_probs[:, 0] / multi_probs[:, 2],
sample_weight=(labels != 1) * 1),
1 - roc_auc_score(labels > 1, multi_probs[:, 1] / multi_probs[:, 2],
sample_weight=(labels != 0) * 1),
label='multiclass')
result = pandas.concat([multiclass_result])
result.index = result['name']
result = result.drop('name', axis=1)
result
Out[70]:
Previous models:
Jet features:
0.963468 0.996211 0.988844 (0.22622155319396442) (with MLP, additional)
0.96358 0.996209 0.988789 (0.22569323022675056 n_estimators=3000, colsample=0.7, eta=0.005, nthreads=8, subsample=0.7, max_depth=6)
In [ ]: