In [1]:
%pylab inline
PROFILE = None #'threads-2'
In [2]:
matplotlib.rc('font', size=16)
In [3]:
from collections import OrderedDict
def generate_result(auc_b_c, auc_b_light, auc_c_light, log_loss_value, label=""):
result = OrderedDict()
result['name'] = [label]
result['logloss'] = [log_loss_value]
result['b vs c'] = [auc_b_c]
result['b vs light'] = [auc_b_light]
result['c vs light'] = [auc_c_light]
return pandas.DataFrame(result)
In [4]:
import root_numpy
import pandas
from rep.data import LabeledDataStorage
from hep_ml.decisiontrain import DecisionTrainClassifier, DecisionTrainRegressor
from hep_ml.losses import LogLossFunction, MSELossFunction
from rep.metaml import FoldingClassifier, FoldingRegressor
from rep.report import ClassificationReport
from rep.report.metrics import RocAuc
from sklearn.metrics import roc_auc_score
from rep.estimators import SklearnClassifier
In [5]:
treename = 'tag'
numpy.random.seed(11)
data_b = pandas.DataFrame(root_numpy.root2array('datasets/type=5.root', treename=treename)).dropna()
data_b = data_b.ix[numpy.random.choice(numpy.arange(len(data_b)), replace=False, size=40000), :]
data_c = pandas.DataFrame(root_numpy.root2array('datasets/type=4.root', treename=treename)).dropna()
data_light = pandas.DataFrame(root_numpy.root2array('datasets/type=0.root', treename=treename)).dropna()
In [6]:
set(data_light.JetParton)
Out[6]:
In [7]:
def add_features(*arrays):
new_data = []
for data in arrays:
data['log_SVFDChi2'] = numpy.log1p(data['SVFDChi2'].values)
data['log_SVMC'] = numpy.log1p(data['SVMC'].values)
data['log_SVM'] = numpy.log1p(data['SVM'].values)
data['log_SVSumIPChi2'] = numpy.log1p(data['SVSumIPChi2'].values)
data['SV_M_PT'] = numpy.log1p(data['SVM'] / data['SVPT'])
data['SV_MC_PT'] = numpy.log1p(data['SVMC'] / data['SVPT'])
data['SVM_diff'] = numpy.log1p(data['SVMC'] ** 2 - data['SVM']**2)
data['SV_Mdiff_PT'] = numpy.log1p(data['SVM_diff'] / data['SVPT'])
data['SV_theta'] = numpy.log1p((data['SVMC'] ** 2 - data['SVM']**2) / data['SVPT'])
data['SVM_rel'] = numpy.log1p(data['SVM'] / data['SVMC'])
data['SV_R_FD_rel'] = numpy.log1p(data['SVR'] / data['SVFDChi2'])
data['SV_Q_N_rel'] = 1. * data['SVQ'] / data['SVN']
data = data.drop(['SVFDChi2', 'SVSumIPChi2', 'SVMC', 'SVM'], axis=1)
new_data.append(data)
return new_data
In [8]:
data_b, data_c, data_light = add_features(data_b, data_c, data_light)
In [9]:
len(data_b), len(data_c), len(data_light)
Out[9]:
In [10]:
jet_features = [column for column in data_b.columns if "Jet" in column]
sv_features = [column for column in data_b.columns if "SV" in column]
In [11]:
print "Jet features", ", ".join(jet_features)
print "SV features", ", ".join(sv_features)
In [12]:
jet_features_base = ['JetQ', 'JetSigma1', 'JetSigma2', 'JetMult', 'JetPTHard', 'JetPTD', 'JetNDis']
In [13]:
figsize(18, 24)
for i, feature in enumerate(sv_features):
subplot(len(sv_features) / 3 + 1, 3, i)
hist(data_b[feature].values, label='b', alpha=0.2, bins=60, normed=True)
hist(data_c[feature].values, label='c', alpha=0.2, bins=60, normed=True)
hist(data_light[feature].values, label='light', alpha=0.2, bins=60, normed=True)
title(feature); legend(loc='best');
In [14]:
labels = numpy.array([0] * len(data_b) + [1] * len(data_c) + [2] * len(data_light))
full_data = pandas.concat([data_b, data_c, data_light])
full_data.index = range(len(full_data))
In [15]:
data_b_c_lds = LabeledDataStorage(pandas.concat([data_b, data_c]), [1] * len(data_b) + [0] * len(data_c))
data_c_light_lds = LabeledDataStorage(pandas.concat([data_c, data_light]), [1] * len(data_c) + [0] * len(data_light))
data_b_light_lds = LabeledDataStorage(pandas.concat([data_b, data_light]), [1] * len(data_b) + [0] * len(data_light))
data_bc_light_lds = LabeledDataStorage(full_data, [1] * len(data_b) + [1] * len(data_c) + [0] * len(data_light))
In [16]:
baseline_features = ['log_SVM', 'log_SVMC', 'SVR', 'SVPT', 'SVDR', 'SVN', 'SVQ', 'log_SVFDChi2', 'log_SVSumIPChi2']
In [17]:
tt_base = DecisionTrainClassifier(learning_rate=0.1, n_estimators=2000, depth=6, pretransform_needed=True,
max_features=6, loss=LogLossFunction(regularization=30))
tt_folding_base_b_c = FoldingClassifier(SklearnClassifier(tt_base), n_folds=2,
random_state=11, parallel_profile='threads-2',
features=baseline_features)
%time tt_folding_base_b_c.fit_lds(data_b_c_lds)
pass
In [18]:
report_base_b_c = tt_folding_base_b_c.test_on_lds(data_b_c_lds)
In [19]:
report_base_b_c.learning_curve(RocAuc()).plot(new_plot=True)
In [20]:
report_base_b_c.feature_importance()
Out[20]:
In [21]:
auc_base_b_c = report_base_b_c.compute_metric(RocAuc())['clf']
auc_base_b_c
Out[21]:
In [22]:
tt_base = DecisionTrainClassifier(learning_rate=0.1, n_estimators=15000, depth=6, pretransform_needed=True,
max_features=6, loss=LogLossFunction(regularization=30))
tt_folding_base_bc_light = FoldingClassifier(SklearnClassifier(tt_base), n_folds=2, random_state=11,
parallel_profile='threads-2', features=baseline_features)
%time tt_folding_base_bc_light.fit_lds(data_bc_light_lds)
Out[22]:
In [23]:
report_base_bc_light = tt_folding_base_bc_light.test_on_lds(data_bc_light_lds)
In [24]:
report_base_bc_light.learning_curve(RocAuc()).plot(new_plot=True)
In [25]:
report_base_bc_light.feature_importance()
Out[25]:
In [26]:
print report_base_bc_light.compute_metric(RocAuc())
probs_base_bc_light = tt_folding_base_bc_light.predict_proba(full_data)[:, 1]
baseline_result = generate_result(auc_base_b_c,
roc_auc_score(labels < 1, probs_base_bc_light, sample_weight=(labels != 1) * 1),
roc_auc_score(labels < 2, probs_base_bc_light, sample_weight=(labels != 0) * 1),
-1,
label='baseline')
In [27]:
baseline_result
Out[27]:
In [28]:
tt_base = DecisionTrainClassifier(learning_rate=0.1, n_estimators=2000, depth=6, pretransform_needed=True,
max_features=6, loss=LogLossFunction(regularization=30))
tt_folding_b_c = FoldingClassifier(SklearnClassifier(tt_base), n_folds=2,
random_state=11, parallel_profile='threads-2',
features=sv_features)
%time tt_folding_b_c.fit_lds(data_b_c_lds)
pass
In [29]:
report_b_c = tt_folding_b_c.test_on_lds(data_b_c_lds)
report_b_c.learning_curve(RocAuc()).plot(new_plot=True)
In [30]:
report_b_c.feature_importance()
Out[30]:
In [31]:
report_b_c.compute_metric(RocAuc())['clf']
Out[31]:
In [32]:
tt_base = DecisionTrainClassifier(learning_rate=0.1, n_estimators=7000, depth=6, pretransform_needed=True,
max_features=6, loss=LogLossFunction(regularization=30))
tt_folding_c_light = FoldingClassifier(SklearnClassifier(tt_base), n_folds=2,
random_state=11, parallel_profile='threads-2',
features=sv_features)
%time tt_folding_c_light.fit_lds(data_c_light_lds)
pass
In [33]:
report_c_light = tt_folding_c_light.test_on_lds(data_c_light_lds)
report_c_light.learning_curve(RocAuc()).plot(new_plot=True)
In [34]:
report_c_light.feature_importance()
Out[34]:
In [35]:
report_c_light.compute_metric(RocAuc())['clf']
Out[35]:
In [36]:
tt_base = DecisionTrainClassifier(learning_rate=0.1, n_estimators=10000, depth=6, pretransform_needed=True,
max_features=6, loss=LogLossFunction(regularization=30))
tt_folding_b_light = FoldingClassifier(SklearnClassifier(tt_base), n_folds=2,
random_state=11, parallel_profile='threads-2',
features=sv_features)
%time tt_folding_b_light.fit_lds(data_b_light_lds)
pass
In [37]:
report_b_light = tt_folding_b_light.test_on_lds(data_b_light_lds)
report_b_light.learning_curve(RocAuc()).plot(new_plot=True)
In [38]:
report_b_light.compute_metric(RocAuc())['clf']
Out[38]:
In [39]:
probs_b_c = numpy.concatenate([tt_folding_b_c.predict_proba(pandas.concat([data_b, data_c])),
tt_folding_b_c.predict_proba(data_light)])[:, 1]
probs_c_light = numpy.concatenate([tt_folding_c_light.predict_proba(data_b),
tt_folding_c_light.predict_proba(pandas.concat([data_c, data_light]))])[:, 1]
probs_b_light = tt_folding_b_light.predict_proba(pandas.concat([data_b, data_light]))[:, 1]
probs_b_light = numpy.concatenate([probs_b_light[:len(data_b)], tt_folding_b_light.predict_proba(data_c)[:, 1],
probs_b_light[len(data_b):]])
In [40]:
one_vs_one_result = generate_result(report_b_c.compute_metric(RocAuc())['clf'],
report_b_light.compute_metric(RocAuc())['clf'],
report_c_light.compute_metric(RocAuc())['clf'],
-1, label='one vs one, add features')
In [41]:
tt_base = DecisionTrainClassifier(learning_rate=0.1, n_estimators=15000, depth=6, pretransform_needed=True,
max_features=6, loss=LogLossFunction(regularization=30))
tt_folding_base_add_bc_light = FoldingClassifier(SklearnClassifier(tt_base), n_folds=2, random_state=11,
parallel_profile='threads-2', features=sv_features)
%time tt_folding_base_add_bc_light.fit_lds(data_bc_light_lds)
Out[41]:
In [42]:
report_base_add_bc_light = tt_folding_base_add_bc_light.test_on_lds(data_bc_light_lds)
report_base_add_bc_light.learning_curve(RocAuc()).plot(new_plot=True)
In [43]:
report_base_add_bc_light.feature_importance()
Out[43]:
In [44]:
print report_base_add_bc_light.compute_metric(RocAuc())
probs_base_add_bc_light = tt_folding_base_add_bc_light.predict_proba(full_data)[:, 1]
baseline_add_result = generate_result(report_b_c.compute_metric(RocAuc())['clf'],
roc_auc_score(labels < 1, probs_base_add_bc_light,
sample_weight=(labels != 1) * 1),
roc_auc_score(labels < 2, probs_base_add_bc_light,
sample_weight=(labels != 0) * 1),
-1,
label='baseline, add features')
In [45]:
pandas.concat([baseline_result, baseline_add_result, one_vs_one_result])
Out[45]:
In [46]:
from rep_ef.estimators import MatrixNetClassifier
In [47]:
mn_base = MatrixNetClassifier(connection_url='mn',
iterations=5000, regularization=0.02, sync=False)
mn_folding_b_c = FoldingClassifier(mn_base, n_folds=2, random_state=11,
parallel_profile='threads-2', features=sv_features)
%time mn_folding_b_c.fit_lds(data_b_c_lds)
Out[47]:
In [48]:
report_mn_b_c = mn_folding_b_c.test_on_lds(data_b_c_lds)
report_mn_b_c.learning_curve(RocAuc()).plot(new_plot=True)
In [49]:
report_mn_b_c.compute_metric(RocAuc())['clf']
Out[49]:
In [ ]:
mn_base = MatrixNetClassifier(connection_url='mn',
iterations=5000, regularization=0.03, sync=False)
mn_folding_bc_light = FoldingClassifier(mn_base, n_folds=2, random_state=11,
parallel_profile='threads-2', features=sv_features)
%time mn_folding_bc_light.fit_lds(data_bc_light_lds)
In [56]:
report_mn_bc_light = mn_folding_bc_light.test_on_lds(data_bc_light_lds)
report_mn_bc_light.learning_curve(RocAuc()).plot(new_plot=True)
In [57]:
report_mn_bc_light.compute_metric(RocAuc())['clf']
Out[57]:
In [58]:
probs_mn_bc_light = mn_folding_bc_light.predict_proba(full_data)[:, 1]
mn_result = generate_result(report_mn_b_c.compute_metric(RocAuc())['clf'],
roc_auc_score(labels < 1, probs_mn_bc_light,
sample_weight=(labels != 1) * 1),
roc_auc_score(labels < 2, probs_mn_bc_light,
sample_weight=(labels != 0) * 1),
-1,
label='mn, add features')
In [59]:
pandas.concat([baseline_result, baseline_add_result, one_vs_one_result, mn_result])
Out[59]:
In [60]:
from hep_ml.nnet import MLPClassifier
from rep.estimators import SklearnClassifier, TheanetsClassifier
In [61]:
from utils import train_one_vs_one
In [62]:
mlp_columns = train_one_vs_one([SklearnClassifier(MLPClassifier(layers=(30, 10), scaler='iron',
epochs=700, random_state=11))]*3,
data_b, data_c, data_light, 'mlp', folding=True,
n_folds=2, features=sv_features)
In [63]:
full_data = pandas.concat([full_data, mlp_columns], axis=1)
In [65]:
mlp_features = list(mlp_columns.columns)
In [69]:
from sklearn.ensemble import AdaBoostClassifier
from hep_ml.nnet import MLPMultiClassifier
In [85]:
combo_base = AdaBoostClassifier(MLPMultiClassifier(layers=(40, 30, 20, 10), random_state=11, epochs=700), random_state=11,
n_estimators=10, learning_rate=0.1)
combo_multi = FoldingClassifier(SklearnClassifier(combo_base), n_folds=2, random_state=13,
features=sv_features)
%time combo_multi.fit(full_data, labels)
Out[85]:
In [86]:
combo_multi_probs = combo_multi.predict_proba(full_data)
log_loss(labels, combo_multi_probs)
combo_multi_result = generate_result(1 - roc_auc_score(labels > 0, combo_multi_probs[:, 0] / combo_multi_probs[:, 1],
sample_weight=(labels != 2) * 1),
1 - roc_auc_score(labels > 1, combo_multi_probs[:, 0] / combo_multi_probs[:, 2],
sample_weight=(labels != 1) * 1),
1 - roc_auc_score(labels > 1, combo_multi_probs[:, 1] / combo_multi_probs[:, 2],
sample_weight=(labels != 0) * 1),
log_loss(labels, combo_multi_probs),
label='multiclass combo')
pandas.concat([baseline_result, baseline_add_result, one_vs_one_result,
mn_result, combo_multi_result])
Out[86]:
In [75]:
Out[75]:
In [64]:
sv_features
Out[64]:
In [ ]:
sv_features_without_categorial = list(set(sv_features) - {'SVN', 'SVQ', "SV_Q_N_rel"})
In [ ]:
mlp_columns_without_categorial = train_one_vs_one([SklearnClassifier(MLPClassifier(
layers=(30, 10), epochs=700, scaler='iron', random_state=11))]*3,
data_b, data_c, data_light, 'rbf_cat', folding=True,
n_folds=3, features=sv_features_without_categorial)
In [62]:
full_data = pandas.concat([full_data, mlp_columns_without_categorial], axis=1)
In [63]:
mlp_columns_group1 = train_one_vs_one([SklearnClassifier(MLPClassifier(
layers=(20, 10), epochs=400, random_state=11))]*3,
data_b, data_c, data_light, 'gr1', folding=True,
n_folds=3, features=['log_SVM', 'log_SVMC', 'SVR', 'log_SVFDChi2'])
In [64]:
mlp_columns_group2 = train_one_vs_one([SklearnClassifier(MLPClassifier(
layers=(20, 10), epochs=400, random_state=11))]*3,
data_b, data_c, data_light, 'gr2', folding=True,
n_folds=3, features=['SVPT', 'SVDR', 'SVR', 'log_SVFDChi2'])
In [65]:
full_data = pandas.concat([full_data, mlp_columns_group1], axis=1)
full_data = pandas.concat([full_data, mlp_columns_group2], axis=1)
In [66]:
mn_columns = train_one_vs_one([MatrixNetClassifier(connection='mn', connection_auth='AUTH_HEADERS',
iterations=5000, regularization=0.02, sync=False)]*3,
data_b, data_c, data_light, 'mn', folding=True,
n_folds=2, features=sv_features)
In [67]:
full_data = pandas.concat([full_data, mn_columns], axis=1)
In [68]:
mpl_features = list(mlp_columns.columns)
mn_features = list(mn_columns.columns)
rbf_features_cat = list(mlp_columns_without_categorial.columns)
mlp1_features = list(mlp_columns_group1.columns)
mlp2_features = list(mlp_columns_group2.columns)
In [69]:
from sklearn.ensemble import RandomForestClassifier
In [70]:
rf_folding = FoldingClassifier(RandomForestClassifier(n_estimators=500, max_depth=8, min_samples_split=100,
min_samples_leaf=20, random_state=11),
n_folds=2, random_state=13,
features=sv_features)
rf_folding.fit_lds(data_b_c_lds)
Out[70]:
In [71]:
p = rf_folding.predict_proba(pandas.concat([data_b, data_c]))[:, 1]
In [72]:
p_b = p[:len(data_b)]
p_c = p[len(data_b):]
In [73]:
figure(figsize=(8, 5))
scatter(p_b, numpy.exp(data_b.log_SVM.values) + 0.01, alpha=0.05)
scatter(p_c, numpy.exp(data_c.log_SVM.values) + 0.01, alpha=0.05, color='r')
xlim(0, 1.01)
ylim(0, 5)
Out[73]:
In [74]:
figure(figsize=(8, 5))
hist(p[:len(data_b)], bins=100, alpha=0.4, normed=True)
hist(p[len(data_b):], bins=100, alpha=0.4, normed=True)
pass
In [75]:
data_b_new = full_data[labels == 0].ix[p_b > 0.8, :]
data_c_new = full_data[labels == 1].ix[p_c < 0.8, :]
In [76]:
data_b_remain = full_data[labels == 0].ix[p_b < 0.8, :]
data_c_remain = full_data[labels == 1].ix[p_c > 0.8, :]
In [77]:
labels_new = numpy.array([0] * len(data_b_new) + [1] * len(data_c_new) + [2] * len(data_light))
full_data_new = pandas.concat([data_b_new, data_c_new, full_data[labels == 2]])
full_data_new.index = range(len(full_data_new))
In [78]:
data_multi_lds_new = LabeledDataStorage(full_data_new, labels_new)
In [79]:
from rep.estimators import XGBoostClassifier
xgb_base = XGBoostClassifier(n_estimators=3000, colsample=0.7, eta=0.01, nthreads=16,
subsample=0.7, max_depth=6)
multi_folding = FoldingClassifier(xgb_base, n_folds=2, random_state=13,
features=sv_features)
%time multi_folding.fit_lds(data_multi_lds_new)
Out[79]:
In [80]:
p1 = multi_folding.predict_proba(full_data_new)
p2 = multi_folding.predict_proba(data_b_remain)
p3 = multi_folding.predict_proba(data_c_remain)
In [81]:
p = numpy.concatenate([p1, p2, p3])
labels_temp = numpy.array(list(labels_new) + [0] * len(p2) + [1] * len(p3))
In [73]:
from sklearn.metrics import log_loss
In [84]:
multi_result_selection = generate_result(1 - roc_auc_score(labels_temp > 0, p[:, 0] / p[:, 1],
sample_weight=(labels_temp != 2) * 1),
1 - roc_auc_score(labels_temp > 1, p[:, 0] / p[:, 2],
sample_weight=(labels_temp != 1) * 1),
1 - roc_auc_score(labels_temp > 1, p[:, 1] / p[:, 2],
sample_weight=(labels_temp != 0) * 1),
log_loss(labels_temp, p),
label='multiclass, NN features + selection')
pandas.concat([baseline_result, baseline_add_result, one_vs_one_result, mn_result, multi_result_selection])
Out[84]:
In [105]:
data_multi_lds = LabeledDataStorage(full_data, labels)
In [106]:
from rep.estimators import XGBoostClassifier
xgb_base = XGBoostClassifier(n_estimators=3000, colsample=0.7, eta=0.01, nthreads=16,
subsample=0.7, max_depth=6)
multi_folding = FoldingClassifier(xgb_base, n_folds=2, random_state=13,
features=sv_features + mlp1_features + mlp2_features)
%time multi_folding.fit_lds(data_multi_lds)
Out[106]:
In [107]:
multi_report = multi_folding.test_on_lds(data_multi_lds)
In [108]:
multi_report.feature_importance().plot(new_plot=True)
In [109]:
from sklearn.metrics import log_loss
probs_xgb_mult = multi_folding.predict_proba(full_data)
print 'log loss\t', log_loss(labels, probs_xgb_mult)
In [87]:
# 0.302275 0.949335 0.988883 0.981100
# my 0.949379 0.988952 0.981269
#0.949040 0.987689 0.979919
# 0.9501 0.9865 0.9777
#0.951279 0.987887 0.98012
In [110]:
multi_result = generate_result(1 - roc_auc_score(labels > 0, probs_xgb_mult[:, 0] / probs_xgb_mult[:, 1],
sample_weight=(labels != 2) * 1),
1 - roc_auc_score(labels > 1, probs_xgb_mult[:, 0] / probs_xgb_mult[:, 2],
sample_weight=(labels != 1) * 1),
1 - roc_auc_score(labels > 1, probs_xgb_mult[:, 1] / probs_xgb_mult[:, 2],
sample_weight=(labels != 0) * 1),
log_loss(labels, probs_xgb_mult),
label='multiclass, NN features')
pandas.concat([baseline_result, baseline_add_result, one_vs_one_result, mn_result, multi_result])
Out[110]:
In [145]:
full_data['mult_xgb_0'] = probs_xgb_mult[:, 0] / probs_xgb_mult[:, 1]
full_data['mult_xgb_1'] = probs_xgb_mult[:, 0] / probs_xgb_mult[:, 2]
full_data['mult_xgb_2'] = probs_xgb_mult[:, 1] / probs_xgb_mult[:, 2]
In [146]:
xgb_features = ['mult_xgb_0', 'mult_xgb_1', 'mult_xgb_2']
In [154]:
full_data['mn'] = probs_mn_bc_light
In [163]:
from hep_ml.nnet import MLPMultiClassifier
mlp_multi_base = MLPMultiClassifier(layers=(30, 10), random_state=11, epochs=700)
mlp_multi = FoldingClassifier(mlp_multi_base, n_folds=2,
features=sv_features + rbf_features + combo_features)
%time mlp_multi.fit(full_data, labels)
Out[163]:
In [164]:
mlp_multi_probs = mlp_multi.predict_proba(full_data)
log_loss(labels, mlp_multi_probs)
Out[164]:
In [217]:
mlp_multi_result = generate_result(1 - roc_auc_score(labels > 0, mlp_multi_probs[:, 0] / mlp_multi_probs[:, 1],
sample_weight=(labels != 2) * 1),
1 - roc_auc_score(labels > 1, mlp_multi_probs[:, 0] / mlp_multi_probs[:, 2],
sample_weight=(labels != 1) * 1),
1 - roc_auc_score(labels > 1, mlp_multi_probs[:, 1] / mlp_multi_probs[:, 2],
sample_weight=(labels != 0) * 1),
log_loss(labels, mlp_multi_probs),
label='multiclass NN, NN features')
pandas.concat([baseline_result, baseline_add_result, one_vs_one_result, mn_result, multi_result, mlp_multi_result])
Out[217]:
In [363]:
from hep_ml.nnet import MLPMultiClassifier
mlp_multi_base = MLPMultiClassifier(layers=(30, 10), random_state=11, epochs=700)
mlp_multi = FoldingClassifier(mlp_multi_base, n_folds=2, random_state=13,
features=sv_features + rbf_features + combo_features + jet_features)
%time mlp_multi.fit(full_data, labels)
Out[363]:
In [364]:
mlp_multi_probs = mlp_multi.predict_proba(full_data)
log_loss(labels, mlp_multi_probs)
Out[364]:
In [365]:
mlp_multi_result2 = generate_result(1 - roc_auc_score(labels > 0, mlp_multi_probs[:, 0] / mlp_multi_probs[:, 1],
sample_weight=(labels != 2) * 1),
1 - roc_auc_score(labels > 1, mlp_multi_probs[:, 0] / mlp_multi_probs[:, 2],
sample_weight=(labels != 1) * 1),
1 - roc_auc_score(labels > 1, mlp_multi_probs[:, 1] / mlp_multi_probs[:, 2],
sample_weight=(labels != 0) * 1),
log_loss(labels, mlp_multi_probs),
label='multiclass2 NN, NN features')
pandas.concat([baseline_result, baseline_add_result, one_vs_one_result,
mn_result, multi_result, mlp_multi_result, mlp_multi_result2])
Out[365]:
In [352]:
from sklearn.ensemble import AdaBoostClassifier, BaggingClassifier
In [238]:
combo_base = BaggingClassifier(MLPMultiClassifier(layers=(30, 10), random_state=11, epochs=700), max_samples=0.8,
max_features=0.8, bootstrap_features=True, random_state=11,
n_estimators=10)
combo_multi = FoldingClassifier(SklearnClassifier(combo_base), n_folds=2, random_state=13,
features=sv_features + rbf_features + jet_features_base)
%time combo_multi.fit(full_data, labels)
Out[238]:
In [307]:
combo_base = AdaBoostClassifier(MLPMultiClassifier(layers=(30, 10), random_state=11, epochs=700), random_state=11,
n_estimators=10, learning_rate=0.1)
combo_multi = FoldingClassifier(SklearnClassifier(combo_base), n_folds=2, random_state=13,
features=sv_features + rbf_features)
%time combo_multi.fit(full_data, labels)
Out[307]:
In [308]:
combo_multi_probs = combo_multi.predict_proba(full_data)
log_loss(labels, combo_multi_probs)
Out[308]:
In [309]:
combo_multi_probs = combo_multi.predict_proba(full_data)
log_loss(labels, combo_multi_probs)
combo_multi_result = generate_result(1 - roc_auc_score(labels > 0, combo_multi_probs[:, 0] / combo_multi_probs[:, 1],
sample_weight=(labels != 2) * 1),
1 - roc_auc_score(labels > 1, combo_multi_probs[:, 0] / combo_multi_probs[:, 2],
sample_weight=(labels != 1) * 1),
1 - roc_auc_score(labels > 1, combo_multi_probs[:, 1] / combo_multi_probs[:, 2],
sample_weight=(labels != 0) * 1),
log_loss(labels, combo_multi_probs),
label='multiclass combo')
pandas.concat([baseline_result, baseline_add_result, one_vs_one_result,
mn_result, multi_result, mlp_multi_result, combo_multi_result])
Out[309]:
In [220]:
combo_multi_result = generate_result(1 - roc_auc_score(labels > 0, combo_multi_probs[:, 0] / combo_multi_probs[:, 1],
sample_weight=(labels != 2) * 1),
1 - roc_auc_score(labels > 1, combo_multi_probs[:, 0] / combo_multi_probs[:, 2],
sample_weight=(labels != 1) * 1),
1 - roc_auc_score(labels > 1, combo_multi_probs[:, 1] / combo_multi_probs[:, 2],
sample_weight=(labels != 0) * 1),
log_loss(labels, combo_multi_probs),
label='multiclass combo')
pandas.concat([baseline_result, baseline_add_result, one_vs_one_result,
mn_result, multi_result, mlp_multi_result, combo_multi_result])
Out[220]:
In [161]:
full_data['combo_0'] = combo_multi_probs[:, 0] / combo_multi_probs[:, 1]
full_data['combo_1'] = combo_multi_probs[:, 0] / combo_multi_probs[:, 2]
full_data['combo_2'] = combo_multi_probs[:, 1] / combo_multi_probs[:, 2]
In [162]:
combo_features = ['combo_0', 'combo_1', 'combo_2']
In [254]:
label_rank = labels - 1
data_ranker_lds = LabeledDataStorage(full_data, label_rank)
In [255]:
from rep.estimators import XGBoostRegressor
In [256]:
xgb_base = XGBoostRegressor(n_estimators=500, colsample=0.7, eta=0.01, nthreads=8,
subsample=0.7, max_depth=6)
xgb_folding_regression = FoldingRegressor(xgb_base, n_folds=2, random_state=11,
parallel_profile='threads-2',
features=sv_features + rbf_features)
%time xgb_folding_regression.fit_lds(data_ranker_lds)
pass
In [257]:
report_reg = xgb_folding_regression.test_on_lds(data_ranker_lds)
In [258]:
from sklearn.metrics import mean_squared_error
lc = report_reg.learning_curve(mean_squared_error, steps=10)
In [260]:
lc.plot(new_plot=True)
In [261]:
figsize(8, 6)
reg_values = xgb_folding_regression.predict(full_data)
for label in numpy.unique(labels):
hist(reg_values[labels == label],
bins=100, alpha=0.1, normed=True, label=str(label))
legend()
Out[261]:
In [262]:
full_data['xgb_reg'] = xgb_folding_regression.predict(full_data)
In [268]:
tt_base = DecisionTrainRegressor(learning_rate=0.02, n_estimators=10000, depth=6, pretransform_needed=True,
max_features=0.7, loss=MSELossFunction(regularization=100))
tt_folding_regression = FoldingRegressor(tt_base, n_folds=2, random_state=11,
parallel_profile='threads-2',
features=sv_features)
%time tt_folding_regression.fit_lds(data_ranker_lds)
pass
In [269]:
report_dt_reg = tt_folding_regression.test_on_lds(data_ranker_lds)
In [270]:
lc_dt = report_dt_reg.learning_curve(mean_squared_error, steps=1)
In [271]:
lc_dt.plot(new_plot=True)
lc.plot()
# ylim(0.16, 0.18)
In [277]:
figsize(8, 6)
reg_values = tt_folding_regression.predict(full_data)
for label in numpy.unique(labels):
hist(reg_values[labels == label],
bins=100, alpha=0.1, normed=True, range=(-0.5, 2.5), label=str(label))
xlim(-1, 2.5)
# ylim(0, 0.05)
legend()
Out[277]:
In [278]:
from hep_ml.gradientboosting import UGradientBoostingRegressor
from hep_ml.losses import RankBoostLossFunction
from rep.estimators import SklearnClassifier
In [280]:
full_data['fake_request'] = 0
In [286]:
tt_base = DecisionTrainRegressor(learning_rate=0.02, n_estimators=10000, depth=6, pretransform_needed=True,
max_features=6,
train_features=sv_features + rbf_features,
loss=RankBoostLossFunction(request_column='fake_request'))
tt_folding_ranking = FoldingRegressor(SklearnClassifier(tt_base), n_folds=2, random_state=11,
parallel_profile='threads-2', features=sv_features + rbf_features + ['fake_request'])
%time tt_folding_ranking.fit_lds(data_multi_lds)
pass
In [290]:
tt_folding_ranking.estimators[0].feature_importances_
Out[290]:
In [291]:
tt_ranks = tt_folding_ranking.predict(full_data)
for label in numpy.unique(labels):
hist(tt_ranks[labels == label], bins=100, alpha=0.1,
normed=True, range=(-20, 10), label=str(label))
legend()
Out[291]:
In [292]:
full_data['dt_rank'] = tt_folding_ranking.predict(full_data)
In [55]:
ranker_base = UGradientBoostingRegressor(loss=RankBoostLossFunction(request_column='fake_request'), subsample=0.6,
n_estimators=400, max_depth=6,
train_features=sv_features + ['bdt1', 'bdt2'])
ranker = FoldingRegressor(ranker_base, n_folds=2, random_state=11,
parallel_profile='threads-2')
%time ranker.fit_lds(data_multi_lds)
pass
In [67]:
ranker.estimators[0].feature_importances_
Out[67]:
In [56]:
ranks = ranker.predict(full_data)
tt_ranks = tt_folding_ranking.predict(full_data)
In [57]:
for label in numpy.unique(full_data.label.values):
hist(tt_ranks[full_data.label.values == label], bins=100, alpha=0.1,
normed=True, range=(-20, 10), label=str(label))
legend()
Out[57]:
In [58]:
for label in numpy.unique(full_data.label.values):
hist(ranks[full_data.label.values == label], bins=100, alpha=0.1, normed=True, range=(-20, 10), label=str(label))
legend()
Out[58]:
In [59]:
labels = full_data.label.values
ranker_result = generate_result(roc_auc_score(labels[labels != 2] > 0, ranks[labels != 2]),
roc_auc_score(labels[labels != 1] > 0, ranks[labels != 1]),
roc_auc_score(labels[labels != 0] > 1, ranks[labels != 0]),
label='ranker')
In [60]:
tt_ranker_result = generate_result(roc_auc_score(labels[labels != 2] > 0, tt_ranks[labels != 2]),
roc_auc_score(labels[labels != 1] > 0, tt_ranks[labels != 1]),
roc_auc_score(labels[labels != 0] > 1, tt_ranks[labels != 0]),
label='dt ranker')
In [ ]:
In [ ]:
In [61]:
result = pandas.concat([baseline_result, multiclass_result,
regression_result, ranker_result, tt_ranker_result])
result.index = result['name']
result = result.drop('name', axis=1)
result
Out[61]: