${t\bar{t}H\left(b\bar{b}\right)}$ scikit-learn BDT for classification of ${t\bar{t}H}$ and ${t\bar{t}b\bar{b}}$ events

For each signal region, information from the output of the reconstruction BDT is combined with kinematic variables for input to classification BDTs, with ${t\bar{t}H \left(H\to b\bar{b}\right)}$ as signal and ${t\bar{t}}$ as background. There is one BDT trained for events with exactly 5 jets or at least 6 jets.


In [1]:
import datetime
import graphviz
import matplotlib.pyplot as plt
%matplotlib inline
import numpy as np
plt.rcParams["figure.figsize"] = (17, 10)
import pandas as pd
import seaborn as sns
sns.set(context = "paper", font = "monospace")
import sklearn.datasets
from sklearn.preprocessing import MinMaxScaler
import sklearn.tree
import sqlite3
import warnings
warnings.filterwarnings("ignore")
pd.set_option("display.max_rows", 500)
pd.set_option("display.max_columns", 500)

read


In [2]:
df = pd.read_csv("ttHbb_data.csv")

df.head()


Out[2]:
dRbb_avg_Sort4 dRbb_MaxPt_Sort4 dEtajj_MaxdEta Mbb_MindR_Sort4 Mjj_MindR nHiggsbb30_Sort4 HT_jets dRlepbb_MindR_Sort4 Aplanarity_jets H1_all TTHReco_best_TTHReco TTHReco_best_Higgs_mass TTHReco_best_Higgsbleptop_mass TTHReco_best_bbHiggs_dR TTHReco_withH_best_Higgsttbar_dR TTHReco_best_Higgsleptop_dR TTHReco_best_b1Higgsbhadtop_dR LHD_Discriminant target
0 0.265696 0.325133 0.311509 0.017415 0.023733 0.333333 0.029321 0.335150 0.025005 0.836903 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 1
1 0.412821 0.174657 0.276681 0.051575 0.062728 0.333333 0.151450 0.509171 0.062811 0.548197 0.595407 0.079543 0.162228 0.348091 0.400588 0.441174 0.262685 0.728361 1
2 0.346333 0.386509 0.081872 0.031549 0.025527 0.000000 0.112252 0.224414 0.152014 0.738440 0.663737 0.082487 0.102262 0.640163 0.398795 0.477156 0.555918 0.712721 1
3 0.427364 0.106369 0.369438 0.070832 0.084710 0.500000 0.058080 0.375719 0.111577 0.508818 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 1
4 0.384440 0.218341 0.438701 0.054052 0.069121 0.500000 0.167710 0.218126 0.671929 0.169410 0.874208 0.029553 0.058182 0.377938 0.374169 0.397906 0.505010 0.815084 1

features and targets


In [3]:
features = list(df.columns[:-1])

X = df[features]
y = df["target"]

In [4]:
classifier = sklearn.tree.DecisionTreeClassifier(min_samples_split = 20, random_state = 99, max_depth = 5)
classifier.fit(X, y)


Out[4]:
DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=5,
            max_features=None, max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=1,
            min_samples_split=20, min_weight_fraction_leaf=0.0,
            presort=False, random_state=99, splitter='best')

In [5]:
graph = graphviz.Source(
    sklearn.tree.export_graphviz(
        classifier,
        out_file           = None,
        feature_names      = list(df[features].columns.values),
        filled             = True,
        rounded            = True,
        special_characters = True
    )
)
graph


Out[5]:
Tree 0 HT_jets ≤ 0.0938 gini = 0.4995 samples = 22795 value = [11743, 11052] 1 LHD_Discriminant ≤ 0.765 gini = 0.4239 samples = 8883 value = [6174, 2709] 0->1 True 30 dRbb_avg_Sort4 ≤ 0.5026 gini = 0.4801 samples = 13912 value = [5569, 8343] 0->30 False 2 dRbb_avg_Sort4 ≤ 0.4929 gini = 0.3984 samples = 7892 value = [5725, 2167] 1->2 17 dEtajj_MaxdEta ≤ 0.5966 gini = 0.4956 samples = 991 value = [449, 542] 1->17 3 HT_jets ≤ 0.0559 gini = 0.4405 samples = 5191 value = [3491, 1700] 2->3 10 dEtajj_MaxdEta ≤ 0.4184 gini = 0.286 samples = 2701 value = [2234, 467] 2->10 4 H1_all ≤ 0.6972 gini = 0.3434 samples = 2216 value = [1728, 488] 3->4 7 H1_all ≤ 0.3079 gini = 0.4828 samples = 2975 value = [1763, 1212] 3->7 5 gini = 0.3109 samples = 1797 value = [1451, 346] 4->5 6 gini = 0.4481 samples = 419 value = [277, 142] 4->6 8 gini = 0.4562 samples = 1540 value = [998, 542] 7->8 9 gini = 0.4978 samples = 1435 value = [765, 670] 7->9 11 dEtajj_MaxdEta ≤ 0.287 gini = 0.4604 samples = 128 value = [82, 46] 10->11 14 HT_jets ≤ 0.0612 gini = 0.2737 samples = 2573 value = [2152, 421] 10->14 12 gini = 0.0 samples = 3 value = [0, 3] 11->12 13 gini = 0.4513 samples = 125 value = [82, 43] 11->13 15 gini = 0.2183 samples = 1291 value = [1130, 161] 14->15 16 gini = 0.3234 samples = 1282 value = [1022, 260] 14->16 18 LHD_Discriminant ≤ 0.9071 gini = 0.484 samples = 838 value = [344, 494] 17->18 25 dRlepbb_MindR_Sort4 ≤ 0.6003 gini = 0.4306 samples = 153 value = [105, 48] 17->25 19 LHD_Discriminant ≤ 0.772 gini = 0.4955 samples = 694 value = [314, 380] 18->19 22 TTHReco_best_Higgsbleptop_mass ≤ 0.1115 gini = 0.3299 samples = 144 value = [30, 114] 18->22 20 gini = 0.0 samples = 17 value = [0, 17] 19->20 21 gini = 0.4974 samples = 677 value = [314, 363] 19->21 23 gini = 0.2778 samples = 132 value = [22, 110] 22->23 24 gini = 0.4444 samples = 12 value = [8, 4] 22->24 26 Mbb_MindR_Sort4 ≤ 0.0127 gini = 0.4123 samples = 148 value = [105, 43] 25->26 29 gini = 0.0 samples = 5 value = [0, 5] 25->29 27 gini = 0.0 samples = 5 value = [0, 5] 26->27 28 gini = 0.3902 samples = 143 value = [105, 38] 26->28 31 LHD_Discriminant ≤ 0.7618 gini = 0.4461 samples = 10136 value = [3404, 6732] 30->31 46 HT_jets ≤ 0.191 gini = 0.4892 samples = 3776 value = [2165, 1611] 30->46 32 HT_jets ≤ 0.1472 gini = 0.4722 samples = 7398 value = [2827, 4571] 31->32 39 dEtajj_MaxdEta ≤ 0.5161 gini = 0.3327 samples = 2738 value = [577, 2161] 31->39 33 Aplanarity_jets ≤ 0.2967 gini = 0.4988 samples = 3278 value = [1560, 1718] 32->33 36 HT_jets ≤ 0.284 gini = 0.4259 samples = 4120 value = [1267, 2853] 32->36 34 gini = 0.4998 samples = 2727 value = [1390, 1337] 33->34 35 gini = 0.4267 samples = 551 value = [170, 381] 33->35 37 gini = 0.4459 samples = 3231 value = [1084, 2147] 36->37 38 gini = 0.327 samples = 889 value = [183, 706] 36->38 40 HT_jets ≤ 0.1588 gini = 0.2888 samples = 1777 value = [311, 1466] 39->40 43 HT_jets ≤ 0.2091 gini = 0.4004 samples = 961 value = [266, 695] 39->43 41 gini = 0.3491 samples = 972 value = [219, 753] 40->41 42 gini = 0.2024 samples = 805 value = [92, 713] 40->42 44 gini = 0.4398 samples = 637 value = [208, 429] 43->44 45 gini = 0.2939 samples = 324 value = [58, 266] 43->45 47 LHD_Discriminant ≤ 0.7961 gini = 0.4612 samples = 2506 value = [1602, 904] 46->47 54 LHD_Discriminant ≤ 0.728 gini = 0.4936 samples = 1270 value = [563, 707] 46->54 48 dEtajj_MaxdEta ≤ 0.5976 gini = 0.4428 samples = 2270 value = [1519, 751] 47->48 51 Mbb_MindR_Sort4 ≤ 0.1091 gini = 0.456 samples = 236 value = [83, 153] 47->51 49 gini = 0.4889 samples = 778 value = [447, 331] 48->49 50 gini = 0.4045 samples = 1492 value = [1072, 420] 48->50 52 gini = 0.494 samples = 137 value = [61, 76] 51->52 53 gini = 0.3457 samples = 99 value = [22, 77] 51->53 55 dRbb_avg_Sort4 ≤ 0.6057 gini = 0.4994 samples = 1070 value = [516, 554] 54->55 58 Mbb_MindR_Sort4 ≤ 0.028 gini = 0.3596 samples = 200 value = [47, 153] 54->58 56 gini = 0.4895 samples = 739 value = [316, 423] 55->56 57 gini = 0.4783 samples = 331 value = [200, 131] 55->57 59 gini = 0.0 samples = 3 value = [3, 0] 58->59 60 gini = 0.3469 samples = 197 value = [44, 153] 58->60

accuracy


In [6]:
y_predictions = classifier.predict(X)
y_predictions


Out[6]:
array([0, 1, 0, ..., 0, 0, 0])

In [7]:
sklearn.metrics.accuracy_score(y, y_predictions)


Out[7]:
0.68335161219565699

In [8]:
_df = pd.DataFrame()
_df["variable"]   = X.columns.values
_df["importance"] = classifier.feature_importances_

_df.index = _df["variable"].values
del _df["variable"]

_df = _df.sort_values(by = "importance", ascending = False)
_df


Out[8]:
importance
HT_jets 0.580307
dRbb_avg_Sort4 0.190065
LHD_Discriminant 0.154543
dEtajj_MaxdEta 0.030467
Aplanarity_jets 0.017219
H1_all 0.015864
Mbb_MindR_Sort4 0.006724
TTHReco_best_Higgsbleptop_mass 0.002552
dRlepbb_MindR_Sort4 0.002259
Mjj_MindR 0.000000
nHiggsbb30_Sort4 0.000000
dRbb_MaxPt_Sort4 0.000000
TTHReco_best_TTHReco 0.000000
TTHReco_best_Higgs_mass 0.000000
TTHReco_best_bbHiggs_dR 0.000000
TTHReco_withH_best_Higgsttbar_dR 0.000000
TTHReco_best_Higgsleptop_dR 0.000000
TTHReco_best_b1Higgsbhadtop_dR 0.000000

In [9]:
plt.rcParams["figure.figsize"] = (17, 10)

_df.sort_values(by = "importance", ascending = True).plot(kind = "barh", legend = "False");



In [ ]: