project bonhomie ${t\bar{t}H}$ and ${t\bar{t}b\bar{b}}$ classification BDT


In [1]:
import graphviz
from matplotlib import gridspec
import matplotlib.pylab as plt
from matplotlib.ticker import NullFormatter, NullLocator, MultipleLocator
import numpy as np
import pandas as pd
pd.set_option("display.max_rows", 500)
pd.set_option("display.max_columns", 500)
from scipy import stats
import seaborn as sns
sns.set_palette('husl')
sns.set(style = 'ticks')
import sklearn.datasets
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import auc, roc_curve
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
import sklearn.tree
import sqlite3
import warnings
warnings.filterwarnings("ignore")

read


In [2]:
df = pd.read_csv("ttHbb_data.csv")
df.head()


Out[2]:
TTHReco_best_Higgsleptop_dR TTHReco_withH_best_Higgsttbar_dR TTHReco_best_TTHReco Aplanarity_jets TTHReco_best_Higgs_mass dRbb_MaxPt_Sort4 Mbb_MindR_Sort4 H1_all dEtajj_MaxdEta LHD_Discriminant TTHReco_best_b1Higgsbhadtop_dR TTHReco_best_Higgsbleptop_mass dRbb_avg_Sort4 nHiggsbb30_Sort4 dRlepbb_MindR_Sort4 Mjj_MindR HT_jets TTHReco_best_bbHiggs_dR classification
0 -1.000000 -1.000000 -1.000000 0.010764 -1.000 1.873108 29691.271 0.801864 1.611009 -1.000000 -1.000000 -1.00 1.596763 2 1.926379 29691.271 200475.97 -1.000000 1
1 3.033175 2.914220 0.000834 0.027035 156656.200 1.190597 57697.246 0.525295 1.440214 0.429600 0.664401 360101.97 2.093996 2 2.911004 57697.246 520111.80 1.190597 1
2 3.362126 2.896700 0.115693 0.065426 162454.690 2.151490 41279.270 0.707541 0.484865 0.398901 2.522366 226994.64 1.869288 0 1.299827 30980.066 417521.62 3.028662 1
3 -1.000000 -1.000000 -1.000000 0.048023 -1.000 0.880868 73485.016 0.487572 1.895097 -1.000000 -1.000000 -1.00 2.143145 3 2.155922 73485.016 275745.00 -1.000000 1
4 2.637625 2.656071 0.469479 0.289182 58203.133 1.388736 59727.780 0.162432 2.234762 0.599817 2.199806 129147.22 1.998076 3 1.264251 62289.004 562667.20 1.378433 1

In [3]:
features = list(df.columns[:-1])
print(features)
X = df[features]
y = df["classification"]


['TTHReco_best_Higgsleptop_dR', 'TTHReco_withH_best_Higgsttbar_dR', 'TTHReco_best_TTHReco', 'Aplanarity_jets', 'TTHReco_best_Higgs_mass', 'dRbb_MaxPt_Sort4', 'Mbb_MindR_Sort4', 'H1_all', 'dEtajj_MaxdEta', 'LHD_Discriminant', 'TTHReco_best_b1Higgsbhadtop_dR', 'TTHReco_best_Higgsbleptop_mass', 'dRbb_avg_Sort4', 'nHiggsbb30_Sort4', 'dRlepbb_MindR_Sort4', 'Mjj_MindR', 'HT_jets', 'TTHReco_best_bbHiggs_dR']

In [4]:
classifier = sklearn.tree.DecisionTreeClassifier(
    min_samples_split = 20,
    random_state      = 99,
    max_depth         = 5
)
classifier.fit(X, y)


Out[4]:
DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=5,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=20,
            min_weight_fraction_leaf=0.0, presort=False, random_state=99,
            splitter='best')

In [5]:
graph = graphviz.Source(
    sklearn.tree.export_graphviz(
        classifier,
        out_file           = None,
        feature_names      = list(df[features].columns.values),
        filled             = True,
        rounded            = True,
        special_characters = True
    )
)
graph


Out[5]:
Tree 0 HT_jets ≤ 369345.031 gini = 0.5 samples = 22795 value = [11743, 11052] 1 LHD_Discriminant ≤ 0.501 gini = 0.424 samples = 8883 value = [6174, 2709] 0->1 True 30 dRbb_avg_Sort4 ≤ 2.397 gini = 0.48 samples = 13912 value = [5569, 8343] 0->30 False 2 dRbb_avg_Sort4 ≤ 2.365 gini = 0.398 samples = 7892 value = [5725, 2167] 1->2 17 dEtajj_MaxdEta ≤ 3.009 gini = 0.496 samples = 991 value = [449, 542] 1->17 3 HT_jets ≤ 269917.984 gini = 0.44 samples = 5191 value = [3491, 1700] 2->3 10 dEtajj_MaxdEta ≤ 2.135 gini = 0.286 samples = 2701 value = [2234, 467] 2->10 4 H1_all ≤ 0.668 gini = 0.343 samples = 2216 value = [1728, 488] 3->4 7 H1_all ≤ 0.295 gini = 0.483 samples = 2975 value = [1763, 1212] 3->7 5 gini = 0.311 samples = 1797 value = [1451, 346] 4->5 6 gini = 0.448 samples = 419 value = [277, 142] 4->6 8 gini = 0.456 samples = 1540 value = [998, 542] 7->8 9 gini = 0.498 samples = 1435 value = [765, 670] 7->9 11 dEtajj_MaxdEta ≤ 1.491 gini = 0.46 samples = 128 value = [82, 46] 10->11 14 HT_jets ≤ 283937.906 gini = 0.274 samples = 2573 value = [2152, 421] 10->14 12 gini = 0.0 samples = 3 value = [0, 3] 11->12 13 gini = 0.451 samples = 125 value = [82, 43] 11->13 15 gini = 0.218 samples = 1291 value = [1130, 161] 14->15 16 gini = 0.323 samples = 1282 value = [1022, 260] 14->16 18 LHD_Discriminant ≤ 0.78 gini = 0.484 samples = 838 value = [344, 494] 17->18 25 dRlepbb_MindR_Sort4 ≤ 3.427 gini = 0.431 samples = 153 value = [105, 48] 17->25 19 LHD_Discriminant ≤ 0.515 gini = 0.495 samples = 694 value = [314, 380] 18->19 22 TTHReco_best_Higgsbleptop_mass ≤ 247482.609 gini = 0.33 samples = 144 value = [30, 114] 18->22 20 gini = 0.0 samples = 17 value = [0, 17] 19->20 21 gini = 0.497 samples = 677 value = [314, 363] 19->21 23 gini = 0.278 samples = 132 value = [22, 110] 22->23 24 gini = 0.444 samples = 12 value = [8, 4] 22->24 26 Mbb_MindR_Sort4 ≤ 25857.608 gini = 0.412 samples = 148 value = [105, 43] 25->26 29 gini = 0.0 samples = 5 value = [0, 5] 25->29 27 gini = 0.0 samples = 5 value = [0, 5] 26->27 28 gini = 0.39 samples = 143 value = [105, 38] 26->28 31 LHD_Discriminant ≤ 0.495 gini = 0.446 samples = 10136 value = [3404, 6732] 30->31 46 HT_jets ≤ 623625.531 gini = 0.489 samples = 3776 value = [2165, 1611] 30->46 32 HT_jets ≤ 508918.922 gini = 0.472 samples = 7398 value = [2827, 4571] 31->32 39 dEtajj_MaxdEta ≤ 2.614 gini = 0.333 samples = 2738 value = [577, 2161] 31->39 33 Aplanarity_jets ≤ 0.128 gini = 0.499 samples = 3278 value = [1560, 1718] 32->33 36 HT_jets ≤ 867087.406 gini = 0.426 samples = 4120 value = [1267, 2853] 32->36 34 gini = 0.5 samples = 2727 value = [1390, 1337] 33->34 35 gini = 0.427 samples = 551 value = [170, 381] 33->35 37 gini = 0.446 samples = 3231 value = [1084, 2147] 36->37 38 gini = 0.327 samples = 889 value = [183, 706] 36->38 40 HT_jets ≤ 539465.219 gini = 0.289 samples = 1777 value = [311, 1466] 39->40 43 HT_jets ≤ 670898.5 gini = 0.4 samples = 961 value = [266, 695] 39->43 41 gini = 0.349 samples = 972 value = [219, 753] 40->41 42 gini = 0.202 samples = 805 value = [92, 713] 40->42 44 gini = 0.44 samples = 637 value = [208, 429] 43->44 45 gini = 0.294 samples = 324 value = [58, 266] 43->45 47 LHD_Discriminant ≤ 0.562 gini = 0.461 samples = 2506 value = [1602, 904] 46->47 54 LHD_Discriminant ≤ 0.429 gini = 0.494 samples = 1270 value = [563, 707] 46->54 48 dEtajj_MaxdEta ≤ 3.014 gini = 0.443 samples = 2270 value = [1519, 751] 47->48 51 Mbb_MindR_Sort4 ≤ 104844.047 gini = 0.456 samples = 236 value = [83, 153] 47->51 49 gini = 0.489 samples = 778 value = [447, 331] 48->49 50 gini = 0.405 samples = 1492 value = [1072, 420] 48->50 52 gini = 0.494 samples = 137 value = [61, 76] 51->52 53 gini = 0.346 samples = 99 value = [22, 77] 51->53 55 dRbb_avg_Sort4 ≤ 2.746 gini = 0.499 samples = 1070 value = [516, 554] 54->55 58 Mbb_MindR_Sort4 ≤ 38388.623 gini = 0.36 samples = 200 value = [47, 153] 54->58 56 gini = 0.49 samples = 739 value = [316, 423] 55->56 57 gini = 0.478 samples = 331 value = [200, 131] 55->57 59 gini = 0.0 samples = 3 value = [3, 0] 58->59 60 gini = 0.347 samples = 197 value = [44, 153] 58->60

accuracy


In [6]:
y_predictions = classifier.predict(X)
y_predictions
sklearn.metrics.accuracy_score(y, y_predictions)


Out[6]:
0.683351612195657

feature importances


In [7]:
_df = pd.DataFrame()
_df["variable"]   = X.columns.values
_df["importance"] = classifier.feature_importances_
_df.index         = _df["variable"].values
del _df["variable"]
_df = _df.sort_values(by = "importance", ascending = False)
_df


Out[7]:
importance
HT_jets 0.580307
dRbb_avg_Sort4 0.190065
LHD_Discriminant 0.154543
dEtajj_MaxdEta 0.030467
Aplanarity_jets 0.017219
H1_all 0.015864
Mbb_MindR_Sort4 0.006724
TTHReco_best_Higgsbleptop_mass 0.002552
dRlepbb_MindR_Sort4 0.002259
Mjj_MindR 0.000000
nHiggsbb30_Sort4 0.000000
TTHReco_best_Higgsleptop_dR 0.000000
TTHReco_best_b1Higgsbhadtop_dR 0.000000
TTHReco_withH_best_Higgsttbar_dR 0.000000
dRbb_MaxPt_Sort4 0.000000
TTHReco_best_Higgs_mass 0.000000
TTHReco_best_TTHReco 0.000000
TTHReco_best_bbHiggs_dR 0.000000

In [8]:
plt.rcParams["figure.figsize"] = [8, 8]
sns.barplot(_df["importance"], _df.index);
plt.xlabel('importance')
plt.show();



In [ ]: