For each signal region, information from the output of the reconstruction BDT is combined with kinematic variables for input to classification BDTs, with ${t\bar{t}H \left(H\to b\bar{b}\right)}$ as signal and ${t\bar{t}}$ as background. There is one BDT trained for events with exactly 5 jets or at least 6 jets.
In [1]:
import datetime
import graphviz
import matplotlib.pyplot as plt
%matplotlib inline
import numpy as np
plt.rcParams["figure.figsize"] = (17, 10)
import pandas as pd
import seaborn as sns
sns.set(context = "paper", font = "monospace")
import sklearn.datasets
from sklearn.preprocessing import MinMaxScaler
import sklearn.tree
import sqlite3
import warnings
warnings.filterwarnings("ignore")
pd.set_option("display.max_rows", 500)
pd.set_option("display.max_columns", 500)
In [2]:
df = pd.read_csv("ttHbb_data.csv")
df.head()
Out[2]:
In [3]:
features = list(df.columns[:-1])
X = df[features]
y = df["target"]
In [4]:
classifier = sklearn.tree.DecisionTreeClassifier(min_samples_split = 20, random_state = 99, max_depth = 5)
classifier.fit(X, y)
Out[4]:
In [5]:
graph = graphviz.Source(
sklearn.tree.export_graphviz(
classifier,
out_file = None,
feature_names = list(df[features].columns.values),
filled = True,
rounded = True,
special_characters = True
)
)
graph
Out[5]:
In [6]:
y_predictions = classifier.predict(X)
y_predictions
Out[6]:
In [7]:
sklearn.metrics.accuracy_score(y, y_predictions)
Out[7]:
In [8]:
_df = pd.DataFrame()
_df["variable"] = X.columns.values
_df["importance"] = classifier.feature_importances_
_df.index = _df["variable"].values
del _df["variable"]
_df = _df.sort_values(by = "importance", ascending = False)
_df
Out[8]:
In [9]:
plt.rcParams["figure.figsize"] = (17, 10)
_df.sort_values(by = "importance", ascending = True).plot(kind = "barh", legend = "False");
In [ ]: