In [1]:
import datetime
import matplotlib.pyplot as plt
%matplotlib inline
import numpy as np
plt.rcParams["figure.figsize"] = (13, 6)
import pandas as pd
import seaborn as sns
sns.set(context = "paper", font = "monospace")
from sklearn.preprocessing import MinMaxScaler
import sqlite3
import warnings
warnings.filterwarnings("ignore")
pd.set_option("display.max_rows", 500)
pd.set_option("display.max_columns", 500)
import root_pandas
variable | type | n-tuple name | description | region >= 6j | region 5j |
---|---|---|---|---|---|
${\Delta R^{\text{avg}}_{bb}}$ | general kinematic | dRbb_avg_Sort4 |
average ${\Delta R}$ for all ${b}$-tagged jet pairs | yes | yes |
${\Delta R^{\text{max} p_{T}}_{bb}}$ | general kinematic | dRbb_MaxPt_Sort4 |
${\Delta R}$ between the two ${b}$-tagged jets with the largest vector sum ${p_{T}}$ | yes | - |
${\Delta \eta^{\textrm{max}\Delta\eta}_{jj}}$ | general kinematic | dEtajj_MaxdEta |
maximum ${\Delta\eta}$ between any two jets | yes | yes |
${m^{\text{min} \Delta R}_{bb}}$ | general kinematic | Mbb_MindR_Sort4 |
mass of the combination of the two ${b}$-tagged jets with the smallest ${\Delta R}$ | yes | - |
${m^{\text{min} \Delta R}_{jj}}$ | general kinematic | Mjj_MindR |
mass of the combination of any two jets with the smallest ${\Delta R}$ | - | yes |
${N^{\text{Higgs}}_{30}}$ | general kinematic | nHiggsbb30_Sort4 |
number of ${b}$-jet pairs with invariant mass within 30 GeV of the Higgs boson mass | yes | yes |
${H^{\text{had}}_{T}}$ | general kinematic | HT_jets ? |
scalar sum of jet ${p_{T}}$ | - | yes |
${\Delta R^{\text{min}\Delta R}_{\text{lep}-bb}}$ | general kinematic | dRlepbb_MindR_Sort4 |
${\Delta R}$ between the lepton and the combination of the two ${b}$-tagged jets with the smallest ${\Delta R}$ | - | yes |
aplanarity | general kinematic | Aplanarity_jets |
${1.5\lambda_{2}}$, where ${\lambda_{2}}$ is the second eigenvalue of the momentum tensor built with all jets | yes | yes |
${H1}$ | general kinematic | H1_all |
second Fox-Wolfram moment computed using all jets and the lepton | yes | yes |
BDT | reconstruction BDT output | TTHReco_best_TTHReco |
BDT output | yes* | yes* |
${m_{H}}$ | reconstruction BDT output | TTHReco_best_Higgs_mass |
Higgs boson mass | yes | yes |
${m_{H,b_{\text{lep top}}}}$ | reconstruction BDT output | TTHReco_best_Higgsbleptop_mass |
Higgs boson mass and ${b}$-jet from leptonic ${t}$ | yes | - |
${\Delta R_{\text{Higgs }bb}}$ | reconstruction BDT output | TTHReco_best_bbHiggs_dR |
${\Delta R}$ between ${b}$-jets from Higgs boson | yes | yes |
${\Delta R_{H,t\bar{t}}}$ | reconstruction BDT output | TTHReco_withH_best_Higgsttbar_dR |
${\Delta R}$ between Higgs boson and ${t\bar{t}}$ system | yes* | yes* |
${\Delta R_{H,\text{lep top}}}$ | reconstruction BDT output | TTHReco_best_Higgsleptop_dR |
${\Delta R}$ between Higgs boson and leptonic ${t}$ | yes | - |
${\Delta R_{H,b_{\text{had top}}}}$ | reconstruction BDT output | TTHReco_best_b1Higgsbhadtop_dR |
${\Delta R}$ between Higgs boson and ${b}$-jet from hadronic ${t}$ | - | yes* |
D | likelihood calculation | LHD_Discriminant |
likelihood discriminant | yes | yes |
${\text{MEM}_{D1}}$ | matrix method | matrix method | yes | - | |
${w^{H}_{b}}$ | ${b}$-tagging | ? | sum of binned ${b}$-tagging weights of jets from best Higgs candidate | yes | yes |
${B_{j^{3}}}$ | ${b}$-tagging | ? | third jet binned ${b}$-tagging weight (sorted by weight) | yes | yes |
${B_{j^{4}}}$ | ${b}$-tagging | ? | fourth jet binned ${b}$-tagging weight (sorted by weight) | yes | yes |
${B_{j^{5}}}$ | ${b}$-tagging | ? | fifth jet binned ${b}$-tagging weight (sorted by weight) | yes | yes |
In [2]:
variables = [
"nElectrons",
"nMuons",
"nJets",
"nBTags_70",
"dRbb_avg_Sort4",
"dRbb_MaxPt_Sort4",
"dEtajj_MaxdEta",
"Mbb_MindR_Sort4",
"Mjj_MindR",
"nHiggsbb30_Sort4",
"HT_jets",
"dRlepbb_MindR_Sort4",
"Aplanarity_jets",
"H1_all",
"TTHReco_best_TTHReco",
"TTHReco_best_Higgs_mass",
"TTHReco_best_Higgsbleptop_mass",
"TTHReco_best_bbHiggs_dR",
"TTHReco_withH_best_Higgsttbar_dR",
"TTHReco_best_Higgsleptop_dR",
"TTHReco_best_b1Higgsbhadtop_dR",
"LHD_Discriminant"
]
In [3]:
filenames_ttH = ["ttH_group.phys-higgs.11468583._000005.out.root"]
filenames_ttbb = ["ttbb_group.phys-higgs.11468624._000005.out.root"]
In [4]:
ttH = root_pandas.read_root(filenames_ttH, "nominal_Loose", columns = variables)
ttH["target"] = 1
ttH.head()
Out[4]:
In [5]:
ttbb = root_pandas.read_root(filenames_ttbb, "nominal_Loose", columns = variables)
ttbb["target"] = 0
ttbb.head()
Out[5]:
In [6]:
df = pd.concat([ttH, ttbb])
df.head()
Out[6]:
In [7]:
selection_ejets = "(nElectrons == 1) & (nJets >= 4)"
selection_mujets = "(nMuons == 1) & (nJets >= 4)"
selection_ejets_5JE4BI = "(nElectrons == 1) & (nJets == 4) & (nBTags_70 >= 4)"
selection_ejets_6JI4BI = "(nElectrons == 1) & (nJets == 6) & (nBTags_70 >= 4)"
df = df.query(selection_ejets)
df.drop(["nElectrons", "nMuons", "nJets", "nBTags_70"], axis = 1, inplace = True)
df.head()
Out[7]:
In [8]:
rows = []
for variable in df.columns.values:
rows.append({
"name": variable,
"maximum": df[variable].max(),
"minimum": df[variable].min(),
"mean": df[variable].mean(),
"median": df[variable].median(),
"std": df[variable].std()
})
_df = pd.DataFrame(rows)[["name", "maximum", "minimum", "mean", "std", "median"]]
_df
Out[8]:
In [9]:
df["TTHReco_best_TTHReco"].replace( -9, -1, inplace = True)
df["TTHReco_best_Higgs_mass"].replace( -9, -1, inplace = True)
df["TTHReco_best_Higgsbleptop_mass"].replace( -9, -1, inplace = True)
df["TTHReco_best_bbHiggs_dR"].replace( -9, -1, inplace = True)
df["TTHReco_withH_best_Higgsttbar_dR"].replace(-9, -1, inplace = True)
df["TTHReco_best_Higgsleptop_dR"].replace( -9, -1, inplace = True)
df["TTHReco_best_b1Higgsbhadtop_dR"].replace( -9, -1, inplace = True)
df["LHD_Discriminant"].replace( -9, -1, inplace = True)
In [10]:
plt.rcParams["figure.figsize"] = (17, 14)
df.hist();
In [11]:
sns.heatmap(df.query("target == 1").drop("target", axis = 1).corr());
In [12]:
sns.heatmap(df.query("target == 0").drop("target", axis = 1).corr());
In [13]:
_df = df.query("target == 1").drop("target", axis = 1).corr() / df.query("target == 0").drop("target", axis = 1).corr()
sns.heatmap(_df);
In [14]:
plot = sns.clustermap(df.corr())
plt.setp(plot.ax_heatmap.get_yticklabels(), rotation = 0);
In [15]:
df.corr()["target"].sort_values(ascending = False).to_frame()[1:]
Out[15]:
In [16]:
df.corr()["target"].abs().sort_values(ascending = False).to_frame()[1:]
Out[16]:
In [17]:
_df = df.corr()["target"].abs().sort_values(ascending = False).to_frame()[1:]
_df.plot(kind = "barh", legend = "False");
In [18]:
names = df.corr()["target"].abs().sort_values(ascending = False)[1:11].index.values
plot = sns.clustermap(df[names].corr())
plt.setp(plot.ax_heatmap.get_yticklabels(), rotation = 0);
In [19]:
variables_rescale = [variable for variable in list(df.columns) if variable != "target"]
scaler = MinMaxScaler()
df[variables_rescale] = scaler.fit_transform(df[variables_rescale])
df.head()
Out[19]:
In [20]:
df.to_csv("ttHbb_data.csv", index = False)
In [ ]: