This notebook takes ROOT files of ${t\bar{t}H}$ and ${t\bar{t}b\bar{b}}$ samples, applies a selection, impudes some values and then exports the resulting data to CSV.
In [1]:
import datetime
import keras
from keras import activations
from keras.datasets import mnist
from keras.layers import Dense, Flatten
from keras.layers import Conv1D, Conv2D, MaxPooling1D, MaxPooling2D, Dropout
from keras.models import Sequential
from keras.utils import plot_model
from matplotlib import gridspec
import matplotlib.pylab as plt
from matplotlib.ticker import NullFormatter, NullLocator, MultipleLocator
import pandas as pd
pd.set_option("display.max_rows", 500)
pd.set_option("display.max_columns", 500)
import numpy as np
import pandas as pd
from scipy import stats
from sklearn.metrics import auc, roc_curve
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
import seaborn as sns
sns.set(style = 'ticks')
sns.set_palette('husl')
import sqlite3
import talos as ta
from vis.visualization import visualize_activation
from vis.visualization import visualize_saliency
from vis.utils import utils
import warnings
warnings.filterwarnings("ignore")
import root_pandas
In [2]:
%matplotlib inline
plt.rcParams["figure.figsize"] = [17, 14]
In [3]:
variables = [
"nElectrons",
"nMuons",
"nJets",
"nBTags_70",
"dRbb_avg_Sort4",
"dRbb_MaxPt_Sort4",
"dEtajj_MaxdEta",
"Mbb_MindR_Sort4",
"Mjj_MindR",
"nHiggsbb30_Sort4",
"HT_jets",
"dRlepbb_MindR_Sort4",
"Aplanarity_jets",
"H1_all",
"TTHReco_best_TTHReco",
"TTHReco_best_Higgs_mass",
"TTHReco_best_Higgsbleptop_mass",
"TTHReco_best_bbHiggs_dR",
"TTHReco_withH_best_Higgsttbar_dR",
"TTHReco_best_Higgsleptop_dR",
"TTHReco_best_b1Higgsbhadtop_dR",
"LHD_Discriminant"
]
filenames_ttH = ["ttH_group.phys-higgs.11468583._000005.out.root"]
filenames_ttbb = ["ttbb_group.phys-higgs.11468624._000005.out.root"]
ttH = root_pandas.read_root(filenames_ttH, "nominal_Loose", columns = variables)
ttbb = root_pandas.read_root(filenames_ttbb, "nominal_Loose", columns = variables)
ttH["classification"] = 1
ttbb["classification"] = 0
df = pd.concat([ttH, ttbb])
df.head()
Out[3]:
In [4]:
selection_ejets = "(nElectrons == 1) & (nJets >= 4)"
selection_mujets = "(nMuons == 1) & (nJets >= 4)"
selection_ejets_5JE4BI = "(nElectrons == 1) & (nJets == 4) & (nBTags_70 >= 4)"
selection_ejets_6JI4BI = "(nElectrons == 1) & (nJets == 6) & (nBTags_70 >= 4)"
df = df.query(selection_ejets)
df.drop(["nElectrons", "nMuons", "nJets", "nBTags_70"], axis = 1, inplace = True)
df.head()
Out[4]:
In [5]:
df["TTHReco_best_TTHReco"].replace( -9, -1, inplace = True)
df["TTHReco_best_Higgs_mass"].replace( -9, -1, inplace = True)
df["TTHReco_best_Higgsbleptop_mass"].replace( -9, -1, inplace = True)
df["TTHReco_best_bbHiggs_dR"].replace( -9, -1, inplace = True)
df["TTHReco_withH_best_Higgsttbar_dR"].replace(-9, -1, inplace = True)
df["TTHReco_best_Higgsleptop_dR"].replace( -9, -1, inplace = True)
df["TTHReco_best_b1Higgsbhadtop_dR"].replace( -9, -1, inplace = True)
df["LHD_Discriminant"].replace( -9, -1, inplace = True)
In [6]:
df.describe()
Out[6]:
In [7]:
df.hist();
In [8]:
_df = df.query("classification == 1").drop("classification", axis = 1)
plot = sns.clustermap(_df.corr())
plt.setp(plot.ax_heatmap.get_yticklabels(), rotation = 0);
In [9]:
_df = df.query("classification == 0").drop("classification", axis = 1)
plot = sns.clustermap(_df.corr())
plt.setp(plot.ax_heatmap.get_yticklabels(), rotation = 0);
In [10]:
_df = df.corr()["classification"].abs().sort_values(ascending = False).to_frame()[1:]
_df
Out[10]:
In [11]:
plt.rcParams["figure.figsize"] = [8, 8]
sns.barplot(_df["classification"], _df.index);
plt.xlabel('absolute correlation with class')
plt.show();
In [12]:
if False:
scaler = MinMaxScaler()
variables_rescale = [variable for variable in list(df.columns) if variable != "classification"]
df[variables_rescale] = scaler.fit_transform(df[variables_rescale])
df.head()
In [13]:
df.to_csv("ttHbb_data.csv", index=False)