We convert the event data stored in the ROOT signal and background files to numpy arrays using the root_numpy package. We can control the features we want to train on by extracting the branch names directly from the ROOT TTree and pruning the unwanted branches.
In [1]:
import numpy as np
import matplotlib as mpl
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
from root_numpy import root2array, rec2array, array2root
import ROOT
import sys, glob, os, time, random
from sklearn import datasets
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
mpl.rcParams.update({'font.size': 12})
In [35]:
def pruned(branch_names):
# Removes unwanted branches from TTree
from collections import OrderedDict
pruned_branches = list(branch_names)
unwanted = ['hso01', 'hso03', 'Signal', 'VtxProd', 'Delta', 'X', 'Y', 'Z', 'Rho', 'dr', 'dz',
'nCands', 'iCand', 'Continuum', 'qrMC', 'P4cms']
# unwanted += ['Rho', 'Pvalue', 'dr', 'dz', 'Vert', 'Prob', 'DevChi','momDevChi2']
for bad_feature in unwanted:
for feature in branch_names:
if (bad_feature in feature) and feature in pruned_branches:
pruned_branches.remove(feature)
return list(OrderedDict.fromkeys(pruned_branches))
In [65]:
def numpy_convert(ROOTfile, treename='b0', B_meson_only = False):
# Reads selected branches of ROOT files into numpy arrays
arrayList, branchList = [], []
loadFile = ROOT.TFile(ROOTfile)
trees = sorted(list(set([key.GetName() for key in loadFile.GetListOfKeys()])))
if B_meson_only:
trees = [treename]
for tree in trees:
loadTree = loadFile.Get(tree)
branches = [b.GetName() for b in loadTree.GetListOfBranches()]
# branches[0] will always be the isSignal(B) - should change this
tree_branch_names = pruned(branches)+[branches[0]]
treeArray = rec2array(root2array(ROOTfile, tree, tree_branch_names))
if B_meson_only:
return treeArray, tree_branch_names
arrayList.append(treeArray)
branchList.append(tree_branch_names)
return arrayList, branchList
In [111]:
# Specify input ROOT files here and the decay mode
sig_ROOT = "ewp/rho0/B02rho0gammasignal.root"
Kst_ROOT = "ewp/rho0/B02rho0gammacustom.root"
cfd_ROOT = "ewp/rho0/B02rho0gammaX_s.root"
cont_ROOT = "ewp/rho0/B02rho0gammaqq.root"
mode = "rho0"
training_mode = 'Continuum' # or 'crossfeed', 'generic'
In [57]:
# Select background, signal data
sig, sig_branches = numpy_convert(sig_ROOT, treename = 'b0')
bkg, bkg_branches = numpy_convert(cont_ROOT, treename = 'b0')
Next we extract the features from the data, and extract the truth labels. The complete dataset is stored as a Pandas dataframe for easy access. The dataset is paritioned into training and testing sets twice to ensure no overfitting occurs when we exploit the validation set for hyperparameter tuning.
In [58]:
def create_dataframe(sig, bkg, branch_names):
# Converts feature representation from numpy to dataframe format
branch_names = [b.replace("__", "_") for b in branch_names]
df_sig = pd.DataFrame(sig, columns=branch_names).dropna()
df_bkg = pd.DataFrame(bkg, columns=branch_names).dropna()
return df_sig, df_bkg
In [ ]:
# Testing Higgs dataset only
sig_ROOT = 'higgs/higgs_signal.root'
bkg_ROOT = 'higgs/higgs_background.root'
mode = 'higgs'
training_mode = 'test'
loadFile = ROOT.TFile(sig_ROOT)
trees = list(set([key.GetName() for key in loadFile.GetListOfKeys()]))
for tree in trees:
loadTree = loadFile.Get(tree)
branches = [b.GetName() for b in loadTree.GetListOfBranches()]
sig = rec2array(root2array(sig_ROOT, tree, branches))
bkg = rec2array(root2array(bkg_ROOT, tree, branches))
df_sig, df_bkg = create_dataframe(sig, bkg, branches)
In [89]:
df_sig_B0, df_bkg_B0 = create_dataframe(sig[0], bkg[0], sig_branches[0])
df_sig_gamma, df_bkg_gamma = create_dataframe(sig[1], bkg[1], sig_branches[1])
df_sig_res, df_bkg_res = create_dataframe(sig[2], bkg[2], sig_branches[2])
In [69]:
df_sig_B0.head()
Out[69]:
In [90]:
# Train network pairwise - in this case it is using the B0 features for continuum and signal
df_sig = df_sig_B0
df_bkg = df_bkg_B0
# df_sig = df_sig_gamma
# df_bkg = df_bkg_gamma
# df_sig = df_sig_res
# df_bkg = df_bkg_res
In [71]:
df_bkg.head()
Out[71]:
In [ ]:
# Store dfs in HDF5 format if required
store = pd.HDFStore('/home/justan/current/jupyter/persistance/rho0/dfs/rho0.h5')
store['df_sig'] = df_sig
store['df_cont'] = df_cont
store['df_cfd'] = df_cfd
store['df_cus'] = df_cus
store.close()
In [72]:
def get_correlated_features(corrmat, corr_threshold = 0.95):
# Obtains features with correlation coefficient above the specified threshold
offending_features, correlated_features = [], []
for column in corrmat.columns:
if ('deltae' not in column) and ('mbc' not in column):
for row in corrmat[column].index:
if ('deltae' not in row) and ('mbc' not in row):
if abs(corrmat[column][row]) > corr_threshold and column!=row:
offending_features.append([column, row])
for features in offending_features:
for feature in features:
if 'deltae' in feature or 'mbc' in feature:
offending_features.remove(features)
if features[::-1] in offending_features:
offending_features.remove(features)
for features in offending_features:
if 'bo0' in features[1]: # Keep properties of the resonance
correlated_features.append(features[0])
else:
correlated_features.append(features[1])
correlated_features = list(set(correlated_features))
return correlated_features
In [73]:
def plot_correlations(data, mode):
# Plot correlation matrix elements between features as a heatmap
corrmat = data.corr()
f, ax = plt.subplots(figsize=(14,11))
sns.heatmap(corrmat, square=True, cmap='RdBu')
plt.xticks(rotation=90)
plt.yticks(rotation=0)
f.tight_layout()
plt.title(r"$\mathrm{Correlation \; Coefficients \; " + mode + "}$")
f.savefig('graphs/' + mode + 'correlations.pdf', format='pdf', dpi=1000)
plt.show()
plt.gcf().clear()
In [74]:
def df_norm_std(df):
# Remove mean, scale to unit variance
from sklearn import preprocessing
stdScaler = preprocessing.StandardScaler()
df_norm = pd.DataFrame(stdScaler.fit_transform(df[df.columns[:-1]]), columns = df.columns[:-1])
df_norm['labels'] = df[df.columns[-1]].values
return df_norm
In [75]:
def df_preprocessing(df):
# Center data, normalize scale to [-1,1] along each feature
from sklearn.preprocessing import scale
feature_columns = df.columns[:-1]
# df_centered = pd.DataFrame(preprocessing.scale(df[feature_columns], with_mean = True, with_std = False), columns = feature_columns)
df_centered = df[feature_columns] - df[feature_columns].mean()
scaler = preprocessing.MaxAbsScaler()
df_pre = pd.DataFrame(scaler.fit_transform(df_centered[feature_columns]), columns = feature_columns)
df_pre['labels'] = df[df.columns[-1]].values
return df_pre.assign(labels = df['B0__isSignal'].values)
Define the background type you want the classifier to train on. Default options: light quark continuum, crossfeed from misreconstructed $b \rightarrow s \gamma$, $B \bar{B}$ generic. You can add your own. Certain decay channels will have a significant background from topologically identical and kinematically similar modes. The classifier should be trained to identify this as background.
In [91]:
# Remove correlated features from DataFrame
correlated_features = get_correlated_features(df_sig.corr(), corr_threshold=0.97) + ['B0_mbc', 'B0_deltae']
mandatory_features = ['ThrustB']
df_sig_raw = df_sig.copy()
df_bkg_raw = df_bkg.copy()
for feature in correlated_features:
for variable in mandatory_features:
if variable not in feature:
del df_sig[feature]
del df_bkg[feature]
In [39]:
plot_correlations(df_sig.drop(df_sig.columns[-1],1), 'SIG')
In [40]:
plot_correlations(df_bkg.drop(df_sig.columns[-1],1), 'BKG')
In [95]:
frames = [df_sig,df_bkg]
df_composite = pd.concat(frames, ignore_index = True) #, keys = ['signal', 'background'])
In [96]:
df_composite_norm = df_norm_std(df_composite)
df_composite_norm.describe()
Out[96]:
Save to file. Load into classifier in separate notebook.
In [101]:
filename = 'norm_std_B02rho0gamma_continuum.h5'
store = pd.HDFStore(os.path.join('/home/justan/current/jupyter/persistance', mode, 'dataframes', filename))
store['df'] = df_composite_norm
store.close()
In [103]:
def truncate_tails(hist, nsigma = 5):
# Removes feature outliers above nsigma stddevs away from mean
hist = hist[hist > np.mean(hist)-nsigma*np.std(hist)]
hist = hist[hist < np.mean(hist)+nsigma*np.std(hist)]
return hist
In [104]:
def compare_histograms_overlay(data_sig, data_bkg, nbins = 50, columns=None):
# Plot continuum suppression variable distributions for signal, background
sea_green = '#54ff9f'
steel_blue = '#4e6bbd'
crimson_tide = '#cc0000'
yellow = '#ffff00'
orange = '#ffa500'
for variable in columns:
d_sig = truncate_tails(data_sig[variable].values,5)
d_bkg = truncate_tails(data_bkg[variable].values,5)
sns.distplot(d_sig,color = sea_green, hist=True,label = r'$\mathrm{Signal}$',bins=nbins)
sns.distplot(d_bkg,color = steel_blue, hist=True,label = r'$\mathrm{Background}$',kde=False,norm_hist=True,bins=nbins)
# sns.kdeplot(data_array_bkg, color = steel_blue, label = 'Background',shade =True)
# sns.kdeplot(d_cfd, color = crimson_tide, label = 'Crossfeed',shade =True)
# sns.kdeplot(d_gen, color = yellow, label = 'Generic',shade =True)
plt.title(variable)
plt.autoscale(enable=True, axis='x', tight=False)
plt.xlabel(variable)
plt.ylabel(r'$\mathrm{Normalized \; events/bin}$')
plt.legend(loc = "best")
#plt.title(r"$\mathrm{"+variable+"{\; - \; (B \rightarrow K^+ \pi^0) \gamma$")
#plt.xlim(-1.0,0.98)
#plt.ylim(0,3.3)
#plt.xlabel(r'$|(p_B)_{CMS}| \; [GeV/c]$')
#plt.savefig('graphs/' + mode + variable + '.pdf', bbox_inches='tight',format='pdf', dpi=1000)
plt.show()
plt.gcf().clear()
In [113]:
df_sig.columns
Out[113]:
In [115]:
compare_histograms_overlay(df_sig, df_bkg, nbins=70, columns = ['B0_q2Bh', 'B0_ThrustB', 'B0_Pcms', 'B0B0_rho0_hel', 'B0_helicityAngle_bo0_cm0_bc'])