In this notebook we experiment with particle identification

based on data from LHCb generation, information about track is used here

The aim is to get uniformity in Kaon identification in TrackP


In [1]:
import numpy
import pandas
import root_numpy
from os import listdir
from collections import defaultdict

from hep_ml.commonutils import train_test_split
from hep_ml.config import ipc_profile

In [2]:
folder = '/mnt/w76/notebook/datasets/PID_Jones/'
treename = 'ANNPID/DecayTree'

In [3]:
origins = {0: 'El', 1: 'Ka', 2: 'Mu', 3: 'Pr', 4: 'Inc' }
origins_inv = {v: k for k, v in origins.items()}
origins_inv


Out[3]:
{'El': 0, 'Inc': 4, 'Ka': 1, 'Mu': 2, 'Pr': 3}

Selecting training_vars


In [4]:
train_variables = [
    'TrackP',
    'TrackPt',
    'TrackChi2PerDof',
    'TrackNumDof',
    'TrackLikelihood',
    'TrackGhostProbability',
    'TrackFitMatchChi2',
    'TrackFitVeloChi2',
    'TrackFitVeloNDoF',
    'TrackFitTChi2',
    'TrackFitTNDoF',
    'RichUsedAero',
    'RichUsedR1Gas',
    'RichUsedR2Gas',
    'RichAboveElThres',
    'RichAboveMuThres',
    'RichAbovePiThres',
    'RichAboveKaThres',
    'RichAbovePrThres',
    'RichDLLe',
    'RichDLLmu',
    'RichDLLk',
    'RichDLLp',
    'RichDLLbt',
    'MuonBkgLL',
    'MuonMuLL',
    'MuonIsMuon',
    'MuonNShared',
    'InAccMuon',
    'MuonIsLooseMuon',
    'EcalPIDe',
    'EcalPIDmu',
    'HcalPIDe',
    'HcalPIDmu',
    'PrsPIDe',
    'InAccBrem',
    'BremPIDe',
    'VeloCharge',
    'TrackType']

Events in each file will get the following label


In [5]:
file_labels = {}
for name in listdir(folder):
    particle = name.split('-')[0][:3]
    file_labels["{}/ANNPID.1.root".format(name)] = origins_inv[particle]
file_labels


Out[5]:
{'El-BdJPsiKstar/ANNPID.1.root': 0,
 'El-BdJPsiX/ANNPID.1.root': 0,
 'El-BdKstar/ANNPID.1.root': 0,
 'El-BuJPsiK/ANNPID.1.root': 0,
 'El-BuJPsiX/ANNPID.1.root': 0,
 'El-BuKee/ANNPID.1.root': 0,
 'El-Ks4e/ANNPID.1.root': 0,
 'El-Kspipiee/ANNPID.1.root': 0,
 'El-LbJPsiX/ANNPID.1.root': 0,
 'El-Zgee40GeV/ANNPID.1.root': 0,
 'Incb-X/ANNPID.1.root': 4,
 'Incb/ANNPID.1.root': 4,
 'Incc-X/ANNPID.1.root': 4,
 'Ka-BdD-K-KKPi/ANNPID.1.root': 1,
 'Ka-BdD0KstarKpi/ANNPID.1.root': 1,
 'Ka-BdKK/ANNPID.1.root': 1,
 'Ka-BdKKPi0/ANNPID.1.root': 1,
 'Ka-BsD0PhiKpi/ANNPID.1.root': 1,
 'Ka-BuKKK/ANNPID.1.root': 1,
 'Mu-Bd4mu/ANNPID.1.root': 2,
 'Mu-BdJPsiKstar/ANNPID.1.root': 2,
 'Mu-BdJPsiX/ANNPID.1.root': 2,
 'Mu-BdKsmumu/ANNPID.1.root': 2,
 'Mu-BdKstarmumu/ANNPID.1.root': 2,
 'Mu-Bs4mu/ANNPID.1.root': 2,
 'Mu-BsJPsiPhi/ANNPID.1.root': 2,
 'Mu-BsJPsiX/ANNPID.1.root': 2,
 'Mu-BuJPsiK/ANNPID.1.root': 2,
 'Mu-BuJPsiPi/ANNPID.1.root': 2,
 'Mu-BuKmumu/ANNPID.1.root': 2,
 'Mu-BuKstarmumu/ANNPID.1.root': 2,
 'Mu-Ks4mu/ANNPID.1.root': 2,
 'Mu-Ksmumu/ANNPID.1.root': 2,
 'Mu-SigmaPmumu/ANNPID.1.root': 2,
 'Mu-Tau3mu/ANNPID.1.root': 2,
 'Mu-TauMuPhi/ANNPID.1.root': 2,
 'Pr-LbLcKa/ANNPID.1.root': 3,
 'Pr-LbPKa/ANNPID.1.root': 3,
 'Pr-LbPKaMuMu/ANNPID.1.root': 3,
 'Pr-LbPKaPhi/ANNPID.1.root': 3,
 'Pr-LbPPi/ANNPID.1.root': 3}

Listing all available branches in first file:


In [6]:
print root_numpy.list_branches(folder + 'El-BdJPsiX/ANNPID.1.root', treename=treename)


['BremPIDe', 'CaloBremChi2', 'CaloBremMatch', 'CaloChargedEcal', 'CaloChargedPrs', 'CaloChargedSpd', 'CaloClusChi2', 'CaloEcalChi2', 'CaloEcalE', 'CaloElectronMatch', 'CaloHcalE', 'CaloNeutralEcal', 'CaloNeutralPrs', 'CaloNeutralSpd', 'CaloPrsE', 'CaloSpdE', 'CaloTrMatch', 'CaloTrajectoryL', 'CombDLLe', 'CombDLLk', 'CombDLLmu', 'CombDLLp', 'CombDLLpi', 'EcalPIDe', 'EcalPIDmu', 'HcalPIDe', 'HcalPIDmu', 'InAccBrem', 'InAccEcal', 'InAccHcal', 'InAccMuon', 'InAccPrs', 'InAccSpd', 'MuonBkgLL', 'MuonIsLooseMuon', 'MuonIsMuon', 'MuonMuLL', 'MuonNShared', 'NumCaloHypos', 'NumDownstreamTracks', 'NumLongTracks', 'NumMuonTracks', 'NumPVs', 'NumProtoParticles', 'NumRich1Hits', 'NumRich2Hits', 'NumSPDHits', 'NumTTracks', 'NumUpstreamTracks', 'NumVeloTracks', 'PrsPIDe', 'RichAboveElThres', 'RichAboveKaThres', 'RichAboveMuThres', 'RichAbovePiThres', 'RichAbovePrThres', 'RichDLLbt', 'RichDLLe', 'RichDLLk', 'RichDLLmu', 'RichDLLp', 'RichDLLpi', 'RichUsedAero', 'RichUsedR1Gas', 'RichUsedR2Gas', 'TrackChi2PerDof', 'TrackCloneDist', 'TrackDOCA', 'TrackFitMatchChi2', 'TrackFitTChi2', 'TrackFitTNDoF', 'TrackFitVeloChi2', 'TrackFitVeloNDoF', 'TrackGhostProbability', 'TrackHistory', 'TrackLikelihood', 'TrackMatchChi2', 'TrackNumDof', 'TrackP', 'TrackPt', 'TrackType', 'VeloCharge', 'RecoPIDcode', 'HasMC', 'MCParticleType', 'MCParticleP', 'MCParticlePt', 'MCVirtualMass', 'MCFromB', 'MCFromD', 'MCVertexType', 'MCVertexX', 'MCVertexY', 'MCVertexZ', 'piplus_OWNPV_X', 'piplus_OWNPV_Y', 'piplus_OWNPV_Z', 'piplus_OWNPV_XERR', 'piplus_OWNPV_YERR', 'piplus_OWNPV_ZERR', 'piplus_OWNPV_CHI2', 'piplus_OWNPV_NDOF', 'piplus_OWNPV_COV_', 'piplus_IP_OWNPV', 'piplus_IPCHI2_OWNPV', 'nCandidate', 'totCandidates', 'EventInSequence']

Reading the data


In [7]:
selection = "(TrackType == 3) && (TrackP > 0) && (TrackP < 100000) && (TrackLikelihood > -100.0) && (TrackPt > 0)"

In [8]:
data_parts = []
label_parts = []

max_events_per_file = 200000
for file_index, (filename, label) in enumerate(file_labels.items()):
    if label not in [1, 2]: # Kaon vs Muon
        continue
    
    data_part = pandas.DataFrame(root_numpy.root2array(folder + filename, treename=treename, 
                                                       branches=train_variables, stop=max_events_per_file, selection=selection))
    print filename, len(data_part)
    data_parts.append(data_part)
    label_parts.append(numpy.zeros(len(data_part), dtype=int) + label)


Mu-SigmaPmumu/ANNPID.1.root 121816
Ka-BdKKPi0/ANNPID.1.root 121331
Mu-Ksmumu/ANNPID.1.root 119254
Ka-BdD-K-KKPi/ANNPID.1.root 124729
Ka-BdD0KstarKpi/ANNPID.1.root 124765
Mu-BuJPsiPi/ANNPID.1.root 124487
Ka-BuKKK/ANNPID.1.root 123491
Mu-BuJPsiK/ANNPID.1.root 124463
Mu-BuKmumu/ANNPID.1.root 123546
Mu-Tau3mu/ANNPID.1.root 124181
Mu-BdKstarmumu/ANNPID.1.root 124437
Mu-BdJPsiKstar/ANNPID.1.root 123729
Mu-TauMuPhi/ANNPID.1.root 123355
Mu-BdJPsiX/ANNPID.1.root 124563
Mu-Bs4mu/ANNPID.1.root 124164
Mu-Bd4mu/ANNPID.1.root 125538
Mu-BdKsmumu/ANNPID.1.root 122467
Mu-Ks4mu/ANNPID.1.root 120295
Ka-BdKK/ANNPID.1.root 122765
Ka-BsD0PhiKpi/ANNPID.1.root 123251
Mu-BsJPsiX/ANNPID.1.root 123959
Mu-BsJPsiPhi/ANNPID.1.root 124180
Mu-BuKstarmumu/ANNPID.1.root 86136

In [14]:
data = pandas.concat(data_parts, ignore_index=True)
labels = numpy.concatenate(label_parts)
answers = labels == origins_inv['Ka'] # Kaon is signal, everything else - bck

Plotting TrackP for different particles

I don't feel much difference


In [16]:
track_p = defaultdict(list)
for file_index, (filename, label) in enumerate(file_labels.items()):
    track_p_part = root_numpy.root2array(folder + filename, treename=treename, branches=['TrackP'], 
                                         selection=selection, stop=10000)['TrackP']
    track_p[label].append(track_p_part)

In [17]:
for key, value in track_p.items():
    hist(numpy.concatenate(value), label=origins[key], bins=40, histtype='step', normed=True)
legend()


Out[17]:
<matplotlib.legend.Legend at 0x179afdd0>

Plotting train variables

skipping -999 values


In [18]:
columns = sorted(data.columns)
figure(figsize=[18, 70])
for i, column in enumerate(columns, 1):
    subplot((len(columns) + 2) // 3, 3, i)
    col_data = data[column]
    limits = numpy.percentile(col_data[col_data != -999], [0.1, 99.9])
    hist(data.ix[answers == 0, column].values, bins=30, normed=True, range=limits, alpha=0.3, label='bck', color='r')
    hist(data.ix[answers == 1, column].values, bins=30, normed=True, range=limits, alpha=0.3, label='sig', color='b')
    legend(loc='best')
    title(column)


Splitting into train and test


In [19]:
trainX, testX, trainY, testY, train_labels, test_labels = train_test_split(data, answers, labels, train_size=0.2, random_state=42)

In [20]:
len(trainX), len(testX)


Out[20]:
(560180, 2240722)

Simple classification


In [21]:
from hep_ml import ClassifiersDict
from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier
from hep_ml import ugradientboosting as ugb

In [22]:
classifiers = ClassifiersDict()
classifiers['RF'] = RandomForestClassifier(n_estimators=150, max_depth=10, 
                                           min_samples_split=100, n_jobs=4, max_features=8)

classifiers['GB'] = GradientBoostingClassifier(subsample=0.1, min_samples_split=300, 
                                               max_depth=10, max_features=10, n_estimators=150)

loss = ugb.BinFlatnessLossFunction(uniform_variables=['TrackP'], n_bins=15, ada_coefficient=0.1)
classifiers['uGB'] = ugb.uGradientBoostingClassifier(loss=loss, subsample=0.1, min_samples_split=300, 
                                                     max_depth=10, max_features=10, n_estimators=150)

In [23]:
classifiers.fit(trainX, trainY, ipc_profile=ipc_profile)
pass


Classifier           RF is learnt in 321.24 seconds
Classifier           GB is learnt in 328.50 seconds
Classifier          uGB is learnt in 137.00 seconds
Totally spent 331.32 seconds on parallel training

ROC curves


In [24]:
predictions = classifiers.test_on(testX, testY)
predictions.roc()


Out[24]:
<hep_ml.reports.Predictions at 0x1644af50>

Learning curves


In [25]:
predictions.learning_curves(step=10)


Out[25]:
<hep_ml.reports.Predictions at 0x1644af50>

Efficiency curves


In [26]:
predictions.efficiency(['TrackP'])


Out[26]:
<hep_ml.reports.Predictions at 0x1644af50>

Staged SDE (measure of non-uniformity, less is better)


In [27]:
predictions.sde_curves(['TrackP'])



In [ ]: