In [1]:
import numpy
import pandas
import root_numpy
from os import listdir
from collections import defaultdict
from hep_ml.commonutils import train_test_split
from hep_ml.config import ipc_profile
In [2]:
folder = '/mnt/w76/notebook/datasets/PID_Jones/'
treename = 'ANNPID/DecayTree'
In [3]:
origins = {0: 'El', 1: 'Ka', 2: 'Mu', 3: 'Pr', 4: 'Inc' }
origins_inv = {v: k for k, v in origins.items()}
origins_inv
Out[3]:
In [4]:
train_variables = [
'TrackP',
'TrackPt',
'TrackChi2PerDof',
'TrackNumDof',
'TrackLikelihood',
'TrackGhostProbability',
'TrackFitMatchChi2',
'TrackFitVeloChi2',
'TrackFitVeloNDoF',
'TrackFitTChi2',
'TrackFitTNDoF',
'RichUsedAero',
'RichUsedR1Gas',
'RichUsedR2Gas',
'RichAboveElThres',
'RichAboveMuThres',
'RichAbovePiThres',
'RichAboveKaThres',
'RichAbovePrThres',
'RichDLLe',
'RichDLLmu',
'RichDLLk',
'RichDLLp',
'RichDLLbt',
'MuonBkgLL',
'MuonMuLL',
'MuonIsMuon',
'MuonNShared',
'InAccMuon',
'MuonIsLooseMuon',
'EcalPIDe',
'EcalPIDmu',
'HcalPIDe',
'HcalPIDmu',
'PrsPIDe',
'InAccBrem',
'BremPIDe',
'VeloCharge',
'TrackType']
In [5]:
file_labels = {}
for name in listdir(folder):
particle = name.split('-')[0][:3]
file_labels["{}/ANNPID.1.root".format(name)] = origins_inv[particle]
file_labels
Out[5]:
In [6]:
print root_numpy.list_branches(folder + 'El-BdJPsiX/ANNPID.1.root', treename=treename)
In [7]:
selection = "(TrackType == 3) && (TrackP > 0) && (TrackP < 100000) && (TrackLikelihood > -100.0) && (TrackPt > 0)"
In [8]:
data_parts = []
label_parts = []
max_events_per_file = 200000
for file_index, (filename, label) in enumerate(file_labels.items()):
if label not in [1, 2]: # Kaon vs Muon
continue
data_part = pandas.DataFrame(root_numpy.root2array(folder + filename, treename=treename,
branches=train_variables, stop=max_events_per_file, selection=selection))
print filename, len(data_part)
data_parts.append(data_part)
label_parts.append(numpy.zeros(len(data_part), dtype=int) + label)
In [14]:
data = pandas.concat(data_parts, ignore_index=True)
labels = numpy.concatenate(label_parts)
answers = labels == origins_inv['Ka'] # Kaon is signal, everything else - bck
In [16]:
track_p = defaultdict(list)
for file_index, (filename, label) in enumerate(file_labels.items()):
track_p_part = root_numpy.root2array(folder + filename, treename=treename, branches=['TrackP'],
selection=selection, stop=10000)['TrackP']
track_p[label].append(track_p_part)
In [17]:
for key, value in track_p.items():
hist(numpy.concatenate(value), label=origins[key], bins=40, histtype='step', normed=True)
legend()
Out[17]:
In [18]:
columns = sorted(data.columns)
figure(figsize=[18, 70])
for i, column in enumerate(columns, 1):
subplot((len(columns) + 2) // 3, 3, i)
col_data = data[column]
limits = numpy.percentile(col_data[col_data != -999], [0.1, 99.9])
hist(data.ix[answers == 0, column].values, bins=30, normed=True, range=limits, alpha=0.3, label='bck', color='r')
hist(data.ix[answers == 1, column].values, bins=30, normed=True, range=limits, alpha=0.3, label='sig', color='b')
legend(loc='best')
title(column)
In [19]:
trainX, testX, trainY, testY, train_labels, test_labels = train_test_split(data, answers, labels, train_size=0.2, random_state=42)
In [20]:
len(trainX), len(testX)
Out[20]:
In [21]:
from hep_ml import ClassifiersDict
from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier
from hep_ml import ugradientboosting as ugb
In [22]:
classifiers = ClassifiersDict()
classifiers['RF'] = RandomForestClassifier(n_estimators=150, max_depth=10,
min_samples_split=100, n_jobs=4, max_features=8)
classifiers['GB'] = GradientBoostingClassifier(subsample=0.1, min_samples_split=300,
max_depth=10, max_features=10, n_estimators=150)
loss = ugb.BinFlatnessLossFunction(uniform_variables=['TrackP'], n_bins=15, ada_coefficient=0.1)
classifiers['uGB'] = ugb.uGradientBoostingClassifier(loss=loss, subsample=0.1, min_samples_split=300,
max_depth=10, max_features=10, n_estimators=150)
In [23]:
classifiers.fit(trainX, trainY, ipc_profile=ipc_profile)
pass
In [24]:
predictions = classifiers.test_on(testX, testY)
predictions.roc()
Out[24]:
In [25]:
predictions.learning_curves(step=10)
Out[25]:
In [26]:
predictions.efficiency(['TrackP'])
Out[26]:
In [27]:
predictions.sde_curves(['TrackP'])
In [ ]: