In [1]:
%pylab inline


Populating the interactive namespace from numpy and matplotlib

In [2]:
import pandas
import root_numpy

In [3]:
train_features = ['Bplus_ENDVERTEX_Y', 'Bplus_P', 'Bplus_PT']
uniform_features = ['KS0_TAU']
all_features = train_features + uniform_features

In [4]:
def read_data(filename):
    df = root_numpy.root2array(filename, treename='DecayTree', branches=all_features, stop=200000)
    return pandas.DataFrame(df)

In [5]:
bck = read_data('/moosefs/notebook/datasets/inflation/highSb_data-LL_strict_BDT_96Up.root')
sig = read_data('/moosefs/notebook/datasets/inflation/12113095-LL-Official.root')
print len(sig), len(bck)


16683 200000

Train/test split


In [6]:
data = pandas.concat([sig, bck])
answers = numpy.concatenate([numpy.ones(len(sig)), numpy.zeros(len(bck))])

In [7]:
from hep_ml.commonutils import train_test_split
trainX, testX, trainY, testY = train_test_split(data, answers, train_size=0.51)

Training classifier

uGB+FL trained to be uniform in background


In [8]:
from sklearn.ensemble import GradientBoostingClassifier
from hep_ml.ugradientboosting import uGradientBoostingClassifier, KnnFlatnessLossFunction

In [9]:
from rep.metaml import ClassifiersFactory
from rep.estimators import SklearnClassifier

classifiers = ClassifiersFactory()

gb = GradientBoostingClassifier(n_estimators=400, max_depth=4, min_samples_leaf=100, learning_rate=0.05)
classifiers['GB'] = SklearnClassifier(gb, features=train_features)

# uniform_label=0 -> uniform in background
loss = KnnFlatnessLossFunction(uniform_features, ada_coefficient=0.1, uniform_label=0)
ugb = uGradientBoostingClassifier(loss=loss, train_variables=train_features, n_estimators=400, 
                                  max_depth=4, min_samples_leaf=100, learning_rate=0.2)
classifiers['uGB'] = SklearnClassifier(ugb)

classifiers.fit(trainX, trainY)
pass


model GB           was trained in 34.54 seconds
model uGB          was trained in 155.60 seconds
Totally spent 190.14 seconds on training
/mnt/mfs/notebook/axelr/lhcb_ml/lhcb_trigger_ml/hep_ml/losses.py:306: UserWarning: 5193 events out of all bins 
  warnings.warn("%i events out of all bins " % numpy.sum(out_of_bins), UserWarning)

Looking at result


In [10]:
predictions = classifiers.test_on(testX, testY,)


/mnt/mfs/notebook/REP/rep_core/rep/data/storage.py:15: UserWarning: The data processing pipeline is unstable and it's probably will be changed in the future.
  "it's probably will be changed in the future.", UserWarning)

In [11]:
from rep.report import metrics
predictions.learning_curve(metrics={'roc': metrics.RocAuc()}, steps=1)


Out[11]:

In [12]:
predictions.roc()


Out[12]:

Uniformity in bck (lower is better)


In [13]:
from hep_ml.metrics import KnnBasedSDE
predictions.learning_curve(metrics={'sde': KnnBasedSDE(uniform_features=uniform_features, uniform_label=0)})


Out[13]:

Uniformity in signal (lower is better)


In [14]:
predictions.learning_curve(metrics={'sde': KnnBasedSDE(uniform_features=uniform_features, uniform_label=1)})


Out[14]:

In [15]:
# ignored_sideban
predictions.efficiencies(features=uniform_features, ignored_sideband=0.02)


Out[15]: