notebook.community

Edit and run



In [1]:

    
%pylab inline









    



Populating the interactive namespace from numpy and matplotlib



In [2]:

    
import pandas
import root_numpy



In [3]:

    
train_features = ['Bplus_ENDVERTEX_Y', 'Bplus_P', 'Bplus_PT']
uniform_features = ['KS0_TAU']
all_features = train_features + uniform_features



In [4]:

    
def read_data(filename):
    df = root_numpy.root2array(filename, treename='DecayTree', branches=all_features, stop=200000)
    return pandas.DataFrame(df)



In [5]:

    
bck = read_data('/moosefs/notebook/datasets/inflation/highSb_data-LL_strict_BDT_96Up.root')
sig = read_data('/moosefs/notebook/datasets/inflation/12113095-LL-Official.root')
print len(sig), len(bck)









    



16683 200000

Train/test split



In [6]:

    
data = pandas.concat([sig, bck])
answers = numpy.concatenate([numpy.ones(len(sig)), numpy.zeros(len(bck))])



In [7]:

    
from hep_ml.commonutils import train_test_split
trainX, testX, trainY, testY = train_test_split(data, answers, train_size=0.51)

Training classifier

uGB+FL trained to be uniform in background



In [8]:

    
from sklearn.ensemble import GradientBoostingClassifier
from hep_ml.ugradientboosting import uGradientBoostingClassifier, KnnFlatnessLossFunction



In [9]:

    
from rep.metaml import ClassifiersFactory
from rep.estimators import SklearnClassifier

classifiers = ClassifiersFactory()

gb = GradientBoostingClassifier(n_estimators=400, max_depth=4, min_samples_leaf=100, learning_rate=0.05)
classifiers['GB'] = SklearnClassifier(gb, features=train_features)

# uniform_label=0 -> uniform in background
loss = KnnFlatnessLossFunction(uniform_features, ada_coefficient=0.1, uniform_label=0)
ugb = uGradientBoostingClassifier(loss=loss, train_variables=train_features, n_estimators=400, 
                                  max_depth=4, min_samples_leaf=100, learning_rate=0.2)
classifiers['uGB'] = SklearnClassifier(ugb)

classifiers.fit(trainX, trainY)
pass









    



model GB           was trained in 34.54 seconds
model uGB          was trained in 155.60 seconds
Totally spent 190.14 seconds on training






    



/mnt/mfs/notebook/axelr/lhcb_ml/lhcb_trigger_ml/hep_ml/losses.py:306: UserWarning: 5193 events out of all bins 
  warnings.warn("%i events out of all bins " % numpy.sum(out_of_bins), UserWarning)

Looking at result



In [10]:

    
predictions = classifiers.test_on(testX, testY,)









    



/mnt/mfs/notebook/REP/rep_core/rep/data/storage.py:15: UserWarning: The data processing pipeline is unstable and it's probably will be changed in the future.
  "it's probably will be changed in the future.", UserWarning)



In [11]:

    
from rep.report import metrics
predictions.learning_curve(metrics={'roc': metrics.RocAuc()}, steps=1)









    Out[11]:



In [12]:

    
predictions.roc()









    Out[12]:

Uniformity in bck (lower is better)



In [13]:

    
from hep_ml.metrics import KnnBasedSDE
predictions.learning_curve(metrics={'sde': KnnBasedSDE(uniform_features=uniform_features, uniform_label=0)})









    Out[13]:

Uniformity in signal (lower is better)



In [14]:

    
predictions.learning_curve(metrics={'sde': KnnBasedSDE(uniform_features=uniform_features, uniform_label=1)})









    Out[14]:



In [15]:

    
# ignored_sideban
predictions.efficiencies(features=uniform_features, ignored_sideband=0.02)









    Out[15]: