In [1]:
%pylab inline
In [2]:
import numpy, pandas
from rep.utils import train_test_split
from sklearn.metrics import roc_auc_score
sig_data = pandas.read_csv('toy_datasets/toyMC_sig_mass.csv', sep='\t')
bck_data = pandas.read_csv('toy_datasets/toyMC_bck_mass.csv', sep='\t')
labels = numpy.array([1] * len(sig_data) + [0] * len(bck_data))
data = pandas.concat([sig_data, bck_data])
train_data, test_data, train_labels, test_labels = train_test_split(data, labels, train_size=0.7)
In [3]:
variables = ["FlightDistance", "FlightDistanceError", "IP", "VertexChi2", "pt", "p0_pt", "p1_pt", "p2_pt", 'LifeTime', 'dira']
data = data[variables]
It implements the same interface as all classifiers, but with some difference:
In [4]:
from rep.estimators import SklearnClassifier
from sklearn.ensemble import GradientBoostingClassifier
In [5]:
from rep.metaml import FoldingClassifier
In [6]:
n_folds = 4
folder = FoldingClassifier(GradientBoostingClassifier(), n_folds=n_folds, features=variables)
folder.fit(train_data, train_labels)
Out[6]:
In [7]:
folder.predict_proba(train_data)
Out[7]:
In [8]:
# definition of mean function, which combines all predictions
def mean_vote(x):
return numpy.mean(x, axis=0)
In [9]:
folder.predict_proba(test_data, vote_function=mean_vote)
Out[9]:
Again use ClassificationReport class to compare different results. For folding classifier this report uses only default prediction.
In [10]:
from rep.data.storage import LabeledDataStorage
from rep.report import ClassificationReport
# add folds_column to dataset to use mask
train_data["FOLDS"] = folder._get_folds_column(len(train_data))
lds = LabeledDataStorage(train_data, train_labels)
report = ClassificationReport({'folding': folder}, lds)
In [11]:
for fold_num in range(n_folds):
report.prediction_pdf(mask="FOLDS == %d" % fold_num, labels_dict={1: 'sig fold %d' % fold_num}).plot()
In [12]:
for fold_num in range(n_folds):
report.prediction_pdf(mask="FOLDS == %d" % fold_num, labels_dict={0: 'bck fold %d' % fold_num}).plot()
In [13]:
for fold_num in range(n_folds):
report.roc(mask="FOLDS == %d" % fold_num).plot()
In [14]:
lds = LabeledDataStorage(test_data, test_labels)
report = ClassificationReport({'folding': folder}, lds)
In [15]:
report.prediction_pdf().plot(new_plot=True, figsize = (9, 4))
In [16]:
report.roc().plot(xlim=(0.5, 1))
In [ ]: