In [1]:
%pylab inline
In [2]:
loaded_variables = ["FlightDistance", "FlightDistanceError", "IP", "VertexChi2", "pt", "p0_pt", "p1_pt", "p2_pt", 'LifeTime', 'dira', 'mass']
train_variables = ["FL: FlightDistance/FlightDistanceError", "IP", "VertexChi2", "pt", "p0_pt", "p1_pt", "p2_pt", 'LifeTime', 'dira']
plot_variables = train_variables + ['mass']
In [3]:
import numpy, pandas
from rep.utils import train_test_split
In [4]:
sig_data = pandas.read_csv('toy_datasets/toyMC_sig_mass.csv', sep='\t', usecols=loaded_variables)
bck_data = pandas.read_csv('toy_datasets/toyMC_bck_mass.csv', sep='\t', usecols=loaded_variables)
labels = numpy.array([1] * len(sig_data) + [0] * len(bck_data))
data = pandas.concat([sig_data, bck_data])
# Get train and test data
train_data, test_data, train_labels, test_labels = train_test_split(data, labels, train_size=0.5)
This class is OrderedDict, with additional interface, main methods are:
factory.add_classifier(name, classifier)
factory.fit(X, y, sample_weight=None, ipc_profile=None, features=None)
train all classifiers in factory
if features
is not None, then all classifiers will be trained on these features
you can pass the name of ipython cluster via ipc_profile
for parallel training
factory.test_on_lds(lds)
- test all models on lds(rep.data.storage.LabeledDataStorage
)
returns report (rep.report.classification.ClassificationReport
)
In [5]:
from rep.metaml import ClassifiersFactory
from rep.estimators import TMVAClassifier, SklearnClassifier, XGBoostClassifier
from sklearn.ensemble import AdaBoostClassifier
In [6]:
factory = ClassifiersFactory()
# there are different ways to add classifiers to Factory:
factory.add_classifier('tmva', TMVAClassifier(NTrees=50, features=train_variables[:5], Shrinkage=0.05))
factory.add_classifier('ada', AdaBoostClassifier(n_estimators=10))
factory['xgb'] = XGBoostClassifier(features=train_variables[2:6])
In [7]:
from copy import deepcopy
factory_copy = deepcopy(factory)
In [8]:
%time factory.fit(train_data, train_labels)
pass
In [9]:
factory.predict_proba(train_data)
Out[9]:
In [10]:
%time factory_copy.fit(train_data, train_labels, features=train_variables)
pass
ClassificationReport
class provides the posibility to get classification description to compare different models.
Below you can find available functions which can help you to analyze result on arbitrary dataset.
There are different plotting backends supported:
In [11]:
report = factory.test_on(test_data, test_labels)
In [12]:
features_importances = report.feature_importance()
features_importances.plot()
In [13]:
features_importances.plot_plotly('importances', figsize=(15, 6))
In [14]:
from sklearn.metrics import roc_auc_score, log_loss
def log_likelihood(y_true, y_pred, sample_weight=None):
return log_loss(y_true, y_pred[:, 1])
def roc_auc(y_true, y_pred, sample_weight=None):
return roc_auc_score(y_true, y_pred[:, 1], sample_weight=sample_weight)
learning_curve = report.learning_curve(log_likelihood, metric_label='log likelihood', steps=1)
learning_curve.plot(new_plot=True)
In [15]:
learning_curve = report.learning_curve(roc_auc, metric_label='roc auc', steps=1)
learning_curve.plot(new_plot=True)
In [16]:
learning_curve.plot_plotly(plotly_filename='learning curves', figsize=(18, 8))
In [17]:
correlation_pairs = []
correlation_pairs.append((plot_variables[0], plot_variables[1]))
correlation_pairs.append((plot_variables[0], plot_variables[2]))
report.scatter(correlation_pairs, alpha=0.01).plot()
In [18]:
# plot correlations between variables for signal-like and bck-like events
report.features_correlation_matrix(features=loaded_variables).plot(new_plot=True, show_legend=False, figsize=(7, 5))
In [19]:
report.features_correlation_matrix_by_class(features=plot_variables).plot(new_plot=True, show_legend=False, figsize=(15, 5))
In [20]:
# plot correlations between variables just for bck-like events
corr = report.features_correlation_matrix_by_class(features=plot_variables[:4], labels_dict={0: 'background'}, grid_columns=1)
corr.plot_plotly(plotly_filename='correlations', show_legend=False, fontsize=8, figsize=(8, 6))
In [21]:
# use just common features for all classifiers
report.features_pdf().plot()
In [22]:
# use all features in data
report.features_pdf(data.columns).plot_plotly('distributions')
In [23]:
report.prediction_pdf().plot(new_plot=True, figsize = (9, 4))
In [24]:
report.prediction_pdf(labels_dict={0: 'background'}, size=5).plot_plotly('models pdf')
In [25]:
report.roc().plot(xlim=(0.5, 1))
In [26]:
# plot the same distribution using interactive plot
report.roc().plot_plotly(plotly_filename='ROC')
In [27]:
efficiencies = report.efficiencies(['mass'])
efficiencies_with_errors = report.efficiencies(['mass'], errors=True, bins=15, ignored_sideband=0.01)
In [ ]:
In [28]:
efficiencies.plot(figsize=(18, 25), fontsize=12, show_legend=False)
efficiencies_with_errors.plot(figsize=(18, 25), fontsize=12, show_legend=False)
In [29]:
efficiencies.plot_plotly("efficiencies", show_legend=False, figsize=(18, 20))
efficiencies_with_errors.plot_plotly("efficiencies error", show_legend=False, figsize=(18, 20))
In [31]:
# define metric functions
def AMS(s, b):
br = 0.01
radicand = 2 *( (s+b+br) * numpy.log (1.0 + s/(b+br)) - s)
return numpy.sqrt(radicand)
def significance(s, b):
br = 0.01
radicand = s / numpy.sqrt(b + br)
return radicand
metrics = report.metrics_vs_cut(AMS, metric_label='AMS')
metrics.plot(new_plot=True, figsize=(15, 6))
In [32]:
metrics = report.metrics_vs_cut(significance, metric_label='significance')
metrics.plot(new_plot=True, figsize=(15, 6))
In [33]:
metrics.plot_plotly('metrics')
Exercise 1. Create weight column for test and train datasets. Then do fit
for factory using this weights columns. Get model information using weights.
Exercise 2. Train another classifiers, plays with parameters and feature sets.
Exercise 3. Try use your cluster (change paths and configurations)