In [1]:
%pylab inline
In [2]:
!cd toy_datasets; wget -O MiniBooNE_PID.txt -nc --no-check-certificate https://archive.ics.uci.edu/ml/machine-learning-databases/00199/MiniBooNE_PID.txt
In [3]:
import numpy, pandas
from rep.utils import train_test_split
In [4]:
import numpy, pandas
from rep.utils import train_test_split
from sklearn.metrics import roc_auc_score
data = pandas.read_csv('toy_datasets/MiniBooNE_PID.txt', sep='\s*', skiprows=[0], header=None, engine='python')
labels = pandas.read_csv('toy_datasets/MiniBooNE_PID.txt', sep=' ', nrows=1, header=None)
labels = [1] * labels[1].values[0] + [0] * labels[2].values[0]
data.columns = ['feature_{}'.format(key) for key in data.columns]
train_data, test_data, train_labels, test_labels = train_test_split(data, labels, train_size=0.5)
In [5]:
train_variables = ["feature_new01: feature_0/feature_1", "feature_2", "feature_26", "feature_12", "feature_24",
"feature_25", "feature_16",]
plot_variables = train_variables + ['feature_3']
This class is OrderedDict, with additional interface, main methods are:
factory.add_classifier(name, classifier)
factory.fit(X, y, sample_weight=None, ipc_profile=None, features=None)
train all classifiers in factory
if features
is not None, then all classifiers will be trained on these features
you can pass the name of ipython cluster via ipc_profile
for parallel training
factory.test_on_lds(lds)
- test all models on lds(rep.data.storage.LabeledDataStorage
)
returns report (rep.report.classification.ClassificationReport
)
In [6]:
from rep.metaml import ClassifiersFactory
from rep.estimators import TMVAClassifier, SklearnClassifier, XGBoostClassifier
from sklearn.ensemble import AdaBoostClassifier
In [7]:
factory = ClassifiersFactory()
# There are different ways to add classifiers to Factory:
factory.add_classifier('tmva', TMVAClassifier(NTrees=50, features=train_variables, Shrinkage=0.05))
factory.add_classifier('ada', AdaBoostClassifier(n_estimators=10))
factory['xgb'] = XGBoostClassifier(features=train_variables)
In [8]:
from copy import deepcopy
factory_copy = deepcopy(factory)
In [9]:
%time factory.fit(train_data, train_labels, features=train_variables)
pass
In [10]:
factory.predict_proba(train_data)
Out[10]:
In [11]:
%time factory_copy.fit(train_data, train_labels)
pass
ClassificationReport
class provides the posibility to get classification description to compare different models.
Below you can find available functions which can help you to analyze result on arbitrary dataset.
There are different plotting backends supported:
In [12]:
report = factory.test_on(test_data, test_labels)
In [13]:
features_importances = report.feature_importance()
features_importances.plot()
# not only in matplotlib, but in other libraries too. For instance, with plotly
# features_importances.plot_plotly('importances', figsize=(15, 6))
In [14]:
from rep.report.metrics import RocAuc
In [15]:
learning_curve = report.learning_curve(RocAuc(), metric_label='ROC AUC', steps=1)
learning_curve.plot(new_plot=True, figsize=(8, 4))
# with plotly
# learning_curve.plot_plotly(plotly_filename='learning curves', figsize=(18, 8))
In [16]:
correlation_pairs = []
correlation_pairs.append((plot_variables[0], plot_variables[1]))
correlation_pairs.append((plot_variables[0], plot_variables[2]))
report.scatter(correlation_pairs, alpha=0.01).plot()
In [17]:
# plot correlations between variables for signal-like and bck-like events
report.features_correlation_matrix(features=plot_variables).plot(new_plot=True, show_legend=False, figsize=(7, 5))
In [18]:
report.features_correlation_matrix_by_class(features=plot_variables).plot(new_plot=True,
show_legend=False, figsize=(15, 5))
# compute correclation matrix only for zero class data
# report.features_correlation_matrix_by_class(features=plot_variables[:4], labels_dict={0: 'background'}, grid_columns=1).plot()
In [19]:
# use just common features for all classifiers
report.features_pdf().plot()
In [20]:
report.prediction_pdf().plot(new_plot=True, figsize = (9, 4))
# Plot the same only for zero class data
# report.prediction_pdf(labels_dict={0: 'background'}, size=5).plot()
In [21]:
figure(figsize(14, 5))
subplot(1, 2, 1)
report.roc().plot(xlim=(0.5, 1))
subplot(1, 2, 2)
report.roc(physics_notion=True).plot(ylim=(0.5, 1))
In [22]:
efficiencies = report.efficiencies(['feature_3'], ignored_sideband=0.01)
efficiencies_with_errors = report.efficiencies(['feature_3'], errors=True, bins=15, ignored_sideband=0.01)
In [23]:
efficiencies.plot(figsize=(18, 15), fontsize=12, show_legend=False)
# plot efficiencies with error bars
# efficiencies_with_errors.plot(figsize=(18, 25), fontsize=12, show_legend=False)
In [24]:
from rep.report.metrics import significance
metrics = report.metrics_vs_cut(significance, metric_label='significance')
metrics.plot(new_plot=True, figsize=(10, 4))
# You can define your own metric and use it
# def AMS(s, b):
# b_reg = 0.01
# radicand = 2 *( (s+b+b_reg) * numpy.log (1.0 + s/(b+b_reg)) - s)
# return numpy.sqrt(radicand)
# metrics = report.metrics_vs_cut(AMS, metric_label='ams')
Exercise 1. Create weight column for test and train datasets. Use weights in fit
ting and test
ing of factory.
Exercise 2. Train another classifiers, play with parameters and feature sets.
Exercise 3. Set up you IPython cluster for distributed training of estimators.