In [1]:
%pylab inline
In [2]:
!cd toy_datasets; wget -O MiniBooNE_PID.txt -nc MiniBooNE_PID.txt https://archive.ics.uci.edu/ml/machine-learning-databases/00199/MiniBooNE_PID.txt
In [3]:
import numpy, pandas
from rep.utils import train_test_split
In [4]:
import numpy, pandas
from rep.utils import train_test_split
from sklearn.metrics import roc_auc_score
data = pandas.read_csv('toy_datasets/MiniBooNE_PID.txt', sep='\s*', skiprows=[0], header=None, engine='python')
labels = pandas.read_csv('toy_datasets/MiniBooNE_PID.txt', sep=' ', nrows=1, header=None)
labels = [1] * labels[1].values[0] + [0] * labels[2].values[0]
data.columns = ['feature_{}'.format(key) for key in data.columns]
train_data, test_data, train_labels, test_labels = train_test_split(data, labels, train_size=0.5)
In [5]:
train_variables = ["feature_new01: feature_0/feature_1", "feature_2", "feature_26", "feature_12", "feature_24",
"feature_25", "feature_16",]
plot_variables = train_variables + ['feature_3']
This class is OrderedDict, with additional interface, main methods are:
factory.add_classifier(name, classifier)
factory.fit(X, y, sample_weight=None, ipc_profile=None, features=None)
train all classifiers in factory
if features
is not None, then all classifiers will be trained on these features
you can pass the name of ipython cluster via ipc_profile
for parallel training
factory.test_on_lds(lds)
- test all models on lds(rep.data.storage.LabeledDataStorage
)
returns report (rep.report.classification.ClassificationReport
)
In [6]:
from rep.metaml import ClassifiersFactory
from rep.estimators import TMVAClassifier, SklearnClassifier, XGBoostClassifier
from sklearn.ensemble import AdaBoostClassifier
In [7]:
factory = ClassifiersFactory()
# There are different ways to add classifiers to Factory:
factory.add_classifier('tmva', TMVAClassifier(NTrees=50, features=train_variables, Shrinkage=0.05))
factory.add_classifier('ada', AdaBoostClassifier(n_estimators=10))
factory['xgb'] = XGBoostClassifier(features=train_variables)
In [8]:
from copy import deepcopy
factory_copy = deepcopy(factory)
In [9]:
%time factory.fit(train_data, train_labels, features=train_variables)
pass
In [10]:
factory.predict_proba(train_data)
Out[10]:
In [11]:
%time factory_copy.fit(train_data, train_labels)
pass
ClassificationReport
class provides the posibility to get classification description to compare different models.
Below you can find available functions which can help you to analyze result on arbitrary dataset.
There are different plotting backends supported:
In [12]:
report = factory.test_on(test_data, test_labels)
In [13]:
features_importances = report.feature_importance()
features_importances.plot()
In [14]:
features_importances.plot_plotly('importances', figsize=(15, 6))
In [15]:
from rep.report.metrics import RocAuc
In [16]:
learning_curve = report.learning_curve(RocAuc(), metric_label='ROC AUC', steps=1)
learning_curve.plot(new_plot=True)
In [17]:
# plotting the same curve (without recomputing) using different plotting library
learning_curve.plot_plotly(plotly_filename='learning curves', figsize=(18, 8))
In [18]:
correlation_pairs = []
correlation_pairs.append((plot_variables[0], plot_variables[1]))
correlation_pairs.append((plot_variables[0], plot_variables[2]))
report.scatter(correlation_pairs, alpha=0.01).plot()
In [19]:
# plot correlations between variables for signal-like and bck-like events
report.features_correlation_matrix(features=plot_variables).plot(new_plot=True, show_legend=False, figsize=(7, 5))
In [20]:
report.features_correlation_matrix_by_class(features=plot_variables).plot(new_plot=True, show_legend=False, figsize=(15, 5))
In [21]:
# plot correlations between variables just for bck-like events
corr = report.features_correlation_matrix_by_class(features=plot_variables[:4], labels_dict={0: 'background'}, grid_columns=1)
corr.plot_plotly(plotly_filename='correlations', show_legend=False, fontsize=8, figsize=(8, 6))
In [22]:
# use just common features for all classifiers
report.features_pdf().plot()
In [23]:
# use all features in data
report.features_pdf(data.columns).plot_plotly('distributions')
In [24]:
report.prediction_pdf().plot(new_plot=True, figsize = (9, 4))
In [25]:
report.prediction_pdf(labels_dict={0: 'background'}, size=5).plot_plotly('models pdf')
In [26]:
report.roc().plot(xlim=(0.5, 1))
In [27]:
# plot the same distribution using interactive plot
report.roc().plot_plotly(plotly_filename='ROC')
In [28]:
efficiencies = report.efficiencies(['feature_3'], ignored_sideband=0.01)
efficiencies_with_errors = report.efficiencies(['feature_3'], errors=True, bins=15, ignored_sideband=0.01)
In [29]:
efficiencies.plot(figsize=(18, 25), fontsize=12, show_legend=False)
efficiencies_with_errors.plot(figsize=(18, 25), fontsize=12, show_legend=False)
In [30]:
efficiencies.plot_plotly("efficiencies", show_legend=False, figsize=(18, 20))
efficiencies_with_errors.plot_plotly("efficiencies error", show_legend=False, figsize=(18, 20))
In [31]:
# define metric functions of interest
def AMS(s, b):
b_reg = 0.01
radicand = 2 *( (s+b+b_reg) * numpy.log (1.0 + s/(b+b_reg)) - s)
return numpy.sqrt(radicand)
def significance(s, b):
b_reg = 0.01
radicand = s / numpy.sqrt(b + b_reg)
return radicand
metrics = report.metrics_vs_cut(AMS, metric_label='AMS')
metrics.plot(new_plot=True, figsize=(15, 6))
In [32]:
metrics = report.metrics_vs_cut(significance, metric_label='significance')
metrics.plot(new_plot=True, figsize=(15, 6))
In [33]:
metrics.plot_plotly('metrics')
Exercise 1. Create weight column for test and train datasets. Then do fit
for factory using this weights columns. Get model information using weights.
Exercise 2. Train another classifiers, plays with parameters and feature sets.
Exercise 3. Try use your cluster (change paths and configurations)