In [1]:
%pylab inline
In [2]:
!cd toy_datasets; wget -O magic04.data -nc --no-check-certificate https://archive.ics.uci.edu/ml/machine-learning-databases/magic/magic04.data
In [3]:
import numpy, pandas
from rep.utils import train_test_split
from sklearn.metrics import roc_auc_score
columns = ['fLength', 'fWidth', 'fSize', 'fConc', 'fConc1', 'fAsym', 'fM3Long', 'fM3Trans', 'fAlpha', 'fDist', 'g']
data = pandas.read_csv('toy_datasets/magic04.data', names=columns)
labels = numpy.array(data['g'] == 'g', dtype=int)
data = data.drop('g', axis=1)
In [4]:
import numpy
import pandas
from rep import utils
from sklearn.ensemble import GradientBoostingClassifier
from rep.report.metrics import RocAuc
from rep.metaml import GridOptimalSearchCV, FoldingScorer, RandomParameterOptimizer
from rep.estimators import SklearnClassifier, TMVAClassifier, XGBoostRegressor
In [5]:
# define grid parameters
grid_param = {}
grid_param['learning_rate'] = [0.2, 0.1, 0.05, 0.02, 0.01]
grid_param['max_depth'] = [2, 3, 4, 5]
# use random hyperparameter optimization algorithm
generator = RandomParameterOptimizer(grid_param)
# define folding scorer
scorer = FoldingScorer(RocAuc(), folds=3, fold_checks=3)
In [6]:
%%time
estimator = SklearnClassifier(GradientBoostingClassifier(n_estimators=30))
grid_finder = GridOptimalSearchCV(estimator, generator, scorer, parallel_profile='threads-4')
grid_finder.fit(data, labels)
In [7]:
grid_finder.params_generator.print_results()
In many applications we need to optimize some binary metrics for classification (f1, BER, misclassification error), in which case we need each time after training classifier to find optimal threshold on predicted probabilities (default one is usually bad).
In this example:
In [8]:
from rep.metaml import RegressionParameterOptimizer
from sklearn.gaussian_process import GaussianProcess
from rep.report.metrics import OptimalMetric, ams
In [9]:
%%time
# OptimalMetrics is a wrapper which is able to check all possible thresholds
# expected number of signal and background events are taken as some arbitrary numbers
optimal_ams = OptimalMetric(ams, expected_s=100, expected_b=1000)
# define grid parameters
grid_param = {'Shrinkage': [0.4, 0.2, 0.1, 0.05, 0.02, 0.01],
'NTrees': [5, 10, 15, 20, 25],
# you can pass different sets of features to be compared
'features': [columns[:2], columns[:3], columns[:4]],
}
# using GaussianProcesses
generator = RegressionParameterOptimizer(grid_param, n_evaluations=10, regressor=GaussianProcess(), n_attempts=10)
# define folding scorer
scorer = FoldingScorer(optimal_ams, folds=2, fold_checks=2)
grid_finder = GridOptimalSearchCV(TMVAClassifier(method='kBDT', BoostType='Grad',), generator, scorer, parallel_profile='threads-3')
grid_finder.fit(data, labels)
In [10]:
grid_finder.generator.print_results()
In [11]:
plot(grid_finder.generator.grid_scores_.values())
Out[11]:
REP supports sklearn-way of combining classifiers and getting/setting their parameters.
So you can tune complex models using the same approach.
Let's optimize
In [12]:
from sklearn.ensemble import BaggingRegressor
from rep.estimators import XGBoostRegressor
In [13]:
from rep.utils import train_test_split
# splitting into train and test
train_data, test_data, train_labels, test_labels = train_test_split(data, labels)
In [14]:
from sklearn.metrics import mean_absolute_error
from sklearn.base import clone
class MyMAEScorer(object):
def __init__(self, test_data, test_labels):
self.test_data = test_data
self.test_labels = test_labels
def __call__(self, base_estimator, params, X, y, sample_weight=None):
cl = clone(base_estimator)
cl.set_params(**params)
cl.fit(X, y)
# Returning with minus, because we maximize metric
return - mean_absolute_error(self.test_labels, cl.predict(self.test_data))
In [15]:
%%time
# define grid parameters
grid_param = {
# parameters of sklearn Bagging
'n_estimators': [1, 3, 5, 7],
'max_samples': [0.2, 0.4, 0.6, 0.8],
# parameters of base (XGBoost)
'base_estimator__n_estimators': [10, 20, 40],
'base_estimator__eta': [0.1, 0.2, 0.4, 0.6, 0.8]
}
# using Gaussian Processes
generator = RegressionParameterOptimizer(grid_param, n_evaluations=10, regressor=GaussianProcess(), n_attempts=10)
estimator = BaggingRegressor(XGBoostRegressor(), n_estimators=10)
scorer = MyMAEScorer(test_data, test_labels)
grid_finder = GridOptimalSearchCV(estimator, generator, scorer, parallel_profile=None)
grid_finder.fit(data, labels)
In [16]:
grid_finder.generator.print_results()
Grid search in REP extends sklearn grid search, uses optimization techniques to avoid complete search of estimator parameters.
REP has predefined scorers, metric functions, optimization techniques. Each component is replaceable and you can optimize complex models and pipelines (Folders/Bagging/Boosting and so on).
ParameterOptimizer is responsible for generating new set of parameters which will be checked
Scorer is responsible for training and evaluating metrics
GridOptimalSearchCV makes all of this work together and sends tasks to cluster or separate threads.