In [1]:
%pylab inline
In [2]:
!cd toy_datasets; wget -O magic04.data -nc https://archive.ics.uci.edu/ml/machine-learning-databases/magic/magic04.data
In [3]:
import numpy, pandas
from rep.utils import train_test_split
from sklearn.metrics import roc_auc_score
columns = ['fLength', 'fWidth', 'fSize', 'fConc', 'fConc1', 'fAsym', 'fM3Long', 'fM3Trans', 'fAlpha', 'fDist', 'g']
data = pandas.read_csv('toy_datasets/magic04.data', names=columns)
labels = numpy.array(data['g'] == 'g', dtype=int)
data = data.drop('g', axis=1)
In [4]:
import numpy
In [5]:
import numexpr
In [7]:
import pandas
In [6]:
from rep import utils
In [ ]:
from sklearn.ensemble import GradientBoostingClassifier
In [ ]:
from rep.report.metrics import RocAuc
In [ ]:
from rep.metaml import GridOptimalSearchCV, FoldingScorer, RandomParameterOptimizer
from rep.estimators import SklearnClassifier, XGBoostClassifier
In [ ]:
# define grid parameters
grid_param = {}
grid_param['learning_rate'] = [0.2, 0.1, 0.05, 0.02, 0.01]
grid_param['max_depth'] = [2, 3, 4, 5]
# use random hyperparameter optimization algorithm
generator = RandomParameterOptimizer(grid_param)
# define folding scorer
scorer = FoldingScorer(RocAuc(), folds=3, fold_checks=3)
In [ ]:
estimator = SklearnClassifier(GradientBoostingClassifier(n_estimators=30))
grid_finder = GridOptimalSearchCV(estimator, generator, scorer, parallel_profile='threads-4')
grid_finder.fit(data[::2], labels[::2])
In [ ]:
grid_finder.params_generator.print_results()
In many applications we need to optimize some binary metrics for classification (f1, BER, misclassification error), in which case we need each time after training classifier to find optimal threshold on predicted probabilities (default one is usually bad).
In this example:
In [ ]:
from rep.metaml import RegressionParameterOptimizer
from sklearn.gaussian_process import GaussianProcess
from rep.report.metrics import OptimalMetric, ams
In [ ]:
# OptimalMetrics is a wrapper which is able to check all possible thresholds
# expected number of signal and background events are taken as some arbitrary numbers
optimal_ams = OptimalMetric(ams, expected_s=100, expected_b=1000)
# define grid parameters
grid_param = {'eta': [0.4, 0.2, 0.1, 0.05, 0.02, 0.01],
'max_depth': [1, 2, 3, 4, 5, 6],
'subsample': [0.1, 0.2, 0.4, 0.6, 0.8],
# one more fature is you can pass different sets of features to be compared
'features': [columns[:2], columns[:3], columns[:4]],
}
# using GaussianProcesses
generator = RegressionParameterOptimizer(grid_param, n_evaluations=20, regressor=GaussianProcess(), n_attempts=10)
# define folding scorer
scorer = FoldingScorer(optimal_ams, folds=2, fold_checks=2)
grid_finder = GridOptimalSearchCV(XGBoostClassifier(), generator, scorer, parallel_profile='threads-3')
grid_finder.fit(data, labels)
In [ ]:
grid_finder.generator.print_results()
In [ ]:
plot(grid_finder.generator.grid_scores_.values())
REP supports sklearn-way of combining classifiers and getting/setting their parameters.
So you can tune complex models using the same approach.
Let's optimize
In [ ]:
from sklearn.ensemble import BaggingRegressor
from rep.estimators import TMVARegressor
In [ ]:
from rep.utils import train_test_split
# splitting into train and test
train_data, test_data, train_labels, test_labels = train_test_split(data, labels)
In [ ]:
from sklearn.metrics import mean_absolute_error
from sklearn.base import clone
class MyMAEScorer(object):
def __init__(self, test_data, test_labels):
self.test_data = test_data
self.test_labels = test_labels
def __call__(self, base_estimator, params, X, y, sample_weight=None):
cl = clone(base_estimator)
cl.set_params(**params)
cl.fit(X, y)
# Returning with minus, because we maximize metric
return - mean_absolute_error(self.test_labels, cl.predict(self.test_data))
In [ ]:
import root_numpy
In [ ]:
# define grid parameters
grid_param = {
# parameters of sklearn classifier
'n_estimators': [1, 3, 5, 7],
'max_samples': [0.2, 0.4, 0.6, 0.8],
# parameters of base (TMVA)
'base_estimator__NTrees': [10, 20, 40],
'base_estimator__Shrinkage': [0.1, 0.2, 0.4, 0.6, 0.8]
}
# using Gaussian Processes
generator = RegressionParameterOptimizer(grid_param, n_evaluations=9, regressor=GaussianProcess(), n_attempts=10)
estimator = BaggingRegressor(TMVARegressor(BoostType='Grad', NTrees=10), n_estimators=10)
scorer = MyMAEScorer(test_data, test_labels)
grid_finder = GridOptimalSearchCV(estimator, generator, scorer, parallel_profile='threads-3')
grid_finder.fit(data, labels)
In [ ]:
grid_finder.generator.print_results()
Grid search in REP extends sklearn grid search, uses optimization techniques to avoid complete search of estimator parameters.
REP has predefined scorers, metric functions, optimization techniques. Each component is replaceable and you can optimize complex models and pipelines (Folders/Bagging/Boosting and so on).
ParameterOptimizer is responsible for generating new set of parameters which will be checked
Scorer is responsible for training and evaluating metrics
GridOptimalSearchCV makes all of this work together and sends tasks to IPython cluster or separate threads.