LR/RF Hyperparameter Tuning

For LR, we want to tune regularisation $C$, which we can do efficiently using LogisticRegressionCV. This only supports $L_2$, so if that turns out poorly, we'll just use L1 with a high regularisation.

For RF, we want to tune

  • like three things

In [40]:
import h5py, numpy, sklearn.linear_model, sklearn.model_selection, crowdastro.crowd.util, sklearn.metrics
import matplotlib.pyplot as plt, sklearn.ensemble, time
%matplotlib inline

In [41]:
with h5py.File('/Users/alger/data/Crowdastro/crowdastro-swire.h5', 'r') as f:
    swire_coords = f['/swire/cdfs/numeric'][:, :2]
data = h5py.File('/Users/alger/data/Crowdastro/all_training_data_01_05_17.h5', 'r')
list(data)


Out[41]:
['features', 'names', 'norris_labels', 'rgz_labels', 'sets']

Tuning LR


In [24]:
def balanced_score(lr, X_test, y_test):
    return crowdastro.crowd.util.balanced_accuracy(y_test, lr.predict(X_test))

# Making some splits for scikit-learn's model selection API.
# It only supports validation/training, not validation/training/testing, so we need to strip out testing.
# We need to make sure we don't peek at different label sets later on, so we should use a different C for
# each set (unless they all turn out the same). If they're all pretty close, we'll just use the C from RGZ.
for subset in data['sets']:
    print(subset)
    scores = []
    cs = []
    for train_set in data['sets'][subset]['train']:
        # Split the train set in three at (52.8, -28.1).
        middle = (52.8, -28.1)
        q1 = (swire_coords[train_set][:, 0] > middle[0]) & (swire_coords[train_set][:, 1] > middle[1])
        q2 = (swire_coords[train_set][:, 0] < middle[0]) & (swire_coords[train_set][:, 1] > middle[1])
        q3 = (swire_coords[train_set][:, 0] < middle[0]) & (swire_coords[train_set][:, 1] < middle[1])
        q4 = (swire_coords[train_set][:, 0] > middle[0]) & (swire_coords[train_set][:, 1] < middle[1])
        
        if q1.sum() < 200:
            qs = [q2, q3, q4]
        elif q2.sum() < 200:
            qs = [q1, q3, q4]
        elif q3.sum() < 200:
            qs = [q1, q2, q4]
        elif q4.sum() < 200:
            qs = [q1, q2, q3]
        else:
            raise ValueError('Invalid training data (too many samples per quadrant)')

        split = numpy.zeros((train_set.sum(),))
        split[qs[0]] = 0
        split[qs[1]] = 1
        split[qs[2]] = 2
        split = sklearn.model_selection.PredefinedSplit(split)
        lrcv = sklearn.linear_model.LogisticRegressionCV(Cs=numpy.logspace(-5, 5, 10),
                                                         cv=split, penalty='l2', solver='lbfgs',
                                                         class_weight='balanced',
                                                         scoring=lambda lr, x, y: sklearn.metrics.f1_score(
                                                             y, lr.predict(x)),
                                                         verbose=1)
        lrcv.fit(data['features'][train_set, :], data['rgz_labels'][train_set])
        print('\t', lrcv.C_)
        cs = lrcv.Cs_
        scores.append(lrcv.scores_[True].T)
    scores = numpy.concatenate(scores, axis=1)
    plt.plot(cs, scores)
    plt.xscale('log')
    plt.show()


RGZ
/usr/local/lib/python3.6/site-packages/sklearn/linear_model/logistic.py:717: UserWarning: lbfgs failed to converge. Increase the number of iterations.
  warnings.warn("lbfgs failed to converge. Increase the number "
[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed:  1.0min finished
	 [ 100000.]
[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed:   53.6s finished
	 [ 100000.]
[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed:   57.9s finished
	 [ 100000.]
[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed:   56.4s finished
	 [ 7742.63682681]
RGZ & Norris
[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed:   17.8s finished
	 [ 0.00012915]
[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed:   17.7s finished
	 [ 0.00012915]
[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed:   19.1s finished
	 [ 0.00012915]
[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed:   17.8s finished
	 [ 0.00012915]
RGZ & Norris & compact
[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed:   13.2s finished
	 [ 3.59381366]
[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed:   13.6s finished
	 [ 0.27825594]
[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed:   11.5s finished
	 [ 3.59381366]
[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed:   10.9s finished
	 [ 3.59381366]
RGZ & Norris & resolved
[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed:    7.1s finished
	 [ 0.0016681]
[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed:    7.6s finished
	 [ 0.0016681]
[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed:    7.0s finished
	 [ 0.00012915]
[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed:    7.0s finished
	 [ 0.00012915]
RGZ & compact
[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed:   51.1s finished
	 [ 3.59381366]
[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed:   47.5s finished
	 [ 3.59381366]
[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed:   44.1s finished
	 [ 3.59381366]
[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed:   44.2s finished
	 [ 3.59381366]
RGZ & resolved
[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed:   15.1s finished
	 [ 0.00012915]
[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed:   12.2s finished
	 [ 0.00012915]
[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed:   10.5s finished
	 [ 0.00012915]
[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed:   10.3s finished
	 [ 0.00012915]

In [ ]:
# L1
for subset in data['sets']:
    print(subset)
    scores = []
    cs = []
    for train_set in data['sets'][subset]['train']:
        # Split the train set in three at (52.8, -28.1).
        middle = (52.8, -28.1)
        q1 = (swire_coords[train_set][:, 0] > middle[0]) & (swire_coords[train_set][:, 1] > middle[1])
        q2 = (swire_coords[train_set][:, 0] < middle[0]) & (swire_coords[train_set][:, 1] > middle[1])
        q3 = (swire_coords[train_set][:, 0] < middle[0]) & (swire_coords[train_set][:, 1] < middle[1])
        q4 = (swire_coords[train_set][:, 0] > middle[0]) & (swire_coords[train_set][:, 1] < middle[1])
        
        if q1.sum() < 200:
            qs = [q2, q3, q4]
        elif q2.sum() < 200:
            qs = [q1, q3, q4]
        elif q3.sum() < 200:
            qs = [q1, q2, q4]
        elif q4.sum() < 200:
            qs = [q1, q2, q3]
        else:
            raise ValueError('Invalid training data (too many samples per quadrant)')

        split = numpy.zeros((train_set.sum(),))
        split[qs[0]] = 0
        split[qs[1]] = 1
        split[qs[2]] = 2
        split = sklearn.model_selection.PredefinedSplit(split)
        lrcv = sklearn.linear_model.LogisticRegressionCV(Cs=numpy.logspace(-5, 5, 5),
                                                         cv=split, penalty='l1', solver='liblinear',
                                                         class_weight='balanced',
                                                         scoring=lambda lr, x, y: sklearn.metrics.f1_score(
                                                             y, lr.predict(x)),
                                                         verbose=1)
        lrcv.fit(data['features'][train_set, :], data['rgz_labels'][train_set])
        print('\t', lrcv.C_)
        cs = lrcv.Cs_
        scores.append(lrcv.scores_[True].T)
    scores = numpy.concatenate(scores, axis=1)
    plt.plot(cs, scores)
    plt.xscale('log')
    plt.show()


RGZ
[LibLinear][LibLinear][LibLinear]

Tuning RF


In [51]:
param_grid = {"min_weight_fraction_leaf": numpy.linspace(0, 0.07, 10),
              "max_depth": numpy.linspace(1, 30, 10)}

for subset in data['sets']:
    print(subset)
    for train_set in data['sets'][subset]['train']:
        now = time.time()
        # Split the train set in three at (52.8, -28.1).
        middle = (52.8, -28.1)
        q1 = (swire_coords[train_set][:, 0] > middle[0]) & (swire_coords[train_set][:, 1] > middle[1])
        q2 = (swire_coords[train_set][:, 0] < middle[0]) & (swire_coords[train_set][:, 1] > middle[1])
        q3 = (swire_coords[train_set][:, 0] < middle[0]) & (swire_coords[train_set][:, 1] < middle[1])
        q4 = (swire_coords[train_set][:, 0] > middle[0]) & (swire_coords[train_set][:, 1] < middle[1])
        
        if q1.sum() < 200:
            qs = [q2, q3, q4]
        elif q2.sum() < 200:
            qs = [q1, q3, q4]
        elif q3.sum() < 200:
            qs = [q1, q2, q4]
        elif q4.sum() < 200:
            qs = [q1, q2, q3]
        else:
            raise ValueError('Invalid training data (too many samples per quadrant)')

        split = numpy.zeros((train_set.sum(),))
        split[qs[0]] = 0
        split[qs[1]] = 1
        split[qs[2]] = 2
        split = sklearn.model_selection.PredefinedSplit(split)
        rf = sklearn.ensemble.RandomForestClassifier(class_weight='balanced', criterion='entropy')
        
        gs = sklearn.model_selection.GridSearchCV(rf, param_grid,
            scoring=lambda lr, x, y: crowdastro.crowd.util.balanced_accuracy(y, lr.predict(x)),
            cv=split)

        gs.fit(data['features'][train_set, :], data['rgz_labels'][train_set])
#         print('\t', gs.grid_scores_)
        print('', gs.best_params_)
        print('', gs.best_score_)
        print('\tTook', time.time() - now)


RGZ
 {'max_depth': 7.4444444444444446, 'min_weight_fraction_leaf': 0.0}
 0.929845929639
	Took 964.6908819675446
 {'max_depth': 7.4444444444444446, 'min_weight_fraction_leaf': 0.0}
 0.9230359061
	Took 3088.115518093109
 {'max_depth': 7.4444444444444446, 'min_weight_fraction_leaf': 0.0077777777777777784}
 0.926406428576
	Took 959.8773319721222
---------------------------------------------------------------------------
KeyboardInterrupt                         Traceback (most recent call last)
<ipython-input-51-9db1b234d9a5> in <module>()
     35             cv=split)
     36 
---> 37         gs.fit(data['features'][train_set, :], data['rgz_labels'][train_set])
     38 #         print('\t', gs.grid_scores_)
     39         print('', gs.best_params_)

/usr/local/lib/python3.6/site-packages/sklearn/model_selection/_search.py in fit(self, X, y, groups)
    943             train/test set.
    944         """
--> 945         return self._fit(X, y, groups, ParameterGrid(self.param_grid))
    946 
    947 

/usr/local/lib/python3.6/site-packages/sklearn/model_selection/_search.py in _fit(self, X, y, groups, parameter_iterable)
    562                                   return_times=True, return_parameters=True,
    563                                   error_score=self.error_score)
--> 564           for parameters in parameter_iterable
    565           for train, test in cv_iter)
    566 

/usr/local/lib/python3.6/site-packages/sklearn/externals/joblib/parallel.py in __call__(self, iterable)
    756             # was dispatched. In particular this covers the edge
    757             # case of Parallel used with an exhausted iterator.
--> 758             while self.dispatch_one_batch(iterator):
    759                 self._iterating = True
    760             else:

/usr/local/lib/python3.6/site-packages/sklearn/externals/joblib/parallel.py in dispatch_one_batch(self, iterator)
    606                 return False
    607             else:
--> 608                 self._dispatch(tasks)
    609                 return True
    610 

/usr/local/lib/python3.6/site-packages/sklearn/externals/joblib/parallel.py in _dispatch(self, batch)
    569         dispatch_timestamp = time.time()
    570         cb = BatchCompletionCallBack(dispatch_timestamp, len(batch), self)
--> 571         job = self._backend.apply_async(batch, callback=cb)
    572         self._jobs.append(job)
    573 

/usr/local/lib/python3.6/site-packages/sklearn/externals/joblib/_parallel_backends.py in apply_async(self, func, callback)
    107     def apply_async(self, func, callback=None):
    108         """Schedule a func to be run"""
--> 109         result = ImmediateResult(func)
    110         if callback:
    111             callback(result)

/usr/local/lib/python3.6/site-packages/sklearn/externals/joblib/_parallel_backends.py in __init__(self, batch)
    324         # Don't delay the application, to avoid keeping the input
    325         # arguments in memory
--> 326         self.results = batch()
    327 
    328     def get(self):

/usr/local/lib/python3.6/site-packages/sklearn/externals/joblib/parallel.py in __call__(self)
    129 
    130     def __call__(self):
--> 131         return [func(*args, **kwargs) for func, args, kwargs in self.items]
    132 
    133     def __len__(self):

/usr/local/lib/python3.6/site-packages/sklearn/externals/joblib/parallel.py in <listcomp>(.0)
    129 
    130     def __call__(self):
--> 131         return [func(*args, **kwargs) for func, args, kwargs in self.items]
    132 
    133     def __len__(self):

/usr/local/lib/python3.6/site-packages/sklearn/model_selection/_validation.py in _fit_and_score(estimator, X, y, scorer, train, test, verbose, parameters, fit_params, return_train_score, return_parameters, return_n_test_samples, return_times, error_score)
    236             estimator.fit(X_train, **fit_params)
    237         else:
--> 238             estimator.fit(X_train, y_train, **fit_params)
    239 
    240     except Exception as e:

/usr/local/lib/python3.6/site-packages/sklearn/ensemble/forest.py in fit(self, X, y, sample_weight)
    324                     t, self, X, y, sample_weight, i, len(trees),
    325                     verbose=self.verbose, class_weight=self.class_weight)
--> 326                 for i, t in enumerate(trees))
    327 
    328             # Collect newly grown trees

/usr/local/lib/python3.6/site-packages/sklearn/externals/joblib/parallel.py in __call__(self, iterable)
    756             # was dispatched. In particular this covers the edge
    757             # case of Parallel used with an exhausted iterator.
--> 758             while self.dispatch_one_batch(iterator):
    759                 self._iterating = True
    760             else:

/usr/local/lib/python3.6/site-packages/sklearn/externals/joblib/parallel.py in dispatch_one_batch(self, iterator)
    606                 return False
    607             else:
--> 608                 self._dispatch(tasks)
    609                 return True
    610 

/usr/local/lib/python3.6/site-packages/sklearn/externals/joblib/parallel.py in _dispatch(self, batch)
    569         dispatch_timestamp = time.time()
    570         cb = BatchCompletionCallBack(dispatch_timestamp, len(batch), self)
--> 571         job = self._backend.apply_async(batch, callback=cb)
    572         self._jobs.append(job)
    573 

/usr/local/lib/python3.6/site-packages/sklearn/externals/joblib/_parallel_backends.py in apply_async(self, func, callback)
    107     def apply_async(self, func, callback=None):
    108         """Schedule a func to be run"""
--> 109         result = ImmediateResult(func)
    110         if callback:
    111             callback(result)

/usr/local/lib/python3.6/site-packages/sklearn/externals/joblib/_parallel_backends.py in __init__(self, batch)
    324         # Don't delay the application, to avoid keeping the input
    325         # arguments in memory
--> 326         self.results = batch()
    327 
    328     def get(self):

/usr/local/lib/python3.6/site-packages/sklearn/externals/joblib/parallel.py in __call__(self)
    129 
    130     def __call__(self):
--> 131         return [func(*args, **kwargs) for func, args, kwargs in self.items]
    132 
    133     def __len__(self):

/usr/local/lib/python3.6/site-packages/sklearn/externals/joblib/parallel.py in <listcomp>(.0)
    129 
    130     def __call__(self):
--> 131         return [func(*args, **kwargs) for func, args, kwargs in self.items]
    132 
    133     def __len__(self):

/usr/local/lib/python3.6/site-packages/sklearn/ensemble/forest.py in _parallel_build_trees(tree, forest, X, y, sample_weight, tree_idx, n_trees, verbose, class_weight)
    118             curr_sample_weight *= compute_sample_weight('balanced', y, indices)
    119 
--> 120         tree.fit(X, y, sample_weight=curr_sample_weight, check_input=False)
    121     else:
    122         tree.fit(X, y, sample_weight=sample_weight, check_input=False)

/usr/local/lib/python3.6/site-packages/sklearn/tree/tree.py in fit(self, X, y, sample_weight, check_input, X_idx_sorted)
    737             sample_weight=sample_weight,
    738             check_input=check_input,
--> 739             X_idx_sorted=X_idx_sorted)
    740         return self
    741 

/usr/local/lib/python3.6/site-packages/sklearn/tree/tree.py in fit(self, X, y, sample_weight, check_input, X_idx_sorted)
    348                                            self.min_impurity_split)
    349 
--> 350         builder.build(self.tree_, X, y, sample_weight, X_idx_sorted)
    351 
    352         if self.n_outputs_ == 1:

KeyboardInterrupt: 

In [ ]: