In [1]:
# http://scikit-learn.org/stable/modules/feature_selection.html
# http://scikit-learn.org/dev/auto_examples/randomized_search.html#example-randomized-search-py

In [3]:
print(__doc__)

import numpy as np

from time import time
from operator import itemgetter
from scipy.stats import randint as sp_randint

from sklearn.grid_search import GridSearchCV, RandomizedSearchCV
from sklearn.datasets import load_digits
from sklearn.ensemble import RandomForestClassifier

# get some data
iris = load_digits()
X, y = iris.data, iris.target

# build a classifier
clf = RandomForestClassifier(n_estimators=50)


# Utility function to report best scores
def report(grid_scores, n_top=5):
    top_scores = sorted(grid_scores, key=itemgetter(1), reverse=True)[:n_top]
    for i, score in enumerate(top_scores):
        print("Model with rank: {0}".format(i + 1))
        print("Mean validation score: {0:.3f} (std: {1:.3f})".format(
              score.mean_validation_score,
              np.std(score.cv_validation_scores)))
        print("Parameters: {0}".format(score.parameters))
        print("")


# specify parameters and distributions to sample from
param_dist = {"max_depth": [3, None],
              "max_features": sp_randint(1, 11),
              "min_samples_split": sp_randint(1, 11),
              "min_samples_leaf": sp_randint(1, 11),
              "bootstrap": [True, False],
              "criterion": ["gini", "entropy"]}

# run randomized search
n_iter_search = 20
random_search = RandomizedSearchCV(clf, param_distributions=param_dist,
                                   n_iter=n_iter_search)

start = time()
random_search.fit(X, y)
print("RandomizedSearchCV took %.2f seconds for %d candidates"
      " parameter settings." % ((time() - start), n_iter_search))
report(random_search.grid_scores_)

# use a full grid over all parameters
param_grid = {"max_depth": [3, None],
              "max_features": [1, 3, 10],
              "min_samples_split": [1, 3, 10],
              "min_samples_leaf": [1, 3, 10],
              "bootstrap": [True, False],
              "criterion": ["gini", "entropy"]}

# run grid search
grid_search = GridSearchCV(clf, param_grid=param_grid)
start = time()
grid_search.fit(X, y)

print("GridSearchCV took %.2f seconds for %d candidate parameter settings."
      % (time() - start, len(grid_search.grid_scores_)))
report(grid_search.grid_scores_)


Automatically created module for IPython interactive environment
RandomizedSearchCV took 12.76 seconds for 20 candidates parameter settings.
Model with rank: 1
Mean validation score: 0.974 (std: 0.002)
Parameters: {'bootstrap': False, 'min_samples_leaf': 3, 'min_samples_split': 3, 'criterion': 'gini', 'max_features': 3, 'max_depth': None}

Model with rank: 2
Mean validation score: 0.973 (std: 0.002)
Parameters: {'bootstrap': False, 'min_samples_leaf': 1, 'min_samples_split': 5, 'criterion': 'gini', 'max_features': 2, 'max_depth': None}

Model with rank: 3
Mean validation score: 0.970 (std: 0.005)
Parameters: {'bootstrap': True, 'min_samples_leaf': 2, 'min_samples_split': 1, 'criterion': 'entropy', 'max_features': 8, 'max_depth': None}

Model with rank: 4
Mean validation score: 0.967 (std: 0.004)
Parameters: {'bootstrap': False, 'min_samples_leaf': 5, 'min_samples_split': 8, 'criterion': 'entropy', 'max_features': 3, 'max_depth': None}

Model with rank: 5
Mean validation score: 0.962 (std: 0.005)
Parameters: {'bootstrap': False, 'min_samples_leaf': 8, 'min_samples_split': 5, 'criterion': 'gini', 'max_features': 5, 'max_depth': None}

GridSearchCV took 105.05 seconds for 216 candidate parameter settings.
Model with rank: 1
Mean validation score: 0.977 (std: 0.006)
Parameters: {'bootstrap': False, 'min_samples_leaf': 1, 'min_samples_split': 1, 'criterion': 'entropy', 'max_features': 3, 'max_depth': None}

Model with rank: 2
Mean validation score: 0.976 (std: 0.003)
Parameters: {'bootstrap': True, 'min_samples_leaf': 1, 'min_samples_split': 1, 'criterion': 'entropy', 'max_features': 10, 'max_depth': None}

Model with rank: 3
Mean validation score: 0.976 (std: 0.005)
Parameters: {'bootstrap': False, 'min_samples_leaf': 1, 'min_samples_split': 1, 'criterion': 'gini', 'max_features': 10, 'max_depth': None}

Model with rank: 4
Mean validation score: 0.976 (std: 0.005)
Parameters: {'bootstrap': False, 'min_samples_leaf': 1, 'min_samples_split': 3, 'criterion': 'entropy', 'max_features': 10, 'max_depth': None}

Model with rank: 5
Mean validation score: 0.976 (std: 0.008)
Parameters: {'bootstrap': False, 'min_samples_leaf': 3, 'min_samples_split': 3, 'criterion': 'entropy', 'max_features': 10, 'max_depth': None}


In [ ]: