Sklearn


In [1]:
from sklearn import cross_validation, datasets, grid_search, linear_model, metrics

import numpy as np
import pandas as pd


/home/andrey/anaconda2/lib/python2.7/site-packages/sklearn/cross_validation.py:44: DeprecationWarning: This module was deprecated in version 0.18 in favor of the model_selection module into which all the refactored classes and functions are moved. Also note that the interface of the new CV iterators are different from that of this module. This module will be removed in 0.20.
  "This module will be removed in 0.20.", DeprecationWarning)
/home/andrey/anaconda2/lib/python2.7/site-packages/sklearn/grid_search.py:43: DeprecationWarning: This module was deprecated in version 0.18 in favor of the model_selection module into which all the refactored classes and functions are moved. This module will be removed in 0.20.
  DeprecationWarning)

Генерация датасета


In [2]:
iris = datasets.load_iris()

In [3]:
train_data, test_data, train_labels, test_labels = cross_validation.train_test_split(iris.data, iris.target, 
                                                                                     test_size = 0.3,random_state = 0)

Задание модели


In [4]:
classifier = linear_model.SGDClassifier(random_state = 0)

Генерация сетки


In [5]:
classifier.get_params().keys()


Out[5]:
['warm_start',
 'loss',
 'n_jobs',
 'eta0',
 'verbose',
 'shuffle',
 'fit_intercept',
 'epsilon',
 'average',
 'n_iter',
 'penalty',
 'power_t',
 'random_state',
 'l1_ratio',
 'alpha',
 'learning_rate',
 'class_weight']

In [6]:
parameters_grid = {
    'loss' : ['hinge', 'log', 'squared_hinge', 'squared_loss'],
    'penalty' : ['l1', 'l2'],
    'n_iter' : range(5,10),
    'alpha' : np.linspace(0.0001, 0.001, num = 5),
}

In [7]:
cv = cross_validation.StratifiedShuffleSplit(train_labels, n_iter = 10, test_size = 0.2, random_state = 0)

Подбор параметров и оценка качества


In [20]:
grid_cv = grid_search.GridSearchCV(classifier, parameters_grid, scoring = 'accuracy', cv = cv)
grid_cv.get_params().keys()


Out[20]:
['estimator__epsilon',
 'n_jobs',
 'verbose',
 'estimator__l1_ratio',
 'estimator__alpha',
 'estimator__penalty',
 'param_grid',
 'estimator__shuffle',
 'scoring',
 'cv',
 'estimator',
 'estimator__verbose',
 'estimator__n_jobs',
 'estimator__loss',
 'fit_params',
 'estimator__warm_start',
 'refit',
 'iid',
 'estimator__n_iter',
 'estimator__learning_rate',
 'pre_dispatch',
 'estimator__power_t',
 'estimator__average',
 'estimator__fit_intercept',
 'estimator__class_weight',
 'estimator__random_state',
 'estimator__eta0',
 'error_score']

In [9]:
%%time
grid_cv.fit(train_data, train_labels)


CPU times: user 29.4 s, sys: 884 ms, total: 30.3 s
Wall time: 31.2 s
Out[9]:
GridSearchCV(cv=StratifiedShuffleSplit(labels=[1 2 ..., 2 0], n_iter=10, test_size=0.2, random_state=0),
       error_score='raise',
       estimator=SGDClassifier(alpha=0.0001, average=False, class_weight=None, epsilon=0.1,
       eta0=0.0, fit_intercept=True, l1_ratio=0.15,
       learning_rate='optimal', loss='hinge', n_iter=5, n_jobs=1,
       penalty='l2', power_t=0.5, random_state=0, shuffle=True, verbose=0,
       warm_start=False),
       fit_params={}, iid=True, n_jobs=1,
       param_grid={'penalty': ['l1', 'l2'], 'loss': ['hinge', 'log', 'squared_hinge', 'squared_loss'], 'alpha': array([ 0.0001 ,  0.00032,  0.00055,  0.00078,  0.001  ]), 'n_iter': [5, 6, 7, 8, 9]},
       pre_dispatch='2*n_jobs', refit=True, scoring='accuracy', verbose=0)

In [10]:
grid_cv.best_estimator_


Out[10]:
SGDClassifier(alpha=0.00032499999999999999, average=False, class_weight=None,
       epsilon=0.1, eta0=0.0, fit_intercept=True, l1_ratio=0.15,
       learning_rate='optimal', loss='hinge', n_iter=9, n_jobs=1,
       penalty='l1', power_t=0.5, random_state=0, shuffle=True, verbose=0,
       warm_start=False)

In [11]:
print grid_cv.best_score_
print grid_cv.best_params_


0.895238095238
{'penalty': 'l1', 'alpha': 0.00032499999999999999, 'n_iter': 9, 'loss': 'hinge'}

In [12]:
grid_cv.grid_scores_[:10]


Out[12]:
[mean: 0.75714, std: 0.13544, params: {'penalty': 'l1', 'alpha': 0.0001, 'n_iter': 5, 'loss': 'hinge'},
 mean: 0.66667, std: 0.15936, params: {'penalty': 'l2', 'alpha': 0.0001, 'n_iter': 5, 'loss': 'hinge'},
 mean: 0.61429, std: 0.14357, params: {'penalty': 'l1', 'alpha': 0.0001, 'n_iter': 6, 'loss': 'hinge'},
 mean: 0.68571, std: 0.15386, params: {'penalty': 'l2', 'alpha': 0.0001, 'n_iter': 6, 'loss': 'hinge'},
 mean: 0.79524, std: 0.16503, params: {'penalty': 'l1', 'alpha': 0.0001, 'n_iter': 7, 'loss': 'hinge'},
 mean: 0.70952, std: 0.18982, params: {'penalty': 'l2', 'alpha': 0.0001, 'n_iter': 7, 'loss': 'hinge'},
 mean: 0.69524, std: 0.19772, params: {'penalty': 'l1', 'alpha': 0.0001, 'n_iter': 8, 'loss': 'hinge'},
 mean: 0.63810, std: 0.17587, params: {'penalty': 'l2', 'alpha': 0.0001, 'n_iter': 8, 'loss': 'hinge'},
 mean: 0.84286, std: 0.12608, params: {'penalty': 'l1', 'alpha': 0.0001, 'n_iter': 9, 'loss': 'hinge'},
 mean: 0.78095, std: 0.10690, params: {'penalty': 'l2', 'alpha': 0.0001, 'n_iter': 9, 'loss': 'hinge'}]

In [13]:
randomized_grid_cv = grid_search.RandomizedSearchCV(classifier, parameters_grid, scoring = 'accuracy', cv = cv, n_iter = 20, 
                                                   random_state = 0)

In [14]:
%%time
randomized_grid_cv.fit(train_data, train_labels)


CPU times: user 2.94 s, sys: 100 ms, total: 3.04 s
Wall time: 3.08 s
Out[14]:
RandomizedSearchCV(cv=StratifiedShuffleSplit(labels=[1 2 ..., 2 0], n_iter=10, test_size=0.2, random_state=0),
          error_score='raise',
          estimator=SGDClassifier(alpha=0.0001, average=False, class_weight=None, epsilon=0.1,
       eta0=0.0, fit_intercept=True, l1_ratio=0.15,
       learning_rate='optimal', loss='hinge', n_iter=5, n_jobs=1,
       penalty='l2', power_t=0.5, random_state=0, shuffle=True, verbose=0,
       warm_start=False),
          fit_params={}, iid=True, n_iter=20, n_jobs=1,
          param_distributions={'penalty': ['l1', 'l2'], 'loss': ['hinge', 'log', 'squared_hinge', 'squared_loss'], 'alpha': array([ 0.0001 ,  0.00032,  0.00055,  0.00078,  0.001  ]), 'n_iter': [5, 6, 7, 8, 9]},
          pre_dispatch='2*n_jobs', random_state=0, refit=True,
          scoring='accuracy', verbose=0)

In [15]:
print randomized_grid_cv.best_score_
print randomized_grid_cv.best_params_


0.814285714286
{'penalty': 'l1', 'n_iter': 9, 'alpha': 0.00055000000000000003, 'loss': 'log'}