In [1]:
from sklearn.datasets import load_iris
from sklearn.neighbors import KNeighborsClassifier
import matplotlib.pyplot as plt
%matplotlib inline
from sklearn.grid_search import GridSearchCV
In [2]:
# read in the iris data
iris = load_iris()
# create X (features) and y (response)
X = iris.data
y = iris.target
In [3]:
# define the parameter values that should be searched
k_range = range(1, 31)
print k_range
In [4]:
# create a parameter grid: map the parameter names to the values that should be searched
# 下面是构建parameter grid,其结构是key为参数名称,value是待搜索的数值列表的一个字典结构
param_grid = dict(n_neighbors=k_range)
print param_grid
In [5]:
knn = KNeighborsClassifier(n_neighbors=5)
# instantiate the grid
# 这里GridSearchCV的参数形式和cross_val_score的形式差不多,其中param_grid是parameter grid所对应的参数
# GridSearchCV中的n_jobs设置为-1时,可以实现并行计算(如果你的电脑支持的情况下)
grid = GridSearchCV(knn, param_grid, cv=10, scoring='accuracy')
我们可以知道,这里的grid search针对每个参数进行了10次交叉验证,并且一共对30个参数进行相同过程的交叉验证
In [6]:
grid.fit(X, y)
Out[6]:
In [7]:
# view the complete results (list of named tuples)
grid.grid_scores_
Out[7]:
In [8]:
# examine the first tuple
print grid.grid_scores_[0].parameters
print grid.grid_scores_[0].cv_validation_scores
print grid.grid_scores_[0].mean_validation_score
In [9]:
# create a list of the mean scores only
grid_mean_scores = [result.mean_validation_score for result in grid.grid_scores_]
print grid_mean_scores
In [10]:
# plot the results
plt.plot(k_range, grid_mean_scores)
plt.xlabel('Value of K for KNN')
plt.ylabel('Cross-Validated Accuracy')
Out[10]:
In [11]:
# examine the best model
print grid.best_score_
print grid.best_params_
print grid.best_estimator_
In [12]:
# define the parameter values that should be searched
k_range = range(1, 31)
weight_options = ['uniform', 'distance']
In [13]:
# create a parameter grid: map the parameter names to the values that should be searched
param_grid = dict(n_neighbors=k_range, weights=weight_options)
print param_grid
In [14]:
# instantiate and fit the grid
grid = GridSearchCV(knn, param_grid, cv=10, scoring='accuracy')
grid.fit(X, y)
Out[14]:
In [15]:
# view the complete results
grid.grid_scores_
Out[15]:
In [16]:
# examine the best model
print grid.best_score_
print grid.best_params_
In [17]:
# train your model using all data and the best known parameters
knn = KNeighborsClassifier(n_neighbors=13, weights='uniform')
knn.fit(X, y)
# make a prediction on out-of-sample data
knn.predict([3, 5, 4, 2])
Out[17]:
这里使用之前得到的最佳参数对模型进行重新训练,在训练时,就可以将所有的数据都作为训练数据全部投入到模型中去,这样就不会浪费个别数据了。
In [18]:
# shortcut: GridSearchCV automatically refits the best model using all of the data
grid.predict([3, 5, 4, 2])
Out[18]:
In [19]:
from sklearn.grid_search import RandomizedSearchCV
In [20]:
# specify "parameter distributions" rather than a "parameter grid"
param_dist = dict(n_neighbors=k_range, weights=weight_options)
In [21]:
# n_iter controls the number of searches
rand = RandomizedSearchCV(knn, param_dist, cv=10, scoring='accuracy', n_iter=10, random_state=5)
rand.fit(X, y)
rand.grid_scores_
Out[21]:
In [22]:
# examine the best model
print rand.best_score_
print rand.best_params_
In [23]:
# run RandomizedSearchCV 20 times (with n_iter=10) and record the best score
best_scores = []
for _ in range(20):
rand = RandomizedSearchCV(knn, param_dist, cv=10, scoring='accuracy', n_iter=10)
rand.fit(X, y)
best_scores.append(round(rand.best_score_, 3))
print best_scores
当你的调节参数是连续的,比如回归问题的正则化参数,有必要指定一个连续分布而不是可能值的列表,这样RandomizeSearchCV就可以执行更好的grid search。