In [ ]:
from sklearn.datasets import load_digits
from sklearn.cross_validation import train_test_split
digits = load_digits()
X_train, X_test, y_train, y_test = train_test_split(digits.data,
digits.target)
Cross-validated pipelines including scaling, we need to estimate mean and standard deviation separately for each fold. To do that, we build a pipeline.
In [ ]:
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.svm import SVC
from sklearn.preprocessing import StandardScaler
In [ ]:
standard_scaler = StandardScaler()
standard_scaler.fit(X_train)
X_train_scaled = standard_scaler.transform(X_train)
svm = SVC().fit(X_train_scaled, y_train)
In [ ]:
#pipeline = Pipeline([("scaler", StandardScaler()),
# ("svm", SVC())])
# short version:
pipeline = make_pipeline(StandardScaler(), SVC())
In [ ]:
pipeline.fit(X_train, y_train)
In [ ]:
pipeline.score(X_test, y_test)
In [ ]:
pipeline.predict(X_test)
In [ ]:
from sklearn.cross_validation import cross_val_score
cross_val_score(pipeline, X_train, y_train)
In [ ]:
import numpy as np
from sklearn.grid_search import GridSearchCV
param_grid = {'svc__C': 10. ** np.arange(-3, 3),
'svc__gamma' : 10. ** np.arange(-3, 3)
}
grid_pipeline = GridSearchCV(pipeline, param_grid=param_grid)
In [ ]:
grid_pipeline.fit(X_train, y_train)
In [ ]:
grid_pipeline.score(X_test, y_test)
In [ ]:
# %load solutions/pipeline_knn.py