Title: Cross-Validation
Slug: cross-validaton
Summary: How to cross-validate models for machine learning in Python.
Date: 2017-09-15 12:00
Category: Machine Learning
Tags: Model Evaluation
Authors: Chris Albon
In [1]:
# Load libraries
import numpy as np
from sklearn import datasets
from sklearn import metrics
from sklearn.model_selection import KFold, cross_val_score
from sklearn.pipeline import make_pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
In [2]:
# Load the digits dataset
digits = datasets.load_digits()
# Create the features matrix
X = digits.data
# Create the target vector
y = digits.target
In [3]:
# Create standardizer
standardizer = StandardScaler()
# Create logistic regression
logit = LogisticRegression()
# Create a pipeline that standardizes, then runs logistic regression
pipeline = make_pipeline(standardizer, logit)
In [4]:
# Create k-Fold cross-validation
kf = KFold(n_splits=10, shuffle=True, random_state=1)
In [5]:
# Do k-fold cross-validation
cv_results = cross_val_score(pipeline, # Pipeline
X, # Feature matrix
y, # Target vector
cv=kf, # Cross-validation technique
scoring="accuracy", # Loss function
n_jobs=-1) # Use all CPU scores
In [6]:
# Calculate mean
cv_results.mean()
Out[6]: