In [3]:
import os
import sys
# Modify the path
sys.path.append("..")
import pandas as pd
import yellowbrick as yb
import matplotlib.pyplot as plt
In [4]:
from sklearn.linear_model import RidgeClassifier
from sklearn.model_selection import StratifiedKFold
from yellowbrick.model_selection import CVScores
room = pd.read_csv('data/occupancy/occupancy.csv')
features = ["temperature", "relative humidity", "light", "C02", "humidity"]
# Extract the numpy arrays from the data frame
X = room[features].values
y = room.occupancy.values
# Create a cross-validation strategy
cv = StratifiedKFold(12)
# Create the cv score visualizer
oz = CVScores(RidgeClassifier(), cv=cv)
oz.fit(X, y)
oz.poof()
In [5]:
# yellowbrick.model_selection.cross_validation
# Implements cross-validation score plotting for model selection.
#
# Author: Prema Damodaran Roman
# Created: Wed June 6 2018 13:32:00 -0500
# Author: Rebecca Bilbro <bilbro@gmail.com>
# Updated: Fri Aug 10 13:15:43 2018 -0500
#
# ID: cross_validation.py [7f47800] pdamo24@gmail.com $
"""
Implements cross-validation score plotting for model selection.
"""
##########################################################################
## Imports
##########################################################################
import numpy as np
import matplotlib.ticker as ticker
from yellowbrick.base import ModelVisualizer
from sklearn.model_selection import cross_val_score
##########################################################################
## CVScores Visualizer
##########################################################################
class CVScores(ModelVisualizer):
"""
CVScores displays cross-validated scores as a bar chart, with the
average of the scores plotted as a horizontal line.
Parameters
----------
model : a scikit-learn estimator
An object that implements ``fit`` and ``predict``, can be a
classifier, regressor, or clusterer so long as there is also a valid
associated scoring metric.
Note that the object is cloned for each validation.
ax : matplotlib.Axes object, optional
The axes object to plot the figure on.
cv : int, cross-validation generator or an iterable, optional
Determines the cross-validation splitting strategy.
Possible inputs for cv are:
- None, to use the default 3-fold cross-validation,
- integer, to specify the number of folds.
- An object to be used as a cross-validation generator.
- An iterable yielding train/test splits.
See the scikit-learn `cross-validation guide <https://goo.gl/FS3VU6>`_
for more information on the possible strategies that can be used here.
scoring : string, callable or None, optional, default: None
A string or scorer callable object / function with signature
``scorer(estimator, X, y)``.
See scikit-learn `cross-validation guide <https://goo.gl/FS3VU6>`_
for more information on the possible metrics that can be used.
kwargs : dict
Keyword arguments that are passed to the base class and may influence
the visualization as defined in other Visualizers.
Examples
--------
>>> from sklearn import datasets, svm
>>> iris = datasets.load_iris()
>>> clf = svm.SVC(kernel='linear', C=1)
>>> X = iris.data
>>> y = iris.target
>>> visualizer = CVScores(model=clf, cv=5, scoring='f1_macro')
>>> visualizer.fit(X,y)
>>> visualizer.poof()
Notes
-----
This visualizer is a wrapper for
`sklearn.model_selection.cross_val_score <https://goo.gl/4v7dfL>`_.
Refer to the scikit-learn
`cross-validation guide <https://goo.gl/FS3VU6>`_
for more details.
"""
def __init__(self, model, ax=None, cv=None, scoring=None, **kwargs):
super(CVScores, self).__init__(model, ax=ax, **kwargs)
self.cv = cv
self.scoring = scoring
def fit(self, X, y, **kwargs):
"""
Fits the learning curve with the wrapped model to the specified data.
Draws training and test score curves and saves the scores to the
estimator.
Parameters
----------
X : array-like, shape (n_samples, n_features)
Training vector, where n_samples is the number of samples and
n_features is the number of features.
y : array-like, shape (n_samples) or (n_samples, n_features), optional
Target relative to X for classification or regression;
None for unsupervised learning.
Returns
-------
self : instance
"""
self.cv_scores_ = cross_val_score(
self.estimator, X, y, cv=self.cv, scoring=self.scoring
)
self.cv_scores_mean_ = self.cv_scores_.mean()
self.draw()
return self
def draw(self, **kwargs):
"""
Creates the bar chart of the cross-validated scores generated from the
fit method and places a dashed horizontal line that represents the
average value of the scores.
"""
color = kwargs.pop("color", "b")
width = kwargs.pop("width", 0.3)
linewidth = kwargs.pop("linewidth", 1)
xvals = np.arange(1, len(self.cv_scores_) + 1, 1)
self.ax.bar(xvals, self.cv_scores_, width=width)
self.ax.axhline(
self.cv_scores_mean_, color=color,
label="Mean score = {:0.3f}".format(self.cv_scores_mean_),
linestyle='--', linewidth=linewidth
)
return self.ax
def finalize(self, **kwargs):
"""
Add the title, legend, and other visual final touches to the plot.
"""
# Set the title of the figure
self.set_title('Cross Validation Scores for {}'.format(self.name))
# Add the legend
loc = kwargs.pop("loc", "best")
edgecolor = kwargs.pop("edgecolor", "k")
self.ax.legend(frameon=True, loc=loc, edgecolor=edgecolor)
# set spacing between the x ticks
self.ax.xaxis.set_major_locator(ticker.MultipleLocator(1))
# Set the axis labels
self.ax.set_xlabel('Training Instances')
self.ax.set_ylabel('Score')
##########################################################################
## Quick Method
##########################################################################
def cv_scores(model, X, y, ax=None, cv=None, scoring=None, **kwargs):
"""
Displays cross validation scores as a bar chart and the
average of the scores as a horizontal line
This helper function is a quick wrapper to utilize the
CVScores visualizer for one-off analysis.
Parameters
----------
model : a scikit-learn estimator
An object that implements ``fit`` and ``predict``, can be a
classifier, regressor, or clusterer so long as there is also a valid
associated scoring metric.
Note that the object is cloned for each validation.
X : array-like, shape (n_samples, n_features)
Training vector, where n_samples is the number of samples and
n_features is the number of features.
y : array-like, shape (n_samples) or (n_samples, n_features), optional
Target relative to X for classification or regression;
None for unsupervised learning.
ax : matplotlib.Axes object, optional
The axes object to plot the figure on.
cv : int, cross-validation generator or an iterable, optional
Determines the cross-validation splitting strategy.
Possible inputs for cv are:
- None, to use the default 3-fold cross-validation,
- integer, to specify the number of folds.
- An object to be used as a cross-validation generator.
- An iterable yielding train/test splits.
see the scikit-learn
`cross-validation guide <https://goo.gl/FS3VU6>`_
for more information on the possible strategies that can be used here.
scoring : string, callable or None, optional, default: None
A string or scorer callable object / function with signature
``scorer(estimator, X, y)``.
See scikit-learn `cross-validation guide <https://goo.gl/FS3VU6>`_
for more information on the possible metrics that can be used.
kwargs : dict
Keyword arguments that are passed to the base class and may influence
the visualization as defined in other Visualizers.
Returns
-------
ax : matplotlib.Axes
The axes object that the validation curves were drawn on.
"""
# Initialize the visualizer
visualizer = cv_scores(model, X, y, ax=ax, cv=cv, scoring=scoring)
# Fit and poof the visualizer
visualizer.fit(X, y)
visualizer.poof(**kwargs)
return visualizer.ax
In [6]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import StratifiedKFold
# Create a cross-validation strategy
cv = StratifiedKFold(12)
# Create the cv score visualizer
oz = CVScores(
MultinomialNB(), cv=cv, scoring='f1_weighted'
)
oz.fit(X, y)
oz.poof()
In [9]:
from sklearn.linear_model import Ridge
from sklearn.model_selection import KFold
energy = pd.read_csv('data/energy/energy.csv')
targets = ["heating load", "cooling load"]
features = [col for col in energy.columns if col not in targets]
X = energy[features]
y = energy[targets[1]]
cv = KFold(12)
oz = CVScores(
Ridge(), cv=cv, scoring='r2'
)
oz.fit(X, y)
oz.poof()
In [ ]: