In [1]:
%matplotlib inline
%load_ext autoreload
%autoreload 2
In [2]:
import os
import sys
sys.path.append("..")
sys.path.append("../..")
import numpy as np
import pandas as pd
In [3]:
## [from examples/examples.py]
from download import download_all
## The path to the test data sets
FIXTURES = os.path.join(os.getcwd(), "data")
## Dataset loading mechanisms
datasets = {
"credit": os.path.join(FIXTURES, "credit", "credit.csv"),
"concrete": os.path.join(FIXTURES, "concrete", "concrete.csv"),
"occupancy": os.path.join(FIXTURES, "occupancy", "occupancy.csv"),
"mushroom": os.path.join(FIXTURES, "mushroom", "mushroom.csv"),
}
def load_data(name, download=True):
"""
Loads and wrangles the passed in dataset by name.
If download is specified, this method will download any missing files.
"""
# Get the path from the datasets
path = datasets[name]
# Check if the data exists, otherwise download or raise
if not os.path.exists(path):
if download:
download_all()
else:
raise ValueError((
"'{}' dataset has not been downloaded, "
"use the download.py module to fetch datasets"
).format(name))
# Return the data frame
return pd.read_csv(path)
In [4]:
# Load the classification data set
data = load_data('occupancy')
print(len(data))
data.head()
Out[4]:
In [5]:
# Specify the features of interest and the classes of the target
features = ["temperature", "relative humidity", "light", "C02", "humidity"]
classes = ['unoccupied', 'occupied']
# Searching the whole dataset takes a while (15 mins on my mac)...
# For demo purposes, we reduce the size
X = data[features].head(2000)
y = data.occupancy.head(2000)
Here we demo the param_projection utility function that does this
In [6]:
from sklearn.model_selection import GridSearchCV
from sklearn.svm import SVC
from yellowbrick.gridsearch.base import param_projection
In [7]:
# Fit a vanilla grid search... these are the example parameters from sklearn's gridsearch docs.
svc = SVC()
grid = [{'kernel': ['rbf'], 'gamma': [1e-3, 1e-4], 'C': [1, 10, 100, 1000]},
{'kernel': ['linear'], 'C': [1, 10, 100, 1000]}]
gs = GridSearchCV(svc, grid, n_jobs=4)
In [8]:
%%time
gs.fit(X, y)
Out[8]:
As of Scikit-learn 0.18, cv_results has replaced grid_scores as the grid search results format
In [9]:
gs.cv_results_
Out[9]:
Demo the use of param_projection... It identifies the unique values of the the two parameter values and gets the best score for each (here taking the max over gamma values)
In [10]:
param_1 = 'C'
param_2 = 'kernel'
param_1_vals, param2_vals, best_scores = param_projection(gs.cv_results_, param_1, param_2)
param_1_vals, param2_vals, best_scores
Out[10]:
In [11]:
from yellowbrick.gridsearch import GridSearchColorPlot
In [12]:
gs_viz = GridSearchColorPlot(gs, 'C', 'kernel')
gs_viz.fit(X, y).show()
In [13]:
gs_viz = GridSearchColorPlot(gs, 'kernel', 'C')
gs_viz.fit(X, y).show()
In [14]:
gs_viz = GridSearchColorPlot(gs, 'C', 'gamma')
gs_viz.fit(X, y).show()
If there are missing values in the grid, these are filled with a hatch (see https://stackoverflow.com/a/35905483/7637679)
In [15]:
gs_viz = GridSearchColorPlot(gs, 'kernel', 'gamma')
gs_viz.fit(X, y).show()
Choose a different metric...
In [16]:
gs_viz = GridSearchColorPlot(gs, 'C', 'kernel', metric='mean_fit_time')
gs_viz.fit(X, y).show()
In [17]:
from yellowbrick.gridsearch import gridsearch_color_plot
In [18]:
%%time
# passing the GridSearchCV object pre-fit
gridsearch_color_plot(gs, 'C', 'kernel')
Out[18]:
In [19]:
%%time
# trying a different cut across parameters
gridsearch_color_plot(gs, 'C', 'gamma')
Out[19]:
In [20]:
%%time
# When we provide X, the `fit` method will call fit (takes longer)
gridsearch_color_plot(gs, 'C', 'kernel', X=X, y=y)
Out[20]:
In [21]:
%%time
# can also choose a different metric
gridsearch_color_plot(gs, 'C', 'kernel', metric='mean_fit_time')
Out[21]:
Bad param values
In [22]:
gs_viz = GridSearchColorPlot(gs, 'foo', 'kernel')
gs_viz.fit(X, y).show()
In [23]:
gs_viz = GridSearchColorPlot(gs, 'C', 'foo')
gs_viz.fit(X, y).show()
Bad metric option
In [24]:
gs_viz = GridSearchColorPlot(gs, 'C', 'kernel', metric='foo')
gs_viz.fit(X, y).show()
Metric option exists in cv_results but is not numeric -> not valid
In [25]:
gs_viz = GridSearchColorPlot(gs, 'C', 'kernel', metric='param_kernel')
gs_viz.fit(X, y).show()
In [ ]: