In [1]:

    
%matplotlib inline
%load_ext autoreload
%autoreload 2



In [2]:

    
import os
import sys

sys.path.append("..")
sys.path.append("../..")

import numpy as np 
import pandas as pd

Occupancy data



In [3]:

    
## [from examples/examples.py]
from download import download_all 

## The path to the test data sets
FIXTURES  = os.path.join(os.getcwd(), "data")

## Dataset loading mechanisms
datasets = {
    "credit": os.path.join(FIXTURES, "credit", "credit.csv"),
    "concrete": os.path.join(FIXTURES, "concrete", "concrete.csv"),
    "occupancy": os.path.join(FIXTURES, "occupancy", "occupancy.csv"),
    "mushroom": os.path.join(FIXTURES, "mushroom", "mushroom.csv"),
}

def load_data(name, download=True):
    """
    Loads and wrangles the passed in dataset by name.
    If download is specified, this method will download any missing files. 
    """
    # Get the path from the datasets 
    path = datasets[name]
    
    # Check if the data exists, otherwise download or raise 
    if not os.path.exists(path):
        if download:
            download_all() 
        else:
            raise ValueError((
                "'{}' dataset has not been downloaded, "
                "use the download.py module to fetch datasets"
            ).format(name))
    
    # Return the data frame
    return pd.read_csv(path)



In [4]:

    
# Load the classification data set
data = load_data('occupancy') 
print(len(data))
data.head()









    



20560






    Out[4]:







  
    
      
      datetime
      temperature
      relative humidity
      light
      C02
      humidity
      occupancy
    
  
  
    
      0
      2015-02-04 17:51:00
      23.18
      27.2720
      426.0
      721.25
      0.004793
      1
    
    
      1
      2015-02-04 17:51:59
      23.15
      27.2675
      429.5
      714.00
      0.004783
      1
    
    
      2
      2015-02-04 17:53:00
      23.15
      27.2450
      426.0
      713.50
      0.004779
      1
    
    
      3
      2015-02-04 17:54:00
      23.15
      27.2000
      426.0
      708.25
      0.004772
      1
    
    
      4
      2015-02-04 17:55:00
      23.10
      27.2000
      426.0
      704.50
      0.004757
      1



In [5]:

    
# Specify the features of interest and the classes of the target 
features = ["temperature", "relative humidity", "light", "C02", "humidity"]
classes = ['unoccupied', 'occupied']

# Searching the whole dataset takes a while (15 mins on my mac)... 
# For demo purposes, we reduce the size
X = data[features].head(2000)
y = data.occupancy.head(2000)

Parameter projection

Because the visualizer only displays results across two parameters, we need some way of reducing the dimension to 2.
Our approach: for each value of the parameters of interest, display the maximum score across all the other parameters.

Here we demo the param_projection utility function that does this



In [6]:

    
from sklearn.model_selection import GridSearchCV
from sklearn.svm import SVC

from yellowbrick.gridsearch.base import param_projection









    



/Users/pschafer/.pyenv/versions/3.6.2/envs/yellowbrick/lib/python3.6/site-packages/sklearn/cross_validation.py:41: DeprecationWarning: This module was deprecated in version 0.18 in favor of the model_selection module into which all the refactored classes and functions are moved. Also note that the interface of the new CV iterators are different from that of this module. This module will be removed in 0.20.
  "This module will be removed in 0.20.", DeprecationWarning)



In [7]:

    
# Fit a vanilla grid search... these are the example parameters from sklearn's gridsearch docs.
svc = SVC()
grid = [{'kernel': ['rbf'], 'gamma': [1e-3, 1e-4], 'C': [1, 10, 100, 1000]},
        {'kernel': ['linear'], 'C': [1, 10, 100, 1000]}]
gs = GridSearchCV(svc, grid, n_jobs=4)



In [8]:

    
%%time
gs.fit(X, y)









    



CPU times: user 147 ms, sys: 28.2 ms, total: 175 ms
Wall time: 31 s






    Out[8]:





GridSearchCV(cv=None, error_score='raise',
       estimator=SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto', kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False),
       fit_params=None, iid=True, n_jobs=4,
       param_grid=[{'kernel': ['rbf'], 'gamma': [0.001, 0.0001], 'C': [1, 10, 100, 1000]}, {'kernel': ['linear'], 'C': [1, 10, 100, 1000]}],
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=0)

As of Scikit-learn 0.18, cv_results has replaced grid_scores as the grid search results format



In [9]:

    
gs.cv_results_









    



/Users/pschafer/.pyenv/versions/3.6.2/envs/yellowbrick/lib/python3.6/site-packages/sklearn/utils/deprecation.py:122: FutureWarning: You are accessing a training score ('mean_train_score'), which will not be available by default any more in 0.21. If you need training scores, please set return_train_score=True
  warnings.warn(*warn_args, **warn_kwargs)
/Users/pschafer/.pyenv/versions/3.6.2/envs/yellowbrick/lib/python3.6/site-packages/sklearn/utils/deprecation.py:122: FutureWarning: You are accessing a training score ('split0_train_score'), which will not be available by default any more in 0.21. If you need training scores, please set return_train_score=True
  warnings.warn(*warn_args, **warn_kwargs)
/Users/pschafer/.pyenv/versions/3.6.2/envs/yellowbrick/lib/python3.6/site-packages/sklearn/utils/deprecation.py:122: FutureWarning: You are accessing a training score ('split1_train_score'), which will not be available by default any more in 0.21. If you need training scores, please set return_train_score=True
  warnings.warn(*warn_args, **warn_kwargs)
/Users/pschafer/.pyenv/versions/3.6.2/envs/yellowbrick/lib/python3.6/site-packages/sklearn/utils/deprecation.py:122: FutureWarning: You are accessing a training score ('split2_train_score'), which will not be available by default any more in 0.21. If you need training scores, please set return_train_score=True
  warnings.warn(*warn_args, **warn_kwargs)
/Users/pschafer/.pyenv/versions/3.6.2/envs/yellowbrick/lib/python3.6/site-packages/sklearn/utils/deprecation.py:122: FutureWarning: You are accessing a training score ('std_train_score'), which will not be available by default any more in 0.21. If you need training scores, please set return_train_score=True
  warnings.warn(*warn_args, **warn_kwargs)






    Out[9]:





{'mean_fit_time': array([0.02446747, 0.01360734, 0.02218564, 0.00889039, 0.02143168,
        0.0146842 , 0.03453048, 0.02234364, 1.94032502, 3.24313124,
        4.1275959 , 3.99856925]),
 'mean_score_time': array([0.00456238, 0.00284052, 0.00362539, 0.00334334, 0.00262896,
        0.00233173, 0.00262411, 0.00177709, 0.0011073 , 0.00116841,
        0.00101606, 0.00154575]),
 'mean_test_score': array([0.895 , 0.891 , 0.9055, 0.8995, 0.8995, 0.914 , 0.9045, 0.9185,
        0.915 , 0.9135, 0.9105, 0.9115]),
 'mean_train_score': array([0.9847535 , 0.97950518, 0.990002  , 0.98125506, 0.99450106,
        0.98475368, 0.99625056, 0.99275119, 0.96525668, 0.96550693,
        0.96525687, 0.96500699]),
 'param_C': masked_array(data=[1, 1, 10, 10, 100, 100, 1000, 1000, 1, 10, 100, 1000],
              mask=[False, False, False, False, False, False, False, False,
                    False, False, False, False],
        fill_value='?',
             dtype=object),
 'param_gamma': masked_array(data=[0.001, 0.0001, 0.001, 0.0001, 0.001, 0.0001, 0.001,
                    0.0001, --, --, --, --],
              mask=[False, False, False, False, False, False, False, False,
                     True,  True,  True,  True],
        fill_value='?',
             dtype=object),
 'param_kernel': masked_array(data=['rbf', 'rbf', 'rbf', 'rbf', 'rbf', 'rbf', 'rbf', 'rbf',
                    'linear', 'linear', 'linear', 'linear'],
              mask=[False, False, False, False, False, False, False, False,
                    False, False, False, False],
        fill_value='?',
             dtype=object),
 'params': [{'C': 1, 'gamma': 0.001, 'kernel': 'rbf'},
  {'C': 1, 'gamma': 0.0001, 'kernel': 'rbf'},
  {'C': 10, 'gamma': 0.001, 'kernel': 'rbf'},
  {'C': 10, 'gamma': 0.0001, 'kernel': 'rbf'},
  {'C': 100, 'gamma': 0.001, 'kernel': 'rbf'},
  {'C': 100, 'gamma': 0.0001, 'kernel': 'rbf'},
  {'C': 1000, 'gamma': 0.001, 'kernel': 'rbf'},
  {'C': 1000, 'gamma': 0.0001, 'kernel': 'rbf'},
  {'C': 1, 'kernel': 'linear'},
  {'C': 10, 'kernel': 'linear'},
  {'C': 100, 'kernel': 'linear'},
  {'C': 1000, 'kernel': 'linear'}],
 'rank_test_score': array([11, 12,  7,  9,  9,  3,  8,  1,  2,  4,  6,  5], dtype=int32),
 'split0_test_score': array([0.85907046, 0.82158921, 0.89805097, 0.84557721, 0.87256372,
        0.88905547, 0.89655172, 0.90254873, 0.91004498, 0.93103448,
        0.90854573, 0.91754123]),
 'split0_train_score': array([0.98349587, 0.97974494, 0.987997  , 0.98274569, 0.99324831,
        0.98424606, 0.99474869, 0.99024756, 0.95723931, 0.95873968,
        0.9579895 , 0.9579895 ]),
 'split1_test_score': array([0.82608696, 0.85307346, 0.82608696, 0.85307346, 0.82608696,
        0.85307346, 0.82608696, 0.85307346, 0.89055472, 0.89055472,
        0.89055472, 0.89055472]),
 'split1_train_score': array([1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.]),
 'split2_test_score': array([1.        , 0.9984985 , 0.99249249, 1.        , 1.        ,
        1.        , 0.99099099, 1.        , 0.94444444, 0.91891892,
        0.93243243, 0.92642643]),
 'split2_train_score': array([0.97076462, 0.95877061, 0.982009  , 0.96101949, 0.99025487,
        0.97001499, 0.994003  , 0.988006  , 0.93853073, 0.93778111,
        0.93778111, 0.93703148]),
 'std_fit_time': array([0.00783314, 0.00750969, 0.00391411, 0.00338101, 0.0082266 ,
        0.00776113, 0.02039378, 0.01587166, 1.54496874, 2.5505148 ,
        2.94695329, 2.82673379]),
 'std_score_time': array([1.04638701e-03, 1.01035138e-03, 1.00287903e-03, 2.10844017e-03,
        5.38548407e-04, 6.35982442e-04, 6.02011534e-04, 5.75618900e-04,
        2.99961800e-04, 8.37446367e-05, 3.58759960e-04, 6.80733055e-04]),
 'std_test_score': array([0.07540321, 0.07703631, 0.06813033, 0.07107689, 0.07350339,
        0.06251678, 0.06754775, 0.06102718, 0.02227512, 0.01696758,
        0.01715017, 0.01525357]),
 'std_train_score': array([0.01196838, 0.01683268, 0.00748038, 0.01594859, 0.00407586,
        0.01224659, 0.00266867, 0.00520673, 0.02572711, 0.02584756,
        0.02591536, 0.02618132])}

Demo the use of param_projection... It identifies the unique values of the the two parameter values and gets the best score for each (here taking the max over gamma values)



In [10]:

    
param_1 = 'C'
param_2 = 'kernel'
param_1_vals, param2_vals, best_scores = param_projection(gs.cv_results_, param_1, param_2)
param_1_vals, param2_vals, best_scores









    Out[10]:





([1, 10, 100, 1000],
 ['linear', 'rbf'],
 array([[0.915 , 0.9135, 0.9105, 0.9115],
        [0.895 , 0.9055, 0.914 , 0.9185]]))

GridSearchColorPlot

This visualizer wraps the GridSearchCV object and plots the values obtained from param_projection.



In [11]:

    
from yellowbrick.gridsearch import GridSearchColorPlot



In [12]:

    
gs_viz = GridSearchColorPlot(gs, 'C', 'kernel')
gs_viz.fit(X, y).show()



In [13]:

    
gs_viz = GridSearchColorPlot(gs, 'kernel', 'C')
gs_viz.fit(X, y).show()



In [14]:

    
gs_viz = GridSearchColorPlot(gs, 'C', 'gamma')
gs_viz.fit(X, y).show()

If there are missing values in the grid, these are filled with a hatch (see https://stackoverflow.com/a/35905483/7637679)



In [15]:

    
gs_viz = GridSearchColorPlot(gs, 'kernel', 'gamma')
gs_viz.fit(X, y).show()

Choose a different metric...



In [16]:

    
gs_viz = GridSearchColorPlot(gs, 'C', 'kernel', metric='mean_fit_time')
gs_viz.fit(X, y).show()

Quick Method

Because grid search can take a long time and we may want to interactively cut the results a few different ways, by default the quick method assumes that the GridSearchCV object is already fit if no X data is passed in.



In [17]:

    
from yellowbrick.gridsearch import gridsearch_color_plot



In [18]:

    
%%time
# passing the GridSearchCV object pre-fit
gridsearch_color_plot(gs, 'C', 'kernel')









    



CPU times: user 84.1 ms, sys: 2.95 ms, total: 87.1 ms
Wall time: 87.4 ms






    Out[18]:





<matplotlib.axes._subplots.AxesSubplot at 0x1105f2ef0>



In [19]:

    
%%time
# trying a different cut across parameters
gridsearch_color_plot(gs, 'C', 'gamma')









    



CPU times: user 73.3 ms, sys: 3.95 ms, total: 77.2 ms
Wall time: 79.8 ms






    Out[19]:





<matplotlib.axes._subplots.AxesSubplot at 0x1107c5470>



In [20]:

    
%%time
# When we provide X, the `fit` method will call fit (takes longer)
gridsearch_color_plot(gs, 'C', 'kernel', X=X, y=y)









    



CPU times: user 206 ms, sys: 35.1 ms, total: 241 ms
Wall time: 31 s






    Out[20]:





<matplotlib.axes._subplots.AxesSubplot at 0x1107c5710>



In [21]:

    
%%time
# can also choose a different metric
gridsearch_color_plot(gs, 'C', 'kernel', metric='mean_fit_time')









    



CPU times: user 67.5 ms, sys: 3.4 ms, total: 70.9 ms
Wall time: 77.7 ms






    Out[21]:





<matplotlib.axes._subplots.AxesSubplot at 0x110a26208>

Parameter errors

Bad param values



In [22]:

    
gs_viz = GridSearchColorPlot(gs, 'foo', 'kernel')
gs_viz.fit(X, y).show()









    



---------------------------------------------------------------------------
KeyError                                  Traceback (most recent call last)
~/projects/yellowbrick/yellowbrick/gridsearch/base.py in param_projection(cv_results, x_param, y_param, metric)
     57     try:
---> 58         x_vals = cv_results['param_' + x_param]
     59     except KeyError:

~/.pyenv/versions/3.6.2/envs/yellowbrick/lib/python3.6/site-packages/sklearn/utils/deprecation.py in __getitem__(self, key)
    122             warnings.warn(*warn_args, **warn_kwargs)
--> 123         return super(DeprecationDict, self).__getitem__(key)
    124 

KeyError: 'param_foo'

During handling of the above exception, another exception occurred:

YellowbrickKeyError                       Traceback (most recent call last)
<ipython-input-22-5a2e9521b555> in <module>()
      1 gs_viz = GridSearchColorPlot(gs, 'foo', 'kernel')
----> 2 gs_viz.fit(X, y).show()

~/projects/yellowbrick/yellowbrick/gridsearch/base.py in fit(self, X, y, **kwargs)
    187         """
    188         self.estimator.fit(X, y)
--> 189         self.draw()
    190         return self

~/projects/yellowbrick/yellowbrick/gridsearch/pcolor.py in draw(self)
    138         # Project the grid search results to 2 dimensions
    139         x_vals, y_vals, best_scores = self.param_projection(
--> 140             self.x_param, self.y_param, metric=self.metric
    141         )
    142 

~/projects/yellowbrick/yellowbrick/gridsearch/base.py in param_projection(self, x_param, y_param, metric)
    162             Array of scores to be displayed for each parameter value pair.
    163         """
--> 164         return param_projection(self.estimator.cv_results_, x_param, y_param, metric)
    165 
    166     def fit(self, X, y=None, **kwargs):

~/projects/yellowbrick/yellowbrick/gridsearch/base.py in param_projection(cv_results, x_param, y_param, metric)
     59     except KeyError:
     60         raise YellowbrickKeyError("Parameter '{}' does not exist in the grid "
---> 61                                   "search results".format(x_param))
     62     try:
     63         y_vals = cv_results['param_' + y_param]

YellowbrickKeyError: "Parameter 'foo' does not exist in the grid search results"



In [23]:

    
gs_viz = GridSearchColorPlot(gs, 'C', 'foo')
gs_viz.fit(X, y).show()









    



---------------------------------------------------------------------------
KeyError                                  Traceback (most recent call last)
~/projects/yellowbrick/yellowbrick/gridsearch/base.py in param_projection(cv_results, x_param, y_param, metric)
     62     try:
---> 63         y_vals = cv_results['param_' + y_param]
     64     except KeyError:

~/.pyenv/versions/3.6.2/envs/yellowbrick/lib/python3.6/site-packages/sklearn/utils/deprecation.py in __getitem__(self, key)
    122             warnings.warn(*warn_args, **warn_kwargs)
--> 123         return super(DeprecationDict, self).__getitem__(key)
    124 

KeyError: 'param_foo'

During handling of the above exception, another exception occurred:

YellowbrickKeyError                       Traceback (most recent call last)
<ipython-input-23-1cfe8afcafe9> in <module>()
      1 gs_viz = GridSearchColorPlot(gs, 'C', 'foo')
----> 2 gs_viz.fit(X, y).show()

~/projects/yellowbrick/yellowbrick/gridsearch/base.py in fit(self, X, y, **kwargs)
    187         """
    188         self.estimator.fit(X, y)
--> 189         self.draw()
    190         return self

~/projects/yellowbrick/yellowbrick/gridsearch/pcolor.py in draw(self)
    138         # Project the grid search results to 2 dimensions
    139         x_vals, y_vals, best_scores = self.param_projection(
--> 140             self.x_param, self.y_param, metric=self.metric
    141         )
    142 

~/projects/yellowbrick/yellowbrick/gridsearch/base.py in param_projection(self, x_param, y_param, metric)
    162             Array of scores to be displayed for each parameter value pair.
    163         """
--> 164         return param_projection(self.estimator.cv_results_, x_param, y_param, metric)
    165 
    166     def fit(self, X, y=None, **kwargs):

~/projects/yellowbrick/yellowbrick/gridsearch/base.py in param_projection(cv_results, x_param, y_param, metric)
     64     except KeyError:
     65         raise YellowbrickKeyError("Parameter '{}' does not exist in the grid "
---> 66                                   "search results".format(y_param))
     67     try:
     68         scores = cv_results[metric]

YellowbrickKeyError: "Parameter 'foo' does not exist in the grid search results"

Bad metric option



In [24]:

    
gs_viz = GridSearchColorPlot(gs, 'C', 'kernel', metric='foo')
gs_viz.fit(X, y).show()









    



---------------------------------------------------------------------------
KeyError                                  Traceback (most recent call last)
~/projects/yellowbrick/yellowbrick/gridsearch/base.py in param_projection(cv_results, x_param, y_param, metric)
     67     try:
---> 68         scores = cv_results[metric]
     69     except KeyError:

~/.pyenv/versions/3.6.2/envs/yellowbrick/lib/python3.6/site-packages/sklearn/utils/deprecation.py in __getitem__(self, key)
    122             warnings.warn(*warn_args, **warn_kwargs)
--> 123         return super(DeprecationDict, self).__getitem__(key)
    124 

KeyError: 'foo'

During handling of the above exception, another exception occurred:

YellowbrickKeyError                       Traceback (most recent call last)
<ipython-input-24-d60beabdd754> in <module>()
      1 gs_viz = GridSearchColorPlot(gs, 'C', 'kernel', metric='foo')
----> 2 gs_viz.fit(X, y).show()

~/projects/yellowbrick/yellowbrick/gridsearch/base.py in fit(self, X, y, **kwargs)
    187         """
    188         self.estimator.fit(X, y)
--> 189         self.draw()
    190         return self

~/projects/yellowbrick/yellowbrick/gridsearch/pcolor.py in draw(self)
    138         # Project the grid search results to 2 dimensions
    139         x_vals, y_vals, best_scores = self.param_projection(
--> 140             self.x_param, self.y_param, metric=self.metric
    141         )
    142 

~/projects/yellowbrick/yellowbrick/gridsearch/base.py in param_projection(self, x_param, y_param, metric)
    162             Array of scores to be displayed for each parameter value pair.
    163         """
--> 164         return param_projection(self.estimator.cv_results_, x_param, y_param, metric)
    165 
    166     def fit(self, X, y=None, **kwargs):

~/projects/yellowbrick/yellowbrick/gridsearch/base.py in param_projection(cv_results, x_param, y_param, metric)
     69     except KeyError:
     70         raise YellowbrickKeyError("Metric '{}' does not exist in the grid "
---> 71                                   "search results".format(metric))
     72 
     73     # Get unique, unmasked values of the two display parameters

YellowbrickKeyError: "Metric 'foo' does not exist in the grid search results"

Metric option exists in cv_results but is not numeric -> not valid



In [25]:

    
gs_viz = GridSearchColorPlot(gs, 'C', 'kernel', metric='param_kernel')
gs_viz.fit(X, y).show()









    



---------------------------------------------------------------------------
ValueError                                Traceback (most recent call last)
~/projects/yellowbrick/yellowbrick/gridsearch/base.py in param_projection(cv_results, x_param, y_param, metric)
    104                 try:
--> 105                     best_scores[y, x] = max(all_scores[y][x])
    106                 except ValueError:

ValueError: could not convert string to float: 'linear'

During handling of the above exception, another exception occurred:

YellowbrickValueError                     Traceback (most recent call last)
<ipython-input-25-8c5aa43d422c> in <module>()
      1 gs_viz = GridSearchColorPlot(gs, 'C', 'kernel', metric='param_kernel')
----> 2 gs_viz.fit(X, y).show()

~/projects/yellowbrick/yellowbrick/gridsearch/base.py in fit(self, X, y, **kwargs)
    187         """
    188         self.estimator.fit(X, y)
--> 189         self.draw()
    190         return self

~/projects/yellowbrick/yellowbrick/gridsearch/pcolor.py in draw(self)
    138         # Project the grid search results to 2 dimensions
    139         x_vals, y_vals, best_scores = self.param_projection(
--> 140             self.x_param, self.y_param, metric=self.metric
    141         )
    142 

~/projects/yellowbrick/yellowbrick/gridsearch/base.py in param_projection(self, x_param, y_param, metric)
    162             Array of scores to be displayed for each parameter value pair.
    163         """
--> 164         return param_projection(self.estimator.cv_results_, x_param, y_param, metric)
    165 
    166     def fit(self, X, y=None, **kwargs):

~/projects/yellowbrick/yellowbrick/gridsearch/base.py in param_projection(cv_results, x_param, y_param, metric)
    107                     raise YellowbrickValueError(
    108                         "Cannot display grid search results for metric '{}': "
--> 109                         "result values may not all be numeric".format(metric)
    110                     )
    111 

YellowbrickValueError: Cannot display grid search results for metric 'param_kernel': result values may not all be numeric



In [ ]:

	datetime	temperature	relative humidity	light	C02	humidity	occupancy
0	2015-02-04 17:51:00	23.18	27.2720	426.0	721.25	0.004793	1
1	2015-02-04 17:51:59	23.15	27.2675	429.5	714.00	0.004783	1
2	2015-02-04 17:53:00	23.15	27.2450	426.0	713.50	0.004779	1
3	2015-02-04 17:54:00	23.15	27.2000	426.0	708.25	0.004772	1
4	2015-02-04 17:55:00	23.10	27.2000	426.0	704.50	0.004757	1