Classifiers and Class Labels

This notebook explores how the classification score visualizers handle constraints when it comes to different types of models, data, class labeling schemes, and other parameters. In particular we explore the following:

Target Types:

  • binary
  • multiclass (3 classes)

Target Encoding:

  • integers
  • labels

Labeling:

  • list of classes
  • LabelEncoder
  • dict encoding
  • list of more classes than values in y

In [1]:
# Ensure we're importing the development version of Yellowbrick
import sys
sys.path.append("../..")

In [2]:
# Use inline so that we can run the notebook multiple times
%matplotlib inline

import numpy as np
import matplotlib.pyplot as plt

# Import all of the Yellowbrick classifiers
from yellowbrick.classifier import *
from yellowbrick.exceptions import YellowbrickError
from yellowbrick.datasets import load_game, load_occupancy

# Import scikit-learn utilities
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB, GaussianNB
from sklearn.model_selection import train_test_split as tts
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
from sklearn.datasets import make_classification

from collections import namedtuple
from functools import partial

Dataset = namedtuple("Dataset", "X,y,classes,encoder")
Split = namedtuple("Split", "train,test")


make_binary = partial(make_classification,
        n_samples=500,
        n_features=20,
        n_informative=8,
        n_redundant=2,
        n_classes=2,
        n_clusters_per_class=3,
    )

make_multiclass = partial(make_classification, 
        n_samples=500,
        n_features=20,
        n_informative=8,
        n_redundant=2,
        n_classes=6,
        n_clusters_per_class=3,
)

In [3]:
# Select the parameters to run against all models 
# Then restart kernel and run all
MODEL = LogisticRegression(solver='lbfgs', multi_class='auto')
IS_FITTED = False
DATASET = "multiclass" 
TARGET = "integers"
ENCODER = "labelencoder"
USE_PANDAS = False


def make_dataset(name=DATASET, target=TARGET, encoder=ENCODER, use_pandas=USE_PANDAS):
    loader = {
        'game': load_game, 
        'occupancy': load_occupancy,
        'binary': make_binary,
        'multiclass': make_multiclass,
    }.get(name)
    
    if name in {'game', 'occupancy'}:
        dataset = loader(return_dataset=True)
        labels = sorted(dataset.meta['labels'].items(), key=lambda i: i[1])

        if use_pandas:
            X, y = dataset.to_pandas()
        else:
            X, y = dataset.to_numpy()
    else:
        X, y = loader()
        labels = zip(list('abcdefghijk'), np.unique(y))
        
    if name == 'game':
        X = OneHotEncoder().fit_transform(X)
    
    # game target is string encoded, occupancy is integer encoded
    if target == "integers":
        if y.dtype.kind != 'i':
            y = LabelEncoder().fit_transform(y)
    elif target == "labels":
        if y.dtype.kind == 'i':
            rv = {i[1]: i[0] for i in labels.items()}
            y = np.array([rv[yi] for yi in y])
    else:
        raise ValueError(f"unknown target type '{target}', use integers or labels")
    
    c, le = None, None
    X_train, X_test, y_train, y_test = tts(X, y, test_size=0.2, shuffle=True, stratify=y)
    
    if encoder == 'list':
        c = [l[0] for l in labels]
    elif encoder == 'labelencoder':
        le = LabelEncoder().fit([l[0] for l in labels])
    elif encoder == 'dict':
        le = {l[1]: l[0] for l in labels}
    elif encoder is None:
        c, le = None, None
    else:
        raise ValueError(f"unknown encoder type '{encoder}', see make_dataset for choices")
    
    return Dataset(Split(X_train, X_test), Split(y_train, y_test), c, le)
    

def visualize(visualizer, model=MODEL, is_fitted=IS_FITTED, score=True):
    if is_fitted:
        # This includes both auto and True; fit the model manually if you want the exception raised
        model = model.fit(dataset.X.train, dataset.y.train)
    _, ax = plt.subplots(figsize=(9,6)) 
    
    try:
        oz = visualizer(model, ax=ax, classes=dataset.classes, encoder=dataset.encoder, is_fitted=is_fitted)
        oz.fit(dataset.X.train, dataset.y.train)

        if score:
            oz.score(dataset.X.test, dataset.y.test)

        oz.finalize()
    except YellowbrickError as e:
        print(e)
    except Exception as e:
        print("A NON YB ERROR OCCURRED:")
        print(e)
    return oz
    
    
                         
dataset = make_dataset()

In [4]:
oz = visualize(ClassPredictionError)



In [5]:
oz.score_


Out[5]:
0.43

In [6]:
visualize(ClassificationReport)


Out[6]:
ClassificationReport(ax=<matplotlib.axes._subplots.AxesSubplot object at 0x12b9f77b8>,
                     classes=None,
                     cmap=<matplotlib.colors.ListedColormap object at 0x10ff4ae80>,
                     encoder=LabelEncoder(), force_model=False, is_fitted=False,
                     model=None, support=None)

In [7]:
visualize(ConfusionMatrix)


Out[7]:
ConfusionMatrix(ax=<matplotlib.axes._subplots.AxesSubplot object at 0x12db15470>,
                classes=None,
                cmap=<matplotlib.colors.ListedColormap object at 0x12dc3fa58>,
                encoder=LabelEncoder(), fontsize=None, force_model=False,
                is_fitted=False, model=None, percent=False, sample_weight=None)

In [8]:
visualize(PRCurve)


Out[8]:
PrecisionRecallCurve(ap_score=True,
                     ax=<matplotlib.axes._subplots.AxesSubplot object at 0x12dc96da0>,
                     classes=None, encoder=LabelEncoder(), fill_area=True,
                     fill_opacity=0.2, force_model=False, is_fitted=False,
                     iso_f1_curves=False, iso_f1_values={0.2, 0.4, 0.6, 0.8},
                     line_opacity=0.8, micro=True, model=None, per_class=False)

In [9]:
visualize(ROCAUC)


Out[9]:
ROCAUC(ax=<matplotlib.axes._subplots.AxesSubplot object at 0x12dd24e10>,
       classes=None, encoder=LabelEncoder(), force_model=False, is_fitted=False,
       macro=True, micro=True, model=None, per_class=True)

In [10]:
visualize(DiscriminationThreshold, score=False)


multiclass format is not supported
Out[10]:
DiscriminationThreshold(argmax='fscore',
                        ax=<matplotlib.axes._subplots.AxesSubplot object at 0x12de8c828>,
                        cv=0.1, exclude=None, fbeta=1.0, force_model=None,
                        is_fitted=False, model=None, n_trials=50,
                        quantiles=array([0.1, 0.5, 0.9]), random_state=None)