We've identified a bug in ROCAUC:
======================================================================
ERROR: Test ROCAUC with a binary classifier
----------------------------------------------------------------------
Traceback (most recent call last):
File "/Users/benjamin/Repos/ddl/yellowbrick/tests/test_classifier/test_rocauc.py", line 110, in test_binary_rocauc
s = visualizer.score(X_test, y_test)
File "/Users/benjamin/Repos/ddl/yellowbrick/yellowbrick/classifier/rocauc.py", line 171, in score
self.fpr[i], self.tpr[i], _ = roc_curve(y, y_pred[:,i], pos_label=c)
IndexError: too many indices for array
Let's see if we can figure out where it's getting triggered.
In [1]:
%matplotlib inline
In [2]:
import os
import sys
# Modify the path
sys.path.append("..")
import pandas as pd
import yellowbrick as yb
import matplotlib.pyplot as plt
from yellowbrick.classifier import ROCAUC
from sklearn.model_selection import train_test_split
In [3]:
occupancy = pd.read_csv('data/occupancy/occupancy.csv')
features = [
"temperature", "relative humidity", "light", "C02", "humidity"
]
classes = ["unoccupied", "occupied"]
X = occupancy[features]
y = occupancy['occupancy']
# Create the train and test data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
In [4]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
In [5]:
classifiers = [
AdaBoostClassifier(),
MLPClassifier(),
DecisionTreeClassifier(),
QuadraticDiscriminantAnalysis(),
DecisionTreeClassifier(),
RandomForestClassifier(),
]
In [6]:
for classifier in classifiers:
oz = ROCAUC(classifier)
oz.fit(X_train, y_train)
oz.score(X_test, y_test)
g = oz.show()
Looks good; everything works!
What about classification with estimators that have multidimensional coefficients? Thanks to ZJ Poh for identifying these in this PR.
In [7]:
from sklearn.svm import LinearSVC, NuSVC, SVC
from sklearn.linear_model import SGDClassifier
from sklearn.naive_bayes import BernoulliNB, MultinomialNB
from sklearn.linear_model import PassiveAggressiveClassifier
from sklearn.linear_model import RidgeClassifier, RidgeClassifierCV
from sklearn.linear_model import LogisticRegression, LogisticRegressionCV
Some of these generate the IndexError: too many indices for array
error, but not all!
These are the ones that seem to work: BernoulliNB()
, MultinomialNB()
, LogisticRegression()
, and LogisticRegressionCV()
.
In [8]:
classifiers = [
BernoulliNB(),
MultinomialNB(),
LogisticRegression(),
LogisticRegressionCV()
]
for classifier in classifiers:
oz = ROCAUC(classifier)
oz.fit(X_train, y_train)
oz.score(X_test, y_test)
g = oz.show()
In [13]:
oz = ROCAUC(LinearSVC())
oz.fit(X_train, y_train)
oz.score(X_test, y_test)
oz.show()
In [14]:
oz = ROCAUC(SVC())
oz.fit(X_train, y_train)
oz.score(X_test, y_test)
oz.show()
In [15]:
oz = ROCAUC(SGDClassifier())
oz.fit(X_train, y_train)
oz.score(X_test, y_test)
oz.show()
In [10]:
oz = ROCAUC(PassiveAggressiveClassifier())
oz.fit(X_train, y_train)
oz.score(X_test, y_test)
oz.show()
In [11]:
oz = ROCAUC(RidgeClassifier())
oz.fit(X_train, y_train)
oz.score(X_test, y_test)
oz.show()
In [12]:
oz = ROCAUC(RidgeClassifierCV())
oz.fit(X_train, y_train)
oz.score(X_test, y_test)
oz.show()
so what's going on here?
y_pred
It looks like all of the classifiers that trigger the IndexError
during binary classification with ROCAUC
are ones that have only a decision_function
and for which y_pred.shape
is (n_samples,).
ROCAUC
In [16]:
attrs = (
'predict_proba',
'decision_function',
)
failing_classifiers = [
LinearSVC(),
SVC(),
SGDClassifier(),
PassiveAggressiveClassifier(),
RidgeClassifier(),
RidgeClassifierCV()
]
In [17]:
def profile(classifiers):
for classifier in classifiers:
classifier.fit(X_train, y_train)
# Return the first resolved function
for attr in attrs:
try:
method = getattr(classifier, attr, None)
if method:
y_pred = method(X_test)
except AttributeError:
continue
print("y_pred shape for {} is {}.".format(
classifier.__class__.__name__, y_pred.shape)
)
print(y_pred)
profile(failing_classifiers)
In [18]:
working_classifiers_decision = [
AdaBoostClassifier(),
QuadraticDiscriminantAnalysis(),
LogisticRegression(),
LogisticRegressionCV()
]
profile(working_classifiers_decision)
In [19]:
working_classifiers_proba = [
MLPClassifier(),
DecisionTreeClassifier(),
RandomForestClassifier(),
BernoulliNB(),
MultinomialNB()
]
profile(working_classifiers_proba)
In [ ]: