In [1]:
# Global imports and settings
# Matplotlib
%matplotlib inline
from matplotlib import pyplot as plt
plt.rcParams["figure.figsize"] = (8, 8)
plt.rcParams["figure.max_open_warning"] = -1
# Print options
import numpy as np
np.set_printoptions(precision=3)
# Slideshow
from notebook.services.config import ConfigManager
cm = ConfigManager()
cm.update('livereveal', {'width': 1440, 'height': 768, 'scroll': True, 'theme': 'simple'})
# Silence warnings
import warnings
warnings.simplefilter(action="ignore", category=FutureWarning)
warnings.simplefilter(action="ignore", category=UserWarning)
warnings.simplefilter(action="ignore", category=RuntimeWarning)
# Utils
from robustness import plot_surface
from robustness import plot_outlier_detector
In [2]:
%%javascript
Reveal.addEventListener("slidechanged", function(event){ window.location.hash = "header"; });
In theory,
In practice,
Today's goal: build models that are robust to outliers!
Novelty detection:
Outlier detection:
In [ ]:
# Unsupervised learning
estimator.fit(X_train) # no "y_train"
# Detecting novelty or outliers
y_pred = estimator.predict(X_test) # inliers == 1, outliers == -1
y_score = estimator.decision_function(X_test) # outliers == highest scores
In [3]:
# Generate data
from sklearn.datasets import make_blobs
inliers, _ = make_blobs(n_samples=200, centers=2, random_state=1)
outliers = np.random.rand(50, 2)
outliers = np.min(inliers, axis=0) + (np.max(inliers, axis=0) - np.min(inliers, axis=0)) * outliers
X = np.vstack((inliers, outliers))
ground_truth = np.ones(len(X), dtype=np.int)
ground_truth[-len(outliers):] = 0
In [4]:
from sklearn.svm import OneClassSVM
from sklearn.covariance import EllipticEnvelope
from sklearn.ensemble import IsolationForest
# Unsupervised learning
estimator = OneClassSVM(nu=0.4, kernel="rbf", gamma=0.1)
# clf = EllipticEnvelope(contamination=.1)
# clf = IsolationForest(max_samples=100)
estimator.fit(X)
plot_outlier_detector(estimator, X, ground_truth)
Theorem. For the squared error loss, the bias-variance decomposition of the expected generalization error at $X=\mathbf{x}$ is
$$ \mathbb{E}_{\cal L} \{ Err(\varphi_{\cal L}(\mathbf{x})) \} = \text{noise}(\mathbf{x}) + \text{bias}^2(\mathbf{x}) + \text{var}(\mathbf{x}) $$Theorem. For the squared error loss, the bias-variance decomposition of the expected generalization error at $X=x$ of an ensemble of $M$ randomized models $\varphi_{{\cal L},\theta_m}$ is
$$ \mathbb{E}_{\cal L} \{ Err(\psi_{{\cal L},\theta_1,\dots,\theta_M}(\mathbf{x})) \} = \text{noise}(\mathbf{x}) + \text{bias}^2(\mathbf{x}) + \text{var}(\mathbf{x}) $$where \begin{align*} \text{noise}(\mathbf{x}) &= Err(\varphi_B(\mathbf{x})), \\ \text{bias}^2(\mathbf{x}) &= (\varphi_B(\mathbf{x}) - \mathbb{E}_{{\cal L},\theta} \{ \varphi_{{\cal L},\theta}(\mathbf{x}) \} )^2, \\ \text{var}(\mathbf{x}) &= \rho(\mathbf{x}) \sigma^2_{{\cal L},\theta}(\mathbf{x}) + \frac{1 - \rho(\mathbf{x})}{M} \sigma^2_{{\cal L},\theta}(\mathbf{x}). \end{align*}
In [5]:
# Load data
from sklearn.datasets import load_iris
iris = load_iris()
X = iris.data[:, [0, 1]]
y = iris.target
In [7]:
from sklearn.tree import DecisionTreeClassifier
clf = DecisionTreeClassifier().fit(X, y)
plot_surface(clf, X, y)
from sklearn.ensemble import RandomForestClassifier
clf = RandomForestClassifier(n_estimators=100).fit(X, y)
plot_surface(clf, X, y)
In [8]:
# Generate data
from sklearn.datasets import make_regression
n_outliers = 3
X, y, coef = make_regression(n_samples=100, n_features=1, n_informative=1, noise=10,
coef=True, random_state=0)
np.random.seed(1)
X[-n_outliers:] = 1 + 0.25 * np.random.normal(size=(n_outliers, 1))
y[-n_outliers:] = -100 + 10 * np.random.normal(size=n_outliers)
plt.scatter(X[:-n_outliers], y[:-n_outliers], color="b")
plt.scatter(X[-n_outliers:], y[-n_outliers:], color="r")
plt.xlim(-3, 3)
plt.ylim(-150, 120)
plt.show()
In [9]:
# Fit with least squares vs. least absolute deviances
from sklearn.ensemble import GradientBoostingRegressor
clf_ls = GradientBoostingRegressor(loss="ls")
clf_lad = GradientBoostingRegressor(loss="lad")
clf_ls.fit(X, y)
clf_lad.fit(X, y)
# Plot
X_test = np.linspace(-5, 5).reshape(-1, 1)
plt.scatter(X[:-n_outliers], y[:-n_outliers], color="b")
plt.scatter(X[-n_outliers:], y[-n_outliers:], color="r")
plt.plot(X_test, clf_ls.predict(X_test), "g", label="Least squares")
plt.plot(X_test, clf_lad.predict(X_test), "y", label="Lead absolute deviances")
plt.xlim(-3, 3)
plt.ylim(-150, 120)
plt.legend()
plt.show()
In [10]:
# Generate data
from sklearn.datasets import make_blobs
from sklearn.model_selection import train_test_split
X, y = make_blobs(n_samples=100, centers=[(0, 0), (-1, 0)], random_state=0)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.5, random_state=0)
X_train[0, 0] = -1000 # a fairly large outlier
In [11]:
# Scale data
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import RobustScaler
standard_scaler = StandardScaler()
Xtr_s = standard_scaler.fit_transform(X_train)
Xte_s = standard_scaler.transform(X_test)
robust_scaler = RobustScaler()
Xtr_r = robust_scaler.fit_transform(X_train)
Xte_r = robust_scaler.transform(X_test)
In [12]:
# Plot data
fig, ax = plt.subplots(1, 3, figsize=(12, 4))
ax[0].scatter(X_train[:, 0], X_train[:, 1], color=np.where(y_train == 0, 'r', 'b'))
ax[1].scatter(Xtr_s[:, 0], Xtr_s[:, 1], color=np.where(y_train == 0, 'r', 'b'))
ax[2].scatter(Xtr_r[:, 0], Xtr_r[:, 1], color=np.where(y_train == 0, 'r', 'b'))
ax[0].set_title("Unscaled data")
ax[1].set_title("After standard scaling (zoomed in)")
ax[2].set_title("After robust scaling (zoomed in)")
# for the scaled data, we zoom in to the data center (outlier can't be seen!)
for a in ax[1:]:
a.set_xlim(-3, 3)
a.set_ylim(-3, 3)
plt.show()
In [13]:
# Classify using kNN
from sklearn.neighbors import KNeighborsClassifier
knn = KNeighborsClassifier()
knn.fit(Xtr_s, y_train)
acc_s = knn.score(Xte_s, y_test)
print("Test set accuracy using standard scaler: %.3f" % acc_s)
knn.fit(Xtr_r, y_train)
acc_r = knn.score(Xte_r, y_test)
print("Test set accuracy using robust scaler: %.3f" % acc_r)
In [14]:
from sklearn.datasets import make_blobs
from sklearn.model_selection import train_test_split
# Generate 3 blobs with 2 classes where the second blob contains
# half positive samples and half negative samples. Probability in this
# blob is therefore 0.5.
X, y = make_blobs(n_samples=10000, n_features=2, cluster_std=1.0,
centers=[(-5, -5), (0, 0), (5, 5)], shuffle=False)
y[:len(X) // 2] = 0
y[len(X) // 2:] = 1
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.5, random_state=42)
In [15]:
# Plot
for this_y, color in zip([0, 1], ["r", "b"]):
this_X = X_train[y_train == this_y]
plt.scatter(this_X[:, 0], this_X[:, 1], c=color, alpha=0.2, label="Class %s" % this_y)
plt.legend(loc="best")
plt.title("Data")
plt.show()
In [16]:
from sklearn.naive_bayes import GaussianNB
from sklearn.calibration import CalibratedClassifierCV
# Without calibration
clf = GaussianNB()
clf.fit(X_train, y_train) # GaussianNB itself does not support sample-weights
prob_pos_clf = clf.predict_proba(X_test)[:, 1]
# With isotonic calibration
clf_isotonic = CalibratedClassifierCV(clf, cv=2, method='isotonic')
clf_isotonic.fit(X_train, y_train)
prob_pos_isotonic = clf_isotonic.predict_proba(X_test)[:, 1]
In [17]:
# Plot
order = np.lexsort((prob_pos_clf, ))
plt.plot(prob_pos_clf[order], 'r', label='No calibration')
plt.plot(prob_pos_isotonic[order], 'b', label='Isotonic calibration')
plt.plot(np.linspace(0, y_test.size, 51)[1::2], y_test[order].reshape(25, -1).mean(1), 'k--', label=r'Empirical')
plt.xlabel("Instances sorted according to predicted probability "
"(uncalibrated GNB)")
plt.ylabel("P(y=1)")
plt.legend(loc="upper left")
plt.title("Gaussian naive Bayes probabilities")
plt.ylim([-0.05, 1.05])
plt.show()
For robust and calibrated estimators:
In [ ]:
questions?