See more at Cook's Distance. A very good description of influence and leverage can be found at Linear Regression in Python, Outliers / Leverage Detect (this post should make it into the Yellowbrick documentation in some form).
A good description of outlier detection that also includes Cook's Distance is How to Make Your Machine Learning Models Robust to Outliers. Original motivating post is at How do you check the quality of your regression model in Python? - source code for this post is at Visual analytics and diagnostics of model fit for linear regression.
The statsmodels source code for Cook's Distance is at:
In [1]:
%matplotlib notebook
import scipy as sp
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
# Note: statsmodels requires scipy 1.2
import statsmodels.formula.api as sm
from sklearn.datasets import make_regression
from sklearn.linear_model import LinearRegression
from statsmodels.stats.outliers_influence import OLSInfluence as influence
from yellowbrick.base import Visualizer
In [2]:
# Make Test Dataset
X, y = make_regression(
n_samples=100, n_features=14, n_informative=6, bias=1.2, noise=49.8, tail_strength=0.6, random_state=637
)
# Convert to a DataFrame for statsmodels
data = pd.DataFrame(X)
data.columns = [f"X{i}" for i in range(X.shape[1])]
data["y"] = y
data.head()
Out[2]:
In [3]:
# Compute an OLS model
cols = data.columns
model = sm.ols(formula=f"{cols[-1]} ~ {' + '.join(cols[:-1])}", data=data)
model = model.fit()
print(model.summary())
In [4]:
# Compute the influence to get Cook's distance
inf = influence(model)
# cooks_distance is an attribute of incluence, here C, not sure about P (p-value maybe?)
C, P = inf.cooks_distance
In [5]:
def plot_cooks_distance(c):
_, ax = plt.subplots(figsize=(9,6))
ax.stem(c, markerfmt=",")
ax.set_xlabel("instance")
ax.set_ylabel("distance")
ax.set_title("Cook's Distance Outlier Detection")
return ax
plot_cooks_distance(C)
Out[5]:
In [6]:
class CooksDistance(Visualizer):
def fit(self, X, y):
# Leverage is computed as the diagonal of the projection matrix of X
# TODO: whiten X before computing leverage
self.leverage_ = (X * np.linalg.pinv(X).T).sum(1)
# Compute the MSE
rank = np.linalg.matrix_rank(X)
df = X.shape[0] - rank
resid = y - LinearRegression().fit(X, y).predict(X)
mse = np.dot(resid, resid) / df
resid_studentized_internal = resid / np.sqrt(mse) / np.sqrt(1-self.leverage_)
self.distance_ = resid_studentized_internal**2 / X.shape[1]
self.distance_ *= self.leverage_ / (1 - self.leverage_)
self.p_values_ = sp.stats.f.sf(self.distance_, X.shape[1], df)
self.draw()
return self
def draw(self):
self.ax.stem(self.distance_, markerfmt=",", label="influence")
self.ax.axhline(4/len(self.distance_), c='r', ls='--', lw=1, label="$\frac{4}{n}$")
def finalize(self):
self.ax.legend()
self.ax.set_xlabel("instance")
self.ax.set_ylabel("influence")
self.ax.set_title("Cook's Distance Outlier Detection")
viz = CooksDistance().fit(X, y)
viz.finalize()