Detect outliers using linear regression model and statsmodels
. Based on Stackoverflow question.
In [ ]:
import numpy as np
import statsmodels.api as sm # For some reason this import is necessary...
import statsmodels.formula.api as smapi
import statsmodels.graphics as smgraph
import matplotlib.pyplot as plt
%matplotlib inline
In [ ]:
x = np.arange(30, dtype=float)
# Make some y data with random noise
y = 10 * x + 5.0*np.random.randn(30)
# Add outlier #
y[10] = 180.
y[20] = 130
In [ ]:
plt.plot(x, y, 'o')
In [ ]:
# Make fit #
regression = smapi.ols("data ~ x", data=dict(data=y, x=x)).fit()
regression.summary()
Here we're using our regression results to do a test for outliers. In this case, I guess the default is a Bonferroni outlier test. We're only printing off test results where the third column is less than 0.05.
In [ ]:
test = regression.outlier_test()
test
In [ ]:
print('Bad data points (bonf(p) < 0.05):')
test[test['bonf(p)'] < 0.05]
In [ ]:
outliers = test[test['bonf(p)'] < 0.05].index.values
outliers
In [ ]:
figure = smgraph.regressionplots.plot_fit(regression, 1)
line = smgraph.regressionplots.abline_plot(model_results=regression, ax=figure.axes[0])
plt.plot(outliers, y[outliers], 'xm', label='outliers', ms=14)
plt.legend(loc=0);
In [ ]:
import statsmodels.formula.api as smapi
def get_outliers(features, target):
regression = smapi.ols("target ~ features", data=locals()).fit()
test = regression.outlier_test()
outliers = test[test['bonf(p)'] < 0.05]
return list(outliers.index.values)
In [ ]:
def test_outliers():
x = np.arange(30, dtype=float)
# Make some y data with random noise
y = 10 * x + 5.0*np.random.randn(30)
# Add outlier
y[10] = 180.
y[20] = 130
outliers = [10, 20]
prediction = get_outliers(features=x, target=y)
assert outliers == prediction
test_outliers()
In [ ]: