In [41]:
import numpy as np
from matplotlib import pyplot as plt
from sklearn import linear_model, datasets
%matplotlib inline

plt.rcParams["figure.figsize"] = (20, 10)

In [42]:
n_samples = 10000
n_outliers = 50

X, y, coef = datasets.make_regression(n_samples=n_samples, n_features=1, n_informative=1, noise=10,
                                     coef=True, random_state=1087)

In [43]:
# add outlier data
np.random.seed(1087)
X[:n_outliers] = 3 + 0.5 * np.random.normal(size=(n_outliers,1))
y[:n_outliers] = -3 + 10 * np.random.normal(size=n_outliers)

In [44]:
# fit line using all data
model = linear_model.LinearRegression()
model.fit(X, y)


Out[44]:
LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False)

In [45]:
# Robustly fit linear model with RANSAC
model_ransac = linear_model.RANSACRegressor(linear_model.LinearRegression())
model_ransac.fit(X, y)
inlier_mask = model_ransac.inlier_mask_
outlier_mask = np.logical_not(inlier_mask)

In [46]:
# Predict data of estimated models
line_X = np.arange(-5, 5)
line_y = model.predict(line_X[:, np.newaxis])
line_y_ransac = model_ransac.predict(line_X[:, np.newaxis])

In [47]:
# Compare estimated coefficients
print("Estimated coefficients (true, normal, RANSAC):")
print(coef, model.coef_, model_ransac.estimator_.coef_)


Estimated coefficients (true, normal, RANSAC):
14.418520986263694 [ 13.73926515] [ 12.92756338]

In [48]:
lw = 2
plt.scatter(X[inlier_mask], y[inlier_mask], color='yellowgreen', marker='.', label='Inliers')
plt.scatter(X[outlier_mask], y[outlier_mask], color='gold', marker='.', label='Outliers')

plt.plot(line_X, line_y, color='navy', linestyle='-', linewidth=lw, label='Linear regressor')
plt.plot(line_X, line_y_ransac, color='cornflowerblue', linestyle='-', linewidth=lw, label='RANSAC regressor')
plt.legend(loc='lower right')
plt.show()



In [ ]: