In [1]:
from sklearn import datasets, metrics
import matplotlib.pyplot as plt
import numpy as np
from pandas import DataFrame
from scipy.optimize import minimize
import seaborn
%matplotlib inline
In [2]:
border = 5.
x = np.random.uniform(-border, border, size=500)
eps = np.random.normal(loc=0, scale=0.2, size=500)
y = 0.5 * x + 1. + eps
dataset = np.concatenate((x.reshape((500, 1)), y.reshape((500, 1))), axis=1).T
In [3]:
plt.figure(figsize=(14, 10))
plt.grid(True)
plt.scatter(dataset[0], dataset[1], label='data', alpha=0.9)
min_x, max_x = dataset[0].min(), dataset[0].max()
plt.plot(x, map(lambda y: 0.5 * y + 1, x), '-', label='$0.5 \cdot x + 1$')
plt.legend(loc='best')
plt.xlabel('x')
plt.ylabel('y(x)')
plt.title('Dataset')
plt.show()
In [4]:
def Q_MSE(w, X, y): # loss function
y_pred = w[0] * X + w[1]
return metrics.mean_squared_error(y_pred=y_pred, y_true=y)
MSE = minimize(Q_MSE, [0., 0.], args=(dataset[0], dataset[1]))
MSE
Out[4]:
In [5]:
plt.figure(figsize=(14, 10))
plt.grid(True)
plt.scatter(dataset[0], dataset[1], label='data', alpha=0.5)
min_x, max_x = dataset[0].min(), dataset[0].max()
plt.plot(x, map(lambda y: 0.5 * y + 1, x), '--', label='$0.5 \cdot x + 1$')
plt.plot(x, map(lambda y: MSE.x[0] * y + MSE.x[1], x), label='MSE minimization')
plt.legend(loc='best')
plt.xlabel('x')
plt.ylabel('y(x)')
plt.title('Regression minimizing MSE')
plt.show()
In [6]:
outliers_x = np.random.uniform(-border, border, size=75).reshape((75, 1))
outliers_y = -1. + np.random.normal(loc=0., scale=0.2, size=75).reshape((75, 1))
outliers = np.concatenate((outliers_x, outliers_y), axis=1).T
In [7]:
data_with_out = np.concatenate((dataset, outliers), axis=1)
In [8]:
def Q_MAE(w, X, y):
y_pred = w[0] * X + w[1]
return metrics.mean_absolute_error(y_pred=y_pred, y_true=y)
MSE = minimize(Q_MSE, [0., 0.], args=(data_with_out[0], data_with_out[1]))
MSE
Out[8]:
In [9]:
# use no gradient because abs() is not a good function
MAE = minimize(Q_MAE, [0., 0.], args=(data_with_out[0], data_with_out[1]), method='Nelder-Mead')
MAE
Out[9]:
In [10]:
plt.figure(figsize=(14, 10))
plt.grid(True)
plt.scatter(data_with_out[0], data_with_out[1], label='data', alpha=0.8)
min_x, max_x = dataset[0].min(), dataset[0].max()
plt.plot(x, map(lambda y: 0.5 * y + 1, x), '--', label='$0.5 \cdot x + 1$')
plt.plot(x, map(lambda y: MSE.x[0] * y + MSE.x[1], x), label='MSE minimization')
plt.plot(x, map(lambda y: MAE.x[0] * y + MAE.x[1], x), label='MAE minimization')
plt.legend(loc='best')
plt.xlabel('x')
plt.ylabel('y(x)')
plt.title('Regression with outliers')
plt.show()
Из полученных значений и графика выше видно, что минимизация MAE гораздо более устойчива к выбросам.
In [ ]: