In [1]:
from sklearn import datasets, metrics
import matplotlib.pyplot as plt
import numpy as np
from pandas import DataFrame
from scipy.optimize import minimize
import seaborn
%matplotlib inline

Генерируем датасет


In [2]:
border = 5.
x = np.random.uniform(-border, border, size=500)
eps = np.random.normal(loc=0, scale=0.2, size=500)
y = 0.5 * x + 1. + eps
dataset = np.concatenate((x.reshape((500, 1)), y.reshape((500, 1))), axis=1).T

In [3]:
plt.figure(figsize=(14, 10))
plt.grid(True)
plt.scatter(dataset[0], dataset[1], label='data', alpha=0.9)
min_x, max_x = dataset[0].min(), dataset[0].max()
plt.plot(x, map(lambda y: 0.5 * y + 1, x), '-', label='$0.5 \cdot x + 1$')
plt.legend(loc='best')
plt.xlabel('x')
plt.ylabel('y(x)')
plt.title('Dataset')
plt.show()


Восстановим зависимость $y(x) = k \cdot x + b$, минимизируя MSE


In [4]:
def Q_MSE(w, X, y): # loss function
    y_pred = w[0] * X + w[1]
    return metrics.mean_squared_error(y_pred=y_pred, y_true=y)

MSE = minimize(Q_MSE, [0., 0.], args=(dataset[0], dataset[1]))
MSE


Out[4]:
      fun: 0.04060520858495015
 hess_inv: array([[ 0.05871139, -0.0008459 ],
       [-0.0008459 ,  0.50245308]])
      jac: array([ -3.55904922e-06,   8.36327672e-07])
  message: 'Optimization terminated successfully.'
     nfev: 32
      nit: 7
     njev: 8
   status: 0
  success: True
        x: array([ 0.495034  ,  1.00170659])

In [5]:
plt.figure(figsize=(14, 10))
plt.grid(True)
plt.scatter(dataset[0], dataset[1], label='data', alpha=0.5)
min_x, max_x = dataset[0].min(), dataset[0].max()
plt.plot(x, map(lambda y: 0.5 * y + 1, x), '--', label='$0.5 \cdot x + 1$')
plt.plot(x, map(lambda y: MSE.x[0] * y + MSE.x[1], x), label='MSE minimization')
plt.legend(loc='best')
plt.xlabel('x')
plt.ylabel('y(x)')
plt.title('Regression minimizing MSE')
plt.show()


Добавим выбросы


In [6]:
outliers_x = np.random.uniform(-border, border, size=75).reshape((75, 1))
outliers_y = -1. + np.random.normal(loc=0., scale=0.2, size=75).reshape((75, 1))
outliers = np.concatenate((outliers_x, outliers_y), axis=1).T

In [7]:
data_with_out = np.concatenate((dataset, outliers), axis=1)

In [8]:
def Q_MAE(w, X, y):
    y_pred = w[0] * X + w[1]
    return metrics.mean_absolute_error(y_pred=y_pred, y_true=y)

MSE = minimize(Q_MSE, [0., 0.], args=(data_with_out[0], data_with_out[1]))
MSE


Out[8]:
      fun: 0.8023181846489391
 hess_inv: array([[ 0.0583013 , -0.01638738],
       [-0.01638738,  1.03179926]])
      jac: array([  5.21540642e-08,   2.98023224e-08])
  message: 'Optimization terminated successfully.'
     nfev: 20
      nit: 2
     njev: 5
   status: 0
  success: True
        x: array([ 0.42024413,  0.72392422])

In [9]:
# use no gradient because abs() is not a good function
MAE = minimize(Q_MAE, [0., 0.], args=(data_with_out[0], data_with_out[1]), method='Nelder-Mead') 
MAE


Out[9]:
 final_simplex: (array([[ 0.4896237 ,  0.9620487 ],
       [ 0.48965493,  0.96195909],
       [ 0.48963891,  0.9620676 ]]), array([ 0.4288773 ,  0.42887732,  0.4288774 ]))
           fun: 0.4288772958739463
       message: 'Optimization terminated successfully.'
          nfev: 123
           nit: 64
        status: 0
       success: True
             x: array([ 0.4896237,  0.9620487])

In [10]:
plt.figure(figsize=(14, 10))
plt.grid(True)
plt.scatter(data_with_out[0], data_with_out[1], label='data', alpha=0.8)
min_x, max_x = dataset[0].min(), dataset[0].max()
plt.plot(x, map(lambda y: 0.5 * y + 1, x), '--', label='$0.5 \cdot x + 1$')
plt.plot(x, map(lambda y: MSE.x[0] * y + MSE.x[1], x), label='MSE minimization')
plt.plot(x, map(lambda y: MAE.x[0] * y + MAE.x[1], x), label='MAE minimization')
plt.legend(loc='best')
plt.xlabel('x')
plt.ylabel('y(x)')
plt.title('Regression with outliers')
plt.show()


Из полученных значений и графика выше видно, что минимизация MAE гораздо более устойчива к выбросам.


In [ ]: