Sklearn

sklearn.linear_model


In [1]:
from matplotlib.colors import ListedColormap
from sklearn import cross_validation, datasets, linear_model, metrics

import numpy as np


/home/andrey/anaconda2/lib/python2.7/site-packages/sklearn/cross_validation.py:44: DeprecationWarning: This module was deprecated in version 0.18 in favor of the model_selection module into which all the refactored classes and functions are moved. Also note that the interface of the new CV iterators are different from that of this module. This module will be removed in 0.20.
  "This module will be removed in 0.20.", DeprecationWarning)

In [2]:
%pylab inline


Populating the interactive namespace from numpy and matplotlib

Линейная регрессия

Генерация данных


In [3]:
data, target, coef = datasets.make_regression(n_features = 2, n_informative = 1, n_targets = 1, 
                                              noise = 5., coef = True, random_state = 2)

In [4]:
pylab.scatter(map(lambda x:x[0], data), target, color = 'r')
pylab.scatter(map(lambda x:x[1], data), target, color = 'b')


Out[4]:
<matplotlib.collections.PathCollection at 0x7f76703f4f50>

In [5]:
train_data, test_data, train_labels, test_labels = cross_validation.train_test_split(data, target,  
                                                                                     test_size = 0.3)

LinearRegression


In [6]:
linear_regressor = linear_model.LinearRegression()
linear_regressor.fit(train_data, train_labels)
predictions = linear_regressor.predict(test_data)

In [7]:
print test_labels


[ -32.71074998  -37.31870104   11.06961035  -22.32195021  101.14760598
  -63.4056294   -61.47026695  -27.02798161   22.2276832    -0.74051877
  -71.3715844    20.87713077  -12.98848753   22.13032804   21.20540389
  -16.79027112   24.82763821    0.34799656  -16.30914909   11.74073026
   12.47089016   23.87701013  -84.32102748  -11.18242389   13.02656201
   11.96165156   25.7124082   -18.86438755   25.24428409    4.45578287]

In [8]:
print predictions


[-43.52503555 -34.82186426  26.08632498 -22.80982487  97.10627799
 -59.65009067 -56.83314735 -24.77704648  18.84238182   1.20520045
 -71.47096194  13.56533234 -16.29714434  20.29563373  16.49008805
 -14.62085994  30.47256022  -1.71838417 -12.1393578   14.08889893
  19.89542664  31.44682999 -81.73502831 -12.74501443  18.81238397
  13.81047258  28.38850352 -18.31064939  14.87945705   9.44432289]

In [9]:
metrics.mean_absolute_error(test_labels, predictions)


Out[9]:
4.262032002319331

In [10]:
linear_scoring = cross_validation.cross_val_score(linear_regressor, data, target, scoring = 'mean_absolute_error', 
                                                  cv = 10)
print 'mean: {}, std: {}'.format(linear_scoring.mean(), linear_scoring.std())


mean: -4.07007149878, std: 1.07371044929
/home/andrey/anaconda2/lib/python2.7/site-packages/sklearn/metrics/scorer.py:90: DeprecationWarning: Scoring method mean_absolute_error was renamed to neg_mean_absolute_error in version 0.18 and will be removed in 0.20.
  sample_weight=sample_weight)
/home/andrey/anaconda2/lib/python2.7/site-packages/sklearn/metrics/scorer.py:90: DeprecationWarning: Scoring method mean_absolute_error was renamed to neg_mean_absolute_error in version 0.18 and will be removed in 0.20.
  sample_weight=sample_weight)
/home/andrey/anaconda2/lib/python2.7/site-packages/sklearn/metrics/scorer.py:90: DeprecationWarning: Scoring method mean_absolute_error was renamed to neg_mean_absolute_error in version 0.18 and will be removed in 0.20.
  sample_weight=sample_weight)
/home/andrey/anaconda2/lib/python2.7/site-packages/sklearn/metrics/scorer.py:90: DeprecationWarning: Scoring method mean_absolute_error was renamed to neg_mean_absolute_error in version 0.18 and will be removed in 0.20.
  sample_weight=sample_weight)
/home/andrey/anaconda2/lib/python2.7/site-packages/sklearn/metrics/scorer.py:90: DeprecationWarning: Scoring method mean_absolute_error was renamed to neg_mean_absolute_error in version 0.18 and will be removed in 0.20.
  sample_weight=sample_weight)
/home/andrey/anaconda2/lib/python2.7/site-packages/sklearn/metrics/scorer.py:90: DeprecationWarning: Scoring method mean_absolute_error was renamed to neg_mean_absolute_error in version 0.18 and will be removed in 0.20.
  sample_weight=sample_weight)
/home/andrey/anaconda2/lib/python2.7/site-packages/sklearn/metrics/scorer.py:90: DeprecationWarning: Scoring method mean_absolute_error was renamed to neg_mean_absolute_error in version 0.18 and will be removed in 0.20.
  sample_weight=sample_weight)
/home/andrey/anaconda2/lib/python2.7/site-packages/sklearn/metrics/scorer.py:90: DeprecationWarning: Scoring method mean_absolute_error was renamed to neg_mean_absolute_error in version 0.18 and will be removed in 0.20.
  sample_weight=sample_weight)
/home/andrey/anaconda2/lib/python2.7/site-packages/sklearn/metrics/scorer.py:90: DeprecationWarning: Scoring method mean_absolute_error was renamed to neg_mean_absolute_error in version 0.18 and will be removed in 0.20.
  sample_weight=sample_weight)
/home/andrey/anaconda2/lib/python2.7/site-packages/sklearn/metrics/scorer.py:90: DeprecationWarning: Scoring method mean_absolute_error was renamed to neg_mean_absolute_error in version 0.18 and will be removed in 0.20.
  sample_weight=sample_weight)

In [11]:
scorer = metrics.make_scorer(metrics.mean_absolute_error, greater_is_better = True)

In [12]:
linear_scoring = cross_validation.cross_val_score(linear_regressor, data, target, scoring=scorer, 
                                                  cv = 10)
print 'mean: {}, std: {}'.format(linear_scoring.mean(), linear_scoring.std())


mean: 4.07007149878, std: 1.07371044929

In [13]:
coef


Out[13]:
array([ 38.07925837,   0.        ])

In [14]:
linear_regressor.coef_


Out[14]:
array([ 38.24184441,   0.12341144])

In [15]:
# в лекции не указано, что в уравнении обученной модели также участвует свободный член
linear_regressor.intercept_


Out[15]:
-0.24537777239041436

In [16]:
print "y = {:.2f}*x1 + {:.2f}*x2".format(coef[0], coef[1])


y = 38.08*x1 + 0.00*x2

In [17]:
print "y = {:.2f}*x1 + {:.2f}*x2 + {:.2f}".format(linear_regressor.coef_[0], 
                                                  linear_regressor.coef_[1], 
                                                  linear_regressor.intercept_)


y = 38.24*x1 + 0.12*x2 + -0.25

Lasso


In [18]:
lasso_regressor = linear_model.Lasso(random_state = 3)
lasso_regressor.fit(train_data, train_labels)
lasso_predictions = lasso_regressor.predict(test_data)

In [19]:
lasso_scoring = cross_validation.cross_val_score(lasso_regressor, data, target, scoring = scorer, cv = 10)
print 'mean: {}, std: {}'.format(lasso_scoring.mean(), lasso_scoring.std())


mean: 4.15447824667, std: 1.0170354385

In [20]:
print lasso_regressor.coef_


[ 37.54937998   0.        ]

In [21]:
print "y = {:.2f}*x1 + {:.2f}*x2".format(coef[0], coef[1])


y = 38.08*x1 + 0.00*x2

In [22]:
print "y = {:.2f}*x1 + {:.2f}*x2".format(lasso_regressor.coef_[0], lasso_regressor.coef_[1])


y = 37.55*x1 + 0.00*x2