Multiple Linear Regression with scikit-learn


In [1]:
%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import numpy as np

from sklearn import linear_model, datasets, metrics, model_selection, feature_selection, preprocessing

from scipy import stats

In [2]:
boston = datasets.load_boston()

X = pd.DataFrame(boston.data, columns=boston.feature_names)
y = boston.target

In [3]:
print('shape:', X.shape)


shape: (506, 13)

In [4]:
X.describe()


Out[4]:
CRIM ZN INDUS CHAS NOX RM AGE DIS RAD TAX PTRATIO B LSTAT
count 506.000000 506.000000 506.000000 506.000000 506.000000 506.000000 506.000000 506.000000 506.000000 506.000000 506.000000 506.000000 506.000000
mean 3.593761 11.363636 11.136779 0.069170 0.554695 6.284634 68.574901 3.795043 9.549407 408.237154 18.455534 356.674032 12.653063
std 8.596783 23.322453 6.860353 0.253994 0.115878 0.702617 28.148861 2.105710 8.707259 168.537116 2.164946 91.294864 7.141062
min 0.006320 0.000000 0.460000 0.000000 0.385000 3.561000 2.900000 1.129600 1.000000 187.000000 12.600000 0.320000 1.730000
25% 0.082045 0.000000 5.190000 0.000000 0.449000 5.885500 45.025000 2.100175 4.000000 279.000000 17.400000 375.377500 6.950000
50% 0.256510 0.000000 9.690000 0.000000 0.538000 6.208500 77.500000 3.207450 5.000000 330.000000 19.050000 391.440000 11.360000
75% 3.647423 12.500000 18.100000 0.000000 0.624000 6.623500 94.075000 5.188425 24.000000 666.000000 20.200000 396.225000 16.955000
max 88.976200 100.000000 27.740000 1.000000 0.871000 8.780000 100.000000 12.126500 24.000000 711.000000 22.000000 396.900000 37.970000

In [5]:
sns.distplot(y)


Out[5]:
<matplotlib.axes._subplots.AxesSubplot at 0x7feca2f10358>

In [6]:
sns.pairplot(X);



In [7]:
fig = plt.figure()
ax1 = fig.add_subplot(211)
ax2 = fig.add_subplot(212)

sns.distplot(y, ax=ax2)

sns.boxplot(data=y, orient='h', ax=ax1)


Out[7]:
<matplotlib.axes._subplots.AxesSubplot at 0x7fec99d75160>

In [8]:
X_train, X_test, y_train, y_test = model_selection.train_test_split(X, y, train_size=0.7)

print('train samples:', len(X_train))
print('test samples', len(X_test))


train samples: 354
test samples 152
/opt/conda/lib/python3.6/site-packages/sklearn/model_selection/_split.py:2010: FutureWarning: From version 0.21, test_size will always complement train_size unless both are specified.
  FutureWarning)

In [9]:
import warnings
warnings.filterwarnings(action="ignore", module="scipy", message="^internal gelsd")

In [10]:
lr = linear_model.LinearRegression()
lr.fit(X_train, y_train)


Out[10]:
LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False)

In [11]:
print('No coef:', len(lr.coef_))
print('Coefficients: \n', lr.coef_)


No coef: 13
Coefficients: 
 [ -6.47979069e-02   4.73143877e-02  -1.17905444e-02   3.52243833e+00
  -1.49650712e+01   4.12429529e+00  -1.39913480e-02  -1.33703261e+00
   2.78469843e-01  -1.43300794e-02  -9.30845180e-01   9.21741038e-03
  -3.80819964e-01]

In [12]:
predicted = lr.predict(X_test)

In [13]:
fig, ax = plt.subplots()
ax.scatter(y_test, predicted)
ax.plot([y.min(), y.max()], [y.min(), y.max()], ls='--', color='red')
ax.set_xlabel('Measured')
ax.set_ylabel('Predicted')


Out[13]:
<matplotlib.text.Text at 0x7fec98f55780>

In [14]:
residual = (y_test - predicted)

In [15]:
fig, ax = plt.subplots()
ax.scatter(y_test, residual)
plt.axhline(0, color='red', ls='--')
ax.set_xlabel('y')
ax.set_ylabel('residual')


Out[15]:
<matplotlib.text.Text at 0x7feca03eceb8>

In [16]:
sns.distplot(residual);


The trainig scores


In [17]:
metrics.r2_score(y_train, lr.predict(X_train))


Out[17]:
0.76589385504484431

In [18]:
metrics.mean_squared_error(y_train, lr.predict(X_train))


Out[18]:
17.414000649589138

In [19]:
metrics.r2_score(y_test, predicted)


Out[19]:
0.67562014366415801

In [20]:
metrics.mean_squared_error(y_test, predicted)


Out[20]:
34.891451434402789

In [21]:
print(lr.intercept_)
print(lr.coef_)


32.1626709009
[ -6.47979069e-02   4.73143877e-02  -1.17905444e-02   3.52243833e+00
  -1.49650712e+01   4.12429529e+00  -1.39913480e-02  -1.33703261e+00
   2.78469843e-01  -1.43300794e-02  -9.30845180e-01   9.21741038e-03
  -3.80819964e-01]