Multiple Linear Regression with scikit-learn



In [1]:

    
%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import numpy as np

from sklearn import linear_model, datasets, metrics, model_selection, feature_selection, preprocessing

from scipy import stats



In [2]:

    
boston = datasets.load_boston()

X = pd.DataFrame(boston.data, columns=boston.feature_names)
y = boston.target



In [3]:

    
print('shape:', X.shape)









    



shape: (506, 13)



In [4]:

    
X.describe()



In [5]:

    
sns.distplot(y)









    Out[5]:





<matplotlib.axes._subplots.AxesSubplot at 0x7feca2f10358>



In [6]:

    
sns.pairplot(X);



In [7]:

    
fig = plt.figure()
ax1 = fig.add_subplot(211)
ax2 = fig.add_subplot(212)

sns.distplot(y, ax=ax2)

sns.boxplot(data=y, orient='h', ax=ax1)









    Out[7]:





<matplotlib.axes._subplots.AxesSubplot at 0x7fec99d75160>



In [8]:

    
X_train, X_test, y_train, y_test = model_selection.train_test_split(X, y, train_size=0.7)

print('train samples:', len(X_train))
print('test samples', len(X_test))









    



train samples: 354
test samples 152






    



/opt/conda/lib/python3.6/site-packages/sklearn/model_selection/_split.py:2010: FutureWarning: From version 0.21, test_size will always complement train_size unless both are specified.
  FutureWarning)



In [9]:

    
import warnings
warnings.filterwarnings(action="ignore", module="scipy", message="^internal gelsd")



In [10]:

    
lr = linear_model.LinearRegression()
lr.fit(X_train, y_train)









    Out[10]:





LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False)



In [11]:

    
print('No coef:', len(lr.coef_))
print('Coefficients: \n', lr.coef_)









    



No coef: 13
Coefficients: 
 [ -6.47979069e-02   4.73143877e-02  -1.17905444e-02   3.52243833e+00
  -1.49650712e+01   4.12429529e+00  -1.39913480e-02  -1.33703261e+00
   2.78469843e-01  -1.43300794e-02  -9.30845180e-01   9.21741038e-03
  -3.80819964e-01]



In [12]:

    
predicted = lr.predict(X_test)



In [13]:

    
fig, ax = plt.subplots()
ax.scatter(y_test, predicted)
ax.plot([y.min(), y.max()], [y.min(), y.max()], ls='--', color='red')
ax.set_xlabel('Measured')
ax.set_ylabel('Predicted')









    Out[13]:





<matplotlib.text.Text at 0x7fec98f55780>



In [14]:

    
residual = (y_test - predicted)



In [15]:

    
fig, ax = plt.subplots()
ax.scatter(y_test, residual)
plt.axhline(0, color='red', ls='--')
ax.set_xlabel('y')
ax.set_ylabel('residual')









    Out[15]:





<matplotlib.text.Text at 0x7feca03eceb8>



In [16]:

    
sns.distplot(residual);

The trainig scores



In [17]:

    
metrics.r2_score(y_train, lr.predict(X_train))









    Out[17]:





0.76589385504484431



In [18]:

    
metrics.mean_squared_error(y_train, lr.predict(X_train))









    Out[18]:





17.414000649589138



In [19]:

    
metrics.r2_score(y_test, predicted)









    Out[19]:





0.67562014366415801



In [20]:

    
metrics.mean_squared_error(y_test, predicted)









    Out[20]:





34.891451434402789



In [21]:

    
print(lr.intercept_)
print(lr.coef_)









    



32.1626709009
[ -6.47979069e-02   4.73143877e-02  -1.17905444e-02   3.52243833e+00
  -1.49650712e+01   4.12429529e+00  -1.39913480e-02  -1.33703261e+00
   2.78469843e-01  -1.43300794e-02  -9.30845180e-01   9.21741038e-03
  -3.80819964e-01]

	CRIM	ZN	INDUS	CHAS	NOX	RM	AGE	DIS	RAD	TAX	PTRATIO	B	LSTAT
count	506.000000	506.000000	506.000000	506.000000	506.000000	506.000000	506.000000	506.000000	506.000000	506.000000	506.000000	506.000000	506.000000
mean	3.593761	11.363636	11.136779	0.069170	0.554695	6.284634	68.574901	3.795043	9.549407	408.237154	18.455534	356.674032	12.653063
std	8.596783	23.322453	6.860353	0.253994	0.115878	0.702617	28.148861	2.105710	8.707259	168.537116	2.164946	91.294864	7.141062
min	0.006320	0.000000	0.460000	0.000000	0.385000	3.561000	2.900000	1.129600	1.000000	187.000000	12.600000	0.320000	1.730000
25%	0.082045	0.000000	5.190000	0.000000	0.449000	5.885500	45.025000	2.100175	4.000000	279.000000	17.400000	375.377500	6.950000
50%	0.256510	0.000000	9.690000	0.000000	0.538000	6.208500	77.500000	3.207450	5.000000	330.000000	19.050000	391.440000	11.360000
75%	3.647423	12.500000	18.100000	0.000000	0.624000	6.623500	94.075000	5.188425	24.000000	666.000000	20.200000	396.225000	16.955000
max	88.976200	100.000000	27.740000	1.000000	0.871000	8.780000	100.000000	12.126500	24.000000	711.000000	22.000000	396.900000	37.970000