Load data from previous notebook



In [1]:

    
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.datasets import fetch_california_housing
from sklearn.datasets import load_boston
%matplotlib inline


def standardize(x):
    return (x - np.mean(x)) / np.std(x)


boston = load_boston()
dataset = pd.DataFrame(boston.data, columns=boston.feature_names)
dataset['target'] = boston.target
x_range = [dataset['RM'].min(), dataset['RM'].max()]
y_range = [dataset['target'].min(), dataset['target'].max()]

Linear Regression

Linear regression defines a straight line through a given set of points minimizing the sum of the squared errors.

Its expression is:

X = matrix of predictors
β = matrix of coefficients
β0 = constant value (bias)

Using statsmodels



In [2]:

    
import statsmodels.api as sm
import statsmodels.formula.api as smf

y = dataset['target']
X = dataset['RM']
X = sm.add_constant(X)

X.head()









    



/home/ubuntu/anaconda3/lib/python3.6/site-packages/statsmodels/compat/pandas.py:56: FutureWarning: The pandas.core.datetools module is deprecated and will be removed in a future version. Please use the pandas.tseries module instead.
  from pandas.core import datetools






    Out[2]:







  
    
      
      const
      RM
    
  
  
    
      0
      1.0
      6.575
    
    
      1
      1.0
      6.421
    
    
      2
      1.0
      7.185
    
    
      3
      1.0
      6.998
    
    
      4
      1.0
      7.147

Method 1



In [17]:

    
linear_regression = sm.OLS(y, X)
fitted_model = linear_regression.fit()

Method 2



In [4]:

    
linear_regression = smf.ols(formula='target ~ RM', data=dataset)
fitted_model = linear_regression.fit()



In [5]:

    
fitted_model.summary()









    Out[5]:





OLS Regression Results

  Dep. Variable:          target         R-squared:             0.484


  Model:                    OLS          Adj. R-squared:        0.483


  Method:              Least Squares     F-statistic:           471.8


  Date:              Tue, 17 Oct 2017    Prob (F-statistic):  2.49e-74


  Time:                  02:41:30        Log-Likelihood:      -1673.1


  No. Observations:          506         AIC:                   3350.


  Df Residuals:              504         BIC:                   3359.


  Df Model:                    1                                     


  Covariance Type:       nonrobust                                   




               coef      std err       t       P>|t|   [0.025     0.975]  


  Intercept    -34.6706      2.650    -13.084   0.000    -39.877    -29.465


  RM             9.1021      0.419     21.722   0.000      8.279      9.925




  Omnibus:        102.585    Durbin-Watson:         0.684 


  Prob(Omnibus):   0.000     Jarque-Bera (JB):    612.449 


  Skew:            0.726     Prob(JB):           1.02e-133


  Kurtosis:        8.190     Cond. No.               58.4



In [6]:

    
fitted_model.params









    Out[6]:





Intercept   -34.670621
RM            9.102109
dtype: float64



In [7]:

    
betas = np.array(fitted_model.params)
fitted_values = fitted_model.predict(X)
fitted_values.head()









    Out[7]:





0    25.175746
1    23.774021
2    30.728032
3    29.025938
4    30.382152
dtype: float64

Coefficient of Determination



In [8]:

    
mean_sum_squared_errors = np.sum((dataset['target'] - dataset['target'].mean())**2)
regr_sum_squared_errors = np.sum((dataset['target'] - fitted_values)**2)
(mean_sum_squared_errors - regr_sum_squared_errors) / mean_sum_squared_errors









    Out[8]:





0.48352545599133434



In [9]:

    
from scipy.stats.stats import pearsonr

(pearsonr(dataset['RM'], dataset['target'])[0])**2









    Out[9]:





0.4835254559913339

Evaluating the Fitted Values



In [10]:

    
residuals = dataset['target'] - fitted_values
normalized_residuals = standardize(residuals)
normalized_residuals.head()









    Out[10]:





0   -0.178060
1   -0.329244
2    0.601533
3    0.662428
4    0.881082
dtype: float64



In [13]:

    
residual_scatter_plot = plt.plot(dataset['RM'], normalized_residuals, 'bp')
mean_residual = plt.plot([int(x_range[0]), round(x_range[1], 0)], [0, 0], '-', color='red', linewidth=2)
upper_bound = plt.plot([int(x_range[0]), round(x_range[1], 0)], [3, 3], '--', color='red', linewidth=1)
lower_bound = plt.plot([int(x_range[0]), round(x_range[1], 0)], [-3, -3], '--', color='red', linewidth=1)
plt.grid()

Predicting with a Regression Model



In [19]:

    
# For this cell to work, ensure that fitted_model is set from sm.OLS instead of smf.ols
RM = 5
Xp = np.array([1, RM])
print('Out model predicts if RM = {0} the answer value is {1}'.format(RM, fitted_model.predict(Xp)))









    



Out model predicts if RM = 5 the answer value is [ 10.83992413]



In [20]:

    
x_range = [dataset['RM'].min(), dataset['RM'].max()]
y_range = [dataset['target'].min(), dataset['target'].max()]
scatter_plot = dataset.plot(kind='scatter', x='RM', y='target', xlim=x_range, ylim=y_range)
mean_y = scatter_plot.plot(x_range, [dataset['target'].mean(), dataset['target'].mean()], '--', color='red', linewidth=1)
mean_x = scatter_plot.plot([dataset['RM'].mean(), dataset['RM'].mean()], y_range, '--', color='red', linewidth=1)
regression_line = scatter_plot.plot(dataset['RM'], fitted_values, '-', color='orange', linewidth=1)



In [44]:

    
predictions_by_dot_product = np.dot(X, betas)
print('Using the prediction method: {0}'.format(fitted_values[:10]))
print('Using betas and a dot product: {0}'.format(predictions_by_dot_product))









    



Using the prediction method: 0    25.175746
1    23.774021
2    30.728032
3    29.025938
4    30.382152
5    23.855940
6    20.051258
7    21.507596
8    16.583355
9    19.978442
dtype: float64
Using betas and a dot product: [ 25.17574577  23.77402099  30.72803225  29.02593787  30.38215211
  23.85593997  20.05125842  21.50759586  16.5833549   19.97844155
  23.3735282   20.02395209  18.93169901  19.47782555  20.81583557
  18.43108302  19.35039603  19.85101202  14.99048582  17.45715736
  16.02812625  19.6234593   21.23453259  18.23993873  19.25027283
  16.29208741  18.23993873  20.36983223  24.44757706  26.07685456
  17.32972783  20.59738496  19.48692766  17.22050253  20.81583557
  19.33219181  18.49479778  18.57671676  19.63256141  25.35778795
  29.26259271  26.95065703  21.48028953  21.86257811  20.57007863
  17.04756245  17.99418179  20.21509638  14.47166561  16.31939374
  19.60525508  20.98877564  24.5932108   19.92382889  18.9225969
  31.31056723  23.42814085  27.36935404  21.26183891  19.27757916
  17.58458688  19.63256141  24.09259481  26.87784015  29.99076143
  22.58164472  18.0032839   18.83157581  16.24657686  18.89529058
  23.73761256  19.58705086  20.53367019  22.17204981  22.42690886
  22.54523628  22.48152152  21.21632837  22.05372239  18.79516738
  26.55926634  25.57623857  22.69087002  21.46208531  23.4827535
  25.67636177  20.07856475  21.0433883   29.10785685  29.7632087
  23.73761256  23.62838725  23.96516528  21.86257811  22.20845825
  25.63085122  21.42567687  38.77429659  36.50787146  32.83061943
  26.55926634  27.05078022  23.62838725  21.18902204  21.46208531
  18.58581887  18.44928724  21.09800095  24.25643277  22.02641607
  21.71694436  26.45004103  19.15014963  20.77942714  22.25396879
  19.28668126  21.54400429  20.1331774   18.77696316  17.49356579
  18.75875894  19.97844155  19.58705086  18.63132942  18.84067792
  19.81460358  16.41951693  17.14768565  23.86504208  16.63796755
  24.11079902  22.90932064  23.32801765  18.32185771  17.73022063
  22.99123962  19.41411079  24.07439059  18.64043153  21.31645157
  21.52580007  11.0128642   14.50807405  15.09971113   9.95701956
  21.12530728  16.55604857  10.16636806  12.5329164   16.27388319
  21.05249041  14.51717616  10.94914944  17.2933194   21.11620517
  21.32555368  13.31569777  28.52532188  20.5427723   24.58410869
  22.21756036  33.49507338  36.34403349  41.55954194  18.6131252
  20.86134612  37.50000134  18.82247371  22.84560588  23.60108092
  18.80426949  18.84978003  16.04633047  23.72851045  18.65863574
  24.91178461  20.12407529  22.80919744  27.76984683  28.86209991
  36.00725546  21.2527368   30.45496898  25.06652047  16.33759795
  21.33465578  36.60799466  27.05988233  25.0028057   30.72803225
  28.59813875  26.66849165  30.66431749  27.2237203   25.43970694
  37.00848745  31.65644737  30.01806775  31.53811995  28.81658937
  30.2729268   21.41657477  34.59642857  36.80824105  38.45572278
  18.94990323  22.90932064  17.96687546  20.52456809  13.97104962
  19.57794875  14.51717616  18.18532608  23.35532398  14.58999303
  21.59861695  18.9225969   25.78558708  19.49602977  23.33711976
  28.59813875  21.43477898  27.94278691  25.56713646  40.56741206
  44.74528008  38.51033543  30.52778586  35.28818885  24.96639727
  19.76909304  32.79421099  41.2136618   40.39447199  26.55016423
  20.72481448  25.68546388  32.30269711  24.32014753  25.45791115
  28.10662487  20.80673346  23.20058813  23.51916194  16.23747476
  16.34670006  20.92506088  21.99910974  23.8832463   26.47734736
  24.37476018  23.92875684  28.65275141  40.5036973   20.92506088
  18.8133716   33.17649957  44.5541358   32.07514438  27.60600887
  30.89187022  33.77723876  41.76889045  32.02053173  30.91917654
  15.93710516  29.17157162  40.84957744  33.32213331  19.21386439
  18.63132942  22.12653927  24.83896774  35.3336994   26.84143172
  27.71523418  31.47440519  27.46037513  24.32924964  27.3329456
  36.50787146  28.7528746   34.91500238  37.44538868  29.84512768
  24.06528848  22.03551818  21.84437389  22.80919744  25.08472469
  27.77894894  30.39125422  25.67636177  21.09800095  20.02395209
  26.113263    24.93909094  18.03059022  23.08226071  29.41732856
  27.86997003  25.31227741  24.44757706  28.88030413  31.19223981
  25.54893224  32.86702786  27.66972364  25.72187231  19.68717406
  10.59416719  21.05249041  20.15138162  22.3631941   25.1029289
  17.25691096  19.15925174  17.95777335  23.41903874  20.97057143
  23.81953154  23.36442609  20.31521958  17.28421729  23.71940834
  23.86504208  22.78189111  20.69750816  18.74055473  22.9730354
  21.2527368   17.26601307  20.22419849  22.81829955  22.76368689
  20.27881114  18.74965683  18.98631167  20.47905754  19.80550148
  19.65076562  31.23775036  24.85717196  26.27710096  27.89727636
  20.06946264  19.01361799  24.63872134  25.72187231  28.48891344
  24.40206651  25.21215421  18.88618847  26.56836845  16.87462238
  19.35949814  21.87168021  23.53736616  21.09800095  20.96146932
  23.56467249  22.22666246  14.13488758  18.14891764  45.24589608
  -2.25801069  10.5031461    0.49082622  10.56686086  26.15877354
  29.18977584  21.90808865  18.80426949   9.98432589   2.99390619
  31.8931022   25.84930184  27.16910764  23.40083452  21.97180341
  28.7528746   24.90268251  15.71865454  15.5730208    5.08739125
  13.36120832   7.6723902   10.83992413   9.74767105  14.38974663
  17.32972783  20.40624067  11.16760005  21.69874014  18.9134948
  24.22912644  23.62838725  17.63919954  14.9631795   18.59492098
  19.82370569  23.06405649  23.61928514  14.01656016  15.673144
  17.05666456   2.99390619  16.37400639  16.45592537  27.69702996
  17.73022063  25.92211871   7.45393959  12.25075102   6.46180971
  23.89234841  27.05988233  13.60696526  19.55064242  27.44217091
  23.6829999   19.99664576  16.73809075  20.87955034  15.9826157
  18.99541378  18.45838935  21.78065912  21.69874014  23.40083452
  23.10956704  27.52408989  23.81042943  23.91055263  21.83527178
  25.66725966  24.13810535  21.32555368  19.35039603  16.54694646
  18.28544928  23.63748936  21.93539498  24.35655597  18.6131252
  24.11990113  23.04585227  22.22666246  21.62592327  23.73761256
  26.75951274  25.90391449  22.64535948  32.62127092  26.56836845
  24.72064033  19.7235825   19.35949814  22.68176791  20.67930394
  26.32261151  23.36442609  22.82740166  24.61141502  21.84437389
  17.74842485  19.50513188  19.96933944  19.26847705  17.32972783
  21.46208531  22.02641607  23.91965474  28.86209991  14.72652466
  21.41657477  24.34745386  13.60696526  21.62592327  22.02641607
  22.14474348  26.76861485  29.59937074  17.77573117  18.76786105
  22.78189111  20.97967353  19.07733276  14.97228161  14.60819725
  11.68642026  19.78729726  19.78729726  17.27511518  19.26847705
  16.93833715  14.38974663  18.06699866  20.11497318  16.01902414
  20.18779005  25.33958374  21.03428619  28.82569148  27.16910764
  20.21509638]

Regressing with Scikit-learn



In [46]:

    
from sklearn import linear_model

linear_regression = linear_model.LinearRegression(normalize=False, fit_intercept=True)



In [47]:

    
observations = len(dataset)
dataset['RM'].values.reshape((observations, 1)) # X should always be a matrix, never a vector
y = dataset['target'].values # y can be a vector



In [48]:

    
linear_regression.fit(X, y)









    Out[48]:





LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False)



In [49]:

    
print(linear_regression.coef_)
print(linear_regression.intercept_)









    



[ 0.          9.10210898]
-34.6706207764



In [50]:

    
print(linear_regression.predict(X)[:10])









    



[ 25.17574577  23.77402099  30.72803225  29.02593787  30.38215211
  23.85593997  20.05125842  21.50759586  16.5833549   19.97844155]

Manually calculate results using matrix-vector multiplication:



In [51]:

    
Xp = np.column_stack((X, np.ones(observations)))
v_coef = list(linear_regression.coef_) + [linear_regression.intercept_]



In [52]:

    
np.dot(Xp, v_coef)[:10]









    Out[52]:





array([ 25.17574577,  23.77402099,  30.72803225,  29.02593787,
        30.38215211,  23.85593997,  20.05125842,  21.50759586,
        16.5833549 ,  19.97844155])

Scikit calculation is much faster:



In [63]:

    
from sklearn.datasets import make_regression

HX, Hy = make_regression(n_samples=10000000, n_features=1, n_targets=1, random_state=101)

%time sk_linear_regression = linear_model.LinearRegression(normalize=False, fit_intercept=True); \
      sk_linear_regression.fit(HX, Hy)









    



Wall time: 494 ms



In [62]:

    
%time sm_linear_regression = sm.OLS(Hy, sm.add_constant(HX)); \
      sm_linear_regression.fit()









    



Wall time: 2.02 s



In [ ]:

Dep. Variable:	target	R-squared:	0.484
Model:	OLS	Adj. R-squared:	0.483
Method:	Least Squares	F-statistic:	471.8
Date:	Tue, 17 Oct 2017	Prob (F-statistic):	2.49e-74
Time:	02:41:30	Log-Likelihood:	-1673.1
No. Observations:	506	AIC:	3350.
Df Residuals:	504	BIC:	3359.
Df Model:	1
Covariance Type:	nonrobust

	coef	std err	t	P>\|t\|	[0.025	0.975]
Intercept	-34.6706	2.650	-13.084	0.000	-39.877	-29.465
RM	9.1021	0.419	21.722	0.000	8.279	9.925

Omnibus:	102.585	Durbin-Watson:	0.684
Prob(Omnibus):	0.000	Jarque-Bera (JB):	612.449
Skew:	0.726	Prob(JB):	1.02e-133
Kurtosis:	8.190	Cond. No.	58.4