Multiple Linear Regression with statsmodels

In [15]:
import statsmodels.api as sm
import statsmodels.formula.api as smf
import pandas as pd

from sklearn import datasets, model_selection

In [16]:
boston = datasets.load_boston()

X = pd.DataFrame(, columns=boston.feature_names)
y =

In [17]:
X_train, X_test, y_train, y_test = model_selection.train_test_split(X, y, train_size=0.7)

/opt/conda/lib/python3.6/site-packages/sklearn/model_selection/ FutureWarning: From version 0.21, test_size will always complement train_size unless both are specified.

In [18]:
model = sm.OLS(y_train, sm.add_constant(X_train))
res =


                            OLS Regression Results                            
Dep. Variable:                      y   R-squared:                       0.767
Model:                            OLS   Adj. R-squared:                  0.758
Method:                 Least Squares   F-statistic:                     86.20
Date:                Sat, 30 Sep 2017   Prob (F-statistic):           4.05e-99
Time:                        18:19:02   Log-Likelihood:                -998.34
No. Observations:                 354   AIC:                             2025.
Df Residuals:                     340   BIC:                             2079.
Df Model:                          13                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
const         29.7434      5.252      5.663      0.000      19.412      40.075
CRIM          -0.1377      0.037     -3.726      0.000      -0.210      -0.065
ZN             0.0400      0.014      2.834      0.005       0.012       0.068
INDUS         -0.0342      0.065     -0.524      0.601      -0.163       0.094
CHAS           1.5077      0.911      1.655      0.099      -0.284       3.299
NOX          -10.4760      4.076     -2.570      0.011     -18.493      -2.459
RM             3.8722      0.443      8.748      0.000       3.002       4.743
AGE           -0.0079      0.014     -0.567      0.571      -0.035       0.019
DIS           -1.1816      0.203     -5.827      0.000      -1.580      -0.783
RAD            0.2712      0.071      3.843      0.000       0.132       0.410
TAX           -0.0141      0.004     -3.529      0.000      -0.022      -0.006
PTRATIO       -0.7972      0.136     -5.879      0.000      -1.064      -0.530
B              0.0085      0.003      3.077      0.002       0.003       0.014
LSTAT         -0.4656      0.056     -8.297      0.000      -0.576      -0.355
Omnibus:                      138.142   Durbin-Watson:                   2.161
Prob(Omnibus):                  0.000   Jarque-Bera (JB):              795.610
Skew:                           1.532   Prob(JB):                    1.72e-173
Kurtosis:                       9.675   Cond. No.                     1.51e+04

[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
[2] The condition number is large, 1.51e+04. This might indicate that there are
strong multicollinearity or other numerical problems.

In [19]:

const      3.172061e-08
CRIM       2.277476e-04
ZN         4.867809e-03
INDUS      6.008156e-01
CHAS       9.875179e-02
NOX        1.058637e-02
RM         1.024719e-16
AGE        5.710887e-01
DIS        1.308529e-08
RAD        1.452952e-04
TAX        4.738253e-04
PTRATIO    9.851339e-09
B          2.262158e-03
LSTAT      2.526541e-15
dtype: float64

In [20]:
res.pvalues < 0.05

const       True
CRIM        True
ZN          True
INDUS      False
CHAS       False
NOX         True
RM          True
AGE        False
DIS         True
RAD         True
TAX         True
PTRATIO     True
B           True
LSTAT       True
dtype: bool

In [21]:
dat = X_train.copy()
dat['PRICE'] = y_train

28 0.77299 0.0 8.14 0.0 0.538 6.495 94.4 4.4547 4.0 307.0 21.0 387.94 12.80 18.4
59 0.10328 25.0 5.13 0.0 0.453 5.927 47.2 6.9320 8.0 284.0 19.7 396.90 9.22 19.6
265 0.76162 20.0 3.97 0.0 0.647 5.560 62.8 1.9865 5.0 264.0 13.0 392.40 10.45 22.8
208 0.13587 0.0 10.59 1.0 0.489 6.064 59.1 4.2392 4.0 277.0 18.6 381.32 14.66 24.4
483 2.81838 0.0 18.10 0.0 0.532 5.762 40.3 4.0983 24.0 666.0 20.2 392.92 10.42 21.8

In [22]:
results = smf.ols('PRICE ~ CRIM + ZN + INDUS + CHAS + NOX + RM + DIS + RAD + PTRATIO + B', data=dat).fit()

                            OLS Regression Results                            
Dep. Variable:                  PRICE   R-squared:                       0.699
Model:                            OLS   Adj. R-squared:                  0.690
Method:                 Least Squares   F-statistic:                     79.49
Date:                Sat, 30 Sep 2017   Prob (F-statistic):           4.42e-83
Time:                        18:19:02   Log-Likelihood:                -1044.1
No. Observations:                 354   AIC:                             2110.
Df Residuals:                     343   BIC:                             2153.
Df Model:                          10                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
Intercept     13.4544      5.609      2.399      0.017       2.421      24.488
CRIM          -0.1872      0.041     -4.524      0.000      -0.269      -0.106
ZN             0.0301      0.015      1.960      0.051      -0.000       0.060
INDUS         -0.2077      0.068     -3.074      0.002      -0.341      -0.075
CHAS           2.1516      1.022      2.104      0.036       0.141       4.163
NOX          -18.2292      4.379     -4.162      0.000     -26.843      -9.615
RM             5.8921      0.428     13.761      0.000       5.050       6.734
DIS           -1.0514      0.221     -4.753      0.000      -1.486      -0.616
RAD            0.0806      0.049      1.655      0.099      -0.015       0.176
PTRATIO       -0.9318      0.152     -6.135      0.000      -1.231      -0.633
B              0.0130      0.003      4.202      0.000       0.007       0.019
Omnibus:                      143.438   Durbin-Watson:                   2.113
Prob(Omnibus):                  0.000   Jarque-Bera (JB):              993.358
Skew:                           1.533   Prob(JB):                    1.97e-216
Kurtosis:                      10.613   Cond. No.                     9.64e+03

[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
[2] The condition number is large, 9.64e+03. This might indicate that there are
strong multicollinearity or other numerical problems.

In [23]:
results = smf.ols('PRICE ~ CRIM + ZN', data=dat).fit()

                            OLS Regression Results                            
Dep. Variable:                  PRICE   R-squared:                       0.280
Model:                            OLS   Adj. R-squared:                  0.276
Method:                 Least Squares   F-statistic:                     68.41
Date:                Sat, 30 Sep 2017   Prob (F-statistic):           8.18e-26
Time:                        18:19:02   Log-Likelihood:                -1198.1
No. Observations:                 354   AIC:                             2402.
Df Residuals:                     351   BIC:                             2414.
Df Model:                           2                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
Intercept     21.7449      0.474     45.844      0.000      20.812      22.678
CRIM          -0.3583      0.049     -7.367      0.000      -0.454      -0.263
ZN             0.1166      0.016      7.333      0.000       0.085       0.148
Omnibus:                      114.831   Durbin-Watson:                   1.917
Prob(Omnibus):                  0.000   Jarque-Bera (JB):              310.219
Skew:                           1.540   Prob(JB):                     4.33e-68
Kurtosis:                       6.398   Cond. No.                         34.2

[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.