In [15]:
import statsmodels.api as sm
import statsmodels.formula.api as smf
import pandas as pd
from sklearn import datasets, model_selection
In [16]:
boston = datasets.load_boston()
X = pd.DataFrame(boston.data, columns=boston.feature_names)
y = boston.target
In [17]:
X_train, X_test, y_train, y_test = model_selection.train_test_split(X, y, train_size=0.7)
/opt/conda/lib/python3.6/site-packages/sklearn/model_selection/_split.py:2010: FutureWarning: From version 0.21, test_size will always complement train_size unless both are specified.
FutureWarning)
In [18]:
model = sm.OLS(y_train, sm.add_constant(X_train))
res = model.fit()
print(res.summary())
OLS Regression Results
==============================================================================
Dep. Variable: y R-squared: 0.767
Model: OLS Adj. R-squared: 0.758
Method: Least Squares F-statistic: 86.20
Date: Sat, 30 Sep 2017 Prob (F-statistic): 4.05e-99
Time: 18:19:02 Log-Likelihood: -998.34
No. Observations: 354 AIC: 2025.
Df Residuals: 340 BIC: 2079.
Df Model: 13
Covariance Type: nonrobust
==============================================================================
coef std err t P>|t| [0.025 0.975]
------------------------------------------------------------------------------
const 29.7434 5.252 5.663 0.000 19.412 40.075
CRIM -0.1377 0.037 -3.726 0.000 -0.210 -0.065
ZN 0.0400 0.014 2.834 0.005 0.012 0.068
INDUS -0.0342 0.065 -0.524 0.601 -0.163 0.094
CHAS 1.5077 0.911 1.655 0.099 -0.284 3.299
NOX -10.4760 4.076 -2.570 0.011 -18.493 -2.459
RM 3.8722 0.443 8.748 0.000 3.002 4.743
AGE -0.0079 0.014 -0.567 0.571 -0.035 0.019
DIS -1.1816 0.203 -5.827 0.000 -1.580 -0.783
RAD 0.2712 0.071 3.843 0.000 0.132 0.410
TAX -0.0141 0.004 -3.529 0.000 -0.022 -0.006
PTRATIO -0.7972 0.136 -5.879 0.000 -1.064 -0.530
B 0.0085 0.003 3.077 0.002 0.003 0.014
LSTAT -0.4656 0.056 -8.297 0.000 -0.576 -0.355
==============================================================================
Omnibus: 138.142 Durbin-Watson: 2.161
Prob(Omnibus): 0.000 Jarque-Bera (JB): 795.610
Skew: 1.532 Prob(JB): 1.72e-173
Kurtosis: 9.675 Cond. No. 1.51e+04
==============================================================================
Warnings:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
[2] The condition number is large, 1.51e+04. This might indicate that there are
strong multicollinearity or other numerical problems.
In [19]:
res.pvalues
Out[19]:
const 3.172061e-08
CRIM 2.277476e-04
ZN 4.867809e-03
INDUS 6.008156e-01
CHAS 9.875179e-02
NOX 1.058637e-02
RM 1.024719e-16
AGE 5.710887e-01
DIS 1.308529e-08
RAD 1.452952e-04
TAX 4.738253e-04
PTRATIO 9.851339e-09
B 2.262158e-03
LSTAT 2.526541e-15
dtype: float64
In [20]:
res.pvalues < 0.05
Out[20]:
const True
CRIM True
ZN True
INDUS False
CHAS False
NOX True
RM True
AGE False
DIS True
RAD True
TAX True
PTRATIO True
B True
LSTAT True
dtype: bool
In [21]:
dat = X_train.copy()
dat['PRICE'] = y_train
dat.head()
Out[21]:
CRIM
ZN
INDUS
CHAS
NOX
RM
AGE
DIS
RAD
TAX
PTRATIO
B
LSTAT
PRICE
28
0.77299
0.0
8.14
0.0
0.538
6.495
94.4
4.4547
4.0
307.0
21.0
387.94
12.80
18.4
59
0.10328
25.0
5.13
0.0
0.453
5.927
47.2
6.9320
8.0
284.0
19.7
396.90
9.22
19.6
265
0.76162
20.0
3.97
0.0
0.647
5.560
62.8
1.9865
5.0
264.0
13.0
392.40
10.45
22.8
208
0.13587
0.0
10.59
1.0
0.489
6.064
59.1
4.2392
4.0
277.0
18.6
381.32
14.66
24.4
483
2.81838
0.0
18.10
0.0
0.532
5.762
40.3
4.0983
24.0
666.0
20.2
392.92
10.42
21.8
In [22]:
results = smf.ols('PRICE ~ CRIM + ZN + INDUS + CHAS + NOX + RM + DIS + RAD + PTRATIO + B', data=dat).fit()
print(results.summary())
OLS Regression Results
==============================================================================
Dep. Variable: PRICE R-squared: 0.699
Model: OLS Adj. R-squared: 0.690
Method: Least Squares F-statistic: 79.49
Date: Sat, 30 Sep 2017 Prob (F-statistic): 4.42e-83
Time: 18:19:02 Log-Likelihood: -1044.1
No. Observations: 354 AIC: 2110.
Df Residuals: 343 BIC: 2153.
Df Model: 10
Covariance Type: nonrobust
==============================================================================
coef std err t P>|t| [0.025 0.975]
------------------------------------------------------------------------------
Intercept 13.4544 5.609 2.399 0.017 2.421 24.488
CRIM -0.1872 0.041 -4.524 0.000 -0.269 -0.106
ZN 0.0301 0.015 1.960 0.051 -0.000 0.060
INDUS -0.2077 0.068 -3.074 0.002 -0.341 -0.075
CHAS 2.1516 1.022 2.104 0.036 0.141 4.163
NOX -18.2292 4.379 -4.162 0.000 -26.843 -9.615
RM 5.8921 0.428 13.761 0.000 5.050 6.734
DIS -1.0514 0.221 -4.753 0.000 -1.486 -0.616
RAD 0.0806 0.049 1.655 0.099 -0.015 0.176
PTRATIO -0.9318 0.152 -6.135 0.000 -1.231 -0.633
B 0.0130 0.003 4.202 0.000 0.007 0.019
==============================================================================
Omnibus: 143.438 Durbin-Watson: 2.113
Prob(Omnibus): 0.000 Jarque-Bera (JB): 993.358
Skew: 1.533 Prob(JB): 1.97e-216
Kurtosis: 10.613 Cond. No. 9.64e+03
==============================================================================
Warnings:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
[2] The condition number is large, 9.64e+03. This might indicate that there are
strong multicollinearity or other numerical problems.
In [23]:
results = smf.ols('PRICE ~ CRIM + ZN', data=dat).fit()
print(results.summary())
OLS Regression Results
==============================================================================
Dep. Variable: PRICE R-squared: 0.280
Model: OLS Adj. R-squared: 0.276
Method: Least Squares F-statistic: 68.41
Date: Sat, 30 Sep 2017 Prob (F-statistic): 8.18e-26
Time: 18:19:02 Log-Likelihood: -1198.1
No. Observations: 354 AIC: 2402.
Df Residuals: 351 BIC: 2414.
Df Model: 2
Covariance Type: nonrobust
==============================================================================
coef std err t P>|t| [0.025 0.975]
------------------------------------------------------------------------------
Intercept 21.7449 0.474 45.844 0.000 20.812 22.678
CRIM -0.3583 0.049 -7.367 0.000 -0.454 -0.263
ZN 0.1166 0.016 7.333 0.000 0.085 0.148
==============================================================================
Omnibus: 114.831 Durbin-Watson: 1.917
Prob(Omnibus): 0.000 Jarque-Bera (JB): 310.219
Skew: 1.540 Prob(JB): 4.33e-68
Kurtosis: 6.398 Cond. No. 34.2
==============================================================================
Warnings:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
Content source: rueedlinger/machine-learning-snippets
Similar notebooks: