In [2]:
from sklearn.datasets import load_boston

In [3]:
boston = load_boston()

In [4]:
X = boston.data
Y = boston.target

In [5]:
names = boston.feature_names

In [6]:
names


Out[6]:
array(['CRIM', 'ZN', 'INDUS', 'CHAS', 'NOX', 'RM', 'AGE', 'DIS', 'RAD',
       'TAX', 'PTRATIO', 'B', 'LSTAT'], 
      dtype='|S7')

In [7]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler(with_mean=False)
X_scaled = scaler.fit_transform(X)

In [8]:
dfX0 = pd.DataFrame(X_scaled,columns=names)

In [9]:
dfX0.head(1)


Out[9]:
CRIM ZN INDUS CHAS NOX RM AGE DIS RAD TAX PTRATIO B LSTAT
0 0.000736 0.772552 0.337051 0.0 4.647422 9.367131 2.318549 1.94426 0.11496 1.758028 7.074146 4.351754 0.698065

In [10]:
dfX = sm.add_constant(dfX0)

In [11]:
dfY = pd.DataFrame(Y, columns=["MEDV"])

In [12]:
df = pd.concat([dfX, dfY],axis=1)

In [13]:
df.tail()


Out[13]:
const CRIM ZN INDUS CHAS NOX RM AGE DIS RAD TAX PTRATIO B LSTAT MEDV
501 1 0.007292 0.0 1.740698 0.0 4.949763 9.392775 2.457236 1.178250 0.11496 1.621424 9.709612 4.297919 1.355480 22.4
502 1 0.005271 0.0 1.740698 0.0 4.949763 8.718911 2.727496 1.087407 0.11496 1.621424 9.709612 4.351754 1.272778 20.6
503 1 0.007075 0.0 1.740698 0.0 4.949763 9.938419 3.236012 1.030363 0.11496 1.621424 9.709612 4.351754 0.790580 23.9
504 1 0.012760 0.0 1.740698 0.0 4.949763 9.679131 3.175559 1.135609 0.11496 1.621424 9.709612 4.313927 0.908326 22.0
505 1 0.005520 0.0 1.740698 0.0 4.949763 8.590692 2.873294 1.190800 0.11496 1.621424 9.709612 4.351754 1.104569 11.9

In [14]:
# 그림저장에 default가 svg로 저장이 되게 한다.
# png, jpg
%config InlineBackend.figure_format = "png"

In [22]:
sns.pairplot(df)


Out[22]:
<seaborn.axisgrid.PairGrid at 0x7f13c2698b90>

In [15]:
sns.jointplot("RM","MEDV",data=df)
plt.show()



In [16]:
# CHAS 는 찰스강에 붙어잇냐 아니냐를 나타내는 카테고리 값이었다.
sns.jointplot("CHAS","MEDV" ,data=df)
plt.show()



In [17]:
regression = sm.OLS(dfY, dfX)

In [18]:
result = regression.fit()

In [19]:
print(result.summary())


                            OLS Regression Results                            
==============================================================================
Dep. Variable:                   MEDV   R-squared:                       0.741
Model:                            OLS   Adj. R-squared:                  0.734
Method:                 Least Squares   F-statistic:                     108.1
Date:                Sun, 12 Jun 2016   Prob (F-statistic):          6.95e-135
Time:                        14:39:34   Log-Likelihood:                -1498.8
No. Observations:                 506   AIC:                             3026.
Df Residuals:                     492   BIC:                             3085.
Df Model:                          13                                         
Covariance Type:            nonrobust                                         
==============================================================================
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const         36.4911      5.104      7.149      0.000      26.462      46.520
CRIM          -0.9204      0.281     -3.276      0.001      -1.472      -0.368
ZN             1.0810      0.320      3.380      0.001       0.453       1.709
INDUS          0.1430      0.421      0.339      0.735      -0.685       0.971
CHAS           0.6822      0.219      3.120      0.002       0.253       1.112
NOX           -2.0601      0.442     -4.658      0.000      -2.929      -1.191
RM             2.6706      0.293      9.102      0.000       2.094       3.247
AGE            0.0211      0.371      0.057      0.955      -0.709       0.751
DIS           -3.1044      0.420     -7.398      0.000      -3.929      -2.280
RAD            2.6588      0.577      4.608      0.000       1.525       3.792
TAX           -2.0759      0.633     -3.278      0.001      -3.320      -0.832
PTRATIO       -2.0622      0.283     -7.287      0.000      -2.618      -1.506
B              0.8566      0.245      3.500      0.001       0.376       1.338
LSTAT         -3.7487      0.362    -10.366      0.000      -4.459      -3.038
==============================================================================
Omnibus:                      178.029   Durbin-Watson:                   1.078
Prob(Omnibus):                  0.000   Jarque-Bera (JB):              782.015
Skew:                           1.521   Prob(JB):                    1.54e-170
Kurtosis:                       8.276   Cond. No.                         357.
==============================================================================

Warnings:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.

강사님


In [80]:
model = sm.OLS(df.ix[:,-1], df.ix[:,:-1])

In [81]:
result = model.fit()
print(result.summary())


                            OLS Regression Results                            
==============================================================================
Dep. Variable:                   MEDV   R-squared:                       0.741
Model:                            OLS   Adj. R-squared:                  0.734
Method:                 Least Squares   F-statistic:                     108.1
Date:                Wed, 08 Jun 2016   Prob (F-statistic):          6.95e-135
Time:                        04:46:26   Log-Likelihood:                -1498.8
No. Observations:                 506   AIC:                             3026.
Df Residuals:                     492   BIC:                             3085.
Df Model:                          13                                         
Covariance Type:            nonrobust                                         
==============================================================================
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const         36.4911      5.104      7.149      0.000      26.462      46.520
CRIM          -0.9204      0.281     -3.276      0.001      -1.472      -0.368
ZN             1.0810      0.320      3.380      0.001       0.453       1.709
INDUS          0.1430      0.421      0.339      0.735      -0.685       0.971
CHAS           0.6822      0.219      3.120      0.002       0.253       1.112
NOX           -2.0601      0.442     -4.658      0.000      -2.929      -1.191
RM             2.6706      0.293      9.102      0.000       2.094       3.247
AGE            0.0211      0.371      0.057      0.955      -0.709       0.751
DIS           -3.1044      0.420     -7.398      0.000      -3.929      -2.280
RAD            2.6588      0.577      4.608      0.000       1.525       3.792
TAX           -2.0759      0.633     -3.278      0.001      -3.320      -0.832
PTRATIO       -2.0622      0.283     -7.287      0.000      -2.618      -1.506
B              0.8566      0.245      3.500      0.001       0.376       1.338
LSTAT         -3.7487      0.362    -10.366      0.000      -4.459      -3.038
==============================================================================
Omnibus:                      178.029   Durbin-Watson:                   1.078
Prob(Omnibus):                  0.000   Jarque-Bera (JB):              782.015
Skew:                           1.521   Prob(JB):                    1.54e-170
Kurtosis:                       8.276   Cond. No.                         357.
==============================================================================

Warnings:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.

강사님. 사이킷런이용한 회귀


In [23]:
from sklearn.linear_model import LinearRegression
model_boston = LinearRegression().fit(df.ix[:,:-1],df.ix[:,-1])

In [24]:
model_boston.intercept_, model_boston.coef_


Out[24]:
(36.491103280361443,
 array([ 0.        , -0.92041113,  1.08098058,  0.14296712,  0.68220346,
        -2.06009246,  2.67064141,  0.02112063, -3.10444805,  2.65878654,
        -2.07589814, -2.06215593,  0.85664044, -3.74867982]))

In [29]:
sm.graphics.plot_fit(result, df.MEDV)
plt.show()



TypeErrorTraceback (most recent call last)
<ipython-input-29-c80c6890c268> in <module>()
----> 1 sm.graphics.plot_fit(result, df.MEDV)
      2 plt.show()

/home/dockeruser/anaconda2/lib/python2.7/site-packages/statsmodels/graphics/regressionplots.pyc in plot_fit(results, exog_idx, y_true, ax, **kwargs)
    131     fig, ax = utils.create_mpl_ax(ax)
    132 
--> 133     exog_name, exog_idx = utils.maybe_name_or_idx(exog_idx, results.model)
    134     results = maybe_unwrap_results(results)
    135 

/home/dockeruser/anaconda2/lib/python2.7/site-packages/statsmodels/graphics/utils.pyc in maybe_name_or_idx(idx, model)
    112     else: # assume we've got a string variable
    113         exog_name = idx
--> 114         exog_idx = model.exog_names.index(idx)
    115 
    116     return exog_name, exog_idx

/home/dockeruser/anaconda2/lib/python2.7/site-packages/pandas/core/ops.pyc in wrapper(self, other, axis)
    759                 other = np.asarray(other)
    760 
--> 761             res = na_op(values, other)
    762             if isscalar(res):
    763                 raise TypeError('Could not compare %s type with Series' %

/home/dockeruser/anaconda2/lib/python2.7/site-packages/pandas/core/ops.pyc in na_op(x, y)
    714                 result = getattr(x, name)(y)
    715                 if result is NotImplemented:
--> 716                     raise TypeError("invalid type comparison")
    717             except AttributeError:
    718                 result = op(x, y)

TypeError: invalid type comparison

In [85]:
sns.distplot(result.resid)


Out[85]:
<matplotlib.axes._subplots.AxesSubplot at 0x7f13a57ec710>

In [86]:
sns.distplot(df.MEDV)


Out[86]:
<matplotlib.axes._subplots.AxesSubplot at 0x7f13a5717ed0>

In [30]:
df2 = df.drop(df[df.MEDV >= df.MEDV.max()].index)
df2.head(1)


Out[30]:
const CRIM ZN INDUS CHAS NOX RM AGE DIS RAD TAX PTRATIO B LSTAT MEDV
0 1 0.000736 0.772552 0.337051 0.0 4.647422 9.367131 2.318549 1.94426 0.11496 1.758028 7.074146 4.351754 0.698065 24.0

In [31]:
sm_model2 = sm.OLS(df2.ix[:,-1],df2.ix[:,:-1])
result2 = sm_model2.fit()
print(result2.summary())


                            OLS Regression Results                            
==============================================================================
Dep. Variable:                   MEDV   R-squared:                       0.778
Model:                            OLS   Adj. R-squared:                  0.771
Method:                 Least Squares   F-statistic:                     128.0
Date:                Sun, 12 Jun 2016   Prob (F-statistic):          4.79e-146
Time:                        14:43:00   Log-Likelihood:                -1337.1
No. Observations:                 490   AIC:                             2702.
Df Residuals:                     476   BIC:                             2761.
Df Model:                          13                                         
Covariance Type:            nonrobust                                         
==============================================================================
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const         32.2611      4.125      7.821      0.000      24.155      40.367
CRIM          -0.9067      0.223     -4.062      0.000      -1.345      -0.468
ZN             0.8219      0.263      3.129      0.002       0.306       1.338
INDUS         -0.2983      0.341     -0.874      0.383      -0.969       0.373
CHAS           0.1156      0.188      0.614      0.540      -0.254       0.485
NOX           -1.4386      0.354     -4.064      0.000      -2.134      -0.743
RM             2.6351      0.251     10.501      0.000       2.142       3.128
AGE           -0.6640      0.300     -2.216      0.027      -1.253      -0.075
DIS           -2.5472      0.338     -7.535      0.000      -3.211      -1.883
RAD            2.1811      0.462      4.724      0.000       1.274       3.088
TAX           -2.3185      0.505     -4.591      0.000      -3.311      -1.326
PTRATIO       -1.8144      0.228     -7.960      0.000      -2.262      -1.366
B              0.7238      0.194      3.727      0.000       0.342       1.105
LSTAT         -2.5037      0.303     -8.257      0.000      -3.100      -1.908
==============================================================================
Omnibus:                       84.129   Durbin-Watson:                   1.231
Prob(Omnibus):                  0.000   Jarque-Bera (JB):              161.897
Skew:                           0.966   Prob(JB):                     6.99e-36
Kurtosis:                       5.048   Cond. No.                         358.
==============================================================================

Warnings:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.

In [32]:
from sklearn.linear_model import LinearRegression
sk_model2 = LinearRegression().fit(df2.ix[:,:-1],df2.ix[:,-1])
sk_model2.coef_, sk_model2.intercept_


Out[32]:
(array([ 0.        , -0.90670198,  0.8218828 , -0.29825317,  0.11555586,
        -1.43856592,  2.63509594, -0.66398505, -2.54724295,  2.18110049,
        -2.31851126, -1.81435162,  0.72377893, -2.5036931 ]),
 32.261106875317672)

In [ ]:


In [33]:
import statsmodels.api as sm
model_anova = sm.OLS.from_formula("MEDV ~ C(CHAS)", data=df2)
result_anova = model_anova.fit()
table_anova = sm.stats.anova_lm(result_anova)
table_anova


Out[33]:
df sum_sq mean_sq F PR(>F)
C(CHAS) 1.0 169.271183 169.271183 2.745998 0.098141
Residual 488.0 30081.716653 61.642862 NaN NaN

Cross Validation


In [100]:
model2 = LinearRegression()

In [102]:
fit_model2 = model2.fit(df2.ix[:,:-1],df2.ix[:,-1])

In [104]:
fit_model2.coef_, fit_model2.intercept_


Out[104]:
(array([ 0.        , -0.90670198,  0.8218828 , -0.29825317,  0.11555586,
        -1.43856592,  2.63509594, -0.66398505, -2.54724295,  2.18110049,
        -2.31851126, -1.81435162,  0.72377893, -2.5036931 ]),
 32.261106875317672)

In [108]:
from sklearn.cross_validation import cross_val_score

scores2 = cross_val_score(model2, df2.ix[:,:-1], df2.ix[:,-1], cv = 5 ) # cv = 5 , k가 5인 k-fold cv를 시행.

In [110]:
scores2, scores2.mean(), scores2.std()


Out[110]:
(array([ 0.670774  ,  0.77345121,  0.5308856 ,  0.00644914,  0.11065571]),
 0.41844313129206878,
 0.30555426893282145)

안 좋은걸 좋게 만들어봅시다..ㅠㅠㅠ


In [112]:
# 이 모델의 점수는 0.418.. 입니당. 어떻게 바꿀까?

In [113]:
# MEDV 와 LSTAT이 반 비례의 2차함수같네??
# 이차항을 해주면 되겟네?

# 로그를 취해줘도 된다.

# CRIM, DIS(헤테로스키더스키? 하다. 갈수록 분산증가)

log transform


In [114]:
df3 = df2.drop(["CRIM","DIS","LSTAT","MEDV"], axis=1)
df3["LOGCRIM"] = np.log(df2.CRIM)
df3["LOGDIS"] = np.log(df2.DIS)
df3["LOGLSTAT"] = np.log(df2.LSTAT)
df3["MEDV"] = df2.MEDV

In [115]:
df3.tail()


Out[115]:
const ZN INDUS CHAS NOX RM AGE RAD TAX PTRATIO B LOGCRIM LOGDIS LOGLSTAT MEDV
501 1 0.0 1.740698 0.0 4.949763 9.392775 2.457236 0.11496 1.621424 9.709612 4.297919 -4.920910 0.164030 0.304156 22.4
502 1 0.0 1.740698 0.0 4.949763 8.718911 2.727496 0.11496 1.621424 9.709612 4.351754 -5.245510 0.083796 0.241202 20.6
503 1 0.0 1.740698 0.0 4.949763 9.938419 3.236012 0.11496 1.621424 9.709612 4.351754 -4.951222 0.029911 -0.234988 23.9
504 1 0.0 1.740698 0.0 4.949763 9.679131 3.175559 0.11496 1.621424 9.709612 4.313927 -4.361408 0.127169 -0.096152 22.0
505 1 0.0 1.740698 0.0 4.949763 8.590692 2.873294 0.11496 1.621424 9.709612 4.351754 -5.199321 0.174625 0.099456 11.9

In [116]:
df2.tail()


Out[116]:
const CRIM ZN INDUS CHAS NOX RM AGE DIS RAD TAX PTRATIO B LSTAT MEDV
501 1 0.007292 0.0 1.740698 0.0 4.949763 9.392775 2.457236 1.178250 0.11496 1.621424 9.709612 4.297919 1.355480 22.4
502 1 0.005271 0.0 1.740698 0.0 4.949763 8.718911 2.727496 1.087407 0.11496 1.621424 9.709612 4.351754 1.272778 20.6
503 1 0.007075 0.0 1.740698 0.0 4.949763 9.938419 3.236012 1.030363 0.11496 1.621424 9.709612 4.351754 0.790580 23.9
504 1 0.012760 0.0 1.740698 0.0 4.949763 9.679131 3.175559 1.135609 0.11496 1.621424 9.709612 4.313927 0.908326 22.0
505 1 0.005520 0.0 1.740698 0.0 4.949763 8.590692 2.873294 1.190800 0.11496 1.621424 9.709612 4.351754 1.104569 11.9

In [117]:
sns.jointplot("CRIM","MEDV", data=df2)


Out[117]:
<seaborn.axisgrid.JointGrid at 0x7f13a57a6650>

In [118]:
sns.jointplot("LOGCRIM","MEDV", data=df3)


Out[118]:
<seaborn.axisgrid.JointGrid at 0x7f13a51c1f50>

In [119]:
sns.jointplot("DIS","MEDV", data=df2)


Out[119]:
<seaborn.axisgrid.JointGrid at 0x7f13a4eb4c50>

In [120]:
sns.jointplot("LOGDIS","MEDV", data=df3)


Out[120]:
<seaborn.axisgrid.JointGrid at 0x7f13a4abe490>

In [121]:
sns.jointplot("LSTAT","MEDV", data=df2)


Out[121]:
<seaborn.axisgrid.JointGrid at 0x7f13a4919110>

In [122]:
sns.jointplot("LOGLSTAT","MEDV", data=df3)


Out[122]:
<seaborn.axisgrid.JointGrid at 0x7f13a741db90>

In [123]:
model3 = sm.OLS(df3.ix[:,-1],df3.ix[:,:-1])

In [124]:
result = model3.fit()

In [125]:
print(result.summary())


                            OLS Regression Results                            
==============================================================================
Dep. Variable:                   MEDV   R-squared:                       0.797
Model:                            OLS   Adj. R-squared:                  0.791
Method:                 Least Squares   F-statistic:                     143.8
Date:                Wed, 08 Jun 2016   Prob (F-statistic):          1.91e-155
Time:                        08:13:57   Log-Likelihood:                -1314.7
No. Observations:                 490   AIC:                             2657.
Df Residuals:                     476   BIC:                             2716.
Df Model:                          13                                         
Covariance Type:            nonrobust                                         
==============================================================================
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const         30.5248      3.930      7.767      0.000      22.802      38.247
ZN            -0.0268      0.238     -0.113      0.910      -0.494       0.440
INDUS         -0.1063      0.329     -0.323      0.747      -0.753       0.541
CHAS           0.2271      0.180      1.263      0.207      -0.126       0.580
NOX           -1.5242      0.360     -4.228      0.000      -2.233      -0.816
RM             2.0886      0.252      8.279      0.000       1.593       2.584
AGE           -0.0504      0.303     -0.166      0.868      -0.646       0.546
RAD            1.7683      0.504      3.509      0.000       0.778       2.758
TAX           -2.1879      0.483     -4.526      0.000      -3.138      -1.238
PTRATIO       -1.7374      0.219     -7.951      0.000      -2.167      -1.308
B              0.7231      0.186      3.889      0.000       0.358       1.088
LOGCRIM       -0.1224      0.209     -0.586      0.558      -0.533       0.288
LOGDIS        -3.9716      0.673     -5.905      0.000      -5.293      -2.650
LOGLSTAT      -6.9240      0.553    -12.527      0.000      -8.010      -5.838
==============================================================================
Omnibus:                       31.683   Durbin-Watson:                   1.199
Prob(Omnibus):                  0.000   Jarque-Bera (JB):               49.222
Skew:                           0.471   Prob(JB):                     2.05e-11
Kurtosis:                       4.234   Cond. No.                         359.
==============================================================================

Warnings:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.

In [126]:
scores3 = cross_val_score(LinearRegression(), df3.ix[:,:-1],df3.ix[:,-1],cv=5)
scores3, scores3.mean(), scores3.std()


Out[126]:
(array([ 0.68959667,  0.78222319,  0.58690158,  0.13525116,  0.24691975]),
 0.48817847061802622,
 0.2528008298676635)

In [127]:
# 평균 점수도 높아지고 분산도 적어졌다!!!

In [128]:
# 지금 데이터셋에는 다중공선성이 있다.

In [130]:
# correlation 보기.
sns.heatmap(np.corrcoef(df3.T))


Out[130]:
<matplotlib.axes._subplots.AxesSubplot at 0x7f13ac1af810>

In [131]:
# t-test 결과만 갖고 잘라내기 (원래는 F-test도 봐야하지만..)
df4 = df3.drop(["ZN","INDUS","AGE","LOGCRIM"],axis=1)

In [132]:
model4 = sm.OLS(df4.ix[:,-1],df4.ix[:,:-1])
result4 = model4.fit()
print(result4.summary())


                            OLS Regression Results                            
==============================================================================
Dep. Variable:                   MEDV   R-squared:                       0.797
Model:                            OLS   Adj. R-squared:                  0.793
Method:                 Least Squares   F-statistic:                     209.1
Date:                Wed, 08 Jun 2016   Prob (F-statistic):          6.89e-160
Time:                        08:23:35   Log-Likelihood:                -1315.0
No. Observations:                 490   AIC:                             2650.
Df Residuals:                     480   BIC:                             2692.
Df Model:                           9                                         
Covariance Type:            nonrobust                                         
==============================================================================
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const         31.2476      3.737      8.361      0.000      23.904      38.591
CHAS           0.2230      0.178      1.250      0.212      -0.127       0.573
NOX           -1.6180      0.331     -4.886      0.000      -2.269      -0.967
RM             2.0911      0.239      8.753      0.000       1.622       2.560
RAD            1.6510      0.404      4.090      0.000       0.858       2.444
TAX           -2.2346      0.429     -5.210      0.000      -3.077      -1.392
PTRATIO       -1.7474      0.202     -8.659      0.000      -2.144      -1.351
B              0.7376      0.183      4.033      0.000       0.378       1.097
LOGDIS        -3.8144      0.568     -6.713      0.000      -4.931      -2.698
LOGLSTAT      -7.0208      0.487    -14.414      0.000      -7.978      -6.064
==============================================================================
Omnibus:                       32.013   Durbin-Watson:                   1.201
Prob(Omnibus):                  0.000   Jarque-Bera (JB):               49.851
Skew:                           0.474   Prob(JB):                     1.50e-11
Kurtosis:                       4.242   Cond. No.                         329.
==============================================================================

Warnings:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.

In [133]:
score4 = cross_val_score(LinearRegression(), df4.ix[:,:-1],df4.ix[:,-1], cv=5)
score4, score4.mean(), score4.std()


Out[133]:
(array([ 0.72319109,  0.78563139,  0.61497277,  0.26420584,  0.30413905]),
 0.53842802863758932,
 0.21503261199735343)

In [140]:
sns.heatmap(np.corrcoef(df4.T), xticklabels=df4.columns, yticklabels=df4.columns, annot=True)


Out[140]:
<matplotlib.axes._subplots.AxesSubplot at 0x7f13a77bb1d0>

In [137]:
sns.heatmap?

In [141]:
df5 = df4.drop(["RAD"],axis=1)

In [142]:
model5 = sm.OLS(df5.ix[:,-1],df5.ix[:,:-1])
result5 = model5.fit()
print(result5.summary())


                            OLS Regression Results                            
==============================================================================
Dep. Variable:                   MEDV   R-squared:                       0.790
Model:                            OLS   Adj. R-squared:                  0.786
Method:                 Least Squares   F-statistic:                     225.8
Date:                Wed, 08 Jun 2016   Prob (F-statistic):          1.60e-157
Time:                        08:36:35   Log-Likelihood:                -1323.4
No. Observations:                 490   AIC:                             2665.
Df Residuals:                     481   BIC:                             2702.
Df Model:                           8                                         
Covariance Type:            nonrobust                                         
==============================================================================
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const         26.8265      3.636      7.379      0.000      19.683      33.970
CHAS           0.2764      0.181      1.529      0.127      -0.079       0.632
NOX           -1.5201      0.336     -4.528      0.000      -2.180      -0.861
RM             2.2448      0.240      9.363      0.000       1.774       2.716
TAX           -0.8255      0.260     -3.180      0.002      -1.335      -0.315
PTRATIO       -1.6023      0.202     -7.937      0.000      -1.999      -1.206
B              0.6501      0.185      3.522      0.000       0.287       1.013
LOGDIS        -3.6961      0.577     -6.409      0.000      -4.829      -2.563
LOGLSTAT      -7.0028      0.495    -14.148      0.000      -7.975      -6.030
==============================================================================
Omnibus:                       42.813   Durbin-Watson:                   1.191
Prob(Omnibus):                  0.000   Jarque-Bera (JB):               70.769
Skew:                           0.582   Prob(JB):                     4.29e-16
Kurtosis:                       4.454   Cond. No.                         314.
==============================================================================

Warnings:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.

In [143]:
score5 = cross_val_score(LinearRegression(), df5.ix[:,:-1],df5.ix[:,-1], cv=5)
score5, score5.mean(), score5.std()


Out[143]:
(array([ 0.72444307,  0.75982829,  0.57680467,  0.25469516,  0.32560728]),
 0.52827569419364395,
 0.20512147639406775)

In [144]:
df6 = df5.drop(["TAX"],axis=1)
model6 = sm.OLS(df6.ix[:,-1],df6.ix[:,:-1])
result6 = model6.fit()
print(result6.summary())


                            OLS Regression Results                            
==============================================================================
Dep. Variable:                   MEDV   R-squared:                       0.785
Model:                            OLS   Adj. R-squared:                  0.782
Method:                 Least Squares   F-statistic:                     251.8
Date:                Wed, 08 Jun 2016   Prob (F-statistic):          1.43e-156
Time:                        08:38:41   Log-Likelihood:                -1328.5
No. Observations:                 490   AIC:                             2673.
Df Residuals:                     482   BIC:                             2706.
Df Model:                           7                                         
Covariance Type:            nonrobust                                         
==============================================================================
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const         28.6735      3.623      7.915      0.000      21.555      35.792
CHAS           0.3401      0.181      1.876      0.061      -0.016       0.696
NOX           -1.8905      0.318     -5.949      0.000      -2.515      -1.266
RM             2.1868      0.241      9.063      0.000       1.713       2.661
PTRATIO       -1.8452      0.189     -9.783      0.000      -2.216      -1.475
B              0.7873      0.181      4.345      0.000       0.431       1.143
LOGDIS        -3.6005      0.581     -6.194      0.000      -4.743      -2.458
LOGLSTAT      -7.1666      0.497    -14.422      0.000      -8.143      -6.190
==============================================================================
Omnibus:                       33.260   Durbin-Watson:                   1.203
Prob(Omnibus):                  0.000   Jarque-Bera (JB):               55.759
Skew:                           0.464   Prob(JB):                     7.80e-13
Kurtosis:                       4.367   Cond. No.                         305.
==============================================================================

Warnings:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.

In [145]:
score6 = cross_val_score(LinearRegression(), df6.ix[:,:-1],df6.ix[:,-1], cv=5)
score6, score6.mean(), score6.std()


Out[145]:
(array([ 0.7160077 ,  0.7598543 ,  0.56232156,  0.23754958,  0.39348215]),
 0.53384305959027156,
 0.19624834931434523)

In [146]:
sns.heatmap(np.corrcoef(df6.T), xticklabels=df6.columns, yticklabels=df6.columns, annot=True)


Out[146]:
<matplotlib.axes._subplots.AxesSubplot at 0x7f13b2974510>

In [ ]: