In [1]:
# boston data loading
from sklearn.datasets import load_boston

boston_data = load_boston()

X = boston_data.data
Y = boston_data.target

In [2]:
# scaling

from sklearn.preprocessing import StandardScaler

scaler = StandardScaler(with_mean=False)

In [3]:
X = scaler.fit_transform(X)
feature = boston_data.feature_names

In [4]:
dfX = pd.DataFrame(X, columns=feature)
dfY = pd.DataFrame(Y, columns=["MEDV"])

In [5]:
df_boston = pd.concat([dfX,dfY],axis=1)

In [6]:
df_boston = sm.add_constant(df_boston)

In [7]:
df_boston.head(1)


Out[7]:
const CRIM ZN INDUS CHAS NOX RM AGE DIS RAD TAX PTRATIO B LSTAT MEDV
0 1 0.000736 0.772552 0.337051 0.0 4.647422 9.367131 2.318549 1.94426 0.11496 1.758028 7.074146 4.351754 0.698065 24.0

In [8]:
from sklearn.linear_model import LinearRegression

model = LinearRegression()

In [9]:
result = model.fit(df_boston.ix[:,:-1],df_boston.ix[:,-1])

In [10]:
result.coef_


Out[10]:
array([ 0.        , -0.92041113,  1.08098058,  0.14296712,  0.68220346,
       -2.06009246,  2.67064141,  0.02112063, -3.10444805,  2.65878654,
       -2.07589814, -2.06215593,  0.85664044, -3.74867982])

In [11]:
result.intercept_


Out[11]:
36.491103280361443

In [ ]:


In [12]:
model1 = sm.OLS(df_boston.ix[:,-1],df_boston.ix[:,:-1])
result1 = model1.fit()
print(result1.summary())


                            OLS Regression Results                            
==============================================================================
Dep. Variable:                   MEDV   R-squared:                       0.741
Model:                            OLS   Adj. R-squared:                  0.734
Method:                 Least Squares   F-statistic:                     108.1
Date:                Sun, 12 Jun 2016   Prob (F-statistic):          6.95e-135
Time:                        23:41:32   Log-Likelihood:                -1498.8
No. Observations:                 506   AIC:                             3026.
Df Residuals:                     492   BIC:                             3085.
Df Model:                          13                                         
Covariance Type:            nonrobust                                         
==============================================================================
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const         36.4911      5.104      7.149      0.000      26.462      46.520
CRIM          -0.9204      0.281     -3.276      0.001      -1.472      -0.368
ZN             1.0810      0.320      3.380      0.001       0.453       1.709
INDUS          0.1430      0.421      0.339      0.735      -0.685       0.971
CHAS           0.6822      0.219      3.120      0.002       0.253       1.112
NOX           -2.0601      0.442     -4.658      0.000      -2.929      -1.191
RM             2.6706      0.293      9.102      0.000       2.094       3.247
AGE            0.0211      0.371      0.057      0.955      -0.709       0.751
DIS           -3.1044      0.420     -7.398      0.000      -3.929      -2.280
RAD            2.6588      0.577      4.608      0.000       1.525       3.792
TAX           -2.0759      0.633     -3.278      0.001      -3.320      -0.832
PTRATIO       -2.0622      0.283     -7.287      0.000      -2.618      -1.506
B              0.8566      0.245      3.500      0.001       0.376       1.338
LSTAT         -3.7487      0.362    -10.366      0.000      -4.459      -3.038
==============================================================================
Omnibus:                      178.029   Durbin-Watson:                   1.078
Prob(Omnibus):                  0.000   Jarque-Bera (JB):              782.015
Skew:                           1.521   Prob(JB):                    1.54e-170
Kurtosis:                       8.276   Cond. No.                         357.
==============================================================================

Warnings:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.

In [13]:
sns.distplot(df_boston.MEDV)


Out[13]:
<matplotlib.axes._subplots.AxesSubplot at 0x7fc072228b90>

In [14]:
df_boston2 = df_boston.drop(df_boston[df_boston.MEDV >= df_boston.MEDV.max()].index)
df_boston2.head(1)


Out[14]:
const CRIM ZN INDUS CHAS NOX RM AGE DIS RAD TAX PTRATIO B LSTAT MEDV
0 1 0.000736 0.772552 0.337051 0.0 4.647422 9.367131 2.318549 1.94426 0.11496 1.758028 7.074146 4.351754 0.698065 24.0

In [15]:
model2 = sm.OLS(df_boston2.ix[:,-1],df_boston2.ix[:,:-1])
result2 = model2.fit()
print(result2.summary())


                            OLS Regression Results                            
==============================================================================
Dep. Variable:                   MEDV   R-squared:                       0.778
Model:                            OLS   Adj. R-squared:                  0.771
Method:                 Least Squares   F-statistic:                     128.0
Date:                Sun, 12 Jun 2016   Prob (F-statistic):          4.79e-146
Time:                        23:41:37   Log-Likelihood:                -1337.1
No. Observations:                 490   AIC:                             2702.
Df Residuals:                     476   BIC:                             2761.
Df Model:                          13                                         
Covariance Type:            nonrobust                                         
==============================================================================
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const         32.2611      4.125      7.821      0.000      24.155      40.367
CRIM          -0.9067      0.223     -4.062      0.000      -1.345      -0.468
ZN             0.8219      0.263      3.129      0.002       0.306       1.338
INDUS         -0.2983      0.341     -0.874      0.383      -0.969       0.373
CHAS           0.1156      0.188      0.614      0.540      -0.254       0.485
NOX           -1.4386      0.354     -4.064      0.000      -2.134      -0.743
RM             2.6351      0.251     10.501      0.000       2.142       3.128
AGE           -0.6640      0.300     -2.216      0.027      -1.253      -0.075
DIS           -2.5472      0.338     -7.535      0.000      -3.211      -1.883
RAD            2.1811      0.462      4.724      0.000       1.274       3.088
TAX           -2.3185      0.505     -4.591      0.000      -3.311      -1.326
PTRATIO       -1.8144      0.228     -7.960      0.000      -2.262      -1.366
B              0.7238      0.194      3.727      0.000       0.342       1.105
LSTAT         -2.5037      0.303     -8.257      0.000      -3.100      -1.908
==============================================================================
Omnibus:                       84.129   Durbin-Watson:                   1.231
Prob(Omnibus):                  0.000   Jarque-Bera (JB):              161.897
Skew:                           0.966   Prob(JB):                     6.99e-36
Kurtosis:                       5.048   Cond. No.                         358.
==============================================================================

Warnings:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.

In [16]:
import statsmodels.api 
model_anova = sm.OLS.from_formula("MEDV ~ INDUS + C(CHAS)", data = df_boston2)
result_anova = model_anova.fit()
table_anova = sm.stats.anova_lm(result_anova)
table_anova


Out[16]:
df sum_sq mean_sq F PR(>F)
C(CHAS) 1.0 169.271183 169.271183 4.320590 3.817684e-02
INDUS 1.0 11002.130375 11002.130375 280.825664 4.233424e-50
Residual 487.0 19079.586278 39.177795 NaN NaN

In [ ]:


In [17]:
sk_model2 = LinearRegression()

In [18]:
sk_result = sk_model2.fit(df_boston2.ix[:,:-1],df_boston2.ix[:,-1])
sk_result.coef_


Out[18]:
array([ 0.        , -0.90670198,  0.8218828 , -0.29825317,  0.11555586,
       -1.43856592,  2.63509594, -0.66398505, -2.54724295,  2.18110049,
       -2.31851126, -1.81435162,  0.72377893, -2.5036931 ])

In [19]:
sk_result.intercept_


Out[19]:
32.261106875317672

In [20]:
from sklearn.cross_validation import cross_val_score

score = cross_val_score(sk_model2, df_boston2.ix[:,:-1],df_boston2.ix[:,-1], cv=5)

In [21]:
score


Out[21]:
array([ 0.670774  ,  0.77345121,  0.5308856 ,  0.00644914,  0.11065571])

In [22]:
score.mean()


Out[22]:
0.41844313129206878

In [23]:
score.std()


Out[23]:
0.30555426893282145

In [24]:
df_boston2.head(1)


Out[24]:
const CRIM ZN INDUS CHAS NOX RM AGE DIS RAD TAX PTRATIO B LSTAT MEDV
0 1 0.000736 0.772552 0.337051 0.0 4.647422 9.367131 2.318549 1.94426 0.11496 1.758028 7.074146 4.351754 0.698065 24.0

In [25]:
df_boston3 = df_boston2.drop(["CRIM","DIS","LSTAT","MEDV"],axis=1)
df_boston3["CRIM"] = np.log(dfX.CRIM)
df_boston3["DIS"] = np.log(dfX.DIS)
df_boston3["LSTAT"] = np.log(dfX.LSTAT)
df_boston3["MEDV"] = dfY.MEDV

In [26]:
model3 = sm.OLS(df_boston3.ix[:,-1],df_boston3.ix[:,:-1])
result = model3.fit()
print(result.summary())


                            OLS Regression Results                            
==============================================================================
Dep. Variable:                   MEDV   R-squared:                       0.797
Model:                            OLS   Adj. R-squared:                  0.791
Method:                 Least Squares   F-statistic:                     143.8
Date:                Sun, 12 Jun 2016   Prob (F-statistic):          1.91e-155
Time:                        23:41:49   Log-Likelihood:                -1314.7
No. Observations:                 490   AIC:                             2657.
Df Residuals:                     476   BIC:                             2716.
Df Model:                          13                                         
Covariance Type:            nonrobust                                         
==============================================================================
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const         30.5248      3.930      7.767      0.000      22.802      38.247
ZN            -0.0268      0.238     -0.113      0.910      -0.494       0.440
INDUS         -0.1063      0.329     -0.323      0.747      -0.753       0.541
CHAS           0.2271      0.180      1.263      0.207      -0.126       0.580
NOX           -1.5242      0.360     -4.228      0.000      -2.233      -0.816
RM             2.0886      0.252      8.279      0.000       1.593       2.584
AGE           -0.0504      0.303     -0.166      0.868      -0.646       0.546
RAD            1.7683      0.504      3.509      0.000       0.778       2.758
TAX           -2.1879      0.483     -4.526      0.000      -3.138      -1.238
PTRATIO       -1.7374      0.219     -7.951      0.000      -2.167      -1.308
B              0.7231      0.186      3.889      0.000       0.358       1.088
CRIM          -0.1224      0.209     -0.586      0.558      -0.533       0.288
DIS           -3.9716      0.673     -5.905      0.000      -5.293      -2.650
LSTAT         -6.9240      0.553    -12.527      0.000      -8.010      -5.838
==============================================================================
Omnibus:                       31.683   Durbin-Watson:                   1.199
Prob(Omnibus):                  0.000   Jarque-Bera (JB):               49.222
Skew:                           0.471   Prob(JB):                     2.05e-11
Kurtosis:                       4.234   Cond. No.                         359.
==============================================================================

Warnings:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.

In [27]:
sk_model3 = LinearRegression()
scores3 = cross_val_score(sk_model3, df_boston3.ix[:,:-1],df_boston3.ix[:,-1], cv=5)

In [28]:
scores3


Out[28]:
array([ 0.68959667,  0.78222319,  0.58690158,  0.13525116,  0.24691975])

In [29]:
scores3.mean()


Out[29]:
0.48817847061802622

In [30]:
scores3.std()


Out[30]:
0.2528008298676635

In [31]:
sns.heatmap(np.corrcoef(df_boston3.T), xticklabels=df_boston3.columns, yticklabels=df_boston3.columns, annot=True)


Out[31]:
<matplotlib.axes._subplots.AxesSubplot at 0x7fc06ef4d390>

In [32]:
df_boston3 = df_boston3.drop(["ZN","INDUS","AGE","CRIM"],axis=1)

In [34]:
model = sm.OLS(df_boston3.ix[:,-1],df_boston3.ix[:,:-1])
result = model.fit()
print(result.summary())


                            OLS Regression Results                            
==============================================================================
Dep. Variable:                   MEDV   R-squared:                       0.797
Model:                            OLS   Adj. R-squared:                  0.793
Method:                 Least Squares   F-statistic:                     209.1
Date:                Sun, 12 Jun 2016   Prob (F-statistic):          6.89e-160
Time:                        23:45:04   Log-Likelihood:                -1315.0
No. Observations:                 490   AIC:                             2650.
Df Residuals:                     480   BIC:                             2692.
Df Model:                           9                                         
Covariance Type:            nonrobust                                         
==============================================================================
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const         31.2476      3.737      8.361      0.000      23.904      38.591
CHAS           0.2230      0.178      1.250      0.212      -0.127       0.573
NOX           -1.6180      0.331     -4.886      0.000      -2.269      -0.967
RM             2.0911      0.239      8.753      0.000       1.622       2.560
RAD            1.6510      0.404      4.090      0.000       0.858       2.444
TAX           -2.2346      0.429     -5.210      0.000      -3.077      -1.392
PTRATIO       -1.7474      0.202     -8.659      0.000      -2.144      -1.351
B              0.7376      0.183      4.033      0.000       0.378       1.097
DIS           -3.8144      0.568     -6.713      0.000      -4.931      -2.698
LSTAT         -7.0208      0.487    -14.414      0.000      -7.978      -6.064
==============================================================================
Omnibus:                       32.013   Durbin-Watson:                   1.201
Prob(Omnibus):                  0.000   Jarque-Bera (JB):               49.851
Skew:                           0.474   Prob(JB):                     1.50e-11
Kurtosis:                       4.242   Cond. No.                         329.
==============================================================================

Warnings:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.

In [35]:
sk_model = LinearRegression()

result = cross_val_score(sk_model, df_boston3.ix[:,:-1],df_boston3.ix[:,-1], cv=5)

In [36]:
result


Out[36]:
array([ 0.72319109,  0.78563139,  0.61497277,  0.26420584,  0.30413905])

In [37]:
result.mean()


Out[37]:
0.53842802863758932

In [38]:
result.std()


Out[38]:
0.21503261199735343

In [43]:
sns.heatmap(np.corrcoef(df_boston3.T), xticklabels=df_boston3.columns, yticklabels=df_boston3.columns, annot=True)


Out[43]:
<matplotlib.axes._subplots.AxesSubplot at 0x7fc06c1e0310>

In [ ]: