notebook.community

Edit and run



In [1]:

    
from sklearn.datasets import load_boston

boston = load_boston()



In [2]:

    
X = boston.data
y = boston.target
names = boston.feature_names



In [4]:

    
# from sklearn.preprocessing import scale
# X_scaled = scale(X)

# 되도록이면 클래스로 하기
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler(with_mean=False)
X_scaled = scaler.fit_transform(X)



In [5]:

    
dfX0 = pd.DataFrame(X_scaled, columns=names)
dfX = sm.add_constant(dfX0)
dfy = pd.DataFrame(y, columns=["MEDV"])
df = pd.concat([dfX, dfy], axis=1)
df.tail()



In [6]:

    
%config InlineBackend.figure_formats = {'png', 'retina'}



In [35]:

    
sns.pairplot(df)
plt.show()



In [7]:

    
sns.jointplot("RM", "MEDV", data=df)









    Out[7]:





<seaborn.axisgrid.JointGrid at 0x7f1bf868c210>



In [8]:

    
sns.jointplot("CHAS", "MEDV", data=df)









    Out[8]:





<seaborn.axisgrid.JointGrid at 0x7f1bf3730490>



In [9]:

    
model = sm.OLS(df.ix[:, -1], df.ix[:, :-1])
result = model.fit()
print(result.summary())









    



                            OLS Regression Results                            
==============================================================================
Dep. Variable:                   MEDV   R-squared:                       0.741
Model:                            OLS   Adj. R-squared:                  0.734
Method:                 Least Squares   F-statistic:                     108.1
Date:                Sun, 12 Jun 2016   Prob (F-statistic):          6.95e-135
Time:                        10:48:34   Log-Likelihood:                -1498.8
No. Observations:                 506   AIC:                             3026.
Df Residuals:                     492   BIC:                             3085.
Df Model:                          13                                         
Covariance Type:            nonrobust                                         
==============================================================================
                 coef    std err          t      P>|t|      [95.0% Conf. Int.]
------------------------------------------------------------------------------
const         36.4911      5.104      7.149      0.000        26.462    46.520
CRIM          -0.9204      0.281     -3.276      0.001        -1.472    -0.368
ZN             1.0810      0.320      3.380      0.001         0.453     1.709
INDUS          0.1430      0.421      0.339      0.735        -0.685     0.971
CHAS           0.6822      0.219      3.120      0.002         0.253     1.112
NOX           -2.0601      0.442     -4.658      0.000        -2.929    -1.191
RM             2.6706      0.293      9.102      0.000         2.094     3.247
AGE            0.0211      0.371      0.057      0.955        -0.709     0.751
DIS           -3.1044      0.420     -7.398      0.000        -3.929    -2.280
RAD            2.6588      0.577      4.608      0.000         1.525     3.792
TAX           -2.0759      0.633     -3.278      0.001        -3.320    -0.832
PTRATIO       -2.0622      0.283     -7.287      0.000        -2.618    -1.506
B              0.8566      0.245      3.500      0.001         0.376     1.338
LSTAT         -3.7487      0.362    -10.366      0.000        -4.459    -3.038
==============================================================================
Omnibus:                      178.029   Durbin-Watson:                   1.078
Prob(Omnibus):                  0.000   Jarque-Bera (JB):              782.015
Skew:                           1.521   Prob(JB):                    1.54e-170
Kurtosis:                       8.276   Cond. No.                         357.
==============================================================================

Warnings:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.



In [10]:

    
# scikit-learn
from sklearn.linear_model import LinearRegression
model = LinearRegression()
model.fit(df.ix[:, :-1], df.ix[:, -1])
model.intercept_, model.coef_









    Out[10]:





(36.491103280361422,
 array([ 0.        , -0.92041113,  1.08098058,  0.14296712,  0.68220346,
        -2.06009246,  2.67064141,  0.02112063, -3.10444805,  2.65878654,
        -2.07589814, -2.06215593,  0.85664044, -3.74867982]))



In [11]:

    
# nomal분포 y 혹은 입실론
sns.distplot(result.resid)









    Out[11]:





<matplotlib.axes._subplots.AxesSubplot at 0x7f1be99d1c90>



In [12]:

    
sns.distplot(df.MEDV)









    Out[12]:





<matplotlib.axes._subplots.AxesSubplot at 0x7f1be9273c90>



In [13]:

    
# sensored 데이터 아웃라이어 잘못된 데이터 때문에 모델이 정규분포를 따르지 않는다.



In [14]:

    
# 아웃라이어를 제거
df2 = df.drop(df[df.MEDV >= df.MEDV.max()].index)



In [15]:

    
model2 = sm.OLS(df2.ix[:,-1], df2.ix[:,:-1])
result2 = model2.fit()
print(result2.summary())
# Jarque-Bera 값이 161로 줄어듦









    



                            OLS Regression Results                            
==============================================================================
Dep. Variable:                   MEDV   R-squared:                       0.778
Model:                            OLS   Adj. R-squared:                  0.771
Method:                 Least Squares   F-statistic:                     128.0
Date:                Sun, 12 Jun 2016   Prob (F-statistic):          4.79e-146
Time:                        10:48:53   Log-Likelihood:                -1337.1
No. Observations:                 490   AIC:                             2702.
Df Residuals:                     476   BIC:                             2761.
Df Model:                          13                                         
Covariance Type:            nonrobust                                         
==============================================================================
                 coef    std err          t      P>|t|      [95.0% Conf. Int.]
------------------------------------------------------------------------------
const         32.2611      4.125      7.821      0.000        24.155    40.367
CRIM          -0.9067      0.223     -4.062      0.000        -1.345    -0.468
ZN             0.8219      0.263      3.129      0.002         0.306     1.338
INDUS         -0.2983      0.341     -0.874      0.383        -0.969     0.373
CHAS           0.1156      0.188      0.614      0.540        -0.254     0.485
NOX           -1.4386      0.354     -4.064      0.000        -2.134    -0.743
RM             2.6351      0.251     10.501      0.000         2.142     3.128
AGE           -0.6640      0.300     -2.216      0.027        -1.253    -0.075
DIS           -2.5472      0.338     -7.535      0.000        -3.211    -1.883
RAD            2.1811      0.462      4.724      0.000         1.274     3.088
TAX           -2.3185      0.505     -4.591      0.000        -3.311    -1.326
PTRATIO       -1.8144      0.228     -7.960      0.000        -2.262    -1.366
B              0.7238      0.194      3.727      0.000         0.342     1.105
LSTAT         -2.5037      0.303     -8.257      0.000        -3.100    -1.908
==============================================================================
Omnibus:                       84.129   Durbin-Watson:                   1.231
Prob(Omnibus):                  0.000   Jarque-Bera (JB):              161.897
Skew:                           0.966   Prob(JB):                     6.99e-36
Kurtosis:                       5.048   Cond. No.                         358.
==============================================================================

Warnings:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.



In [16]:

    
model2 = LinearRegression()
result2 = model2.fit(df2.ix[:,:-1],df2.ix[:,-1])
model2.intercept_, model2.coef_









    Out[16]:





(32.261106875317736,
 array([ 0.        , -0.90670198,  0.8218828 , -0.29825317,  0.11555586,
        -1.43856592,  2.63509594, -0.66398505, -2.54724295,  2.18110049,
        -2.31851126, -1.81435162,  0.72377893, -2.5036931 ]))



In [17]:

    
# ANOVA
model_anova = sm.OLS.from_formula("MEDV ~ C(CHAS)", data=df2)
result_anova = model_anova.fit()
table_anova = sm.stats.anova_lm(result_anova)
table_anova
# CHAS 때문에 움직인 분산의크기 mean_sq 가  residual보다 크다.  
# PR>F(P-value)가 9.8% 유의 수준이 5%이면 이거로는 영향이 없다. (P-value가 작을수록 영향력이 크다.)









    Out[17]:






  
    
      
      df
      sum_sq
      mean_sq
      F
      PR(>F)
    
  
  
    
      C(CHAS)
      1.0
      169.271183
      169.271183
      2.745998
      0.098141
    
    
      Residual
      488.0
      30081.716653
      61.642862
      NaN
      NaN



In [18]:

    
model1_anova = sm.OLS.from_formula("MEDV ~ CRIM + ZN +INDUS + NOX + RM + AGE + DIS + RAD + TAX + PTRATIO + B + LSTAT", data=df2)
model2_anova = sm.OLS.from_formula("MEDV ~ CRIM + ZN +INDUS + NOX + RM + AGE + DIS + RAD + TAX + PTRATIO + B + LSTAT + C(CHAS)", data=df2)
        
result1_anova = model1_anova.fit()
result2_anova = model2_anova.fit()
table2_anova = sm.stats.anova_lm(result1_anova, result2_anova)
table2_anova









    Out[18]:






  
    
      
      df_resid
      ssr
      df_diff
      ss_diff
      F
      Pr(>F)
    
  
  
    
      0
      477.0
      6734.172357
      0.0
      NaN
      NaN
      NaN
    
    
      1
      476.0
      6728.844589
      1.0
      5.327768
      0.376888
      0.539567



In [19]:

    
model2 = LinearRegression()
model2.fit(df2.ix[:,:-1],df2.ix[:,-1]) # feature, target
model2.intercept_, model2.coef_









    Out[19]:





(32.261106875317736,
 array([ 0.        , -0.90670198,  0.8218828 , -0.29825317,  0.11555586,
        -1.43856592,  2.63509594, -0.66398505, -2.54724295,  2.18110049,
        -2.31851126, -1.81435162,  0.72377893, -2.5036931 ]))



In [20]:

    
from sklearn.cross_validation import cross_val_score

# cv 숫자만 넣으면 KFold로 만들어서 실행 시킨다.

scores2 = cross_val_score(model2, df2.ix[:,:-1], df2.ix[:,-1], cv=5)
scores2, scores2.mean(), scores2.std()
# scores2.mean() : 평균, scores2.std() : 분산 
# -> 0.3 만큼 수치가 왔다 갔다 함으로 분석결과가 안 좋음 (분산이 적어야 R-squres 값이 안정적) -> 모델이 좋음
# 실제 R square 는 0.418









    Out[20]:





(array([ 0.670774  ,  0.77345121,  0.5308856 ,  0.00644914,  0.11065571]),
 0.41844313129206856,
 0.30555426893282184)



In [21]:

    
# 회기분석에서 R square 값 오차를 줄이자
# target에 대해서 feature가 로그취하기 -기울기를 갖는 데이터



In [22]:

    
df3 = df2.drop(["CRIM","DIS","LSTAT","MEDV"], axis=1)
df3["LOGCRIM"] = np.log(df2.CRIM)
df3["LOGDIS"] = np.log(df2.DIS)
df3["LOGLSTAT"] = np.log(df2.LSTAT)
df3["MEDV"] = df2.MEDV
df3.tail()



In [23]:

    
sns.jointplot("CRIM","MEDV", data=df2)
sns.jointplot("LOGCRIM","MEDV", data=df3)









    Out[23]:





<seaborn.axisgrid.JointGrid at 0x7f1be8fb0f10>



In [24]:

    
sns.jointplot("DIS","MEDV", data=df2)
sns.jointplot("LOGDIS","MEDV", data=df3)









    Out[24]:





<seaborn.axisgrid.JointGrid at 0x7f1be89bde10>



In [109]:

    
sns.jointplot("LSTAT","MEDV", data=df2)
sns.jointplot("LOGLSTAT","MEDV", data=df3)









    Out[109]:





<seaborn.axisgrid.JointGrid at 0x7fc27c969290>



In [25]:

    
model3 = sm.OLS(df3.ix[:,-1], df3.ix[:,:-1])
result3 = model3.fit()
print(result3.summary())
#









    



                            OLS Regression Results                            
==============================================================================
Dep. Variable:                   MEDV   R-squared:                       0.797
Model:                            OLS   Adj. R-squared:                  0.791
Method:                 Least Squares   F-statistic:                     143.8
Date:                Sun, 12 Jun 2016   Prob (F-statistic):          1.91e-155
Time:                        10:49:08   Log-Likelihood:                -1314.7
No. Observations:                 490   AIC:                             2657.
Df Residuals:                     476   BIC:                             2716.
Df Model:                          13                                         
Covariance Type:            nonrobust                                         
==============================================================================
                 coef    std err          t      P>|t|      [95.0% Conf. Int.]
------------------------------------------------------------------------------
const         30.5248      3.930      7.767      0.000        22.802    38.247
ZN            -0.0268      0.238     -0.113      0.910        -0.494     0.440
INDUS         -0.1063      0.329     -0.323      0.747        -0.753     0.541
CHAS           0.2271      0.180      1.263      0.207        -0.126     0.580
NOX           -1.5242      0.360     -4.228      0.000        -2.233    -0.816
RM             2.0886      0.252      8.279      0.000         1.593     2.584
AGE           -0.0504      0.303     -0.166      0.868        -0.646     0.546
RAD            1.7683      0.504      3.509      0.000         0.778     2.758
TAX           -2.1879      0.483     -4.526      0.000        -3.138    -1.238
PTRATIO       -1.7374      0.219     -7.951      0.000        -2.167    -1.308
B              0.7231      0.186      3.889      0.000         0.358     1.088
LOGCRIM       -0.1224      0.209     -0.586      0.558        -0.533     0.288
LOGDIS        -3.9716      0.673     -5.905      0.000        -5.293    -2.650
LOGLSTAT      -6.9240      0.553    -12.527      0.000        -8.010    -5.838
==============================================================================
Omnibus:                       31.683   Durbin-Watson:                   1.199
Prob(Omnibus):                  0.000   Jarque-Bera (JB):               49.222
Skew:                           0.471   Prob(JB):                     2.05e-11
Kurtosis:                       4.234   Cond. No.                         359.
==============================================================================

Warnings:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.



In [26]:

    
# cross-validation
scores3 = cross_val_score(LinearRegression(), df3.ix[:,:-1], df3.ix[:,-1], cv=5)
scores3, scores3.mean(), scores3.std()
# 점수(R-square)도 높아지고 r-square의 분산도 줄어들었다.









    Out[26]:





(array([ 0.68959667,  0.78222319,  0.58690158,  0.13525116,  0.24691975]),
 0.48817847061802599,
 0.25280082986766383)



In [27]:

    
# Multicolinearity (다중공선성) 이 있음



In [28]:

    
sns.heatmap(np.corrcoef(df3.T), annot=True, xticklabels=df3.columns, yticklabels=df3.columns) 
# 0에 가까울 수록 좋다. 파란애가 좋고 빨간애가 안좋다. 빨간애들을 잘라내야 한다.









    Out[28]:





<matplotlib.axes._subplots.AxesSubplot at 0x7f1be8381090>



In [136]:

    
df4 = df3.drop(["RAD","TAX","ZN","INDUS","AGE","LOGCRIM"], axis=1)
model4 = sm.OLS(df4.ix[:,-1],df4.ix[:,:-1])
result4 = model4.fit()
print(result4.summary())
# 변수가 빠졌음에도 R-squared: 는 그대로고, Adj. R-squared: 는 더 좋아짐









    



                            OLS Regression Results                            
==============================================================================
Dep. Variable:                   MEDV   R-squared:                       0.785
Model:                            OLS   Adj. R-squared:                  0.782
Method:                 Least Squares   F-statistic:                     251.8
Date:                Wed, 08 Jun 2016   Prob (F-statistic):          1.43e-156
Time:                        08:36:42   Log-Likelihood:                -1328.5
No. Observations:                 490   AIC:                             2673.
Df Residuals:                     482   BIC:                             2706.
Df Model:                           7                                         
Covariance Type:            nonrobust                                         
==============================================================================
                 coef    std err          t      P>|t|      [95.0% Conf. Int.]
------------------------------------------------------------------------------
const         28.6735      3.623      7.915      0.000        21.555    35.792
CHAS           0.3401      0.181      1.876      0.061        -0.016     0.696
NOX           -1.8905      0.318     -5.949      0.000        -2.515    -1.266
RM             2.1868      0.241      9.063      0.000         1.713     2.661
PTRATIO       -1.8452      0.189     -9.783      0.000        -2.216    -1.475
B              0.7873      0.181      4.345      0.000         0.431     1.143
LOGDIS        -3.6005      0.581     -6.194      0.000        -4.743    -2.458
LOGLSTAT      -7.1666      0.497    -14.422      0.000        -8.143    -6.190
==============================================================================
Omnibus:                       33.260   Durbin-Watson:                   1.203
Prob(Omnibus):                  0.000   Jarque-Bera (JB):               55.759
Skew:                           0.464   Prob(JB):                     7.80e-13
Kurtosis:                       4.367   Cond. No.                         305.
==============================================================================

Warnings:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.



In [137]:

    
scores4 = cross_val_score(LinearRegression(), df4.ix[:,:-1], df4.ix[:,-1], cv = 5)
scores4, scores4.mean(), scores4.std()
# R-square과 분산 수치가 좋아졌다
# grid search









    Out[137]:





(array([ 0.7160077 ,  0.7598543 ,  0.56232156,  0.23754958,  0.39348215]),
 0.53384305959027123,
 0.19624834931434537)



In [138]:

    
sns.heatmap(np.corrcoef(df4.T), annot=True, xticklabels=df4.columns, yticklabels=df4.columns)









    Out[138]:





<matplotlib.axes._subplots.AxesSubplot at 0x7fc27cc29550>



In [139]:

    
# grid search로 찾자 anova 분석으로 줄이자
# PCA로 줄여야 한다. feature 공간을 회전시켜서 가장 떨어지는 조합을 제거한다.



In [ ]:

	const	CRIM	INDUS	NOX	RM	AGE	DIS	RAD	TAX	PTRATIO	B	LSTAT	MEDV
501	1	0.007292	1.740698	4.949763	9.392775	2.457236	1.178250	0.11496	1.621424	9.709612	4.297919	1.355480	22.4
502	1	0.005271	1.740698	4.949763	8.718911	2.727496	1.087407	0.11496	1.621424	9.709612	4.351754	1.272778	20.6
503	1	0.007075	1.740698	4.949763	9.938419	3.236012	1.030363	0.11496	1.621424	9.709612	4.351754	0.790580	23.9
504	1	0.012760	1.740698	4.949763	9.679131	3.175559	1.135609	0.11496	1.621424	9.709612	4.313927	0.908326	22.0
505	1	0.005520	1.740698	4.949763	8.590692	2.873294	1.190800	0.11496	1.621424	9.709612	4.351754	1.104569	11.9

	const	INDUS	NOX	RM	AGE	RAD	TAX	PTRATIO	B	LOGCRIM	LOGDIS	LOGLSTAT	MEDV
501	1	1.740698	4.949763	9.392775	2.457236	0.11496	1.621424	9.709612	4.297919	-4.920910	0.164030	0.304156	22.4
502	1	1.740698	4.949763	8.718911	2.727496	0.11496	1.621424	9.709612	4.351754	-5.245510	0.083796	0.241202	20.6
503	1	1.740698	4.949763	9.938419	3.236012	0.11496	1.621424	9.709612	4.351754	-4.951222	0.029911	-0.234988	23.9
504	1	1.740698	4.949763	9.679131	3.175559	0.11496	1.621424	9.709612	4.313927	-4.361408	0.127169	-0.096152	22.0
505	1	1.740698	4.949763	8.590692	2.873294	0.11496	1.621424	9.709612	4.351754	-5.199321	0.174625	0.099456	11.9

	df	sum_sq	mean_sq	F	PR(>F)
C(CHAS)	1.0	169.271183	169.271183	2.745998	0.098141
Residual	488.0	30081.716653	61.642862	NaN	NaN

	df_resid	ssr	df_diff	ss_diff	F	Pr(>F)
0	477.0	6734.172357	0.0	NaN	NaN	NaN
1	476.0	6728.844589	1.0	5.327768	0.376888	0.539567