In [2]:
from sklearn.datasets import load_boston
In [3]:
boston = load_boston()
In [4]:
X = boston.data
Y = boston.target
In [5]:
names = boston.feature_names
In [6]:
names
Out[6]:
In [7]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler(with_mean=False)
X_scaled = scaler.fit_transform(X)
In [8]:
dfX0 = pd.DataFrame(X_scaled,columns=names)
In [9]:
dfX0.head(1)
Out[9]:
In [10]:
dfX = sm.add_constant(dfX0)
In [11]:
dfY = pd.DataFrame(Y, columns=["MEDV"])
In [12]:
df = pd.concat([dfX, dfY],axis=1)
In [13]:
df.tail()
Out[13]:
In [14]:
# 그림저장에 default가 svg로 저장이 되게 한다.
# png, jpg
%config InlineBackend.figure_format = "png"
In [22]:
sns.pairplot(df)
Out[22]:
In [15]:
sns.jointplot("RM","MEDV",data=df)
plt.show()
In [16]:
# CHAS 는 찰스강에 붙어잇냐 아니냐를 나타내는 카테고리 값이었다.
sns.jointplot("CHAS","MEDV" ,data=df)
plt.show()
In [17]:
regression = sm.OLS(dfY, dfX)
In [18]:
result = regression.fit()
In [19]:
print(result.summary())
In [80]:
model = sm.OLS(df.ix[:,-1], df.ix[:,:-1])
In [81]:
result = model.fit()
print(result.summary())
In [23]:
from sklearn.linear_model import LinearRegression
model_boston = LinearRegression().fit(df.ix[:,:-1],df.ix[:,-1])
In [24]:
model_boston.intercept_, model_boston.coef_
Out[24]:
In [29]:
sm.graphics.plot_fit(result, df.MEDV)
plt.show()
In [85]:
sns.distplot(result.resid)
Out[85]:
In [86]:
sns.distplot(df.MEDV)
Out[86]:
In [30]:
df2 = df.drop(df[df.MEDV >= df.MEDV.max()].index)
df2.head(1)
Out[30]:
In [31]:
sm_model2 = sm.OLS(df2.ix[:,-1],df2.ix[:,:-1])
result2 = sm_model2.fit()
print(result2.summary())
In [32]:
from sklearn.linear_model import LinearRegression
sk_model2 = LinearRegression().fit(df2.ix[:,:-1],df2.ix[:,-1])
sk_model2.coef_, sk_model2.intercept_
Out[32]:
In [ ]:
In [33]:
import statsmodels.api as sm
model_anova = sm.OLS.from_formula("MEDV ~ C(CHAS)", data=df2)
result_anova = model_anova.fit()
table_anova = sm.stats.anova_lm(result_anova)
table_anova
Out[33]:
In [100]:
model2 = LinearRegression()
In [102]:
fit_model2 = model2.fit(df2.ix[:,:-1],df2.ix[:,-1])
In [104]:
fit_model2.coef_, fit_model2.intercept_
Out[104]:
In [108]:
from sklearn.cross_validation import cross_val_score
scores2 = cross_val_score(model2, df2.ix[:,:-1], df2.ix[:,-1], cv = 5 ) # cv = 5 , k가 5인 k-fold cv를 시행.
In [110]:
scores2, scores2.mean(), scores2.std()
Out[110]:
In [112]:
# 이 모델의 점수는 0.418.. 입니당. 어떻게 바꿀까?
In [113]:
# MEDV 와 LSTAT이 반 비례의 2차함수같네??
# 이차항을 해주면 되겟네?
# 로그를 취해줘도 된다.
# CRIM, DIS(헤테로스키더스키? 하다. 갈수록 분산증가)
In [114]:
df3 = df2.drop(["CRIM","DIS","LSTAT","MEDV"], axis=1)
df3["LOGCRIM"] = np.log(df2.CRIM)
df3["LOGDIS"] = np.log(df2.DIS)
df3["LOGLSTAT"] = np.log(df2.LSTAT)
df3["MEDV"] = df2.MEDV
In [115]:
df3.tail()
Out[115]:
In [116]:
df2.tail()
Out[116]:
In [117]:
sns.jointplot("CRIM","MEDV", data=df2)
Out[117]:
In [118]:
sns.jointplot("LOGCRIM","MEDV", data=df3)
Out[118]:
In [119]:
sns.jointplot("DIS","MEDV", data=df2)
Out[119]:
In [120]:
sns.jointplot("LOGDIS","MEDV", data=df3)
Out[120]:
In [121]:
sns.jointplot("LSTAT","MEDV", data=df2)
Out[121]:
In [122]:
sns.jointplot("LOGLSTAT","MEDV", data=df3)
Out[122]:
In [123]:
model3 = sm.OLS(df3.ix[:,-1],df3.ix[:,:-1])
In [124]:
result = model3.fit()
In [125]:
print(result.summary())
In [126]:
scores3 = cross_val_score(LinearRegression(), df3.ix[:,:-1],df3.ix[:,-1],cv=5)
scores3, scores3.mean(), scores3.std()
Out[126]:
In [127]:
# 평균 점수도 높아지고 분산도 적어졌다!!!
In [128]:
# 지금 데이터셋에는 다중공선성이 있다.
In [130]:
# correlation 보기.
sns.heatmap(np.corrcoef(df3.T))
Out[130]:
In [131]:
# t-test 결과만 갖고 잘라내기 (원래는 F-test도 봐야하지만..)
df4 = df3.drop(["ZN","INDUS","AGE","LOGCRIM"],axis=1)
In [132]:
model4 = sm.OLS(df4.ix[:,-1],df4.ix[:,:-1])
result4 = model4.fit()
print(result4.summary())
In [133]:
score4 = cross_val_score(LinearRegression(), df4.ix[:,:-1],df4.ix[:,-1], cv=5)
score4, score4.mean(), score4.std()
Out[133]:
In [140]:
sns.heatmap(np.corrcoef(df4.T), xticklabels=df4.columns, yticklabels=df4.columns, annot=True)
Out[140]:
In [137]:
sns.heatmap?
In [141]:
df5 = df4.drop(["RAD"],axis=1)
In [142]:
model5 = sm.OLS(df5.ix[:,-1],df5.ix[:,:-1])
result5 = model5.fit()
print(result5.summary())
In [143]:
score5 = cross_val_score(LinearRegression(), df5.ix[:,:-1],df5.ix[:,-1], cv=5)
score5, score5.mean(), score5.std()
Out[143]:
In [144]:
df6 = df5.drop(["TAX"],axis=1)
model6 = sm.OLS(df6.ix[:,-1],df6.ix[:,:-1])
result6 = model6.fit()
print(result6.summary())
In [145]:
score6 = cross_val_score(LinearRegression(), df6.ix[:,:-1],df6.ix[:,-1], cv=5)
score6, score6.mean(), score6.std()
Out[145]:
In [146]:
sns.heatmap(np.corrcoef(df6.T), xticklabels=df6.columns, yticklabels=df6.columns, annot=True)
Out[146]:
In [ ]: