In [1]:
# boston data loading
from sklearn.datasets import load_boston
boston_data = load_boston()
X = boston_data.data
Y = boston_data.target
In [2]:
# scaling
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler(with_mean=False)
In [3]:
X = scaler.fit_transform(X)
feature = boston_data.feature_names
In [4]:
dfX = pd.DataFrame(X, columns=feature)
dfY = pd.DataFrame(Y, columns=["MEDV"])
In [5]:
df_boston = pd.concat([dfX,dfY],axis=1)
In [6]:
df_boston = sm.add_constant(df_boston)
In [7]:
df_boston.head(1)
Out[7]:
In [8]:
from sklearn.linear_model import LinearRegression
model = LinearRegression()
In [9]:
result = model.fit(df_boston.ix[:,:-1],df_boston.ix[:,-1])
In [10]:
result.coef_
Out[10]:
In [11]:
result.intercept_
Out[11]:
In [ ]:
In [12]:
model1 = sm.OLS(df_boston.ix[:,-1],df_boston.ix[:,:-1])
result1 = model1.fit()
print(result1.summary())
In [13]:
sns.distplot(df_boston.MEDV)
Out[13]:
In [14]:
df_boston2 = df_boston.drop(df_boston[df_boston.MEDV >= df_boston.MEDV.max()].index)
df_boston2.head(1)
Out[14]:
In [15]:
model2 = sm.OLS(df_boston2.ix[:,-1],df_boston2.ix[:,:-1])
result2 = model2.fit()
print(result2.summary())
In [16]:
import statsmodels.api
model_anova = sm.OLS.from_formula("MEDV ~ INDUS + C(CHAS)", data = df_boston2)
result_anova = model_anova.fit()
table_anova = sm.stats.anova_lm(result_anova)
table_anova
Out[16]:
In [ ]:
In [17]:
sk_model2 = LinearRegression()
In [18]:
sk_result = sk_model2.fit(df_boston2.ix[:,:-1],df_boston2.ix[:,-1])
sk_result.coef_
Out[18]:
In [19]:
sk_result.intercept_
Out[19]:
In [20]:
from sklearn.cross_validation import cross_val_score
score = cross_val_score(sk_model2, df_boston2.ix[:,:-1],df_boston2.ix[:,-1], cv=5)
In [21]:
score
Out[21]:
In [22]:
score.mean()
Out[22]:
In [23]:
score.std()
Out[23]:
In [24]:
df_boston2.head(1)
Out[24]:
In [25]:
df_boston3 = df_boston2.drop(["CRIM","DIS","LSTAT","MEDV"],axis=1)
df_boston3["CRIM"] = np.log(dfX.CRIM)
df_boston3["DIS"] = np.log(dfX.DIS)
df_boston3["LSTAT"] = np.log(dfX.LSTAT)
df_boston3["MEDV"] = dfY.MEDV
In [26]:
model3 = sm.OLS(df_boston3.ix[:,-1],df_boston3.ix[:,:-1])
result = model3.fit()
print(result.summary())
In [27]:
sk_model3 = LinearRegression()
scores3 = cross_val_score(sk_model3, df_boston3.ix[:,:-1],df_boston3.ix[:,-1], cv=5)
In [28]:
scores3
Out[28]:
In [29]:
scores3.mean()
Out[29]:
In [30]:
scores3.std()
Out[30]:
In [31]:
sns.heatmap(np.corrcoef(df_boston3.T), xticklabels=df_boston3.columns, yticklabels=df_boston3.columns, annot=True)
Out[31]:
In [32]:
df_boston3 = df_boston3.drop(["ZN","INDUS","AGE","CRIM"],axis=1)
In [34]:
model = sm.OLS(df_boston3.ix[:,-1],df_boston3.ix[:,:-1])
result = model.fit()
print(result.summary())
In [35]:
sk_model = LinearRegression()
result = cross_val_score(sk_model, df_boston3.ix[:,:-1],df_boston3.ix[:,-1], cv=5)
In [36]:
result
Out[36]:
In [37]:
result.mean()
Out[37]:
In [38]:
result.std()
Out[38]:
In [43]:
sns.heatmap(np.corrcoef(df_boston3.T), xticklabels=df_boston3.columns, yticklabels=df_boston3.columns, annot=True)
Out[43]:
In [ ]: