We will use the Boston Housing Dataset to explore how to build a regression model in this case.
In [1]:
%pylab inline
pylab.style.use('ggplot')
import numpy as np
import pandas as pd
In [2]:
from sklearn.datasets import load_boston
boston = load_boston()
In [3]:
dir(boston)
Out[3]:
In [5]:
print(boston.DESCR)
In [14]:
columns = [name for name in boston.feature_names] + ['MEDV']
columns
Out[14]:
In [23]:
data = np.column_stack([boston.data, boston.target])
In [24]:
data_df = pd.DataFrame(data=data, columns=columns)
In [25]:
data_df.head()
Out[25]:
In [27]:
data_df.dtypes
Out[27]:
In [28]:
pd.unique(data_df.RAD)
Out[28]:
In [29]:
pd.unique(data_df.CHAS)
Out[29]:
In [36]:
gp_chas = data_df.MEDV.groupby(data_df.CHAS)
gp_chas.agg({'min': np.min, 'max': np.max, 'mean': np.mean})
Out[36]:
In [37]:
gp_rad = data_df.MEDV.groupby(data_df.RAD)
gp_rad.agg({'min': np.min, 'max': np.max, 'mean': np.mean})
Out[37]:
In [38]:
data_df[['CHAS', 'MEDV']].corr()
Out[38]:
In [39]:
data_df.plot(kind='scatter', x='CHAS', y='MEDV')
Out[39]:
In [68]:
corrs_with_medv = data_df.drop(['CHAS', 'RAD', 'MEDV'], axis=1).corrwith(data_df['MEDV'])
corrs_with_medv.reindex(corrs_with_medv.abs().sort_values(ascending=False).index)
Out[68]:
In [41]:
data_df.plot(kind='scatter', x='LSTAT', y='MEDV')
Out[41]:
In [42]:
data_df.plot(kind='scatter', x='RM', y='MEDV')
Out[42]:
In [51]:
import statsmodels.formula.api as sm
result = sm.ols(formula='MEDV ~ RM + LSTAT', data=data_df).fit()
result.summary()
Out[51]:
In [55]:
result = sm.ols(formula='MEDV ~ RM + np.log(LSTAT)', data=data_df).fit()
result.summary()
Out[55]:
In [56]:
data_df.plot(kind='scatter', x='PTRATIO', y='MEDV')
Out[56]:
In [61]:
result = sm.ols(formula='MEDV ~ PTRATIO + RM + np.log(LSTAT)', data=data_df).fit()
result.summary()
Out[61]:
In [63]:
result.resid.plot(kind='hist', bins=20)
Out[63]:
In [ ]: