Boston Housing Dataset

Analysis of the Boston Housing Dataset.



In [1]:

    
%pylab inline
pylab.style.use('ggplot')
import pandas as pd
import numpy as np
import seaborn as sns









    



Populating the interactive namespace from numpy and matplotlib

Getting the Data

The easiest way to get thee data is to use the sklearn.datasets.load_boston function.



In [2]:

    
from sklearn.datasets import load_boston

boston_data = load_boston()



In [3]:

    
dir(boston_data)









    Out[3]:





['DESCR', 'data', 'feature_names', 'target']



In [4]:

    
data_df = pd.DataFrame(boston_data.data, columns=boston_data.feature_names)



In [5]:

    
data_df = data_df.assign(target=boston_data.target)



In [6]:

    
data_df.head()

Univariate Analysis



In [7]:

    
data_df.shape[1]









    Out[7]:





14



In [8]:

    
_, axes = pylab.subplots(5, 3, figsize=(20, 20))

for idx, colname in enumerate(data_df.columns):
    row, col = divmod(idx, 3)
    data_df.loc[:, colname].plot(kind='hist', ax=axes[row][col], title=colname)

Bivariate Analysis



In [9]:

    
_, axes = pylab.subplots(5, 3, figsize=(20, 20))

for idx, colname in enumerate(data_df.columns.drop('target')):    
    row, col = divmod(idx, 3)
    sns.regplot(data=data_df, x=colname, y='target', ax=axes[row][col])

Feature Correlations



In [10]:

    
f_corrs = data_df.drop('target', axis=1).corrwith(data_df.target)
f_corrs.plot(kind='bar')









    Out[10]:





<matplotlib.axes._subplots.AxesSubplot at 0x2314581aa20>



In [11]:

    
_, ax = pylab.subplots(1, 1, figsize=(12, 10))
corrs = data_df.drop('target', axis=1).corr()
sns.heatmap(corrs, annot=True, ax=ax)









    Out[11]:





<matplotlib.axes._subplots.AxesSubplot at 0x231458f4a58>

Feature Importances



In [12]:

    
from sklearn.feature_selection import mutual_info_regression

importances = mutual_info_regression(data_df.drop('target', axis=1), data_df.target)
importances = pd.Series(data=importances, index=data_df.columns.drop('target').copy())

importances.sort_values(ascending=True).plot(kind='barh')









    Out[12]:





<matplotlib.axes._subplots.AxesSubplot at 0x23144f602b0>

Random Forest Regressor



In [13]:

    
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold



In [14]:

    
def score_with_best_k_features(k):
    """Cross_val score with best n features."""

    model = RandomForestRegressor(n_estimators=50, max_features=k)
    features = data_df.drop('target', axis=1)
    target = data_df.target

    cv = KFold(n_splits=10, shuffle=True, random_state=1234) 

    scores = cross_val_score(estimator=model, 
                             X=features, 
                             y=target, 
                             cv=cv,
                             scoring='r2')

    scores = pd.Series(scores)    
    return scores



In [15]:

    
scores = {k: score_with_best_k_features(k) for k in (5, 9, 13)}
scores = pd.concat(scores, axis=1)



In [16]:

    
scores.plot(kind='bar')









    Out[16]:





<matplotlib.axes._subplots.AxesSubplot at 0x23144a9c9b0>



In [17]:

    
scores.mean().plot(kind='bar')









    Out[17]:





<matplotlib.axes._subplots.AxesSubplot at 0x23145068358>



In [18]:

    
scores.T



In [19]:

    
scores.mean()









    Out[19]:





5     0.863216
9     0.850489
13    0.832048
dtype: float64

HyperParamater Tuning with GridSearch



In [20]:

    
from sklearn.model_selection import GridSearchCV



In [21]:

    
model = RandomForestRegressor()

param_grid = {
    'n_estimators' : [100, 120, 140, 160, 180, 200],
    'max_depth'    : [10],
    'max_features' : [5],    
}

cv = KFold(n_splits=10, shuffle=True, random_state=1234) 

grid_search = GridSearchCV(model, param_grid, cv=cv, verbose=1)

features = data_df.drop('target', axis=1)
target = data_df.target

grid_search = grid_search.fit(X=features, y=target)









    



Fitting 10 folds for each of 6 candidates, totalling 60 fits






    



[Parallel(n_jobs=1)]: Done  60 out of  60 | elapsed:   24.3s finished



In [22]:

    
grid_search.best_params_









    Out[22]:





{'max_depth': 10, 'max_features': 5, 'n_estimators': 100}



In [23]:

    
grid_search.best_score_









    Out[23]:





0.8699525055696834



In [ ]:

	CRIM	ZN	INDUS	NOX	RM	AGE	DIS	RAD	TAX	PTRATIO	B	LSTAT	target
0	0.00632	18.0	2.31	0.538	6.575	65.2	4.0900	1.0	296.0	15.3	396.90	4.98	24.0
1	0.02731	0.0	7.07	0.469	6.421	78.9	4.9671	2.0	242.0	17.8	396.90	9.14	21.6
2	0.02729	0.0	7.07	0.469	7.185	61.1	4.9671	2.0	242.0	17.8	392.83	4.03	34.7
3	0.03237	0.0	2.18	0.458	6.998	45.8	6.0622	3.0	222.0	18.7	394.63	2.94	33.4
4	0.06905	0.0	2.18	0.458	7.147	54.2	6.0622	3.0	222.0	18.7	396.90	5.33	36.2

	0	1	2	3	4	5	6	7	8	9
5	0.874687	0.915860	0.891514	0.829263	0.920731	0.872104	0.930854	0.910429	0.815378	0.671340
9	0.882683	0.921308	0.843245	0.832220	0.902560	0.794787	0.938194	0.903834	0.792211	0.693847
13	0.882219	0.923986	0.830504	0.818592	0.905698	0.740840	0.937183	0.853072	0.764192	0.664189