In [2]:
import os
import tarfile
from six.moves import urllib

DL_ROOT = "https://raw.githubusercontent.com/ageron/handson-ml/master/"
HOUSING_PATH = "datasets/housing"
HOUSING_URL = DL_ROOT + HOUSING_PATH + "/housing.tgz"

def fetch_housing_data(housing_url=HOUSING_URL, housing_path=HOUSING_PATH):
    if not os.path.isdir(housing_path):
        os.makedirs(housing_path)
    tgz_path = os.path.join(housing_path, "housing.tgz")
    urllib.request.urlretrieve(housing_url, tgz_path)
    housing_tgz = tarfile.open(tgz_path)
    housing_tgz.extractall(path=housing_path)
    housing_tgz.close()

In [3]:
import pandas as pd

def load_housing_data(housing_path=HOUSING_PATH):
    csv_path = os.path.join(housing_path, "housing.csv")
    return pd.read_csv(csv_path)

In [4]:
fetch_housing_data()

In [5]:
housing = load_housing_data()

In [7]:
housing.head()


Out[7]:
longitude latitude housing_median_age total_rooms total_bedrooms population households median_income median_house_value ocean_proximity
0 -122.23 37.88 41.0 880.0 129.0 322.0 126.0 8.3252 452600.0 NEAR BAY
1 -122.22 37.86 21.0 7099.0 1106.0 2401.0 1138.0 8.3014 358500.0 NEAR BAY
2 -122.24 37.85 52.0 1467.0 190.0 496.0 177.0 7.2574 352100.0 NEAR BAY
3 -122.25 37.85 52.0 1274.0 235.0 558.0 219.0 5.6431 341300.0 NEAR BAY
4 -122.25 37.85 52.0 1627.0 280.0 565.0 259.0 3.8462 342200.0 NEAR BAY

In [8]:
housing.describe()


Out[8]:
longitude latitude housing_median_age total_rooms total_bedrooms population households median_income median_house_value
count 20640.000000 20640.000000 20640.000000 20640.000000 20433.000000 20640.000000 20640.000000 20640.000000 20640.000000
mean -119.569704 35.631861 28.639486 2635.763081 537.870553 1425.476744 499.539680 3.870671 206855.816909
std 2.003532 2.135952 12.585558 2181.615252 421.385070 1132.462122 382.329753 1.899822 115395.615874
min -124.350000 32.540000 1.000000 2.000000 1.000000 3.000000 1.000000 0.499900 14999.000000
25% -121.800000 33.930000 18.000000 1447.750000 296.000000 787.000000 280.000000 2.563400 119600.000000
50% -118.490000 34.260000 29.000000 2127.000000 435.000000 1166.000000 409.000000 3.534800 179700.000000
75% -118.010000 37.710000 37.000000 3148.000000 647.000000 1725.000000 605.000000 4.743250 264725.000000
max -114.310000 41.950000 52.000000 39320.000000 6445.000000 35682.000000 6082.000000 15.000100 500001.000000

In [9]:
%matplotlib inline
import matplotlib.pyplot as plt

In [11]:
housing.hist(bins=50, figsize=(20,15))
plt.show()



In [12]:
import numpy as np

def split_train_test(data, test_ratio):
    shuffled_indices = np.random.permutation(len(data))
    test_set_size = int(len(data) * test_ratio)
    test_indices = shuffled_indices[:test_set_size]
    train_indices = shuffled_indices[test_set_size:]
    return data.iloc[train_indices], data.iloc[test_indices]

In [15]:
train_set, test_set = split_train_test(housing, 0.2)
print(len(train_set), "train +", len(test_set), "test")


16512 train + 4128 test

In [20]:
from sklearn.model_selection import train_test_split

train_set, test_set = train_test_split(housing, test_size=0.2, random_state=42)

In [24]:
housing.plot(kind='scatter', x='longitude', y='latitude', alpha=0.1)


Out[24]:
<matplotlib.axes._subplots.AxesSubplot at 0x2a15c0615f8>

In [25]:
housing.plot(kind='scatter', x='longitude', y='latitude', alpha=0.4, s=housing['population']/100, label='population', c='median_house_value', cmap=plt.get_cmap('jet'), colorbar=True)
plt.legend()


Out[25]:
<matplotlib.legend.Legend at 0x2a159e583c8>

In [27]:
corr_matrix = housing.corr()

In [28]:
corr_matrix['median_house_value'].sort_values(ascending=False)


Out[28]:
median_house_value    1.000000
median_income         0.688075
total_rooms           0.134153
housing_median_age    0.105623
households            0.065843
total_bedrooms        0.049686
population           -0.024650
longitude            -0.045967
latitude             -0.144160
Name: median_house_value, dtype: float64

In [31]:
from pandas.plotting import scatter_matrix

attributes = ['median_house_value', 'median_income', 'total_rooms', 'housing_median_age']

scatter_matrix(housing[attributes], figsize=(12,8))


Out[31]:
array([[<matplotlib.axes._subplots.AxesSubplot object at 0x000002A158F773C8>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x000002A159F3BA90>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x000002A159EA7A58>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x000002A159DC50B8>],
       [<matplotlib.axes._subplots.AxesSubplot object at 0x000002A159744C88>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x000002A159909128>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x000002A15A0A22E8>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x000002A1594FB8D0>],
       [<matplotlib.axes._subplots.AxesSubplot object at 0x000002A1594FBD68>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x000002A159D3C7F0>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x000002A159D46D68>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x000002A159D62320>],
       [<matplotlib.axes._subplots.AxesSubplot object at 0x000002A159C3B898>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x000002A1599CBE10>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x000002A159ED83C8>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x000002A159EDC940>]],
      dtype=object)

In [106]:
from sklearn.impute import SimpleImputer

imputer = SimpleImputer(strategy = 'median')

housing_num = housing.drop(['ocean_proximity','median_house_value'], axis=1)

imputer.fit(housing_num)

X = imputer.transform(housing_num)

X


Out[106]:
array([[-1.2223e+02,  3.7880e+01,  4.1000e+01, ...,  3.2200e+02,
         1.2600e+02,  8.3252e+00],
       [-1.2222e+02,  3.7860e+01,  2.1000e+01, ...,  2.4010e+03,
         1.1380e+03,  8.3014e+00],
       [-1.2224e+02,  3.7850e+01,  5.2000e+01, ...,  4.9600e+02,
         1.7700e+02,  7.2574e+00],
       ...,
       [-1.2122e+02,  3.9430e+01,  1.7000e+01, ...,  1.0070e+03,
         4.3300e+02,  1.7000e+00],
       [-1.2132e+02,  3.9430e+01,  1.8000e+01, ...,  7.4100e+02,
         3.4900e+02,  1.8672e+00],
       [-1.2124e+02,  3.9370e+01,  1.6000e+01, ...,  1.3870e+03,
         5.3000e+02,  2.3886e+00]])

In [141]:
from sklearn.preprocessing import LabelEncoder, OneHotEncoder

encoder = LabelEncoder()

housing_cat = housing['ocean_proximity']

housing_cat_encoded = encoder.fit_transform(housing_cat)

print(housing_cat_encoded)

print(encoder.classes_)

encoder = OneHotEncoder(categories='auto')

housing_cat_1hot = encoder.fit_transform(housing_cat_encoded.reshape(-1,1))
housing_cat_1hot.toarray()


[3 3 3 ... 1 1 1]
['<1H OCEAN' 'INLAND' 'ISLAND' 'NEAR BAY' 'NEAR OCEAN']
Out[141]:
array([[0., 0., 0., 1., 0.],
       [0., 0., 0., 1., 0.],
       [0., 0., 0., 1., 0.],
       ...,
       [0., 1., 0., 0., 0.],
       [0., 1., 0., 0., 0.],
       [0., 1., 0., 0., 0.]])

In [142]:
from sklearn.preprocessing import LabelBinarizer

encoder = LabelBinarizer()

housing_cat_1hot = encoder.fit_transform(housing_cat)
housing_cat_1hot


Out[142]:
array([[0, 0, 0, 1, 0],
       [0, 0, 0, 1, 0],
       [0, 0, 0, 1, 0],
       ...,
       [0, 1, 0, 0, 0],
       [0, 1, 0, 0, 0],
       [0, 1, 0, 0, 0]])

In [78]:
from sklearn.base import BaseEstimator, TransformerMixin

rooms_ix, bedrooms_ix, population_ix, household_ix = 3, 4, 5, 6

class CombinedAttributesAdder(BaseEstimator, TransformerMixin):
    def __init__(self, add_bedrooms_per_room = True):
        self.add_bedrooms_per_room = add_bedrooms_per_room
    def fit(self, X, y=None):
        return self
    def transform(self, X, y=None):
        rooms_per_household = X[:, rooms_ix] / X[:, household_ix]
        population_per_household = X[:, population_ix] / X[:, household_ix]
        if self.add_bedrooms_per_room:
            bedrooms_per_room = X[:, bedrooms_ix] / X[:, rooms_ix]
            return np.c_[X, rooms_per_household, population_per_household, bedrooms_per_room]
        else:
            return np.c_[X, rooms_per_household, population_per_household]

attr_adder = CombinedAttributesAdder(add_bedrooms_per_room=True)

housing_extra_attribs = attr_adder.transform(housing.values)

print(housing_extra_attribs)


[[-122.23 37.88 41.0 ... 6.984126984126984 2.5555555555555554
  0.14659090909090908]
 [-122.22 37.86 21.0 ... 6.238137082601054 2.109841827768014
  0.15579659106916466]
 [-122.24 37.85 52.0 ... 8.288135593220339 2.8022598870056497
  0.12951601908657123]
 ...
 [-121.22 39.43 17.0 ... 5.20554272517321 2.325635103926097
  0.21517302573203195]
 [-121.32 39.43 18.0 ... 5.329512893982808 2.1232091690544412
  0.21989247311827956]
 [-121.24 39.37 16.0 ... 5.254716981132075 2.616981132075472
  0.22118491921005387]]

In [79]:
# Transformation Pipelines

from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler

num_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='median')),
    ('std_scaler', StandardScaler()),
])

housing_num_tr = num_pipeline.fit_transform(housing_num)
print(housing_num_tr.min(axis=1), housing_num_tr.max(axis=1))


[-1.32783522 -1.32284391 -1.33282653 ... -1.14259331 -1.05860847
 -1.01787803] [2.34476576 2.33223796 1.85618152 ... 1.77823747 1.77823747 1.75014627]

In [107]:
from sklearn.pipeline import FeatureUnion
from sklearn.base import BaseEstimator, TransformerMixin

class DataFrameSelector(BaseEstimator, TransformerMixin):
    def __init__(self, attribute_names):
        self.attribute_names = attribute_names
    def fit(self, X, y=None):
        return self
    def transform(self, X, y=None):
        return X[self.attribute_names].values

class CustomLabelBinarizer(BaseEstimator, TransformerMixin):
    def __init__(self, sparse_output=False):
        self.sparse_output = sparse_output
    def fit(self, X, y=None):
        return self
    def transform(self, X, y=None):
        enc = LabelBinarizer(sparse_output=self.sparse_output)
        return enc.fit_transform(X)



num_attribs = list(housing_num)
cat_attribs = ['ocean_proximity']

print(num_attribs)

num_pipeline = Pipeline([
    ('selector', DataFrameSelector(num_attribs)),
    ('imputer', SimpleImputer(strategy='median')),
    ('attribs_adder', CombinedAttributesAdder()),
    ('std_scaler', StandardScaler()),
])

cat_pipeline = Pipeline([
    ('selector', DataFrameSelector(cat_attribs)),
    ('label_binarizer', CustomLabelBinarizer()),
])

full_pipeline = FeatureUnion(transformer_list=[
    ('num_pipeline', num_pipeline),
    ('cat_pipeline', cat_pipeline),
])

housing_prepared = full_pipeline.fit_transform(housing)

housing_prepared


['longitude', 'latitude', 'housing_median_age', 'total_rooms', 'total_bedrooms', 'population', 'households', 'median_income']
Out[107]:
array([[-1.32783522,  1.05254828,  0.98214266, ...,  0.        ,
         1.        ,  0.        ],
       [-1.32284391,  1.04318455, -0.60701891, ...,  0.        ,
         1.        ,  0.        ],
       [-1.33282653,  1.03850269,  1.85618152, ...,  0.        ,
         1.        ,  0.        ],
       ...,
       [-0.8237132 ,  1.77823747, -0.92485123, ...,  0.        ,
         0.        ,  0.        ],
       [-0.87362627,  1.77823747, -0.84539315, ...,  0.        ,
         0.        ,  0.        ],
       [-0.83369581,  1.75014627, -1.00430931, ...,  0.        ,
         0.        ,  0.        ]])

In [116]:
from sklearn.linear_model import LinearRegression

housing_X = housing.drop(["median_house_value"], axis=1)
housing_Y = housing["median_house_value"].values

print(housing_Y)

housing_prepared = full_pipeline.fit_transform(housing_X)


lin_reg = LinearRegression()

lin_reg.fit(housing_prepared, housing_Y)


from sklearn.metrics import mean_squared_error

housing_predictions = lin_reg.predict(housing_prepared)
lin_mse = mean_squared_error(housing_Y, housing_predictions)
lin_rmse = np.sqrt(lin_mse)
lin_rmse


[452600. 358500. 352100. ...  92300.  84700.  89400.]
Out[116]:
68286.12607251322

In [120]:
from sklearn.tree import DecisionTreeRegressor

tree_reg = DecisionTreeRegressor()

tree_reg.fit(housing_prepared, housing_Y)

housing_predictions = tree_reg.predict(housing_prepared)
tree_mse = mean_squared_error(housing_Y, housing_predictions)
tree_rmse = np.sqrt(tree_mse)
print(tree_rmse)


0.0

In [123]:
from sklearn.model_selection import cross_val_score

scores = cross_val_score(tree_reg, housing_prepared, housing_Y, scoring='neg_mean_squared_error', cv=10)

rmse_scores = np.sqrt(-scores)

def display_scores(scores):
    print('scores:', scores)
    print('mean:', scores.mean())
    print('std:', scores.std())

display_scores(rmse_scores)


scores: [121333.0112734   71419.78892793  85094.59633577  73819.63348344
  89581.3000502   77516.89924067  68190.9987994  101509.94460809
  92974.45416405  71805.53619816]
mean: 85324.6163081105
std: 15805.876505502598

In [125]:
lin_scores = cross_val_score(lin_reg, housing_prepared, housing_Y, scoring='neg_mean_squared_error', cv=10)

lin_rmse_scores = np.sqrt(-lin_scores)

display_scores(lin_rmse_scores)


scores: [84183.66301514 61191.52853899 86743.60959739 62286.73445075
 80537.25795828 68918.58661112 52504.86407192 90904.22793667
 77675.08903006 53940.95369716]
mean: 71888.65149074617
std: 13247.67185583079

In [140]:
from sklearn.ensemble import RandomForestRegressor

forest_reg = RandomForestRegressor(n_estimators = 10)

forest_scores = cross_val_score(forest_reg, housing_prepared, housing_Y, scoring='neg_mean_squared_error', cv=10)

forest_rmse_scores = np.sqrt(-forest_scores)

display_scores(forest_rmse_scores)


scores: [97315.74872864 51974.93587501 68340.41993304 59494.25222907
 63209.00536231 62462.35967189 48357.97722972 80364.95096352
 77324.07599552 52186.86195507]
mean: 66103.05879438028
std: 14429.169983361297

In [128]:
# Fine-Tune Your Model

from sklearn.model_selection import GridSearchCV

param_grid = [
    {'n_estimators': [3,10,30], 'max_features': [2,4,6,8]},
    {'bootstrap': [False], 'n_estimators': [3, 10], 'max_features': [2,3,4]},
]

forest_reg = RandomForestRegressor()

grid_search = GridSearchCV(forest_reg, param_grid, cv=5, scoring='neg_mean_squared_error')

grid_search.fit(housing_prepared, housing_Y)

grid_search.best_params_


Out[128]:
{'max_features': 4, 'n_estimators': 30}

In [130]:
grid_search.best_estimator_


Out[130]:
RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
                      max_features=4, max_leaf_nodes=None,
                      min_impurity_decrease=0.0, min_impurity_split=None,
                      min_samples_leaf=1, min_samples_split=2,
                      min_weight_fraction_leaf=0.0, n_estimators=30,
                      n_jobs=None, oob_score=False, random_state=None,
                      verbose=0, warm_start=False)

In [131]:
cvres = grid_search.cv_results_
for mean_score, params in zip(cvres['mean_test_score'], cvres['params']):
    print(np.sqrt(-mean_score), params)


81289.08766553094 {'max_features': 2, 'n_estimators': 3}
73158.21964765945 {'max_features': 2, 'n_estimators': 10}
69570.12652864677 {'max_features': 2, 'n_estimators': 30}
76662.59265367892 {'max_features': 4, 'n_estimators': 3}
70601.43435703327 {'max_features': 4, 'n_estimators': 10}
67811.30074349103 {'max_features': 4, 'n_estimators': 30}
79993.3334121623 {'max_features': 6, 'n_estimators': 3}
71821.2423340115 {'max_features': 6, 'n_estimators': 10}
67980.20110627243 {'max_features': 6, 'n_estimators': 30}
76038.93736973275 {'max_features': 8, 'n_estimators': 3}
70625.69804067428 {'max_features': 8, 'n_estimators': 10}
68404.89243928483 {'max_features': 8, 'n_estimators': 30}
82011.97781532306 {'bootstrap': False, 'max_features': 2, 'n_estimators': 3}
71441.53551586885 {'bootstrap': False, 'max_features': 2, 'n_estimators': 10}
80217.00255734805 {'bootstrap': False, 'max_features': 3, 'n_estimators': 3}
72240.88881926336 {'bootstrap': False, 'max_features': 3, 'n_estimators': 10}
78232.55070841077 {'bootstrap': False, 'max_features': 4, 'n_estimators': 3}
70128.10670649988 {'bootstrap': False, 'max_features': 4, 'n_estimators': 10}

In [136]:
feature_importances = grid_search.best_estimator_.feature_importances_
feature_importances

extra_attribs = ['rooms_per_hhold', 'pop_per_hhold', 'bedrooms_per_room']

cat_one_hot_attribs = list(encoder.classes_)

attributes = num_attribs + extra_attribs + cat_one_hot_attribs

sorted(zip(feature_importances, attributes), reverse=True)


Out[136]:
[(0.29270142067458654, 'median_income'),
 (0.13834075931969428, 'INLAND'),
 (0.10346770966268978, 'pop_per_hhold'),
 (0.08400123962788562, 'latitude'),
 (0.08325908334643219, 'bedrooms_per_room'),
 (0.0789091468434949, 'longitude'),
 (0.07000444912239243, 'rooms_per_hhold'),
 (0.041020803986901645, 'housing_median_age'),
 (0.022083484376280636, 'population'),
 (0.02144979586263664, 'total_rooms'),
 (0.019866964809786548, 'total_bedrooms'),
 (0.019398534554527423, 'households'),
 (0.011759408345777841, '<1H OCEAN'),
 (0.007638554840163659, 'NEAR OCEAN'),
 (0.0059601278500213606, 'NEAR BAY'),
 (0.00013851677672851748, 'ISLAND')]

In [139]:
final_model = grid_search.best_estimator_

X_test = test_set.drop('median_house_value', axis=1)
Y_test = test_set['median_house_value']

X_test_prepared = full_pipeline.fit_transform(X_test)

final_predictions = final_model.predict(X_test_prepared)

final_mse = mean_squared_error(Y_test, final_predictions)

final_rmse = np.sqrt(final_mse)
print(final_rmse)


54855.264170511706