notebook.community

Edit and run



In [2]:

    
import os
import tarfile
from six.moves import urllib

DL_ROOT = "https://raw.githubusercontent.com/ageron/handson-ml/master/"
HOUSING_PATH = "datasets/housing"
HOUSING_URL = DL_ROOT + HOUSING_PATH + "/housing.tgz"

def fetch_housing_data(housing_url=HOUSING_URL, housing_path=HOUSING_PATH):
    if not os.path.isdir(housing_path):
        os.makedirs(housing_path)
    tgz_path = os.path.join(housing_path, "housing.tgz")
    urllib.request.urlretrieve(housing_url, tgz_path)
    housing_tgz = tarfile.open(tgz_path)
    housing_tgz.extractall(path=housing_path)
    housing_tgz.close()



In [3]:

    
import pandas as pd

def load_housing_data(housing_path=HOUSING_PATH):
    csv_path = os.path.join(housing_path, "housing.csv")
    return pd.read_csv(csv_path)



In [4]:

    
fetch_housing_data()



In [5]:

    
housing = load_housing_data()



In [7]:

    
housing.head()









    Out[7]:







  
    
      
      longitude
      latitude
      housing_median_age
      total_rooms
      total_bedrooms
      population
      households
      median_income
      median_house_value
      ocean_proximity
    
  
  
    
      0
      -122.23
      37.88
      41.0
      880.0
      129.0
      322.0
      126.0
      8.3252
      452600.0
      NEAR BAY
    
    
      1
      -122.22
      37.86
      21.0
      7099.0
      1106.0
      2401.0
      1138.0
      8.3014
      358500.0
      NEAR BAY
    
    
      2
      -122.24
      37.85
      52.0
      1467.0
      190.0
      496.0
      177.0
      7.2574
      352100.0
      NEAR BAY
    
    
      3
      -122.25
      37.85
      52.0
      1274.0
      235.0
      558.0
      219.0
      5.6431
      341300.0
      NEAR BAY
    
    
      4
      -122.25
      37.85
      52.0
      1627.0
      280.0
      565.0
      259.0
      3.8462
      342200.0
      NEAR BAY



In [8]:

    
housing.describe()









    Out[8]:







  
    
      
      longitude
      latitude
      housing_median_age
      total_rooms
      total_bedrooms
      population
      households
      median_income
      median_house_value
    
  
  
    
      count
      20640.000000
      20640.000000
      20640.000000
      20640.000000
      20433.000000
      20640.000000
      20640.000000
      20640.000000
      20640.000000
    
    
      mean
      -119.569704
      35.631861
      28.639486
      2635.763081
      537.870553
      1425.476744
      499.539680
      3.870671
      206855.816909
    
    
      std
      2.003532
      2.135952
      12.585558
      2181.615252
      421.385070
      1132.462122
      382.329753
      1.899822
      115395.615874
    
    
      min
      -124.350000
      32.540000
      1.000000
      2.000000
      1.000000
      3.000000
      1.000000
      0.499900
      14999.000000
    
    
      25%
      -121.800000
      33.930000
      18.000000
      1447.750000
      296.000000
      787.000000
      280.000000
      2.563400
      119600.000000
    
    
      50%
      -118.490000
      34.260000
      29.000000
      2127.000000
      435.000000
      1166.000000
      409.000000
      3.534800
      179700.000000
    
    
      75%
      -118.010000
      37.710000
      37.000000
      3148.000000
      647.000000
      1725.000000
      605.000000
      4.743250
      264725.000000
    
    
      max
      -114.310000
      41.950000
      52.000000
      39320.000000
      6445.000000
      35682.000000
      6082.000000
      15.000100
      500001.000000



In [9]:

    
%matplotlib inline
import matplotlib.pyplot as plt



In [11]:

    
housing.hist(bins=50, figsize=(20,15))
plt.show()



In [12]:

    
import numpy as np

def split_train_test(data, test_ratio):
    shuffled_indices = np.random.permutation(len(data))
    test_set_size = int(len(data) * test_ratio)
    test_indices = shuffled_indices[:test_set_size]
    train_indices = shuffled_indices[test_set_size:]
    return data.iloc[train_indices], data.iloc[test_indices]



In [15]:

    
train_set, test_set = split_train_test(housing, 0.2)
print(len(train_set), "train +", len(test_set), "test")









    



16512 train + 4128 test



In [20]:

    
from sklearn.model_selection import train_test_split

train_set, test_set = train_test_split(housing, test_size=0.2, random_state=42)



In [24]:

    
housing.plot(kind='scatter', x='longitude', y='latitude', alpha=0.1)









    Out[24]:





<matplotlib.axes._subplots.AxesSubplot at 0x2a15c0615f8>



In [25]:

    
housing.plot(kind='scatter', x='longitude', y='latitude', alpha=0.4, s=housing['population']/100, label='population', c='median_house_value', cmap=plt.get_cmap('jet'), colorbar=True)
plt.legend()









    Out[25]:





<matplotlib.legend.Legend at 0x2a159e583c8>



In [27]:

    
corr_matrix = housing.corr()



In [28]:

    
corr_matrix['median_house_value'].sort_values(ascending=False)









    Out[28]:





median_house_value    1.000000
median_income         0.688075
total_rooms           0.134153
housing_median_age    0.105623
households            0.065843
total_bedrooms        0.049686
population           -0.024650
longitude            -0.045967
latitude             -0.144160
Name: median_house_value, dtype: float64



In [31]:

    
from pandas.plotting import scatter_matrix

attributes = ['median_house_value', 'median_income', 'total_rooms', 'housing_median_age']

scatter_matrix(housing[attributes], figsize=(12,8))









    Out[31]:





array([[<matplotlib.axes._subplots.AxesSubplot object at 0x000002A158F773C8>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x000002A159F3BA90>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x000002A159EA7A58>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x000002A159DC50B8>],
       [<matplotlib.axes._subplots.AxesSubplot object at 0x000002A159744C88>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x000002A159909128>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x000002A15A0A22E8>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x000002A1594FB8D0>],
       [<matplotlib.axes._subplots.AxesSubplot object at 0x000002A1594FBD68>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x000002A159D3C7F0>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x000002A159D46D68>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x000002A159D62320>],
       [<matplotlib.axes._subplots.AxesSubplot object at 0x000002A159C3B898>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x000002A1599CBE10>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x000002A159ED83C8>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x000002A159EDC940>]],
      dtype=object)



In [106]:

    
from sklearn.impute import SimpleImputer

imputer = SimpleImputer(strategy = 'median')

housing_num = housing.drop(['ocean_proximity','median_house_value'], axis=1)

imputer.fit(housing_num)

X = imputer.transform(housing_num)

X









    Out[106]:





array([[-1.2223e+02,  3.7880e+01,  4.1000e+01, ...,  3.2200e+02,
         1.2600e+02,  8.3252e+00],
       [-1.2222e+02,  3.7860e+01,  2.1000e+01, ...,  2.4010e+03,
         1.1380e+03,  8.3014e+00],
       [-1.2224e+02,  3.7850e+01,  5.2000e+01, ...,  4.9600e+02,
         1.7700e+02,  7.2574e+00],
       ...,
       [-1.2122e+02,  3.9430e+01,  1.7000e+01, ...,  1.0070e+03,
         4.3300e+02,  1.7000e+00],
       [-1.2132e+02,  3.9430e+01,  1.8000e+01, ...,  7.4100e+02,
         3.4900e+02,  1.8672e+00],
       [-1.2124e+02,  3.9370e+01,  1.6000e+01, ...,  1.3870e+03,
         5.3000e+02,  2.3886e+00]])



In [141]:

    
from sklearn.preprocessing import LabelEncoder, OneHotEncoder

encoder = LabelEncoder()

housing_cat = housing['ocean_proximity']

housing_cat_encoded = encoder.fit_transform(housing_cat)

print(housing_cat_encoded)

print(encoder.classes_)

encoder = OneHotEncoder(categories='auto')

housing_cat_1hot = encoder.fit_transform(housing_cat_encoded.reshape(-1,1))
housing_cat_1hot.toarray()









    



[3 3 3 ... 1 1 1]
['<1H OCEAN' 'INLAND' 'ISLAND' 'NEAR BAY' 'NEAR OCEAN']






    Out[141]:





array([[0., 0., 0., 1., 0.],
       [0., 0., 0., 1., 0.],
       [0., 0., 0., 1., 0.],
       ...,
       [0., 1., 0., 0., 0.],
       [0., 1., 0., 0., 0.],
       [0., 1., 0., 0., 0.]])



In [142]:

    
from sklearn.preprocessing import LabelBinarizer

encoder = LabelBinarizer()

housing_cat_1hot = encoder.fit_transform(housing_cat)
housing_cat_1hot









    Out[142]:





array([[0, 0, 0, 1, 0],
       [0, 0, 0, 1, 0],
       [0, 0, 0, 1, 0],
       ...,
       [0, 1, 0, 0, 0],
       [0, 1, 0, 0, 0],
       [0, 1, 0, 0, 0]])



In [78]:

    
from sklearn.base import BaseEstimator, TransformerMixin

rooms_ix, bedrooms_ix, population_ix, household_ix = 3, 4, 5, 6

class CombinedAttributesAdder(BaseEstimator, TransformerMixin):
    def __init__(self, add_bedrooms_per_room = True):
        self.add_bedrooms_per_room = add_bedrooms_per_room
    def fit(self, X, y=None):
        return self
    def transform(self, X, y=None):
        rooms_per_household = X[:, rooms_ix] / X[:, household_ix]
        population_per_household = X[:, population_ix] / X[:, household_ix]
        if self.add_bedrooms_per_room:
            bedrooms_per_room = X[:, bedrooms_ix] / X[:, rooms_ix]
            return np.c_[X, rooms_per_household, population_per_household, bedrooms_per_room]
        else:
            return np.c_[X, rooms_per_household, population_per_household]

attr_adder = CombinedAttributesAdder(add_bedrooms_per_room=True)

housing_extra_attribs = attr_adder.transform(housing.values)

print(housing_extra_attribs)









    



[[-122.23 37.88 41.0 ... 6.984126984126984 2.5555555555555554
  0.14659090909090908]
 [-122.22 37.86 21.0 ... 6.238137082601054 2.109841827768014
  0.15579659106916466]
 [-122.24 37.85 52.0 ... 8.288135593220339 2.8022598870056497
  0.12951601908657123]
 ...
 [-121.22 39.43 17.0 ... 5.20554272517321 2.325635103926097
  0.21517302573203195]
 [-121.32 39.43 18.0 ... 5.329512893982808 2.1232091690544412
  0.21989247311827956]
 [-121.24 39.37 16.0 ... 5.254716981132075 2.616981132075472
  0.22118491921005387]]



In [79]:

    
# Transformation Pipelines

from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler

num_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='median')),
    ('std_scaler', StandardScaler()),
])

housing_num_tr = num_pipeline.fit_transform(housing_num)
print(housing_num_tr.min(axis=1), housing_num_tr.max(axis=1))









    



[-1.32783522 -1.32284391 -1.33282653 ... -1.14259331 -1.05860847
 -1.01787803] [2.34476576 2.33223796 1.85618152 ... 1.77823747 1.77823747 1.75014627]



In [107]:

    
from sklearn.pipeline import FeatureUnion
from sklearn.base import BaseEstimator, TransformerMixin

class DataFrameSelector(BaseEstimator, TransformerMixin):
    def __init__(self, attribute_names):
        self.attribute_names = attribute_names
    def fit(self, X, y=None):
        return self
    def transform(self, X, y=None):
        return X[self.attribute_names].values

class CustomLabelBinarizer(BaseEstimator, TransformerMixin):
    def __init__(self, sparse_output=False):
        self.sparse_output = sparse_output
    def fit(self, X, y=None):
        return self
    def transform(self, X, y=None):
        enc = LabelBinarizer(sparse_output=self.sparse_output)
        return enc.fit_transform(X)



num_attribs = list(housing_num)
cat_attribs = ['ocean_proximity']

print(num_attribs)

num_pipeline = Pipeline([
    ('selector', DataFrameSelector(num_attribs)),
    ('imputer', SimpleImputer(strategy='median')),
    ('attribs_adder', CombinedAttributesAdder()),
    ('std_scaler', StandardScaler()),
])

cat_pipeline = Pipeline([
    ('selector', DataFrameSelector(cat_attribs)),
    ('label_binarizer', CustomLabelBinarizer()),
])

full_pipeline = FeatureUnion(transformer_list=[
    ('num_pipeline', num_pipeline),
    ('cat_pipeline', cat_pipeline),
])

housing_prepared = full_pipeline.fit_transform(housing)

housing_prepared









    



['longitude', 'latitude', 'housing_median_age', 'total_rooms', 'total_bedrooms', 'population', 'households', 'median_income']






    Out[107]:





array([[-1.32783522,  1.05254828,  0.98214266, ...,  0.        ,
         1.        ,  0.        ],
       [-1.32284391,  1.04318455, -0.60701891, ...,  0.        ,
         1.        ,  0.        ],
       [-1.33282653,  1.03850269,  1.85618152, ...,  0.        ,
         1.        ,  0.        ],
       ...,
       [-0.8237132 ,  1.77823747, -0.92485123, ...,  0.        ,
         0.        ,  0.        ],
       [-0.87362627,  1.77823747, -0.84539315, ...,  0.        ,
         0.        ,  0.        ],
       [-0.83369581,  1.75014627, -1.00430931, ...,  0.        ,
         0.        ,  0.        ]])



In [116]:

    
from sklearn.linear_model import LinearRegression

housing_X = housing.drop(["median_house_value"], axis=1)
housing_Y = housing["median_house_value"].values

print(housing_Y)

housing_prepared = full_pipeline.fit_transform(housing_X)


lin_reg = LinearRegression()

lin_reg.fit(housing_prepared, housing_Y)


from sklearn.metrics import mean_squared_error

housing_predictions = lin_reg.predict(housing_prepared)
lin_mse = mean_squared_error(housing_Y, housing_predictions)
lin_rmse = np.sqrt(lin_mse)
lin_rmse









    



[452600. 358500. 352100. ...  92300.  84700.  89400.]






    Out[116]:





68286.12607251322



In [120]:

    
from sklearn.tree import DecisionTreeRegressor

tree_reg = DecisionTreeRegressor()

tree_reg.fit(housing_prepared, housing_Y)

housing_predictions = tree_reg.predict(housing_prepared)
tree_mse = mean_squared_error(housing_Y, housing_predictions)
tree_rmse = np.sqrt(tree_mse)
print(tree_rmse)

0.0



In [123]:

    
from sklearn.model_selection import cross_val_score

scores = cross_val_score(tree_reg, housing_prepared, housing_Y, scoring='neg_mean_squared_error', cv=10)

rmse_scores = np.sqrt(-scores)

def display_scores(scores):
    print('scores:', scores)
    print('mean:', scores.mean())
    print('std:', scores.std())

display_scores(rmse_scores)









    



scores: [121333.0112734   71419.78892793  85094.59633577  73819.63348344
  89581.3000502   77516.89924067  68190.9987994  101509.94460809
  92974.45416405  71805.53619816]
mean: 85324.6163081105
std: 15805.876505502598



In [125]:

    
lin_scores = cross_val_score(lin_reg, housing_prepared, housing_Y, scoring='neg_mean_squared_error', cv=10)

lin_rmse_scores = np.sqrt(-lin_scores)

display_scores(lin_rmse_scores)









    



scores: [84183.66301514 61191.52853899 86743.60959739 62286.73445075
 80537.25795828 68918.58661112 52504.86407192 90904.22793667
 77675.08903006 53940.95369716]
mean: 71888.65149074617
std: 13247.67185583079



In [140]:

    
from sklearn.ensemble import RandomForestRegressor

forest_reg = RandomForestRegressor(n_estimators = 10)

forest_scores = cross_val_score(forest_reg, housing_prepared, housing_Y, scoring='neg_mean_squared_error', cv=10)

forest_rmse_scores = np.sqrt(-forest_scores)

display_scores(forest_rmse_scores)









    



scores: [97315.74872864 51974.93587501 68340.41993304 59494.25222907
 63209.00536231 62462.35967189 48357.97722972 80364.95096352
 77324.07599552 52186.86195507]
mean: 66103.05879438028
std: 14429.169983361297



In [128]:

    
# Fine-Tune Your Model

from sklearn.model_selection import GridSearchCV

param_grid = [
    {'n_estimators': [3,10,30], 'max_features': [2,4,6,8]},
    {'bootstrap': [False], 'n_estimators': [3, 10], 'max_features': [2,3,4]},
]

forest_reg = RandomForestRegressor()

grid_search = GridSearchCV(forest_reg, param_grid, cv=5, scoring='neg_mean_squared_error')

grid_search.fit(housing_prepared, housing_Y)

grid_search.best_params_









    Out[128]:





{'max_features': 4, 'n_estimators': 30}



In [130]:

    
grid_search.best_estimator_









    Out[130]:





RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
                      max_features=4, max_leaf_nodes=None,
                      min_impurity_decrease=0.0, min_impurity_split=None,
                      min_samples_leaf=1, min_samples_split=2,
                      min_weight_fraction_leaf=0.0, n_estimators=30,
                      n_jobs=None, oob_score=False, random_state=None,
                      verbose=0, warm_start=False)



In [131]:

    
cvres = grid_search.cv_results_
for mean_score, params in zip(cvres['mean_test_score'], cvres['params']):
    print(np.sqrt(-mean_score), params)









    



81289.08766553094 {'max_features': 2, 'n_estimators': 3}
73158.21964765945 {'max_features': 2, 'n_estimators': 10}
69570.12652864677 {'max_features': 2, 'n_estimators': 30}
76662.59265367892 {'max_features': 4, 'n_estimators': 3}
70601.43435703327 {'max_features': 4, 'n_estimators': 10}
67811.30074349103 {'max_features': 4, 'n_estimators': 30}
79993.3334121623 {'max_features': 6, 'n_estimators': 3}
71821.2423340115 {'max_features': 6, 'n_estimators': 10}
67980.20110627243 {'max_features': 6, 'n_estimators': 30}
76038.93736973275 {'max_features': 8, 'n_estimators': 3}
70625.69804067428 {'max_features': 8, 'n_estimators': 10}
68404.89243928483 {'max_features': 8, 'n_estimators': 30}
82011.97781532306 {'bootstrap': False, 'max_features': 2, 'n_estimators': 3}
71441.53551586885 {'bootstrap': False, 'max_features': 2, 'n_estimators': 10}
80217.00255734805 {'bootstrap': False, 'max_features': 3, 'n_estimators': 3}
72240.88881926336 {'bootstrap': False, 'max_features': 3, 'n_estimators': 10}
78232.55070841077 {'bootstrap': False, 'max_features': 4, 'n_estimators': 3}
70128.10670649988 {'bootstrap': False, 'max_features': 4, 'n_estimators': 10}



In [136]:

    
feature_importances = grid_search.best_estimator_.feature_importances_
feature_importances

extra_attribs = ['rooms_per_hhold', 'pop_per_hhold', 'bedrooms_per_room']

cat_one_hot_attribs = list(encoder.classes_)

attributes = num_attribs + extra_attribs + cat_one_hot_attribs

sorted(zip(feature_importances, attributes), reverse=True)









    Out[136]:





[(0.29270142067458654, 'median_income'),
 (0.13834075931969428, 'INLAND'),
 (0.10346770966268978, 'pop_per_hhold'),
 (0.08400123962788562, 'latitude'),
 (0.08325908334643219, 'bedrooms_per_room'),
 (0.0789091468434949, 'longitude'),
 (0.07000444912239243, 'rooms_per_hhold'),
 (0.041020803986901645, 'housing_median_age'),
 (0.022083484376280636, 'population'),
 (0.02144979586263664, 'total_rooms'),
 (0.019866964809786548, 'total_bedrooms'),
 (0.019398534554527423, 'households'),
 (0.011759408345777841, '<1H OCEAN'),
 (0.007638554840163659, 'NEAR OCEAN'),
 (0.0059601278500213606, 'NEAR BAY'),
 (0.00013851677672851748, 'ISLAND')]



In [139]:

    
final_model = grid_search.best_estimator_

X_test = test_set.drop('median_house_value', axis=1)
Y_test = test_set['median_house_value']

X_test_prepared = full_pipeline.fit_transform(X_test)

final_predictions = final_model.predict(X_test_prepared)

final_mse = mean_squared_error(Y_test, final_predictions)

final_rmse = np.sqrt(final_mse)
print(final_rmse)









    



54855.264170511706

	longitude	latitude	housing_median_age	total_rooms	total_bedrooms	population	households	median_income	median_house_value	ocean_proximity
0	-122.23	37.88	41.0	880.0	129.0	322.0	126.0	8.3252	452600.0	NEAR BAY
1	-122.22	37.86	21.0	7099.0	1106.0	2401.0	1138.0	8.3014	358500.0	NEAR BAY
2	-122.24	37.85	52.0	1467.0	190.0	496.0	177.0	7.2574	352100.0	NEAR BAY
3	-122.25	37.85	52.0	1274.0	235.0	558.0	219.0	5.6431	341300.0	NEAR BAY
4	-122.25	37.85	52.0	1627.0	280.0	565.0	259.0	3.8462	342200.0	NEAR BAY

	longitude	latitude	housing_median_age	total_rooms	total_bedrooms	population	households	median_income	median_house_value
count	20640.000000	20640.000000	20640.000000	20640.000000	20433.000000	20640.000000	20640.000000	20640.000000	20640.000000
mean	-119.569704	35.631861	28.639486	2635.763081	537.870553	1425.476744	499.539680	3.870671	206855.816909
std	2.003532	2.135952	12.585558	2181.615252	421.385070	1132.462122	382.329753	1.899822	115395.615874
min	-124.350000	32.540000	1.000000	2.000000	1.000000	3.000000	1.000000	0.499900	14999.000000
25%	-121.800000	33.930000	18.000000	1447.750000	296.000000	787.000000	280.000000	2.563400	119600.000000
50%	-118.490000	34.260000	29.000000	2127.000000	435.000000	1166.000000	409.000000	3.534800	179700.000000
75%	-118.010000	37.710000	37.000000	3148.000000	647.000000	1725.000000	605.000000	4.743250	264725.000000
max	-114.310000	41.950000	52.000000	39320.000000	6445.000000	35682.000000	6082.000000	15.000100	500001.000000