notebook.community

Edit and run



In [1]:

    
import os 
import tarfile
from six.moves import urllib

DOWNLOAD_ROOT="https://raw.githubusercontent.com/ageron/handson-ml/master/"
HOUSING_PATH="datasets/housing"
HOUSING_URL=DOWNLOAD_ROOT + HOUSING_PATH + "/housing.tgz"

def fetch_housing_data(housing_url = HOUSING_URL, housing_path = HOUSING_PATH):
    if not os.path.isdir(housing_path):
        os.makedirs(housing_path)
    tgz_path = os.path.join(housing_path, "housing.tgz")
    urllib.request.urlretrieve(housing_url, tgz_path)
    print("Downloaded file.")
    housing_tgz = tarfile.open(tgz_path)
    housing_tgz.extractall(path = housing_path)
    print("Extracted archive.")
    housing_tgz.close()
    print("Done.")

fetch_housing_data()









    



Downloaded file.
Extracted archive.
Done.



In [2]:

    
import pandas as pd

def load_housing_data(housing_path = HOUSING_PATH):
    csv_path = os.path.join(housing_path, "housing.csv")
    return pd.read_csv(csv_path)

housing = load_housing_data()
housing.head()









    Out[2]:






  
    
      
      longitude
      latitude
      housing_median_age
      total_rooms
      total_bedrooms
      population
      households
      median_income
      median_house_value
      ocean_proximity
    
  
  
    
      0
      -122.23
      37.88
      41.0
      880.0
      129.0
      322.0
      126.0
      8.3252
      452600.0
      NEAR BAY
    
    
      1
      -122.22
      37.86
      21.0
      7099.0
      1106.0
      2401.0
      1138.0
      8.3014
      358500.0
      NEAR BAY
    
    
      2
      -122.24
      37.85
      52.0
      1467.0
      190.0
      496.0
      177.0
      7.2574
      352100.0
      NEAR BAY
    
    
      3
      -122.25
      37.85
      52.0
      1274.0
      235.0
      558.0
      219.0
      5.6431
      341300.0
      NEAR BAY
    
    
      4
      -122.25
      37.85
      52.0
      1627.0
      280.0
      565.0
      259.0
      3.8462
      342200.0
      NEAR BAY



In [3]:

    
housing.info()









    



<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20640 entries, 0 to 20639
Data columns (total 10 columns):
longitude             20640 non-null float64
latitude              20640 non-null float64
housing_median_age    20640 non-null float64
total_rooms           20640 non-null float64
total_bedrooms        20433 non-null float64
population            20640 non-null float64
households            20640 non-null float64
median_income         20640 non-null float64
median_house_value    20640 non-null float64
ocean_proximity       20640 non-null object
dtypes: float64(9), object(1)
memory usage: 1.6+ MB



In [4]:

    
housing["ocean_proximity"].head()









    Out[4]:





0    NEAR BAY
1    NEAR BAY
2    NEAR BAY
3    NEAR BAY
4    NEAR BAY
Name: ocean_proximity, dtype: object



In [5]:

    
housing["ocean_proximity"].value_counts()









    Out[5]:





<1H OCEAN     9136
INLAND        6551
NEAR OCEAN    2658
NEAR BAY      2290
ISLAND           5
Name: ocean_proximity, dtype: int64



In [6]:

    
housing.describe()









    Out[6]:






  
    
      
      longitude
      latitude
      housing_median_age
      total_rooms
      total_bedrooms
      population
      households
      median_income
      median_house_value
    
  
  
    
      count
      20640.000000
      20640.000000
      20640.000000
      20640.000000
      20433.000000
      20640.000000
      20640.000000
      20640.000000
      20640.000000
    
    
      mean
      -119.569704
      35.631861
      28.639486
      2635.763081
      537.870553
      1425.476744
      499.539680
      3.870671
      206855.816909
    
    
      std
      2.003532
      2.135952
      12.585558
      2181.615252
      421.385070
      1132.462122
      382.329753
      1.899822
      115395.615874
    
    
      min
      -124.350000
      32.540000
      1.000000
      2.000000
      1.000000
      3.000000
      1.000000
      0.499900
      14999.000000
    
    
      25%
      -121.800000
      33.930000
      18.000000
      1447.750000
      296.000000
      787.000000
      280.000000
      2.563400
      119600.000000
    
    
      50%
      -118.490000
      34.260000
      29.000000
      2127.000000
      435.000000
      1166.000000
      409.000000
      3.534800
      179700.000000
    
    
      75%
      -118.010000
      37.710000
      37.000000
      3148.000000
      647.000000
      1725.000000
      605.000000
      4.743250
      264725.000000
    
    
      max
      -114.310000
      41.950000
      52.000000
      39320.000000
      6445.000000
      35682.000000
      6082.000000
      15.000100
      500001.000000



In [7]:

    
import matplotlib.pyplot as plt
housing.hist(bins=50, figsize=(20,15))
plt.show()



In [8]:

    
import hashlib
import numpy as np # erratum, was missing

def test_set_check(identifier, test_ratio, hash):
    return hash(np.int64(identifier)).digest()[-1] < 256 * test_ratio

def split_train_test_by_id(data, test_ratio, id_column, hash=hashlib.md5):
    ids = data[id_column]
    in_test_set = ids.apply(lambda id_: test_set_check(id_, test_ratio, hash))
    return data.loc[~in_test_set], data.loc[in_test_set]

# Using this approach, we need to make sure that new data
# gets added at the end of the dataset!
housing_with_id = housing.reset_index()    # add id column
train_set, test_set = split_train_test_by_id(housing_with_id, 0.2, "index")

print("Train set:")
train_set.info()

print("Test set:")
test_set.info()









    



Train set:
<class 'pandas.core.frame.DataFrame'>
Int64Index: 16362 entries, 0 to 20639
Data columns (total 11 columns):
index                 16362 non-null int64
longitude             16362 non-null float64
latitude              16362 non-null float64
housing_median_age    16362 non-null float64
total_rooms           16362 non-null float64
total_bedrooms        16195 non-null float64
population            16362 non-null float64
households            16362 non-null float64
median_income         16362 non-null float64
median_house_value    16362 non-null float64
ocean_proximity       16362 non-null object
dtypes: float64(9), int64(1), object(1)
memory usage: 1.5+ MB
Test set:
<class 'pandas.core.frame.DataFrame'>
Int64Index: 4278 entries, 4 to 20637
Data columns (total 11 columns):
index                 4278 non-null int64
longitude             4278 non-null float64
latitude              4278 non-null float64
housing_median_age    4278 non-null float64
total_rooms           4278 non-null float64
total_bedrooms        4238 non-null float64
population            4278 non-null float64
households            4278 non-null float64
median_income         4278 non-null float64
median_house_value    4278 non-null float64
ocean_proximity       4278 non-null object
dtypes: float64(9), int64(1), object(1)
memory usage: 401.1+ KB



In [9]:

    
# Create income categories to be able to use stratified sampling 
# across data (avoid sampling bias).
housing["income_cat"] = np.ceil(housing["median_income"] / 1.5)
housing["income_cat"].where(housing["income_cat"] < 5, 5.0, inplace=True)

housing["income_cat"].hist(bins=5, figsize=(8,8))
plt.show()



In [10]:

    
# Create stratified test / train set split using income categories
from sklearn.model_selection import StratifiedShuffleSplit

split = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=42)
for train_index, test_index in split.split(housing, housing["income_cat"]):
    strat_train_set = housing.loc[train_index]
    strat_test_set = housing.loc[test_index]
    
housing["income_cat"].value_counts() / len(housing)









    Out[10]:





3.0    0.350581
2.0    0.318847
4.0    0.176308
5.0    0.114438
1.0    0.039826
Name: income_cat, dtype: float64



In [11]:

    
# Remove temporary income category from sets
for set in (strat_test_set, strat_train_set):
    set.drop(["income_cat"], axis=1, inplace=True)



In [12]:

    
# Create working set and visualize
%matplotlib inline
housing = strat_train_set.copy()
housing.plot(kind="scatter", x="longitude", y="latitude", alpha=0.1, figsize=(12,10))









    Out[12]:





<matplotlib.axes._subplots.AxesSubplot at 0x7f993cf4e940>



In [13]:

    
housing.plot(kind="scatter", x="longitude", y="latitude", alpha=0.4, 
            s=housing["population"]/100, label="population",
            c="median_house_value", cmap=plt.get_cmap("jet"), colorbar=True,
            figsize=(12,10))
plt.legend()









    Out[13]:





<matplotlib.legend.Legend at 0x7f993ca39b38>



In [14]:

    
# Check correlation between columns
corr_matrix = housing.corr()
corr_matrix["median_house_value"].sort_values(ascending=False)









    Out[14]:





median_house_value    1.000000
median_income         0.687160
total_rooms           0.135097
housing_median_age    0.114110
households            0.064506
total_bedrooms        0.047689
population           -0.026920
longitude            -0.047432
latitude             -0.142724
Name: median_house_value, dtype: float64



In [15]:

    
# Create scatter matrix to visually check for correlations
from pandas.tools.plotting import scatter_matrix

attributes = ["median_house_value", "median_income", "total_rooms", "housing_median_age"]
scatter_matrix(housing[attributes], figsize=(12, 8))









    Out[15]:





array([[<matplotlib.axes._subplots.AxesSubplot object at 0x7f993ce38c50>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x7f993cad5d68>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x7f993a189a20>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x7f993caebcc0>],
       [<matplotlib.axes._subplots.AxesSubplot object at 0x7f993cc59518>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x7f993a1ca9e8>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x7f993a0c7390>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x7f993a17a908>],
       [<matplotlib.axes._subplots.AxesSubplot object at 0x7f993cd977b8>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x7f993a0c1e10>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x7f993a206898>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x7f993cae29b0>],
       [<matplotlib.axes._subplots.AxesSubplot object at 0x7f993ca1e908>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x7f993ce032e8>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x7f993cc442e8>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x7f993cef2240>]], dtype=object)



In [16]:

    
housing.plot(kind="scatter", x="median_income", y="median_house_value", alpha=0.1, figsize=(8,6))









    Out[16]:





<matplotlib.axes._subplots.AxesSubplot at 0x7f993cef7f98>



In [17]:

    
housing["rooms_per_household"] = housing["total_rooms"] / housing["households"]
housing["bedrooms_per_room"] = housing["total_bedrooms"] / housing["total_rooms"]
housing["population_per_household"] = housing["population"] / housing["households"]

corr_matrix = housing.corr()
corr_matrix["median_house_value"].sort_values(ascending=False)









    Out[17]:





median_house_value          1.000000
median_income               0.687160
rooms_per_household         0.146285
total_rooms                 0.135097
housing_median_age          0.114110
households                  0.064506
total_bedrooms              0.047689
population_per_household   -0.021985
population                 -0.026920
longitude                  -0.047432
latitude                   -0.142724
bedrooms_per_room          -0.259984
Name: median_house_value, dtype: float64



In [18]:

    
# reset data
housing = strat_train_set.drop("median_house_value", axis=1)
housing_labels = strat_train_set["median_house_value"].copy()

# Now handle n/a values using one of the following:

#housing.dropna(subset=["total_bedrooms"]) # 1) get rid of districts
#housing.drop(["total_bedrooms"], axis=2)  # 2) get rid of whole attribute
#median = housing["total_bedrooms"].median()
#housing["total_bedrooms"].fillna(median)  # 3) fill with median value

# This is 3) using sklearn's library
from sklearn.preprocessing import Imputer

housing_num = housing.drop("ocean_proximity", axis=1)
imputer = Imputer(strategy="median")
imputer.fit(housing_num)
X = imputer.transform(housing_num)
housing_tr = pd.DataFrame(X, columns=housing_num.columns)
housing_tr.info()









    



<class 'pandas.core.frame.DataFrame'>
RangeIndex: 16512 entries, 0 to 16511
Data columns (total 8 columns):
longitude             16512 non-null float64
latitude              16512 non-null float64
housing_median_age    16512 non-null float64
total_rooms           16512 non-null float64
total_bedrooms        16512 non-null float64
population            16512 non-null float64
households            16512 non-null float64
median_income         16512 non-null float64
dtypes: float64(8)
memory usage: 1.0 MB



In [19]:

    
# Use one-hot encoding got near ocean attribute
from sklearn.preprocessing import LabelBinarizer
encoder = LabelBinarizer()

housing_cat = housing["ocean_proximity"]
housing_cat_1hot = encoder.fit_transform(housing_cat)



In [20]:

    
# Custom class for adding further attributes
from sklearn.base import BaseEstimator, TransformerMixin

rooms_ix, bedrooms_ix, population_ix, household_ix = 3, 4, 5, 6

class CombinedAttributesAdder(BaseEstimator, TransformerMixin):
    def __init__(self, add_bedrooms_per_room=True):
        self.add_bedrooms_per_room = add_bedrooms_per_room
    def fit(self, X, y=None):
        return self
    def transform(self, X, y=None):
        rooms_per_household = X[:, population_ix] / X[:, household_ix]
        population_per_household = X[:, population_ix] / X[:, household_ix]
        if self.add_bedrooms_per_room:
            bedrooms_per_room = X[:, bedrooms_ix] / X[:, rooms_ix]
            return np.c_[X, rooms_per_household, population_per_household, bedrooms_per_room]
        else:
            return np.c_[X, rooms_per_household, population_per_household]
        
attr_adder = CombinedAttributesAdder(add_bedrooms_per_room=False)
housing_extra_attribs = attr_adder.transform(housing.values)
housing_extra_attribs









    Out[20]:





array([[-121.89, 37.29, 38.0, ..., '<1H OCEAN', 2.094395280235988,
        2.094395280235988],
       [-121.93, 37.05, 14.0, ..., '<1H OCEAN', 2.7079646017699117,
        2.7079646017699117],
       [-117.2, 32.77, 31.0, ..., 'NEAR OCEAN', 2.0259740259740258,
        2.0259740259740258],
       ..., 
       [-116.4, 34.09, 9.0, ..., 'INLAND', 2.742483660130719,
        2.742483660130719],
       [-118.01, 33.82, 31.0, ..., '<1H OCEAN', 3.808988764044944,
        3.808988764044944],
       [-122.45, 37.77, 52.0, ..., 'NEAR BAY', 1.9859154929577465,
        1.9859154929577465]], dtype=object)



In [21]:

    
# Creating a full pipeline with all transformations so far

# 1) Helper class to extract only given attributes
from sklearn.base import BaseEstimator, TransformerMixin

class DataFrameSelector(BaseEstimator, TransformerMixin):
    def __init__(self, attribute_names):
        self.attribute_names = attribute_names
    def fit(self, X, y=None):
        return self
    def transform(self, X, y=None):
        return X[self.attribute_names].values

# 2) Define pipeline with interim steps
from sklearn.pipeline import FeatureUnion, Pipeline
from sklearn.preprocessing import StandardScaler

num_attribs = list(housing_num)
cat_attribs = ["ocean_proximity"]

num_pipeline = Pipeline([
    ("selector", DataFrameSelector(num_attribs)),  # select only numeric attributes
    ("imputer", Imputer(strategy="median")),       # fill n/a values with median
    ("attribs_added", CombinedAttributesAdder()),  # add derived attributes
    ("std_scaler", StandardScaler()),              # scale values to [-1,1]
])

cat_pipeline = Pipeline([
    ("selector", DataFrameSelector(cat_attribs)),  # select only category attributes
    ("label_binarizer", LabelBinarizer()),         # convert to 0/1 flags in columns
])

full_pipeline = FeatureUnion(transformer_list=[
    ("num_pipeline", num_pipeline),
    ("cat_pipeline", cat_pipeline),
])

housing_prepared = full_pipeline.fit_transform(housing)
housing_prepared.shape









    Out[21]:





(16512, 16)



In [22]:

    
# trying linear regression as first example
from sklearn.linear_model import LinearRegression

lin_reg = LinearRegression()
lin_reg.fit(housing_prepared, housing_labels)

some_data = housing.iloc[:5]
some_labels = housing_labels.iloc[:5]
some_data_prepared = full_pipeline.transform(some_data)  # NO fit()!
print("Predictions:\t", lin_reg.predict(some_data_prepared))
print("Labels:\t\t", list(some_labels))

# calculate MRSE
from sklearn.metrics import mean_squared_error

housing_predictions = lin_reg.predict(housing_prepared)
lin_mse = mean_squared_error(housing_labels, housing_predictions)
lin_mrse = np.sqrt(lin_mse)
print("MRSE (linear): ", lin_mrse, " (calculated across all, not only sample above)")









    



Predictions:	 [ 210563.33661755  318260.76438827  211215.14630977   59114.79474222
  187121.77956263]
Labels:		 [286600.0, 340600.0, 196900.0, 46300.0, 254500.0]
MRSE (linear):  68826.635851  (calculated across all, not only sample above)



In [23]:

    
from sklearn.tree import DecisionTreeRegressor

tree_reg = DecisionTreeRegressor()
tree_reg.fit(housing_prepared, housing_labels)

housing_predictions = tree_reg.predict(housing_prepared)
tree_mse = mean_squared_error(housing_labels, housing_predictions)
tree_mrse = np.sqrt(tree_mse)
print("MRSE (tree): ", tree_mrse, " (obviously overfitting)")









    



MRSE (tree):  0.0  (obviously overfitting)



In [24]:

    
from sklearn.model_selection import cross_val_score

tree_scores = cross_val_score(tree_reg, housing_prepared, housing_labels, scoring="neg_mean_squared_error", cv=10)
tree_rmse_scores = np.sqrt(-tree_scores)

def display_scores(scores):
    print("Scores: ", scores)
    print("Mean: ", scores.mean())
    print("Standard deviation: ", scores.std())

print("Decision tree: ")
display_scores(tree_rmse_scores)

lin_scores = cross_val_score(lin_reg, housing_prepared, housing_labels, scoring="neg_mean_squared_error", cv=10)
lin_rmse_scores = np.sqrt(-lin_scores)

print("Linear regression: ")
display_scores(lin_rmse_scores)









    



Decision tree: 
Scores:  [ 67251.23576485  68962.89751612  70025.62356522  69237.89254929
  71345.24640804  74707.79328983  71476.10527552  70963.85556754
  74607.66564682  70861.96266544]
Mean:  70944.0278249
Standard deviation:  2221.6061222
Linear regression: 
Scores:  [ 66899.81050358  67175.37517063  70150.13267841  74996.34376617
  67805.64981263  71294.59349397  65184.94156317  68217.48595344
  72578.14406325  67736.06602734]
Mean:  69203.8543033
Standard deviation:  2843.63683451



In [25]:

    
# Next, implement ForestRegressor prediction like lin_reg and tree above! p.70/71
from sklearn.ensemble import RandomForestRegressor

forest_reg = RandomForestRegressor()
forest_reg.fit(housing_prepared, housing_labels)

housing_predictions = forest_reg.predict(housing_prepared)
forest_mse = mean_squared_error(housing_labels, housing_predictions)
forest_mrse = np.sqrt(forest_mse)
print("MRSE (forest): ", forest_mrse)


forest_scores = cross_val_score(forest_reg, housing_prepared, housing_labels, scoring="neg_mean_squared_error", cv=10)
forest_rmse_scores = np.sqrt(-forest_scores)

print("Forest scores: ")
display_scores(forest_rmse_scores)









    



MRSE (forest):  22059.327924
Forest scores: 
Scores:  [ 52798.34263259  51313.08784562  54801.01660897  55602.06345358
  51978.53744617  55468.39400497  51691.72285024  49944.39222288
  55354.64610866  54063.8221069 ]
Mean:  53301.6025281
Standard deviation:  1919.6835931



In [26]:

    
# If you want to save the learned model (p.71), use pickle or joblib
from sklearn.externals import joblib

# save via
#joblib.dump(lin_reg, "lin_reg.pkl")

# restore it via
#lin_reg_loaded = joblib.load("lin_reg.pkl")



In [27]:

    
# (p. 72) Using grid-search to fine-tune parameters for model
from sklearn.model_selection import GridSearchCV

param_grid = [
    { 'n_estimators' : [3, 10, 30], 'max_features' : [2, 4, 6, 8] },
    { 'bootstrap' : [False], 'n_estimators' : [3, 10], 'max_features' : [2, 3, 4] }
]

forest_reg = RandomForestRegressor()

grid_search = GridSearchCV(forest_reg, param_grid, cv=5, scoring='neg_mean_squared_error')
grid_search.fit(housing_prepared, housing_labels)

print("Grid search, best params:\n--------------", grid_search.best_params_)

print("Grid search, best estimator:\n--------------", grid_search.best_estimator_)

print("Grid search, cross-evaluation scores:\n--------------")
cvres = grid_search.cv_results_
for mean_score, params in zip(cvres["mean_test_score"], cvres["params"]):
    print(np.sqrt(-mean_score), params)

#joblib.dump(grid_search, "grid_search.pkl")









    



Grid search, best params:
-------------- {'n_estimators': 30, 'max_features': 6}
Grid search, best estimator:
-------------- RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features=6, max_leaf_nodes=None, min_impurity_split=1e-07,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=30, n_jobs=1,
           oob_score=False, random_state=None, verbose=0, warm_start=False)
Grid search, cross-evaluation scores:
--------------
64609.5256662 {'n_estimators': 3, 'max_features': 2}
55755.0244917 {'n_estimators': 10, 'max_features': 2}
52871.9229124 {'n_estimators': 30, 'max_features': 2}
60622.0711223 {'n_estimators': 3, 'max_features': 4}
53379.0294788 {'n_estimators': 10, 'max_features': 4}
51258.1336416 {'n_estimators': 30, 'max_features': 4}
60711.2568896 {'n_estimators': 3, 'max_features': 6}
52593.5752989 {'n_estimators': 10, 'max_features': 6}
50541.6639541 {'n_estimators': 30, 'max_features': 6}
59101.0265536 {'n_estimators': 3, 'max_features': 8}
52280.167475 {'n_estimators': 10, 'max_features': 8}
50964.2834679 {'n_estimators': 30, 'max_features': 8}
62506.5504757 {'n_estimators': 3, 'max_features': 2, 'bootstrap': False}
54585.0342877 {'n_estimators': 10, 'max_features': 2, 'bootstrap': False}
60755.0352464 {'n_estimators': 3, 'max_features': 3, 'bootstrap': False}
53451.3104212 {'n_estimators': 10, 'max_features': 3, 'bootstrap': False}
58672.9636205 {'n_estimators': 3, 'max_features': 4, 'bootstrap': False}
52613.8694748 {'n_estimators': 10, 'max_features': 4, 'bootstrap': False}






    Out[27]:





['grid_search.pkl']



In [28]:

    
# TODO: should fine tune parameters for model or use RandomizedSearchCV instead of Grid~

# (p. 74) analyze best model features, here: median_income + INLAND + others
feature_importances = grid_search.best_estimator_.feature_importances_
extra_attribs = ["rooms_per_hhold", "pop_per_hhold", "bedrooms_per_room"]
cat_one_hot_attribs = list(encoder.classes_)
attributes = num_attribs + extra_attribs + cat_one_hot_attribs
sorted(zip(feature_importances, attributes), reverse=True)









    Out[28]:





[(0.37316533281442332, 'median_income'),
 (0.15028020126424085, 'INLAND'),
 (0.080748613803678912, 'bedrooms_per_room'),
 (0.073523054745776223, 'pop_per_hhold'),
 (0.068061355901989959, 'longitude'),
 (0.065176993076601686, 'rooms_per_hhold'),
 (0.061658555764659638, 'latitude'),
 (0.042698438685555319, 'housing_median_age'),
 (0.018215667598756827, 'total_rooms'),
 (0.017377384479193327, 'population'),
 (0.017247102687009894, 'total_bedrooms'),
 (0.01667986510035473, 'households'),
 (0.0084198670234885333, '<1H OCEAN'),
 (0.0039352009852568351, 'NEAR OCEAN'),
 (0.0027057199869508696, 'NEAR BAY'),
 (0.00010664608206306942, 'ISLAND')]



In [29]:

    
# TODO: Should drop less relevant features from model here.

# (p. 75) After having fine-tuned the model etc, evaluate on the test set
final_model = grid_search.best_estimator_

X_test = strat_test_set.drop("median_house_value", axis=1)
y_test = strat_test_set["median_house_value"].copy()

X_test_prepared = full_pipeline.transform(X_test)  # DO NOT fit()!

final_predictions = final_model.predict(X_test_prepared)

final_mse = mean_squared_error(y_test, final_predictions)
final_rmse = np.sqrt(final_mse)

print("Final RMSE against test set: ", final_rmse)









    



Final RMSE against test set:  49145.6096665



In [30]:

    
print("Done! Next: cleanup, consolidate, automate, vary, try out on test set from other source.")









    



Done! Next: cleanup, consolidate, automate, vary, try out on test set from other source.



In [ ]:

	longitude	latitude	housing_median_age	total_rooms	total_bedrooms	population	households	median_income	median_house_value	ocean_proximity
0	-122.23	37.88	41.0	880.0	129.0	322.0	126.0	8.3252	452600.0	NEAR BAY
1	-122.22	37.86	21.0	7099.0	1106.0	2401.0	1138.0	8.3014	358500.0	NEAR BAY
2	-122.24	37.85	52.0	1467.0	190.0	496.0	177.0	7.2574	352100.0	NEAR BAY
3	-122.25	37.85	52.0	1274.0	235.0	558.0	219.0	5.6431	341300.0	NEAR BAY
4	-122.25	37.85	52.0	1627.0	280.0	565.0	259.0	3.8462	342200.0	NEAR BAY

	longitude	latitude	housing_median_age	total_rooms	total_bedrooms	population	households	median_income	median_house_value
count	20640.000000	20640.000000	20640.000000	20640.000000	20433.000000	20640.000000	20640.000000	20640.000000	20640.000000
mean	-119.569704	35.631861	28.639486	2635.763081	537.870553	1425.476744	499.539680	3.870671	206855.816909
std	2.003532	2.135952	12.585558	2181.615252	421.385070	1132.462122	382.329753	1.899822	115395.615874
min	-124.350000	32.540000	1.000000	2.000000	1.000000	3.000000	1.000000	0.499900	14999.000000
25%	-121.800000	33.930000	18.000000	1447.750000	296.000000	787.000000	280.000000	2.563400	119600.000000
50%	-118.490000	34.260000	29.000000	2127.000000	435.000000	1166.000000	409.000000	3.534800	179700.000000
75%	-118.010000	37.710000	37.000000	3148.000000	647.000000	1725.000000	605.000000	4.743250	264725.000000
max	-114.310000	41.950000	52.000000	39320.000000	6445.000000	35682.000000	6082.000000	15.000100	500001.000000