In [2]:
import os
import tarfile
from six.moves import urllib
DL_ROOT = "https://raw.githubusercontent.com/ageron/handson-ml/master/"
HOUSING_PATH = "datasets/housing"
HOUSING_URL = DL_ROOT + HOUSING_PATH + "/housing.tgz"
def fetch_housing_data(housing_url=HOUSING_URL, housing_path=HOUSING_PATH):
if not os.path.isdir(housing_path):
os.makedirs(housing_path)
tgz_path = os.path.join(housing_path, "housing.tgz")
urllib.request.urlretrieve(housing_url, tgz_path)
housing_tgz = tarfile.open(tgz_path)
housing_tgz.extractall(path=housing_path)
housing_tgz.close()
In [3]:
import pandas as pd
def load_housing_data(housing_path=HOUSING_PATH):
csv_path = os.path.join(housing_path, "housing.csv")
return pd.read_csv(csv_path)
In [4]:
fetch_housing_data()
In [5]:
housing = load_housing_data()
In [7]:
housing.head()
Out[7]:
In [8]:
housing.describe()
Out[8]:
In [9]:
%matplotlib inline
import matplotlib.pyplot as plt
In [11]:
housing.hist(bins=50, figsize=(20,15))
plt.show()
In [12]:
import numpy as np
def split_train_test(data, test_ratio):
shuffled_indices = np.random.permutation(len(data))
test_set_size = int(len(data) * test_ratio)
test_indices = shuffled_indices[:test_set_size]
train_indices = shuffled_indices[test_set_size:]
return data.iloc[train_indices], data.iloc[test_indices]
In [15]:
train_set, test_set = split_train_test(housing, 0.2)
print(len(train_set), "train +", len(test_set), "test")
In [20]:
from sklearn.model_selection import train_test_split
train_set, test_set = train_test_split(housing, test_size=0.2, random_state=42)
In [24]:
housing.plot(kind='scatter', x='longitude', y='latitude', alpha=0.1)
Out[24]:
In [25]:
housing.plot(kind='scatter', x='longitude', y='latitude', alpha=0.4, s=housing['population']/100, label='population', c='median_house_value', cmap=plt.get_cmap('jet'), colorbar=True)
plt.legend()
Out[25]:
In [27]:
corr_matrix = housing.corr()
In [28]:
corr_matrix['median_house_value'].sort_values(ascending=False)
Out[28]:
In [31]:
from pandas.plotting import scatter_matrix
attributes = ['median_house_value', 'median_income', 'total_rooms', 'housing_median_age']
scatter_matrix(housing[attributes], figsize=(12,8))
Out[31]:
In [106]:
from sklearn.impute import SimpleImputer
imputer = SimpleImputer(strategy = 'median')
housing_num = housing.drop(['ocean_proximity','median_house_value'], axis=1)
imputer.fit(housing_num)
X = imputer.transform(housing_num)
X
Out[106]:
In [141]:
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
encoder = LabelEncoder()
housing_cat = housing['ocean_proximity']
housing_cat_encoded = encoder.fit_transform(housing_cat)
print(housing_cat_encoded)
print(encoder.classes_)
encoder = OneHotEncoder(categories='auto')
housing_cat_1hot = encoder.fit_transform(housing_cat_encoded.reshape(-1,1))
housing_cat_1hot.toarray()
Out[141]:
In [142]:
from sklearn.preprocessing import LabelBinarizer
encoder = LabelBinarizer()
housing_cat_1hot = encoder.fit_transform(housing_cat)
housing_cat_1hot
Out[142]:
In [78]:
from sklearn.base import BaseEstimator, TransformerMixin
rooms_ix, bedrooms_ix, population_ix, household_ix = 3, 4, 5, 6
class CombinedAttributesAdder(BaseEstimator, TransformerMixin):
def __init__(self, add_bedrooms_per_room = True):
self.add_bedrooms_per_room = add_bedrooms_per_room
def fit(self, X, y=None):
return self
def transform(self, X, y=None):
rooms_per_household = X[:, rooms_ix] / X[:, household_ix]
population_per_household = X[:, population_ix] / X[:, household_ix]
if self.add_bedrooms_per_room:
bedrooms_per_room = X[:, bedrooms_ix] / X[:, rooms_ix]
return np.c_[X, rooms_per_household, population_per_household, bedrooms_per_room]
else:
return np.c_[X, rooms_per_household, population_per_household]
attr_adder = CombinedAttributesAdder(add_bedrooms_per_room=True)
housing_extra_attribs = attr_adder.transform(housing.values)
print(housing_extra_attribs)
In [79]:
# Transformation Pipelines
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
num_pipeline = Pipeline([
('imputer', SimpleImputer(strategy='median')),
('std_scaler', StandardScaler()),
])
housing_num_tr = num_pipeline.fit_transform(housing_num)
print(housing_num_tr.min(axis=1), housing_num_tr.max(axis=1))
In [107]:
from sklearn.pipeline import FeatureUnion
from sklearn.base import BaseEstimator, TransformerMixin
class DataFrameSelector(BaseEstimator, TransformerMixin):
def __init__(self, attribute_names):
self.attribute_names = attribute_names
def fit(self, X, y=None):
return self
def transform(self, X, y=None):
return X[self.attribute_names].values
class CustomLabelBinarizer(BaseEstimator, TransformerMixin):
def __init__(self, sparse_output=False):
self.sparse_output = sparse_output
def fit(self, X, y=None):
return self
def transform(self, X, y=None):
enc = LabelBinarizer(sparse_output=self.sparse_output)
return enc.fit_transform(X)
num_attribs = list(housing_num)
cat_attribs = ['ocean_proximity']
print(num_attribs)
num_pipeline = Pipeline([
('selector', DataFrameSelector(num_attribs)),
('imputer', SimpleImputer(strategy='median')),
('attribs_adder', CombinedAttributesAdder()),
('std_scaler', StandardScaler()),
])
cat_pipeline = Pipeline([
('selector', DataFrameSelector(cat_attribs)),
('label_binarizer', CustomLabelBinarizer()),
])
full_pipeline = FeatureUnion(transformer_list=[
('num_pipeline', num_pipeline),
('cat_pipeline', cat_pipeline),
])
housing_prepared = full_pipeline.fit_transform(housing)
housing_prepared
Out[107]:
In [116]:
from sklearn.linear_model import LinearRegression
housing_X = housing.drop(["median_house_value"], axis=1)
housing_Y = housing["median_house_value"].values
print(housing_Y)
housing_prepared = full_pipeline.fit_transform(housing_X)
lin_reg = LinearRegression()
lin_reg.fit(housing_prepared, housing_Y)
from sklearn.metrics import mean_squared_error
housing_predictions = lin_reg.predict(housing_prepared)
lin_mse = mean_squared_error(housing_Y, housing_predictions)
lin_rmse = np.sqrt(lin_mse)
lin_rmse
Out[116]:
In [120]:
from sklearn.tree import DecisionTreeRegressor
tree_reg = DecisionTreeRegressor()
tree_reg.fit(housing_prepared, housing_Y)
housing_predictions = tree_reg.predict(housing_prepared)
tree_mse = mean_squared_error(housing_Y, housing_predictions)
tree_rmse = np.sqrt(tree_mse)
print(tree_rmse)
In [123]:
from sklearn.model_selection import cross_val_score
scores = cross_val_score(tree_reg, housing_prepared, housing_Y, scoring='neg_mean_squared_error', cv=10)
rmse_scores = np.sqrt(-scores)
def display_scores(scores):
print('scores:', scores)
print('mean:', scores.mean())
print('std:', scores.std())
display_scores(rmse_scores)
In [125]:
lin_scores = cross_val_score(lin_reg, housing_prepared, housing_Y, scoring='neg_mean_squared_error', cv=10)
lin_rmse_scores = np.sqrt(-lin_scores)
display_scores(lin_rmse_scores)
In [140]:
from sklearn.ensemble import RandomForestRegressor
forest_reg = RandomForestRegressor(n_estimators = 10)
forest_scores = cross_val_score(forest_reg, housing_prepared, housing_Y, scoring='neg_mean_squared_error', cv=10)
forest_rmse_scores = np.sqrt(-forest_scores)
display_scores(forest_rmse_scores)
In [128]:
# Fine-Tune Your Model
from sklearn.model_selection import GridSearchCV
param_grid = [
{'n_estimators': [3,10,30], 'max_features': [2,4,6,8]},
{'bootstrap': [False], 'n_estimators': [3, 10], 'max_features': [2,3,4]},
]
forest_reg = RandomForestRegressor()
grid_search = GridSearchCV(forest_reg, param_grid, cv=5, scoring='neg_mean_squared_error')
grid_search.fit(housing_prepared, housing_Y)
grid_search.best_params_
Out[128]:
In [130]:
grid_search.best_estimator_
Out[130]:
In [131]:
cvres = grid_search.cv_results_
for mean_score, params in zip(cvres['mean_test_score'], cvres['params']):
print(np.sqrt(-mean_score), params)
In [136]:
feature_importances = grid_search.best_estimator_.feature_importances_
feature_importances
extra_attribs = ['rooms_per_hhold', 'pop_per_hhold', 'bedrooms_per_room']
cat_one_hot_attribs = list(encoder.classes_)
attributes = num_attribs + extra_attribs + cat_one_hot_attribs
sorted(zip(feature_importances, attributes), reverse=True)
Out[136]:
In [139]:
final_model = grid_search.best_estimator_
X_test = test_set.drop('median_house_value', axis=1)
Y_test = test_set['median_house_value']
X_test_prepared = full_pipeline.fit_transform(X_test)
final_predictions = final_model.predict(X_test_prepared)
final_mse = mean_squared_error(Y_test, final_predictions)
final_rmse = np.sqrt(final_mse)
print(final_rmse)