In [1]:
import os
import tarfile
from six.moves import urllib
DOWNLOAD_ROOT="https://raw.githubusercontent.com/ageron/handson-ml/master/"
HOUSING_PATH="datasets/housing"
HOUSING_URL=DOWNLOAD_ROOT + HOUSING_PATH + "/housing.tgz"
def fetch_housing_data(housing_url = HOUSING_URL, housing_path = HOUSING_PATH):
if not os.path.isdir(housing_path):
os.makedirs(housing_path)
tgz_path = os.path.join(housing_path, "housing.tgz")
urllib.request.urlretrieve(housing_url, tgz_path)
print("Downloaded file.")
housing_tgz = tarfile.open(tgz_path)
housing_tgz.extractall(path = housing_path)
print("Extracted archive.")
housing_tgz.close()
print("Done.")
fetch_housing_data()
In [2]:
import pandas as pd
def load_housing_data(housing_path = HOUSING_PATH):
csv_path = os.path.join(housing_path, "housing.csv")
return pd.read_csv(csv_path)
housing = load_housing_data()
housing.head()
Out[2]:
In [3]:
housing.info()
In [4]:
housing["ocean_proximity"].head()
Out[4]:
In [5]:
housing["ocean_proximity"].value_counts()
Out[5]:
In [6]:
housing.describe()
Out[6]:
In [7]:
import matplotlib.pyplot as plt
housing.hist(bins=50, figsize=(20,15))
plt.show()
In [8]:
import hashlib
import numpy as np # erratum, was missing
def test_set_check(identifier, test_ratio, hash):
return hash(np.int64(identifier)).digest()[-1] < 256 * test_ratio
def split_train_test_by_id(data, test_ratio, id_column, hash=hashlib.md5):
ids = data[id_column]
in_test_set = ids.apply(lambda id_: test_set_check(id_, test_ratio, hash))
return data.loc[~in_test_set], data.loc[in_test_set]
# Using this approach, we need to make sure that new data
# gets added at the end of the dataset!
housing_with_id = housing.reset_index() # add id column
train_set, test_set = split_train_test_by_id(housing_with_id, 0.2, "index")
print("Train set:")
train_set.info()
print("Test set:")
test_set.info()
In [9]:
# Create income categories to be able to use stratified sampling
# across data (avoid sampling bias).
housing["income_cat"] = np.ceil(housing["median_income"] / 1.5)
housing["income_cat"].where(housing["income_cat"] < 5, 5.0, inplace=True)
housing["income_cat"].hist(bins=5, figsize=(8,8))
plt.show()
In [10]:
# Create stratified test / train set split using income categories
from sklearn.model_selection import StratifiedShuffleSplit
split = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=42)
for train_index, test_index in split.split(housing, housing["income_cat"]):
strat_train_set = housing.loc[train_index]
strat_test_set = housing.loc[test_index]
housing["income_cat"].value_counts() / len(housing)
Out[10]:
In [11]:
# Remove temporary income category from sets
for set in (strat_test_set, strat_train_set):
set.drop(["income_cat"], axis=1, inplace=True)
In [12]:
# Create working set and visualize
%matplotlib inline
housing = strat_train_set.copy()
housing.plot(kind="scatter", x="longitude", y="latitude", alpha=0.1, figsize=(12,10))
Out[12]:
In [13]:
housing.plot(kind="scatter", x="longitude", y="latitude", alpha=0.4,
s=housing["population"]/100, label="population",
c="median_house_value", cmap=plt.get_cmap("jet"), colorbar=True,
figsize=(12,10))
plt.legend()
Out[13]:
In [14]:
# Check correlation between columns
corr_matrix = housing.corr()
corr_matrix["median_house_value"].sort_values(ascending=False)
Out[14]:
In [15]:
# Create scatter matrix to visually check for correlations
from pandas.tools.plotting import scatter_matrix
attributes = ["median_house_value", "median_income", "total_rooms", "housing_median_age"]
scatter_matrix(housing[attributes], figsize=(12, 8))
Out[15]:
In [16]:
housing.plot(kind="scatter", x="median_income", y="median_house_value", alpha=0.1, figsize=(8,6))
Out[16]:
In [17]:
housing["rooms_per_household"] = housing["total_rooms"] / housing["households"]
housing["bedrooms_per_room"] = housing["total_bedrooms"] / housing["total_rooms"]
housing["population_per_household"] = housing["population"] / housing["households"]
corr_matrix = housing.corr()
corr_matrix["median_house_value"].sort_values(ascending=False)
Out[17]:
In [18]:
# reset data
housing = strat_train_set.drop("median_house_value", axis=1)
housing_labels = strat_train_set["median_house_value"].copy()
# Now handle n/a values using one of the following:
#housing.dropna(subset=["total_bedrooms"]) # 1) get rid of districts
#housing.drop(["total_bedrooms"], axis=2) # 2) get rid of whole attribute
#median = housing["total_bedrooms"].median()
#housing["total_bedrooms"].fillna(median) # 3) fill with median value
# This is 3) using sklearn's library
from sklearn.preprocessing import Imputer
housing_num = housing.drop("ocean_proximity", axis=1)
imputer = Imputer(strategy="median")
imputer.fit(housing_num)
X = imputer.transform(housing_num)
housing_tr = pd.DataFrame(X, columns=housing_num.columns)
housing_tr.info()
In [19]:
# Use one-hot encoding got near ocean attribute
from sklearn.preprocessing import LabelBinarizer
encoder = LabelBinarizer()
housing_cat = housing["ocean_proximity"]
housing_cat_1hot = encoder.fit_transform(housing_cat)
In [20]:
# Custom class for adding further attributes
from sklearn.base import BaseEstimator, TransformerMixin
rooms_ix, bedrooms_ix, population_ix, household_ix = 3, 4, 5, 6
class CombinedAttributesAdder(BaseEstimator, TransformerMixin):
def __init__(self, add_bedrooms_per_room=True):
self.add_bedrooms_per_room = add_bedrooms_per_room
def fit(self, X, y=None):
return self
def transform(self, X, y=None):
rooms_per_household = X[:, population_ix] / X[:, household_ix]
population_per_household = X[:, population_ix] / X[:, household_ix]
if self.add_bedrooms_per_room:
bedrooms_per_room = X[:, bedrooms_ix] / X[:, rooms_ix]
return np.c_[X, rooms_per_household, population_per_household, bedrooms_per_room]
else:
return np.c_[X, rooms_per_household, population_per_household]
attr_adder = CombinedAttributesAdder(add_bedrooms_per_room=False)
housing_extra_attribs = attr_adder.transform(housing.values)
housing_extra_attribs
Out[20]:
In [21]:
# Creating a full pipeline with all transformations so far
# 1) Helper class to extract only given attributes
from sklearn.base import BaseEstimator, TransformerMixin
class DataFrameSelector(BaseEstimator, TransformerMixin):
def __init__(self, attribute_names):
self.attribute_names = attribute_names
def fit(self, X, y=None):
return self
def transform(self, X, y=None):
return X[self.attribute_names].values
# 2) Define pipeline with interim steps
from sklearn.pipeline import FeatureUnion, Pipeline
from sklearn.preprocessing import StandardScaler
num_attribs = list(housing_num)
cat_attribs = ["ocean_proximity"]
num_pipeline = Pipeline([
("selector", DataFrameSelector(num_attribs)), # select only numeric attributes
("imputer", Imputer(strategy="median")), # fill n/a values with median
("attribs_added", CombinedAttributesAdder()), # add derived attributes
("std_scaler", StandardScaler()), # scale values to [-1,1]
])
cat_pipeline = Pipeline([
("selector", DataFrameSelector(cat_attribs)), # select only category attributes
("label_binarizer", LabelBinarizer()), # convert to 0/1 flags in columns
])
full_pipeline = FeatureUnion(transformer_list=[
("num_pipeline", num_pipeline),
("cat_pipeline", cat_pipeline),
])
housing_prepared = full_pipeline.fit_transform(housing)
housing_prepared.shape
Out[21]:
In [22]:
# trying linear regression as first example
from sklearn.linear_model import LinearRegression
lin_reg = LinearRegression()
lin_reg.fit(housing_prepared, housing_labels)
some_data = housing.iloc[:5]
some_labels = housing_labels.iloc[:5]
some_data_prepared = full_pipeline.transform(some_data) # NO fit()!
print("Predictions:\t", lin_reg.predict(some_data_prepared))
print("Labels:\t\t", list(some_labels))
# calculate MRSE
from sklearn.metrics import mean_squared_error
housing_predictions = lin_reg.predict(housing_prepared)
lin_mse = mean_squared_error(housing_labels, housing_predictions)
lin_mrse = np.sqrt(lin_mse)
print("MRSE (linear): ", lin_mrse, " (calculated across all, not only sample above)")
In [23]:
from sklearn.tree import DecisionTreeRegressor
tree_reg = DecisionTreeRegressor()
tree_reg.fit(housing_prepared, housing_labels)
housing_predictions = tree_reg.predict(housing_prepared)
tree_mse = mean_squared_error(housing_labels, housing_predictions)
tree_mrse = np.sqrt(tree_mse)
print("MRSE (tree): ", tree_mrse, " (obviously overfitting)")
In [24]:
from sklearn.model_selection import cross_val_score
tree_scores = cross_val_score(tree_reg, housing_prepared, housing_labels, scoring="neg_mean_squared_error", cv=10)
tree_rmse_scores = np.sqrt(-tree_scores)
def display_scores(scores):
print("Scores: ", scores)
print("Mean: ", scores.mean())
print("Standard deviation: ", scores.std())
print("Decision tree: ")
display_scores(tree_rmse_scores)
lin_scores = cross_val_score(lin_reg, housing_prepared, housing_labels, scoring="neg_mean_squared_error", cv=10)
lin_rmse_scores = np.sqrt(-lin_scores)
print("Linear regression: ")
display_scores(lin_rmse_scores)
In [25]:
# Next, implement ForestRegressor prediction like lin_reg and tree above! p.70/71
from sklearn.ensemble import RandomForestRegressor
forest_reg = RandomForestRegressor()
forest_reg.fit(housing_prepared, housing_labels)
housing_predictions = forest_reg.predict(housing_prepared)
forest_mse = mean_squared_error(housing_labels, housing_predictions)
forest_mrse = np.sqrt(forest_mse)
print("MRSE (forest): ", forest_mrse)
forest_scores = cross_val_score(forest_reg, housing_prepared, housing_labels, scoring="neg_mean_squared_error", cv=10)
forest_rmse_scores = np.sqrt(-forest_scores)
print("Forest scores: ")
display_scores(forest_rmse_scores)
In [26]:
# If you want to save the learned model (p.71), use pickle or joblib
from sklearn.externals import joblib
# save via
#joblib.dump(lin_reg, "lin_reg.pkl")
# restore it via
#lin_reg_loaded = joblib.load("lin_reg.pkl")
In [27]:
# (p. 72) Using grid-search to fine-tune parameters for model
from sklearn.model_selection import GridSearchCV
param_grid = [
{ 'n_estimators' : [3, 10, 30], 'max_features' : [2, 4, 6, 8] },
{ 'bootstrap' : [False], 'n_estimators' : [3, 10], 'max_features' : [2, 3, 4] }
]
forest_reg = RandomForestRegressor()
grid_search = GridSearchCV(forest_reg, param_grid, cv=5, scoring='neg_mean_squared_error')
grid_search.fit(housing_prepared, housing_labels)
print("Grid search, best params:\n--------------", grid_search.best_params_)
print("Grid search, best estimator:\n--------------", grid_search.best_estimator_)
print("Grid search, cross-evaluation scores:\n--------------")
cvres = grid_search.cv_results_
for mean_score, params in zip(cvres["mean_test_score"], cvres["params"]):
print(np.sqrt(-mean_score), params)
#joblib.dump(grid_search, "grid_search.pkl")
Out[27]:
In [28]:
# TODO: should fine tune parameters for model or use RandomizedSearchCV instead of Grid~
# (p. 74) analyze best model features, here: median_income + INLAND + others
feature_importances = grid_search.best_estimator_.feature_importances_
extra_attribs = ["rooms_per_hhold", "pop_per_hhold", "bedrooms_per_room"]
cat_one_hot_attribs = list(encoder.classes_)
attributes = num_attribs + extra_attribs + cat_one_hot_attribs
sorted(zip(feature_importances, attributes), reverse=True)
Out[28]:
In [29]:
# TODO: Should drop less relevant features from model here.
# (p. 75) After having fine-tuned the model etc, evaluate on the test set
final_model = grid_search.best_estimator_
X_test = strat_test_set.drop("median_house_value", axis=1)
y_test = strat_test_set["median_house_value"].copy()
X_test_prepared = full_pipeline.transform(X_test) # DO NOT fit()!
final_predictions = final_model.predict(X_test_prepared)
final_mse = mean_squared_error(y_test, final_predictions)
final_rmse = np.sqrt(final_mse)
print("Final RMSE against test set: ", final_rmse)
In [30]:
print("Done! Next: cleanup, consolidate, automate, vary, try out on test set from other source.")
In [ ]: