In [ ]:
print('Hello world!')

In [ ]:
import numpy as np
import pandas as pd

In [ ]:
import os
import tarfile

In [ ]:
HOUSING_PATH = 'datasets/housing'

In [ ]:
DOWNLOAD_ROOT = "https://raw.githubusercontent.com/ageron/handson-ml/master/"
HOUSING_PATH = "datasets/housing"
HOUSING_URL = DOWNLOAD_ROOT + HOUSING_PATH + "/housing.tgz"

def fetch_housing_data(housing_url=HOUSING_URL, housing_path=HOUSING_PATH):
    housing_csv_path = os.path.join(housing_path, 'housing.csv') 
    housing_tgz_path = os.path.join(housing_path, 'housing.tgz') 
    if os.path.isfile(housing_csv_path):
        print(f'Find {housing_csv_path}, do nothing')
        return
    if os.path.isfile(housing_tgz_path):
        print(f'Find {housing_tgz_path}, will extract it')
        housing_tgz = tarfile.open(housing_tgz_path)
        housing_tgz.extractall(path=housing_path)
        housing_tgz.close()
        return
    print(f'Can not find {housing_csv_path}')

fetch_housing_data()

In [ ]:
def load_housing_data(housing_path=HOUSING_PATH):
    csv_path = os.path.join(housing_path, "housing.csv")
    return pd.read_csv(csv_path)

In [ ]:
housing = load_housing_data()

In [ ]:
housing.head()

In [ ]:
housing.info()

In [ ]:
housing.ocean_proximity.value_counts()

In [ ]:
housing['ocean_proximity'].value_counts()

In [ ]:
housing.describe()

In [ ]:
%matplotlib inline
import matplotlib.pyplot as plt
housing.hist(bins=50, figsize=(20,15))
plt.show()

In [ ]:
housing.median_income.describe()

In [ ]:
housing.median_income.hist(bins=15)
plt.show()

In [ ]:
income_cat = np.ceil(housing.median_income / 1.5)

In [ ]:
income_cat.where(income_cat < 5.0, 5.0, inplace=True)

In [ ]:
# The above operations can be replaced by the following
income_cat2 = np.ceil(housing.median_income / 1.5)
income_cat2[income_cat2 > 5.0] = 5.0
(income_cat2 == income_cat).all()

In [ ]:
income_cat.describe()

In [ ]:
income_cat.value_counts() / len(income_cat)

In [ ]:
from sklearn.model_selection import StratifiedShuffleSplit

In [ ]:
split = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=42)

In [ ]:
housing['income_cat'] = income_cat

for train_index, test_index in split.split(housing, housing['income_cat']):
    strat_train_set = housing.loc[train_index]
    strat_test_set  = housing.loc[test_index]

In [ ]:
Stratified = strat_test_set['income_cat'].value_counts().sort_index() / len(strat_test_set)
Overall = housing['income_cat'].value_counts().sort_index() / len(housing)
data = pd.DataFrame({'Overall': Overall, 'Stratified' : Stratified})
data['Strat. %error'] = (data['Overall'] - data['Stratified']) / data['Overall'] * 100
data

Visualizing Data


In [ ]:
strat_train_set_copy = strat_train_set.copy()

In [ ]:
housing.plot(kind="scatter", x='longitude', y='latitude')

In [ ]:
housing.plot(kind="scatter", x='longitude', y='latitude', alpha=0.1)

In [ ]:
strat_train_set_copy.plot(kind='scatter', x='longitude', y='latitude', alpha=0.4,
                          s=strat_train_set_copy.population/100,
                          c=strat_train_set_copy.median_house_value,
                          cmap=plt.get_cmap("jet"),
                          label="population", figsize=(15, 15),
                          colorbar=True)
plt.legend()

In [ ]:
corr_matrix = strat_train_set_copy.corr()

In [ ]:
corr_matrix.median_house_value.sort_values(ascending=False)

In [ ]:
from pandas.plotting import scatter_matrix

In [ ]:
attributes = ["median_house_value", "median_income", "total_rooms",
"housing_median_age"]
scatter_matrix(housing[attributes], figsize=(12, 8))

In [ ]:
strat_train_set_copy.plot.scatter(x="median_income", y="median_house_value", alpha=0.1)

Experimenting with Attribute Combinations


In [ ]:
housing["rooms_per_household"] = housing["total_rooms"] / housing["households"]
housing["bedrooms_per_room"] = housing["total_bedrooms"]/housing["total_rooms"]
housing["population_per_household"]=housing["population"]/housing["households"]

In [ ]:
housing.info()

In [ ]:
corr_matrix = housing.corr()
corr_matrix['median_house_value'].sort_values(ascending=False)

2.5 Prepare the Data for Machine Learning Algorithms


In [ ]:
housing = strat_train_set.drop('median_house_value', axis=1)
housing_labels = strat_train_set['median_house_value'].copy()

In [ ]:
housing.info()

In [ ]:
housing.dropna(subset=['total_bedrooms']).info()

In [ ]:
housing.drop('total_bedrooms', axis=1).info()

In [ ]:
housing['total_bedrooms'].fillna(housing['total_bedrooms'].median()).describe()

In [ ]:
from sklearn.impute import SimpleImputer
imputer = SimpleImputer(strategy='median')
housing_num = housing.drop("ocean_proximity", axis=1)
imputer.fit(housing_num)
imputer.statistics_

In [ ]:
imputer.strategy

In [ ]:
housing.drop("ocean_proximity", axis=1).median().values

In [ ]:
X = imputer.transform(housing_num)
X

In [ ]:
housing_tr = pd.DataFrame(X, columns=housing_num.columns)
housing_tr.head()

Handling Text and Categorical Attributes


In [ ]:
from sklearn.preprocessing import LabelEncoder

In [ ]:
encoder = LabelEncoder()

In [ ]:
housing_cat = housing.ocean_proximity

In [ ]:
housing_cat.describe()

In [ ]:
housing_cat.value_counts()

In [ ]:
housing_cat_encoded = encoder.fit_transform(housing_cat)

In [ ]:
housing_cat_encoded

In [ ]:
type(housing_cat_encoded)

In [ ]:
print(encoder.classes_)

One hot encoding


In [ ]:
from sklearn.preprocessing import OneHotEncoder

In [ ]:
encoder = OneHotEncoder()

In [ ]:
print(housing_cat_encoded.shape)
print(type(housing_cat_encoded))

In [ ]:
(housing_cat_encoded.reshape(-1, 1)).shape

In [ ]:
housing_cat_1hot = encoder.fit_transform(housing_cat_encoded.reshape(-1, 1))

In [ ]:
housing_cat_1hot

In [ ]:
type(housing_cat_1hot)

In [ ]:
housing_cat_1hot.toarray()

Combine


In [ ]:
from sklearn.preprocessing import LabelBinarizer

In [ ]:
encoder = LabelBinarizer(sparse_output=False)

In [ ]:
housing_cat_1hot = encoder.fit_transform(housing_cat)

In [ ]:
housing_cat_1hot

In [ ]:
type(housing_cat_1hot)

Custom Transformers


In [ ]:
rooms_ix, bedrooms_ix, population_ix, households_ix = 3, 4, 5, 6

In [ ]:
housing.head()

In [ ]:
housing.iloc[:, 3]

In [ ]:
X = housing.values

In [ ]:
# This can be achieved by the iloc, with using .values
housing.iloc[:, [rooms_ix, bedrooms_ix, households_ix, population_ix]].head()

In [ ]:
rooms_per_household = X[:, rooms_ix] / X[:, households_ix]
population_per_household = X[:, population_ix] / X[:, households_ix]
bedrooms_per_room = X[:, bedrooms_ix] / X[:, rooms_ix]
np.c_[X, rooms_per_household, population_per_household]
np.c_[X, rooms_per_household, population_per_household, bedrooms_per_room]

In [ ]:
from sklearn.base import BaseEstimator, TransformerMixin
rooms_ix, bedrooms_ix, population_ix, households_ix = 3, 4, 5, 6

class CombinedAttributesAdder(BaseEstimator, TransformerMixin):
    def __init__(self, add_bedrooms_per_room=False):
        self.add_bedrooms_per_room = add_bedrooms_per_room
    def fit(self, X, y=None):
        return self
    def transform(self, X, y=None):
        rooms_per_household = X[:, rooms_ix] / X[:, households_ix]
        population_per_household = X[:, population_ix] / X[:, households_ix]
        if self.add_bedrooms_per_room:
            bedrooms_per_room = X[:, bedrooms_ix] / X[:, rooms_ix]
            return np.c_[X, rooms_per_household, population_per_household, bedrooms_per_room]
        else:
            return np.c_[X, rooms_per_household, population_per_household]

In [ ]:
attr_adder = CombinedAttributesAdder(add_bedrooms_per_room=False)
housing_extra_attribs = attr_adder.transform(X)

In [ ]:
print(housing_extra_attribs.shape)
print(housing.shape)

In [ ]:
# Convert back to data frame -- My way
new_columns = housing.columns.append(
    pd.Index(['rooms_per_household', 'population_per_household'])
)
new_columns
housing_extra_attribs_df = pd.DataFrame(housing_extra_attribs, columns=new_columns)
housing_extra_attribs_df.head()

2.5.4 Feature Scaling


In [ ]:
housing.describe()

In [ ]:
housing.total_rooms.describe()

In [ ]:
from sklearn.preprocessing import MinMaxScaler

scalar = MinMaxScaler()
scalar.fit(housing["total_rooms"].values.reshape(-1, 1))
pd.DataFrame(scalar.transform(housing["total_rooms"].values.reshape(-1, 1)), columns=["total_rooms"])["total_rooms"].describe()

In [ ]:
from sklearn.preprocessing import StandardScaler

scalar = StandardScaler()
scalar.fit(housing["total_rooms"].values.reshape(-1, 1))
pd.DataFrame(scalar.transform(housing["total_rooms"].values.reshape(-1, 1)), columns=["total_rooms"])["total_rooms"].describe()

2.5.5 Transformation Pipeline


In [ ]:
from sklearn.pipeline import Pipeline

num_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy="median")),
    ('attr_adder', CombinedAttributesAdder()),
    ('std_scaler', StandardScaler())
])

In [ ]:
# I want to verify the pipelined version
# doest the same thing as the separated steps

num_pipeline_stage1 = Pipeline([
    ('imputer', SimpleImputer(strategy="median")),
])

X_pipeline = num_pipeline_stage1.fit_transform(housing_num)
X = imputer.transform(housing_num)
X_pipeline
np.array_equal(X, X_pipeline)

In [ ]:
num_pipeline_stage2 = Pipeline([
    ('imputer', SimpleImputer(strategy="median")),
    ('attr_adder', CombinedAttributesAdder()),
])

Y = attr_adder.fit_transform(X)
Y_pipeline = num_pipeline_stage2.fit_transform(housing_num)
np.array_equal(Y, Y_pipeline)

In [ ]:
num_pipeline_stage3 = Pipeline([
    ('imputer', SimpleImputer(strategy="median")),
    ('attr_adder', CombinedAttributesAdder()),
    ('std_scaler', StandardScaler())
])

Z = scalar.fit_transform(Y)
Z.std(), Z.mean()
Z_pipeline = num_pipeline_stage3.fit_transform(housing_num)
np.array_equal(Z, Z_pipeline)

In [ ]:
from sklearn.base import BaseEstimator, TransformerMixin

class DataFrameSelector(BaseEstimator, TransformerMixin):
    def __init__(self, attribute_names):
        self.attribute_names = attribute_names
    def fit(self, X, y=None):
        return self
    def transform(self, X):
        return X[self.attribute_names].values

class CustomizedLabelBinarizer(BaseEstimator, TransformerMixin):
    def __init__(self, sparse_output=False):
        self.encode = LabelBinarizer(sparse_output = sparse_output)
    def fit(self, X, y=None):
        return self.encode.fit(X)
    def transform(self, X):
        return self.encode.transform(X)


num_attribs = list(housing_num)
cat_attribs = ["ocean_proximity"]

num_pipeline = Pipeline([
    ('selector', DataFrameSelector(num_attribs)),
    ('imputer', SimpleImputer(strategy="median")),
    ('attr_adder', CombinedAttributesAdder()),
    ('std_scaler', StandardScaler()),
]
)

cat_pipeline = Pipeline([
    ('selector', DataFrameSelector(cat_attribs)),
    ('label_binarizer', CustomizedLabelBinarizer()),
]
)

# LabelBinarizer().fit_transform(DataFrameSelector(cat_attribs).fit_transform(housing))
# num_pipeline.fit_transform(housing)
# cat_pipeline.fit_transform(housing)

from sklearn.pipeline import FeatureUnion

full_pipeline = FeatureUnion(transformer_list=[
    ('num_pipeline', num_pipeline),
    ('cat_pipeline', cat_pipeline),
])

housing_prepared = full_pipeline.fit_transform(housing)
print(housing_prepared.shape)
housing_prepared

2.6.1 Training and Evaluating on the Training Set


In [ ]:
from sklearn.linear_model import LinearRegression

In [ ]:
lin_reg = LinearRegression()
lin_reg.fit(housing_prepared, housing_labels)

In [ ]:
some_data = housing[:5]
some_data

In [ ]:
some_labels = housing_labels[:5]
some_labels

In [ ]:
some_data_prepared = full_pipeline.transform(some_data)
some_data_prepared

In [ ]:
print(f'Prediction:\t{lin_reg.predict(some_data_prepared)}')
print(f'Lables:\t\t{list(some_labels)}')

In [ ]:
from sklearn.metrics import mean_squared_error

In [ ]:
housing_prediction = lin_reg.predict(housing_prepared)
lin_mse = mean_squared_error(housing_prediction, housing_labels)
lin_rmse = np.sqrt(lin_mse)
lin_rmse

Tree model


In [ ]:
from sklearn.tree import DecisionTreeRegressor

tree_reg = DecisionTreeRegressor()
tree_reg.fit(housing_prepared, housing_labels)
tree_predictions = tree_reg.predict(housing_prepared)
tree_mse = mean_squared_error(tree_predictions, housing_labels)
tree_rmse = np.sqrt(tree_mse)
tree_rmse

2.6.2 Better Evaluation Using Cross-Validation


In [ ]:
from sklearn.model_selection import cross_val_score
scores = cross_val_score(tree_reg, housing_prepared, housing_labels, scoring="neg_mean_squared_error", cv=10)

In [ ]:
rmse_scores = np.sqrt(-scores)
rmse_scores

In [ ]:
def display_scores(scores):
    print(f'Scores: {scores}')
    print(f'Mean: {scores.mean()}')
    print(f'STD: {scores.std()}')

display_scores(rmse_scores)

Random Forest


In [ ]:
from sklearn.ensemble import RandomForestRegressor

forest_reg = RandomForestRegressor()
forest_reg.fit(housing_prepared, housing_labels)
forest_scores = cross_val_score(forest_reg, housing_prepared, housing_labels, scoring="neg_mean_squared_error", cv=10)
forest_rmse_scores = np.sqrt(-forest_scores)
display_scores(forest_rmse_scores)

In [ ]:
forest_prediction = forest_reg.predict(housing_prepared)
forest_rmse = np.sqrt(mean_squared_error(forest_prediction, housing_labels))
forest_rmse

Ex02

Try replacing GridSearchCV with RandomizedSearchCV.

The official document is here


In [ ]:
from sklearn.model_selection import RandomizedSearchCV

In [ ]:
from scipy.stats import uniform
from scipy.stats import randint

In [ ]:
distributions = {
    'n_estimators': randint(low=3, high=63),
    'max_features': randint(low=2, high=8),
    'bootstrap'   : [True, False],
}

In [ ]:
rand_search = RandomizedSearchCV(forest_reg, distributions, random_state=42, cv=5, scoring="neg_mean_squared_error")

In [ ]:
rand_search.fit(housing_prepared, housing_labels)

In [ ]:
cvres = rand_search.cv_results_
for mean_score, params in zip(cvres["mean_test_score"], cvres["params"]):
    print(np.sqrt(-mean_score), params)

These are the results I got, similar to GridSearchCV

50865.19350215962 {'bootstrap': True, 'max_features': 5, 'n_estimators': 31}
53821.59623533819 {'bootstrap': True, 'max_features': 4, 'n_estimators': 10}
50296.03902259843 {'bootstrap': True, 'max_features': 6, 'n_estimators': 41}
50762.781955938386 {'bootstrap': False, 'max_features': 4, 'n_estimators': 25}
51365.68817153847 {'bootstrap': True, 'max_features': 4, 'n_estimators': 26}
50555.07511742977 {'bootstrap': True, 'max_features': 5, 'n_estimators': 42}
50750.81686321685 {'bootstrap': False, 'max_features': 4, 'n_estimators': 24}
52003.13807126158 {'bootstrap': True, 'max_features': 3, 'n_estimators': 26}
49766.656263124736 {'bootstrap': False, 'max_features': 7, 'n_estimators': 40}
50347.828040616296 {'bootstrap': False, 'max_features': 5, 'n_estimators': 23}

In [ ]:
# from sklearn.model_selection import GridSearchCV

# param_grid = [
#     {'n_estimators': [3, 10, 30], 'max_features': [2,4,6,8]},
#     {'bootstrap': [False], 'n_estimators': [3, 10, 30], 'max_features': [2,4,6,8]}
# ]

# forest_reg = RandomForestRegressor()

# grid_search = GridSearchCV(forest_reg, param_grid, cv=5, scoring="neg_mean_squared_error")

# grid_search.fit(housing_prepared, housing_labels)

# grid_search.best_params_

# grid_search.best_estimator_

# cvres = grid_search.cv_results_
# for mean_score, params in zip(cvres["mean_test_score"], cvres["params"]):
#     print(np.sqrt(-mean_score), params)

2.7.4 Analyze the best models and their errors


In [ ]:
# feature_importances = grid_search.best_estimator_.feature_importances_
# feature_importances

# extra_attribs = ['rooms_per_hhold', 'pop_per_hhold']

# cat_one_hot_attribs = list(encoder.classes_)
# cat_one_hot_attribs

# attributes = num_attribs + extra_attribs + cat_one_hot_attribs
# attributes, len(attributes)

# sorted(zip(feature_importances, attributes), reverse=True)

2.7.5 Evaluate Your System on the Test Set


In [ ]:
# final_model = grid_search.best_estimator_
# X_test = strat_test_set.drop("median_house_value", axis=1)
# y_test = strat_test_set.median_house_value.copy()
# X_test_prepared = full_pipeline.transform(X_test)

# final_predictions = final_model.predict(X_test_prepared)
# final_mse = mean_squared_error(final_predictions, y_test)
# final_rmse = np.sqrt(final_mse)
# final_rmse