In [ ]:
print('Hello world!')
In [ ]:
import numpy as np
import pandas as pd
In [ ]:
import os
import tarfile
In [ ]:
HOUSING_PATH = 'datasets/housing'
In [ ]:
DOWNLOAD_ROOT = "https://raw.githubusercontent.com/ageron/handson-ml/master/"
HOUSING_PATH = "datasets/housing"
HOUSING_URL = DOWNLOAD_ROOT + HOUSING_PATH + "/housing.tgz"
def fetch_housing_data(housing_url=HOUSING_URL, housing_path=HOUSING_PATH):
housing_csv_path = os.path.join(housing_path, 'housing.csv')
housing_tgz_path = os.path.join(housing_path, 'housing.tgz')
if os.path.isfile(housing_csv_path):
print(f'Find {housing_csv_path}, do nothing')
return
if os.path.isfile(housing_tgz_path):
print(f'Find {housing_tgz_path}, will extract it')
housing_tgz = tarfile.open(housing_tgz_path)
housing_tgz.extractall(path=housing_path)
housing_tgz.close()
return
print(f'Can not find {housing_csv_path}')
fetch_housing_data()
In [ ]:
def load_housing_data(housing_path=HOUSING_PATH):
csv_path = os.path.join(housing_path, "housing.csv")
return pd.read_csv(csv_path)
In [ ]:
housing = load_housing_data()
In [ ]:
housing.head()
In [ ]:
housing.info()
In [ ]:
housing.ocean_proximity.value_counts()
In [ ]:
housing['ocean_proximity'].value_counts()
In [ ]:
housing.describe()
In [ ]:
%matplotlib inline
import matplotlib.pyplot as plt
housing.hist(bins=50, figsize=(20,15))
plt.show()
In [ ]:
housing.median_income.describe()
In [ ]:
housing.median_income.hist(bins=15)
plt.show()
In [ ]:
income_cat = np.ceil(housing.median_income / 1.5)
In [ ]:
income_cat.where(income_cat < 5.0, 5.0, inplace=True)
In [ ]:
# The above operations can be replaced by the following
income_cat2 = np.ceil(housing.median_income / 1.5)
income_cat2[income_cat2 > 5.0] = 5.0
(income_cat2 == income_cat).all()
In [ ]:
income_cat.describe()
In [ ]:
income_cat.value_counts() / len(income_cat)
In [ ]:
from sklearn.model_selection import StratifiedShuffleSplit
In [ ]:
split = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=42)
In [ ]:
housing['income_cat'] = income_cat
for train_index, test_index in split.split(housing, housing['income_cat']):
strat_train_set = housing.loc[train_index]
strat_test_set = housing.loc[test_index]
In [ ]:
Stratified = strat_test_set['income_cat'].value_counts().sort_index() / len(strat_test_set)
Overall = housing['income_cat'].value_counts().sort_index() / len(housing)
data = pd.DataFrame({'Overall': Overall, 'Stratified' : Stratified})
data['Strat. %error'] = (data['Overall'] - data['Stratified']) / data['Overall'] * 100
data
In [ ]:
strat_train_set_copy = strat_train_set.copy()
In [ ]:
housing.plot(kind="scatter", x='longitude', y='latitude')
In [ ]:
housing.plot(kind="scatter", x='longitude', y='latitude', alpha=0.1)
In [ ]:
strat_train_set_copy.plot(kind='scatter', x='longitude', y='latitude', alpha=0.4,
s=strat_train_set_copy.population/100,
c=strat_train_set_copy.median_house_value,
cmap=plt.get_cmap("jet"),
label="population", figsize=(15, 15),
colorbar=True)
plt.legend()
In [ ]:
corr_matrix = strat_train_set_copy.corr()
In [ ]:
corr_matrix.median_house_value.sort_values(ascending=False)
In [ ]:
from pandas.plotting import scatter_matrix
In [ ]:
attributes = ["median_house_value", "median_income", "total_rooms",
"housing_median_age"]
scatter_matrix(housing[attributes], figsize=(12, 8))
In [ ]:
strat_train_set_copy.plot.scatter(x="median_income", y="median_house_value", alpha=0.1)
In [ ]:
housing["rooms_per_household"] = housing["total_rooms"] / housing["households"]
housing["bedrooms_per_room"] = housing["total_bedrooms"]/housing["total_rooms"]
housing["population_per_household"]=housing["population"]/housing["households"]
In [ ]:
housing.info()
In [ ]:
corr_matrix = housing.corr()
corr_matrix['median_house_value'].sort_values(ascending=False)
In [ ]:
housing = strat_train_set.drop('median_house_value', axis=1)
housing_labels = strat_train_set['median_house_value'].copy()
In [ ]:
housing.info()
In [ ]:
housing.dropna(subset=['total_bedrooms']).info()
In [ ]:
housing.drop('total_bedrooms', axis=1).info()
In [ ]:
housing['total_bedrooms'].fillna(housing['total_bedrooms'].median()).describe()
In [ ]:
from sklearn.impute import SimpleImputer
imputer = SimpleImputer(strategy='median')
housing_num = housing.drop("ocean_proximity", axis=1)
imputer.fit(housing_num)
imputer.statistics_
In [ ]:
imputer.strategy
In [ ]:
housing.drop("ocean_proximity", axis=1).median().values
In [ ]:
X = imputer.transform(housing_num)
X
In [ ]:
housing_tr = pd.DataFrame(X, columns=housing_num.columns)
housing_tr.head()
In [ ]:
from sklearn.preprocessing import LabelEncoder
In [ ]:
encoder = LabelEncoder()
In [ ]:
housing_cat = housing.ocean_proximity
In [ ]:
housing_cat.describe()
In [ ]:
housing_cat.value_counts()
In [ ]:
housing_cat_encoded = encoder.fit_transform(housing_cat)
In [ ]:
housing_cat_encoded
In [ ]:
type(housing_cat_encoded)
In [ ]:
print(encoder.classes_)
In [ ]:
from sklearn.preprocessing import OneHotEncoder
In [ ]:
encoder = OneHotEncoder()
In [ ]:
print(housing_cat_encoded.shape)
print(type(housing_cat_encoded))
In [ ]:
(housing_cat_encoded.reshape(-1, 1)).shape
In [ ]:
housing_cat_1hot = encoder.fit_transform(housing_cat_encoded.reshape(-1, 1))
In [ ]:
housing_cat_1hot
In [ ]:
type(housing_cat_1hot)
In [ ]:
housing_cat_1hot.toarray()
In [ ]:
from sklearn.preprocessing import LabelBinarizer
In [ ]:
encoder = LabelBinarizer(sparse_output=False)
In [ ]:
housing_cat_1hot = encoder.fit_transform(housing_cat)
In [ ]:
housing_cat_1hot
In [ ]:
type(housing_cat_1hot)
In [ ]:
rooms_ix, bedrooms_ix, population_ix, households_ix = 3, 4, 5, 6
In [ ]:
housing.head()
In [ ]:
housing.iloc[:, 3]
In [ ]:
X = housing.values
In [ ]:
# This can be achieved by the iloc, with using .values
housing.iloc[:, [rooms_ix, bedrooms_ix, households_ix, population_ix]].head()
In [ ]:
rooms_per_household = X[:, rooms_ix] / X[:, households_ix]
population_per_household = X[:, population_ix] / X[:, households_ix]
bedrooms_per_room = X[:, bedrooms_ix] / X[:, rooms_ix]
np.c_[X, rooms_per_household, population_per_household]
np.c_[X, rooms_per_household, population_per_household, bedrooms_per_room]
In [ ]:
from sklearn.base import BaseEstimator, TransformerMixin
rooms_ix, bedrooms_ix, population_ix, households_ix = 3, 4, 5, 6
class CombinedAttributesAdder(BaseEstimator, TransformerMixin):
def __init__(self, add_bedrooms_per_room=False):
self.add_bedrooms_per_room = add_bedrooms_per_room
def fit(self, X, y=None):
return self
def transform(self, X, y=None):
rooms_per_household = X[:, rooms_ix] / X[:, households_ix]
population_per_household = X[:, population_ix] / X[:, households_ix]
if self.add_bedrooms_per_room:
bedrooms_per_room = X[:, bedrooms_ix] / X[:, rooms_ix]
return np.c_[X, rooms_per_household, population_per_household, bedrooms_per_room]
else:
return np.c_[X, rooms_per_household, population_per_household]
In [ ]:
attr_adder = CombinedAttributesAdder(add_bedrooms_per_room=False)
housing_extra_attribs = attr_adder.transform(X)
In [ ]:
print(housing_extra_attribs.shape)
print(housing.shape)
In [ ]:
# Convert back to data frame -- My way
new_columns = housing.columns.append(
pd.Index(['rooms_per_household', 'population_per_household'])
)
new_columns
housing_extra_attribs_df = pd.DataFrame(housing_extra_attribs, columns=new_columns)
housing_extra_attribs_df.head()
In [ ]:
housing.describe()
In [ ]:
housing.total_rooms.describe()
In [ ]:
from sklearn.preprocessing import MinMaxScaler
scalar = MinMaxScaler()
scalar.fit(housing["total_rooms"].values.reshape(-1, 1))
pd.DataFrame(scalar.transform(housing["total_rooms"].values.reshape(-1, 1)), columns=["total_rooms"])["total_rooms"].describe()
In [ ]:
from sklearn.preprocessing import StandardScaler
scalar = StandardScaler()
scalar.fit(housing["total_rooms"].values.reshape(-1, 1))
pd.DataFrame(scalar.transform(housing["total_rooms"].values.reshape(-1, 1)), columns=["total_rooms"])["total_rooms"].describe()
In [ ]:
from sklearn.pipeline import Pipeline
num_pipeline = Pipeline([
('imputer', SimpleImputer(strategy="median")),
('attr_adder', CombinedAttributesAdder()),
('std_scaler', StandardScaler())
])
In [ ]:
# I want to verify the pipelined version
# doest the same thing as the separated steps
num_pipeline_stage1 = Pipeline([
('imputer', SimpleImputer(strategy="median")),
])
X_pipeline = num_pipeline_stage1.fit_transform(housing_num)
X = imputer.transform(housing_num)
X_pipeline
np.array_equal(X, X_pipeline)
In [ ]:
num_pipeline_stage2 = Pipeline([
('imputer', SimpleImputer(strategy="median")),
('attr_adder', CombinedAttributesAdder()),
])
Y = attr_adder.fit_transform(X)
Y_pipeline = num_pipeline_stage2.fit_transform(housing_num)
np.array_equal(Y, Y_pipeline)
In [ ]:
num_pipeline_stage3 = Pipeline([
('imputer', SimpleImputer(strategy="median")),
('attr_adder', CombinedAttributesAdder()),
('std_scaler', StandardScaler())
])
Z = scalar.fit_transform(Y)
Z.std(), Z.mean()
Z_pipeline = num_pipeline_stage3.fit_transform(housing_num)
np.array_equal(Z, Z_pipeline)
In [ ]:
from sklearn.base import BaseEstimator, TransformerMixin
class DataFrameSelector(BaseEstimator, TransformerMixin):
def __init__(self, attribute_names):
self.attribute_names = attribute_names
def fit(self, X, y=None):
return self
def transform(self, X):
return X[self.attribute_names].values
class CustomizedLabelBinarizer(BaseEstimator, TransformerMixin):
def __init__(self, sparse_output=False):
self.encode = LabelBinarizer(sparse_output = sparse_output)
def fit(self, X, y=None):
return self.encode.fit(X)
def transform(self, X):
return self.encode.transform(X)
num_attribs = list(housing_num)
cat_attribs = ["ocean_proximity"]
num_pipeline = Pipeline([
('selector', DataFrameSelector(num_attribs)),
('imputer', SimpleImputer(strategy="median")),
('attr_adder', CombinedAttributesAdder()),
('std_scaler', StandardScaler()),
]
)
cat_pipeline = Pipeline([
('selector', DataFrameSelector(cat_attribs)),
('label_binarizer', CustomizedLabelBinarizer()),
]
)
# LabelBinarizer().fit_transform(DataFrameSelector(cat_attribs).fit_transform(housing))
# num_pipeline.fit_transform(housing)
# cat_pipeline.fit_transform(housing)
from sklearn.pipeline import FeatureUnion
full_pipeline = FeatureUnion(transformer_list=[
('num_pipeline', num_pipeline),
('cat_pipeline', cat_pipeline),
])
housing_prepared = full_pipeline.fit_transform(housing)
print(housing_prepared.shape)
housing_prepared
In [ ]:
from sklearn.linear_model import LinearRegression
In [ ]:
lin_reg = LinearRegression()
lin_reg.fit(housing_prepared, housing_labels)
In [ ]:
some_data = housing[:5]
some_data
In [ ]:
some_labels = housing_labels[:5]
some_labels
In [ ]:
some_data_prepared = full_pipeline.transform(some_data)
some_data_prepared
In [ ]:
print(f'Prediction:\t{lin_reg.predict(some_data_prepared)}')
print(f'Lables:\t\t{list(some_labels)}')
In [ ]:
from sklearn.metrics import mean_squared_error
In [ ]:
housing_prediction = lin_reg.predict(housing_prepared)
lin_mse = mean_squared_error(housing_prediction, housing_labels)
lin_rmse = np.sqrt(lin_mse)
lin_rmse
In [ ]:
from sklearn.tree import DecisionTreeRegressor
tree_reg = DecisionTreeRegressor()
tree_reg.fit(housing_prepared, housing_labels)
tree_predictions = tree_reg.predict(housing_prepared)
tree_mse = mean_squared_error(tree_predictions, housing_labels)
tree_rmse = np.sqrt(tree_mse)
tree_rmse
In [ ]:
from sklearn.model_selection import cross_val_score
scores = cross_val_score(tree_reg, housing_prepared, housing_labels, scoring="neg_mean_squared_error", cv=10)
In [ ]:
rmse_scores = np.sqrt(-scores)
rmse_scores
In [ ]:
def display_scores(scores):
print(f'Scores: {scores}')
print(f'Mean: {scores.mean()}')
print(f'STD: {scores.std()}')
display_scores(rmse_scores)
In [ ]:
from sklearn.ensemble import RandomForestRegressor
forest_reg = RandomForestRegressor()
forest_reg.fit(housing_prepared, housing_labels)
forest_scores = cross_val_score(forest_reg, housing_prepared, housing_labels, scoring="neg_mean_squared_error", cv=10)
forest_rmse_scores = np.sqrt(-forest_scores)
display_scores(forest_rmse_scores)
In [ ]:
forest_prediction = forest_reg.predict(housing_prepared)
forest_rmse = np.sqrt(mean_squared_error(forest_prediction, housing_labels))
forest_rmse
Try adding a transformer in the preparation pipeline to select only the most important attributes.
The importance of each feature is show below in 2.7.4
>>> sorted(zip(feature_importances, attributes), reverse=True)
[(0.32649798665134971, 'median_income'),
(0.15334491760305854, 'INLAND'),
(0.11305529021187399, 'pop_per_hhold'),
(0.07793247662544775, 'bedrooms_per_room'),
(0.071415642259275158, 'longitude'),
(0.067613918945568688, 'latitude'),
(0.060436577499703222, 'rooms_per_hhold'),
(0.04442608939578685, 'housing_median_age'),
(0.018240254462909437, 'population'),
(0.01663085833886218, 'total_rooms'),
(0.016607686091288865, 'total_bedrooms'),
(0.016345876147580776, 'households'),
(0.011216644219017424, '<1H OCEAN'),
(0.0034668118081117387, 'NEAR OCEAN'),
(0.0026848388432755429, 'NEAR BAY'),
(8.4130896890070617e-05, 'ISLAND')]
Based on the ranking, I will select the following 3:
median_income
INLAND
pop_per_hhold
In [ ]:
from sklearn.base import BaseEstimator, TransformerMixin
class EX3NumSelector(BaseEstimator, TransformerMixin):
def __init__(self):
pass
def fit(self, X, y=None):
return self
def transform(self, X):
X['pop_per_hhold'] = X['population'] / X['households']
return X[['median_income', 'pop_per_hhold', 'longitude']].values
class EX3CatSelector(BaseEstimator, TransformerMixin):
def __init__(self):
pass
def fit(self, X, y=None):
return self
def transform(self, X):
Y = housing['ocean_proximity']
Y[Y != 'INLAND'] = 'NON_INLAND'
return Y.values
num_sel = EX3NumSelector()
num_sel.fit_transform(housing)
cat_sel = EX3CatSelector()
cat_sel.fit_transform(housing)
In [ ]:
num_pipeline = Pipeline([
('selector', EX3NumSelector()),
('imputer', SimpleImputer(strategy="median")),
('std_scaler', StandardScaler()),
]
)
cat_pipeline = Pipeline([
('selector', EX3CatSelector()),
('label_binarizer', CustomizedLabelBinarizer()),
]
)
full_pipeline = FeatureUnion(transformer_list=[
('num_pipeline', num_pipeline),
('cat_pipeline', cat_pipeline),
])
housing_prepared = full_pipeline.fit_transform(housing)
print(housing_prepared.shape)
housing_prepared
In [ ]:
forest_reg.fit(housing_prepared, housing_labels)
forest_scores = cross_val_score(forest_reg, housing_prepared, housing_labels, scoring="neg_mean_squared_error", cv=10)
forest_rmse_scores = np.sqrt(-forest_scores)
display_scores(forest_rmse_scores)
With only 3 features, we do see the performance degradation. Adding one more feature 'longitude' improves a little bit, which makes sense.
In [ ]:
# from sklearn.model_selection import GridSearchCV
# param_grid = [
# {'n_estimators': [3, 10, 30], 'max_features': [2,4,6,8]},
# {'bootstrap': [False], 'n_estimators': [3, 10, 30], 'max_features': [2,4,6,8]}
# ]
# forest_reg = RandomForestRegressor()
# grid_search = GridSearchCV(forest_reg, param_grid, cv=5, scoring="neg_mean_squared_error")
# grid_search.fit(housing_prepared, housing_labels)
# grid_search.best_params_
# grid_search.best_estimator_
# cvres = grid_search.cv_results_
# for mean_score, params in zip(cvres["mean_test_score"], cvres["params"]):
# print(np.sqrt(-mean_score), params)
In [ ]:
# feature_importances = grid_search.best_estimator_.feature_importances_
# feature_importances
# extra_attribs = ['rooms_per_hhold', 'pop_per_hhold']
# cat_one_hot_attribs = list(encoder.classes_)
# cat_one_hot_attribs
# attributes = num_attribs + extra_attribs + cat_one_hot_attribs
# attributes, len(attributes)
# sorted(zip(feature_importances, attributes), reverse=True)
In [ ]:
# final_model = grid_search.best_estimator_
# X_test = strat_test_set.drop("median_house_value", axis=1)
# y_test = strat_test_set.median_house_value.copy()
# X_test_prepared = full_pipeline.transform(X_test)
# final_predictions = final_model.predict(X_test_prepared)
# final_mse = mean_squared_error(final_predictions, y_test)
# final_rmse = np.sqrt(final_mse)
# final_rmse