In [50]:
# import
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
In [51]:
import os
cwd = os.getcwd()
print(cwd)
In [52]:
# reading the data
df = pd.read_csv("datasets/housing/housing.csv")
df.head(5)
Out[52]:
In [53]:
df.info()
In [54]:
# checking for nulls
df.isnull().sum()
Out[54]:
In [55]:
df.describe()
Out[55]:
In [56]:
df.hist(bins=50, figsize=(20, 12))
Out[56]:
In [57]:
# We can use sklearn preprocessing library for same but this is how we can write the same code in
# python numpy
def train_test_split(data, test_ratio):
shuffled_indices = np.random.permutation(len(data))
test_indices = int(len(data)*test_ratio)
train_data = shuffled_indices[test_indices:]
test_data = shuffled_indices[:test_indices]
return data.iloc[train_data], data.iloc[test_data]
In [58]:
train_set, test_set = train_test_split(df, 0.2)
In [59]:
print("train_set:"+str(len(train_set))+" test set:"+str(len(test_set)))
But the problem with this implementation is that it will randomize and see the whole data over few run, we can fix it by initializing the random seed with some value but this will break out when there is a update in data, this will change the data in train and test both
In [60]:
# Using Scikit learn
from sklearn.model_selection import train_test_split
train_set, test_set = train_test_split(df, test_size=0.2, random_state=42)
print("train_set:"+str(len(train_set))+" test set:"+str(len(test_set)))
income_cat - Creating a new column which categorize the data such a way that max median income will be 5
our output variable y is median income
In [61]:
df["income_cat"]=np.ceil(df["median_income"]/1.5)
df["income_cat"].where(df["income_cat"]<5,5.0,inplace=True)
In [62]:
df["income_cat"].hist(bins=50)
Out[62]:
As you can see, we have removed the long tail of median income and shrink the data into 1-5 range.
Now, using sklearn Statified Split to split the data
In [63]:
from sklearn.model_selection import StratifiedShuffleSplit
In [64]:
split = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=42)
for train_index,test_index in split.split(df, df["income_cat"]): # obj.split(X, y)
strat_train_set = df.loc[train_index]
strat_test_set = df.loc[test_index]
In [65]:
# lets check the % data distrinution in income_cat column
df["income_cat"].value_counts()/len(df)
Out[65]:
In [66]:
# now dropping the df["income_cat"] from the train, test data
strat_train_set.drop("income_cat", axis=1, inplace=True)
strat_test_set.drop("income_cat", axis=1, inplace=True)
In [67]:
# keeping the train data copy aside to keep it intact with our explorations steps
housing = strat_train_set.copy()
In [68]:
housing.plot(kind="scatter", x="longitude", y="latitude", figsize=(6, 6))
Out[68]:
In [69]:
# for density check, add an other option alpha
housing.plot(kind="scatter", x="longitude", y="latitude", figsize=(6, 6), alpha=0.1)
Out[69]:
In [70]:
housing.plot(kind="scatter", x="longitude", y="latitude", figsize=(8, 8), alpha=0.4,
c="median_house_value", cmap=plt.get_cmap("jet"), colorbar=True,
s=housing["population"]/100, label="population",
grid=True, legend=True)
Out[70]:
In [71]:
corr_matrix = housing.corr()
corr_matrix
Out[71]:
In [72]:
# looking for relation between
# The correlation coefficient only measures l inear correlations (“if x goes up, then y general ly goes up/down”).
# It may completely miss out on nonl inear relationships (e.g., “if x is close to zero then y general ly goes up”)
corr_matrix["median_house_value"].sort_values(ascending=False)
Out[72]:
In [73]:
# Using pandas scatter matrix function
attr = ["median_house_value","median_income","total_rooms","housing_median_age"]
pd.scatter_matrix(housing[attr], figsize=(15, 10))
Out[73]:
In [74]:
housing.plot(kind="scatter", y="median_house_value", x="median_income", alpha=0.4, figsize=(12,8))
Out[74]:
In [75]:
housing["rooms_per_household"] = housing["total_rooms"]/housing["households"]
housing["bedrooms_per_room"] = housing["total_bedrooms"]/housing["total_rooms"]
housing["population_per_household"] = housing["population"]/housing["households"]
In [76]:
corr_matrix = housing.corr()
corr_matrix["median_house_value"].sort_values(ascending=False)
Out[76]:
note that drop() creates a copy of the data and does not affect strat_train_set
In [77]:
housing = strat_train_set.drop("median_house_value", axis=1)
housing_labels = strat_train_set["median_house_value"].copy()
housing.drop("total_bedrooms", axis=1) # option a
housing.dropna(subset=["total_bedrooms"]) # option b
median = housing["total_bedrooms"].median()
housing["total_bedrooms"].fillna(median) # option c
But keep one thing in mind that the fillna value is applied on test set as well.
So you need this value to replce the na in test dataset.
Scikit learn provides a better way to do this, Imputer
First create a instance of Imputer method, then feed the data (numerical only) into to get the median. To use Imputer you need to remove all but numerical columns
In [78]:
housing_num = housing.drop("ocean_proximity",axis=1) #ocean_proximity - varchar column
from sklearn.preprocessing import Imputer
imputer = Imputer(strategy="median")
imputer.fit(housing_num)
Out[78]:
In [79]:
print(imputer.statistics_)
print(housing_num.median().values)
In [80]:
# convert this to pandas dataframe for better understanding
housing_median = pd.DataFrame(imputer.statistics_.reshape(1,8), columns=housing_num.columns)
housing_median
Out[80]:
In [81]:
# we have trained the imputer on dataset, now transform our dataset
X = imputer.transform(housing_num)
housing_tr = pd.DataFrame(X, columns=housing_num.columns)
In [82]:
# best way to convert "Text" data to some code so that we can calculate the median
from sklearn.preprocessing import LabelEncoder
encoder = LabelEncoder()
housing_cat = housing["ocean_proximity"]
housing_cat_encoded = encoder.fit_transform(housing_cat)
housing_cat_encoded
Out[82]:
In [83]:
encoder.classes_
Out[83]:
The problem with this encoding is ML understanding of numbers, ML treat 1 and 4 to distant compare to 1 and 2.
Which may not be the case as this is a categorical value. To avoid this issue we go with OneHotEncoder methond, where we have single bit on and off for each category.
In [84]:
from sklearn.preprocessing import OneHotEncoder
onehot = OneHotEncoder()
housing_cat_1hot = onehot.fit_transform(housing_cat_encoded.reshape(-1, 1))
housing_cat_1hot
Out[84]:
Above displayed array is Scipy compressed array. Using up tons of memory mostly to store zeros would be very wasteful, so instead a sparse matrix only stores the location of the nonzero elements. You can use it mostly like a normal 2D array, but if you really want to convert it to a (dense) NumPy array, just call the toarray() method:
In [85]:
housing_cat_1hot.toarray()
Out[85]:
We can apply both transformations (from text categories to integer categories, then from integer categories to one-hot vectors) in one shot using the LabelBinarizer class:
In [86]:
from sklearn.preprocessing import LabelBinarizer
binarizer = LabelBinarizer()
housing_cat_1hot_encoded = binarizer.fit_transform(housing_cat)
housing_cat_1hot_encoded
Out[86]:
In [87]:
from sklearn.base import BaseEstimator, TransformerMixin
rooms_ix, bedrooms_ix, population_ix, household_ix = 3, 4, 5, 6
class CombinedAttributesAdder(BaseEstimator, TransformerMixin):
def __init__(self, add_bedrooms_per_room = True): # no *args or **kargs
self.add_bedrooms_per_room = add_bedrooms_per_room
def fit(self, X, y=None):
return self # nothing else to do
def transform(self, X, y=None):
rooms_per_household = X[:, rooms_ix] / X[:, household_ix]
population_per_household = X[:, population_ix] / X[:, household_ix]
if self.add_bedrooms_per_room:
bedrooms_per_room = X[:, bedrooms_ix] / X[:, rooms_ix]
return np.c_[X, rooms_per_household, population_per_household,
bedrooms_per_room]
else:
return np.c_[X, rooms_per_household, population_per_household]
attr_adder = CombinedAttributesAdder(add_bedrooms_per_room=False)
housing_extra_attribs = attr_adder.transform(housing.values)
In [88]:
attr_adder.get_params()
Out[88]:
a. Min - Max Scaling (also called Normalization) - Use Scikit's MinMaxScaler
b. Standardization - Use Scikit's StandardScaler
Standardization is quite different: first it subtracts the mean value (so standardized values always have a zero mean), and then it divides by the variance so that the resulting distribution has unit variance. Unlike min-max scaling, standardization does not bound values to a specific range, which may be a problem for some algorithms (e.g., neural networks often expect an input value ranging from 0 to 1). However, standardization is much less affected by outliers. For example, suppose a district had a median income equal to 100 (by mistake). Min-max scaling would then crush all the other values from 0–15 down to 0– 0.15, whereas standardization would not be much affected.
In [89]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
# Pipeline (list) -- list of tuples (name, transformation)
# transformation must have fit_transform() method
num_pipeline = Pipeline([
('imputer', Imputer(strategy="median")), # this is equivalent to-> imputer = Imputer(strategy="median")
('attribs_adder', CombinedAttributesAdder()),
('std_scaler', StandardScaler()),
])
housing_num_tr = num_pipeline.fit_transform(housing_num)
As we have implemented Pipeline for numerical columns only, we can do the same for categorical/text columns.
With the help of scikit learn "FeatureUnion" method, we can implement numerical as well categorical transformation in one go and combine the result of both to produce final dataset
Each subpipeline starts with a selector transformer: it simply transforms the data by selecting the desired attributes (numerical or categorical), dropping the rest, and converting the resulting DataFrame to a NumPy array.
In [90]:
from sklearn.base import BaseEstimator, TransformerMixin
class DataFrameSelector(BaseEstimator, TransformerMixin):
def __init__(self, attribute_names):
self.attribute_names = attribute_names
def fit(self, X, y=None):
return self
def transform(self, X):
return X[self.attribute_names].values
In [91]:
from sklearn.pipeline import FeatureUnion
num_attribs = list(housing_num) # list of numerical columns
cat_attribs = ["ocean_proximity"] # list of categorical columns
num_pipeline = Pipeline([
('selector', DataFrameSelector(num_attribs)),
('imputer', Imputer(strategy="median")),
('attribs_adder', CombinedAttributesAdder()),
('std_scaler', StandardScaler()),
])
cat_pipeline = Pipeline([
('selector', DataFrameSelector(cat_attribs)),
('label_binarizer', LabelBinarizer()),
])
full_pipeline = FeatureUnion(transformer_list=[
("num_pipeline", num_pipeline),
("cat_pipeline", cat_pipeline),
])
print(num_attribs)
print(cat_attribs)
In [92]:
#housing.tail(3)
housing_prepared = full_pipeline.fit_transform(housing)
In [93]:
# Linear Regression can be a base model
from sklearn.linear_model import LinearRegression
lin_reg = LinearRegression()
lin_reg.fit(housing_prepared, housing_labels)
Out[93]:
In [94]:
housing_predictions = lin_reg.predict(housing_prepared)
In [95]:
from sklearn.metrics import mean_squared_error
lin_mse = mean_squared_error(housing_labels, housing_predictions)
line_rmse = np.sqrt(lin_mse)
line_rmse
Out[95]:
Trying with some other model/algorithm
In [96]:
from sklearn.tree import DecisionTreeRegressor
tree_reg = DecisionTreeRegressor()
tree_reg.fit(housing_prepared, housing_labels)
Out[96]:
In [97]:
housing_predictions = tree_reg.predict(housing_prepared)
lin_mse = mean_squared_error(housing_labels, housing_predictions)
line_rmse = np.sqrt(lin_mse)
line_rmse
Out[97]:
as we can see there is no error, which indicates that this model overfits the data, so let's check these models with Cross Validation technique
In [98]:
from sklearn.cross_validation import cross_val_score
scores = cross_val_score(tree_reg, housing_prepared, housing_labels, scoring="neg_mean_squared_error", cv=10)
rmse_scores = np.sqrt(-scores)
Scikit-Learn cross-validation features expect a utility function (greater is better) rather than a cost function (lower is better), so the scoring function is actually the opposite of the MSE (i.e., a negative value), which is why the preceding code computes - scores before calculating the square root.
In [100]:
print(rmse_scores)
print(np.mean(rmse_scores))
In [102]:
# better build a function
def display_error(scores):
print("Scores:", scores)
print("Mean:", np.mean(scores))
print("Standard deviation:", scores.std())
In [103]:
display_error(rmse_scores)
In [104]:
# let's do the same for linear regression
scores = cross_val_score(lin_reg, housing_prepared, housing_labels, scoring="neg_mean_squared_error", cv=10)
rmse_scores = np.sqrt(-scores)
display_error(rmse_scores)
In [105]:
#let's try another model
from sklearn.ensemble import RandomForestRegressor
forest_reg = RandomForestRegressor()
forest_reg.fit(housing_prepared, housing_labels)
housing_predictions = forest_reg.predict(housing_prepared)
scores = cross_val_score(forest_reg, housing_prepared, housing_labels, scoring="neg_mean_squared_error", cv=10)
rmse_scores = np.sqrt(-scores)
display_error(rmse_scores)
In [106]:
forest_mse = mean_squared_error(housing_labels, housing_predictions)
forest_rmse = np.sqrt(forest_mse)
forest_rmse
Out[106]:
It's better to save your model, parameters, hyperparameters, cross validation score and actual prediction if possible
In [107]:
from sklearn.externals import joblib
# dumping the model
joblib.dump(forest_reg, 'forest_reg.pkl')
joblib.dump(tree_reg, 'tree_reg.pkl')
#loading the model
tree_reg_loaded = joblib.load('tree_reg.pkl')
In [109]:
tree_reg_loaded.feature_importances_
Out[109]:
In [110]:
from sklearn.model_selection import GridSearchCV
param_grid = [
{'n_estimators': [3, 10, 30], 'max_features': [2, 4, 6, 8]},
{'bootstrap': [False], 'n_estimators': [3, 10], 'max_features': [2, 3, 4]},
]
forest_reg = RandomForestRegressor()
grid_search = GridSearchCV(forest_reg, param_grid, cv=5,scoring='neg_mean_squared_error')
grid_search.fit(housing_prepared, housing_labels)
Out[110]:
In [112]:
grid_search.best_params_
Out[112]:
In [113]:
grid_search.best_estimator_
Out[113]:
In [114]:
cvres = grid_search.cv_results_
for mean_scores, params in zip(cvres['mean_test_score'], cvres['params']):
print(np.sqrt(-mean_scores), params)
In [115]:
feature_importances = grid_search.best_estimator_.feature_importances_
feature_importances
Out[115]:
In [116]:
extra_attribs = ["rooms_per_hhold", "pop_per_hhold", "bedrooms_per_room"]
cat_one_hot_attribs = list(encoder.classes_)
attributes = num_attribs + extra_attribs + cat_one_hot_attribs
sorted(zip(feature_importances, attributes), reverse=True)
Out[116]:
In [117]:
final_model = grid_search.best_estimator_
X_test = strat_test_set.drop("median_house_value", axis=1)
y_test = strat_test_set["median_house_value"].copy()
X_test_prepared = full_pipeline.transform(X_test)
final_predictions = final_model.predict(X_test_prepared)
final_mse = mean_squared_error(y_test, final_predictions)
final_rmse = np.sqrt(final_mse)
In [119]:
final_rmse
Out[119]:
In [ ]: