In [50]:
# import
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

%matplotlib inline

In [51]:
import os
cwd = os.getcwd()
print(cwd)


C:\learn\Git\MachineLearning\HandsOnML

In [52]:
# reading the data
df = pd.read_csv("datasets/housing/housing.csv")
df.head(5)


Out[52]:
longitude latitude housing_median_age total_rooms total_bedrooms population households median_income median_house_value ocean_proximity
0 -122.23 37.88 41.0 880.0 129.0 322.0 126.0 8.3252 452600.0 NEAR BAY
1 -122.22 37.86 21.0 7099.0 1106.0 2401.0 1138.0 8.3014 358500.0 NEAR BAY
2 -122.24 37.85 52.0 1467.0 190.0 496.0 177.0 7.2574 352100.0 NEAR BAY
3 -122.25 37.85 52.0 1274.0 235.0 558.0 219.0 5.6431 341300.0 NEAR BAY
4 -122.25 37.85 52.0 1627.0 280.0 565.0 259.0 3.8462 342200.0 NEAR BAY

Getting some info about data


In [53]:
df.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20640 entries, 0 to 20639
Data columns (total 10 columns):
longitude             20640 non-null float64
latitude              20640 non-null float64
housing_median_age    20640 non-null float64
total_rooms           20640 non-null float64
total_bedrooms        20433 non-null float64
population            20640 non-null float64
households            20640 non-null float64
median_income         20640 non-null float64
median_house_value    20640 non-null float64
ocean_proximity       20640 non-null object
dtypes: float64(9), object(1)
memory usage: 1.6+ MB

In [54]:
# checking for nulls
df.isnull().sum()


Out[54]:
longitude               0
latitude                0
housing_median_age      0
total_rooms             0
total_bedrooms        207
population              0
households              0
median_income           0
median_house_value      0
ocean_proximity         0
dtype: int64

In [55]:
df.describe()


Out[55]:
longitude latitude housing_median_age total_rooms total_bedrooms population households median_income median_house_value
count 20640.000000 20640.000000 20640.000000 20640.000000 20433.000000 20640.000000 20640.000000 20640.000000 20640.000000
mean -119.569704 35.631861 28.639486 2635.763081 537.870553 1425.476744 499.539680 3.870671 206855.816909
std 2.003532 2.135952 12.585558 2181.615252 421.385070 1132.462122 382.329753 1.899822 115395.615874
min -124.350000 32.540000 1.000000 2.000000 1.000000 3.000000 1.000000 0.499900 14999.000000
25% -121.800000 33.930000 18.000000 1447.750000 296.000000 787.000000 280.000000 2.563400 119600.000000
50% -118.490000 34.260000 29.000000 2127.000000 435.000000 1166.000000 409.000000 3.534800 179700.000000
75% -118.010000 37.710000 37.000000 3148.000000 647.000000 1725.000000 605.000000 4.743250 264725.000000
max -114.310000 41.950000 52.000000 39320.000000 6445.000000 35682.000000 6082.000000 15.000100 500001.000000

Visualizing and checking the data tendency


In [56]:
df.hist(bins=50, figsize=(20, 12))


Out[56]:
array([[<matplotlib.axes._subplots.AxesSubplot object at 0x000000000F867160>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x000000000F8CA5C0>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x000000000F910EB8>],
       [<matplotlib.axes._subplots.AxesSubplot object at 0x000000000F950518>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x000000000FEB1828>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x000000000FEF93C8>],
       [<matplotlib.axes._subplots.AxesSubplot object at 0x000000000FF4D048>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x000000000FF90048>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x000000000FFDFC88>]], dtype=object)

In [57]:
# We can use sklearn preprocessing library for same but this is how we can write the same code in 
# python numpy

def train_test_split(data, test_ratio):
    shuffled_indices = np.random.permutation(len(data))
    test_indices = int(len(data)*test_ratio)
    train_data = shuffled_indices[test_indices:]
    test_data = shuffled_indices[:test_indices]
    return data.iloc[train_data], data.iloc[test_data]

In [58]:
train_set, test_set = train_test_split(df, 0.2)

In [59]:
print("train_set:"+str(len(train_set))+" test set:"+str(len(test_set)))


train_set:16512 test set:4128

But the problem with this implementation is that it will randomize and see the whole data over few run, we can fix it by initializing the random seed with some value but this will break out when there is a update in data, this will change the data in train and test both


In [60]:
# Using Scikit learn
from sklearn.model_selection import train_test_split

train_set, test_set = train_test_split(df, test_size=0.2, random_state=42)
print("train_set:"+str(len(train_set))+" test set:"+str(len(test_set)))


train_set:16512 test set:4128

income_cat - Creating a new column which categorize the data such a way that max median income will be 5
our output variable y is median income


In [61]:
df["income_cat"]=np.ceil(df["median_income"]/1.5)
df["income_cat"].where(df["income_cat"]<5,5.0,inplace=True)

In [62]:
df["income_cat"].hist(bins=50)


Out[62]:
<matplotlib.axes._subplots.AxesSubplot at 0xea3e048>

As you can see, we have removed the long tail of median income and shrink the data into 1-5 range.
Now, using sklearn Statified Split to split the data


In [63]:
from sklearn.model_selection import StratifiedShuffleSplit

In [64]:
split = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=42)
for train_index,test_index in split.split(df, df["income_cat"]):    # obj.split(X, y)
    strat_train_set = df.loc[train_index]
    strat_test_set = df.loc[test_index]

In [65]:
# lets check the % data distrinution in income_cat column
df["income_cat"].value_counts()/len(df)


Out[65]:
3.0    0.350581
2.0    0.318847
4.0    0.176308
5.0    0.114438
1.0    0.039826
Name: income_cat, dtype: float64

In [66]:
# now dropping the df["income_cat"] from the train, test data
strat_train_set.drop("income_cat", axis=1, inplace=True)
strat_test_set.drop("income_cat", axis=1, inplace=True)

In [67]:
# keeping the train data copy aside to keep it intact with our explorations steps
housing = strat_train_set.copy()

In [68]:
housing.plot(kind="scatter", x="longitude", y="latitude", figsize=(6, 6))


Out[68]:
<matplotlib.axes._subplots.AxesSubplot at 0x112c6ef0>

In [69]:
# for density check, add an other option alpha
housing.plot(kind="scatter", x="longitude", y="latitude", figsize=(6, 6), alpha=0.1)


Out[69]:
<matplotlib.axes._subplots.AxesSubplot at 0xb8c83c8>

In [70]:
housing.plot(kind="scatter", x="longitude", y="latitude", figsize=(8, 8), alpha=0.4,
            c="median_house_value", cmap=plt.get_cmap("jet"), colorbar=True,
             s=housing["population"]/100, label="population",
             grid=True,  legend=True)


Out[70]:
<matplotlib.axes._subplots.AxesSubplot at 0x111d4320>

Looking for corelation if any


In [71]:
corr_matrix = housing.corr()
corr_matrix


Out[71]:
longitude latitude housing_median_age total_rooms total_bedrooms population households median_income median_house_value
longitude 1.000000 -0.924478 -0.105848 0.048871 0.076598 0.108030 0.063070 -0.019583 -0.047432
latitude -0.924478 1.000000 0.005766 -0.039184 -0.072419 -0.115222 -0.077647 -0.075205 -0.142724
housing_median_age -0.105848 0.005766 1.000000 -0.364509 -0.325047 -0.298710 -0.306428 -0.111360 0.114110
total_rooms 0.048871 -0.039184 -0.364509 1.000000 0.929379 0.855109 0.918392 0.200087 0.135097
total_bedrooms 0.076598 -0.072419 -0.325047 0.929379 1.000000 0.876320 0.980170 -0.009740 0.047689
population 0.108030 -0.115222 -0.298710 0.855109 0.876320 1.000000 0.904637 0.002380 -0.026920
households 0.063070 -0.077647 -0.306428 0.918392 0.980170 0.904637 1.000000 0.010781 0.064506
median_income -0.019583 -0.075205 -0.111360 0.200087 -0.009740 0.002380 0.010781 1.000000 0.687160
median_house_value -0.047432 -0.142724 0.114110 0.135097 0.047689 -0.026920 0.064506 0.687160 1.000000

In [72]:
# looking for relation between 
# The correlation coefficient only measures l inear correlations (“if x goes up, then y general ly goes up/down”). 
# It may completely miss out on nonl inear relationships (e.g., “if x is close to zero then y general ly goes up”)
corr_matrix["median_house_value"].sort_values(ascending=False)


Out[72]:
median_house_value    1.000000
median_income         0.687160
total_rooms           0.135097
housing_median_age    0.114110
households            0.064506
total_bedrooms        0.047689
population           -0.026920
longitude            -0.047432
latitude             -0.142724
Name: median_house_value, dtype: float64

In [73]:
# Using pandas scatter matrix function
attr = ["median_house_value","median_income","total_rooms","housing_median_age"]
pd.scatter_matrix(housing[attr], figsize=(15, 10))


Out[73]:
array([[<matplotlib.axes._subplots.AxesSubplot object at 0x000000000C9A70F0>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x00000000110C8278>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x00000000110B37F0>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x000000000C96E898>],
       [<matplotlib.axes._subplots.AxesSubplot object at 0x000000000C9C5940>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x00000000129566D8>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x000000001296B8D0>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x00000000129EB518>],
       [<matplotlib.axes._subplots.AxesSubplot object at 0x0000000012A01588>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x0000000012A83358>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x0000000012A987B8>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x0000000012B1B198>],
       [<matplotlib.axes._subplots.AxesSubplot object at 0x0000000012B2F4A8>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x0000000012BA9F98>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x0000000012BC4320>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x0000000012C40DD8>]], dtype=object)

In [74]:
housing.plot(kind="scatter", y="median_house_value", x="median_income", alpha=0.4, figsize=(12,8))


Out[74]:
<matplotlib.axes._subplots.AxesSubplot at 0x141304e0>

Experimenting with attribute combination


In [75]:
housing["rooms_per_household"] = housing["total_rooms"]/housing["households"]
housing["bedrooms_per_room"] = housing["total_bedrooms"]/housing["total_rooms"]
housing["population_per_household"] = housing["population"]/housing["households"]

In [76]:
corr_matrix = housing.corr()
corr_matrix["median_house_value"].sort_values(ascending=False)


Out[76]:
median_house_value          1.000000
median_income               0.687160
rooms_per_household         0.146285
total_rooms                 0.135097
housing_median_age          0.114110
households                  0.064506
total_bedrooms              0.047689
population_per_household   -0.021985
population                 -0.026920
longitude                  -0.047432
latitude                   -0.142724
bedrooms_per_room          -0.259984
Name: median_house_value, dtype: float64

note that drop() creates a copy of the data and does not affect strat_train_set


In [77]:
housing = strat_train_set.drop("median_house_value", axis=1)
housing_labels = strat_train_set["median_house_value"].copy()

Data Cleaning

Handling Numerical Data

There are 3 methods to do the data cleaning -
a. Either Drop the particular feature (drop)
b. Or Drop the NA values only (dropna)
c. Or Fill NA with some value such as mean, median or other (fillna)

housing.drop("total_bedrooms",  axis=1)             #   option a    
housing.dropna(subset=["total_bedrooms"])           #   option b    
median = housing["total_bedrooms"].median()    
housing["total_bedrooms"].fillna(median)            #   option c

But keep one thing in mind that the fillna value is applied on test set as well.
So you need this value to replce the na in test dataset.
Scikit learn provides a better way to do this, Imputer

First create a instance of Imputer method, then feed the data (numerical only) into to get the median. To use Imputer you need to remove all but numerical columns


In [78]:
housing_num = housing.drop("ocean_proximity",axis=1)  #ocean_proximity - varchar column

from sklearn.preprocessing import Imputer
imputer = Imputer(strategy="median")

imputer.fit(housing_num)


Out[78]:
Imputer(axis=0, copy=True, missing_values='NaN', strategy='median', verbose=0)

In [79]:
print(imputer.statistics_)
print(housing_num.median().values)


[ -118.51      34.26      29.      2119.5      433.      1164.       408.
     3.5409]
[ -118.51      34.26      29.      2119.5      433.      1164.       408.
     3.5409]

In [80]:
# convert this to pandas dataframe for better understanding
housing_median = pd.DataFrame(imputer.statistics_.reshape(1,8), columns=housing_num.columns)
housing_median


Out[80]:
longitude latitude housing_median_age total_rooms total_bedrooms population households median_income
0 -118.51 34.26 29.0 2119.5 433.0 1164.0 408.0 3.5409

In [81]:
# we have trained the imputer on dataset, now transform our dataset
X = imputer.transform(housing_num)
housing_tr = pd.DataFrame(X, columns=housing_num.columns)

Handling Text and Categorical Data


In [82]:
# best way to convert "Text" data to some code so that we can calculate the median

from sklearn.preprocessing import LabelEncoder

encoder = LabelEncoder()
housing_cat = housing["ocean_proximity"]
housing_cat_encoded = encoder.fit_transform(housing_cat)

housing_cat_encoded


Out[82]:
array([0, 0, 4, ..., 1, 0, 3], dtype=int64)

In [83]:
encoder.classes_


Out[83]:
array(['<1H OCEAN', 'INLAND', 'ISLAND', 'NEAR BAY', 'NEAR OCEAN'], dtype=object)

The problem with this encoding is ML understanding of numbers, ML treat 1 and 4 to distant compare to 1 and 2.
Which may not be the case as this is a categorical value. To avoid this issue we go with OneHotEncoder methond, where we have single bit on and off for each category.


In [84]:
from sklearn.preprocessing import OneHotEncoder

onehot = OneHotEncoder()
housing_cat_1hot = onehot.fit_transform(housing_cat_encoded.reshape(-1, 1))

housing_cat_1hot


Out[84]:
<16512x5 sparse matrix of type '<class 'numpy.float64'>'
	with 16512 stored elements in Compressed Sparse Row format>

Above displayed array is Scipy compressed array. Using up tons of memory mostly to store zeros would be very wasteful, so instead a sparse matrix only stores the location of the nonzero elements. You can use it mostly like a normal 2D array, but if you really want to convert it to a (dense) NumPy array, just call the toarray() method:


In [85]:
housing_cat_1hot.toarray()


Out[85]:
array([[ 1.,  0.,  0.,  0.,  0.],
       [ 1.,  0.,  0.,  0.,  0.],
       [ 0.,  0.,  0.,  0.,  1.],
       ..., 
       [ 0.,  1.,  0.,  0.,  0.],
       [ 1.,  0.,  0.,  0.,  0.],
       [ 0.,  0.,  0.,  1.,  0.]])

We can apply both transformations (from text categories to integer categories, then from integer categories to one-hot vectors) in one shot using the LabelBinarizer class:


In [86]:
from sklearn.preprocessing import LabelBinarizer

binarizer = LabelBinarizer()
housing_cat_1hot_encoded = binarizer.fit_transform(housing_cat)

housing_cat_1hot_encoded


Out[86]:
array([[1, 0, 0, 0, 0],
       [1, 0, 0, 0, 0],
       [0, 0, 0, 0, 1],
       ..., 
       [0, 1, 0, 0, 0],
       [1, 0, 0, 0, 0],
       [0, 0, 0, 1, 0]])

Transforming other features

a. Write a Class for transforming attribute
b. Use Pipeline, BaseEstimator OR TransformerMixin


In [87]:
from sklearn.base import BaseEstimator, TransformerMixin
rooms_ix, bedrooms_ix, population_ix, household_ix = 3, 4, 5, 6
class CombinedAttributesAdder(BaseEstimator, TransformerMixin):
    def __init__(self, add_bedrooms_per_room = True): # no *args or **kargs
        self.add_bedrooms_per_room = add_bedrooms_per_room
    def fit(self, X, y=None):
        return self  # nothing else to do
    def transform(self, X, y=None):
        rooms_per_household = X[:, rooms_ix] / X[:, household_ix]
        population_per_household = X[:, population_ix] / X[:, household_ix]
        if self.add_bedrooms_per_room:
            bedrooms_per_room = X[:, bedrooms_ix] / X[:, rooms_ix]
            return np.c_[X, rooms_per_household, population_per_household,
                         bedrooms_per_room]
        else:
            return np.c_[X, rooms_per_household, population_per_household]
attr_adder = CombinedAttributesAdder(add_bedrooms_per_room=False)
housing_extra_attribs = attr_adder.transform(housing.values)

In [88]:
attr_adder.get_params()


Out[88]:
{'add_bedrooms_per_room': False}

Feature Scaling

a. Min - Max Scaling (also called Normalization) - Use Scikit's MinMaxScaler
b. Standardization - Use Scikit's StandardScaler

Standardization is quite different: first it subtracts the mean value (so standardized values always have a zero mean), and then it divides by the variance so that the resulting distribution has unit variance. Unlike min-max scaling, standardization does not bound values to a specific range, which may be a problem for some algorithms (e.g., neural networks often expect an input value ranging from 0 to 1). However, standardization is much less affected by outliers. For example, suppose a district had a median income equal to 100 (by mistake). Min-max scaling would then crush all the other values from 0–15 down to 0– 0.15, whereas standardization would not be much affected.

Transformation Pipeline


In [89]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler

# Pipeline (list)  -- list of tuples (name, transformation)
# transformation must have fit_transform() method
num_pipeline = Pipeline([
        ('imputer', Imputer(strategy="median")),   # this is equivalent to-> imputer = Imputer(strategy="median")
        ('attribs_adder', CombinedAttributesAdder()),
        ('std_scaler', StandardScaler()),
    ])
housing_num_tr = num_pipeline.fit_transform(housing_num)

As we have implemented Pipeline for numerical columns only, we can do the same for categorical/text columns.
With the help of scikit learn "FeatureUnion" method, we can implement numerical as well categorical transformation in one go and combine the result of both to produce final dataset

Each subpipeline starts with a selector transformer: it simply transforms the data by selecting the desired attributes (numerical or categorical), dropping the rest, and converting the resulting DataFrame to a NumPy array.


In [90]:
from sklearn.base import BaseEstimator, TransformerMixin

class DataFrameSelector(BaseEstimator, TransformerMixin):
    def __init__(self, attribute_names):
        self.attribute_names = attribute_names
    def fit(self, X, y=None):
        return self
    def transform(self, X):
        return X[self.attribute_names].values

In [91]:
from sklearn.pipeline import FeatureUnion

num_attribs = list(housing_num)     # list of numerical columns
cat_attribs = ["ocean_proximity"]   # list of categorical columns

num_pipeline = Pipeline([
        ('selector', DataFrameSelector(num_attribs)),
        ('imputer', Imputer(strategy="median")),
        ('attribs_adder', CombinedAttributesAdder()),
        ('std_scaler', StandardScaler()),
    ])

cat_pipeline = Pipeline([
        ('selector', DataFrameSelector(cat_attribs)),
        ('label_binarizer', LabelBinarizer()),
    ])

full_pipeline = FeatureUnion(transformer_list=[
        ("num_pipeline", num_pipeline),
        ("cat_pipeline", cat_pipeline),
    ])

print(num_attribs)
print(cat_attribs)


['longitude', 'latitude', 'housing_median_age', 'total_rooms', 'total_bedrooms', 'population', 'households', 'median_income']
['ocean_proximity']

In [92]:
#housing.tail(3)
housing_prepared = full_pipeline.fit_transform(housing)

Train and Evaluate the model


In [93]:
# Linear Regression can be a base model 

from sklearn.linear_model import LinearRegression

lin_reg = LinearRegression()
lin_reg.fit(housing_prepared, housing_labels)


Out[93]:
LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False)

In [94]:
housing_predictions = lin_reg.predict(housing_prepared)

In [95]:
from sklearn.metrics import mean_squared_error

lin_mse = mean_squared_error(housing_labels, housing_predictions)
line_rmse = np.sqrt(lin_mse)
line_rmse


Out[95]:
68628.198198489219

Trying with some other model/algorithm


In [96]:
from sklearn.tree import DecisionTreeRegressor

tree_reg = DecisionTreeRegressor()
tree_reg.fit(housing_prepared, housing_labels)


Out[96]:
DecisionTreeRegressor(criterion='mse', max_depth=None, max_features=None,
           max_leaf_nodes=None, min_impurity_split=1e-07,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, presort=False, random_state=None,
           splitter='best')

In [97]:
housing_predictions = tree_reg.predict(housing_prepared)

lin_mse = mean_squared_error(housing_labels, housing_predictions)
line_rmse = np.sqrt(lin_mse)
line_rmse


Out[97]:
0.0

as we can see there is no error, which indicates that this model overfits the data, so let's check these models with Cross Validation technique


In [98]:
from sklearn.cross_validation import cross_val_score

scores = cross_val_score(tree_reg, housing_prepared, housing_labels, scoring="neg_mean_squared_error", cv=10)
rmse_scores = np.sqrt(-scores)

Scikit-Learn cross-validation features expect a utility function (greater is better) rather than a cost function (lower is better), so the scoring function is actually the opposite of the MSE (i.e., a negative value), which is why the preceding code computes - scores before calculating the square root.


In [100]:
print(rmse_scores)
print(np.mean(rmse_scores))


[ 67784.52277265  68798.9718476   72039.67801005  68002.98232346
  72961.07025773  75668.03593291  69584.17937419  69960.7732639
  76281.91680959  69771.94322463]
71085.4073817

In [102]:
# better build a function

def display_error(scores):
    print("Scores:", scores)
    print("Mean:", np.mean(scores))
    print("Standard deviation:", scores.std())

In [103]:
display_error(rmse_scores)


Scores: [ 67784.52277265  68798.9718476   72039.67801005  68002.98232346
  72961.07025773  75668.03593291  69584.17937419  69960.7732639
  76281.91680959  69771.94322463]
Mean: 71085.4073817
Standard deviation: 2886.20139187

In [104]:
# let's do the same for linear regression

scores = cross_val_score(lin_reg, housing_prepared, housing_labels, scoring="neg_mean_squared_error", cv=10)
rmse_scores = np.sqrt(-scores)
display_error(rmse_scores)


Scores: [ 66782.73843989  66960.118071    70347.95244419  74739.57052552
  68031.13388938  71193.84183426  64969.63056405  68281.61137997
  71552.91566558  67665.10082067]
Mean: 69052.4613635
Standard deviation: 2731.6740018

In [105]:
#let's try another model

from sklearn.ensemble import RandomForestRegressor

forest_reg = RandomForestRegressor()
forest_reg.fit(housing_prepared, housing_labels)
housing_predictions = forest_reg.predict(housing_prepared)

scores = cross_val_score(forest_reg, housing_prepared, housing_labels, scoring="neg_mean_squared_error", cv=10)
rmse_scores = np.sqrt(-scores)
display_error(rmse_scores)


Scores: [ 52670.18001482  50348.80677872  52892.83900992  54672.42876256
  52709.57283225  54948.96736875  52121.26339085  50498.48039838
  54773.10899716  54187.20603396]
Mean: 52982.2853587
Standard deviation: 1594.01831389

In [106]:
forest_mse = mean_squared_error(housing_labels, housing_predictions)
forest_rmse = np.sqrt(forest_mse)
forest_rmse


Out[106]:
22695.712388029056

It's better to save your model, parameters, hyperparameters, cross validation score and actual prediction if possible


In [107]:
from sklearn.externals import joblib

# dumping the model
joblib.dump(forest_reg, 'forest_reg.pkl')
joblib.dump(tree_reg, 'tree_reg.pkl')

#loading the model
tree_reg_loaded = joblib.load('tree_reg.pkl')

In [109]:
tree_reg_loaded.feature_importances_


Out[109]:
array([  6.62958649e-02,   5.61100747e-02,   4.03414845e-02,
         1.28410592e-02,   1.31259277e-02,   1.02776136e-02,
         1.12696542e-02,   4.72354615e-01,   2.79828675e-02,
         1.22657793e-01,   2.36225422e-02,   8.48678700e-04,
         1.40268030e-01,   0.00000000e+00,   4.17254879e-04,
         1.58654055e-03])

Fine and Tune the model

Using Grid Search


In [110]:
from sklearn.model_selection import GridSearchCV

param_grid = [
{'n_estimators': [3, 10, 30], 'max_features': [2, 4, 6, 8]},
{'bootstrap': [False], 'n_estimators': [3, 10], 'max_features': [2, 3, 4]},
]

forest_reg = RandomForestRegressor()

grid_search = GridSearchCV(forest_reg, param_grid, cv=5,scoring='neg_mean_squared_error')
grid_search.fit(housing_prepared, housing_labels)


Out[110]:
GridSearchCV(cv=5, error_score='raise',
       estimator=RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_split=1e-07, min_samples_leaf=1,
           min_samples_split=2, min_weight_fraction_leaf=0.0,
           n_estimators=10, n_jobs=1, oob_score=False, random_state=None,
           verbose=0, warm_start=False),
       fit_params={}, iid=True, n_jobs=1,
       param_grid=[{'n_estimators': [3, 10, 30], 'max_features': [2, 4, 6, 8]}, {'n_estimators': [3, 10], 'max_features': [2, 3, 4], 'bootstrap': [False]}],
       pre_dispatch='2*n_jobs', refit=True, return_train_score=True,
       scoring='neg_mean_squared_error', verbose=0)

In [112]:
grid_search.best_params_


Out[112]:
{'max_features': 8, 'n_estimators': 30}

In [113]:
grid_search.best_estimator_


Out[113]:
RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features=8, max_leaf_nodes=None, min_impurity_split=1e-07,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=30, n_jobs=1,
           oob_score=False, random_state=None, verbose=0, warm_start=False)

In [114]:
cvres = grid_search.cv_results_

for mean_scores, params in zip(cvres['mean_test_score'], cvres['params']):
    print(np.sqrt(-mean_scores), params)


65404.0888668 {'n_estimators': 3, 'max_features': 2}
55613.446117 {'n_estimators': 10, 'max_features': 2}
53192.1387609 {'n_estimators': 30, 'max_features': 2}
59755.096622 {'n_estimators': 3, 'max_features': 4}
53116.0895546 {'n_estimators': 10, 'max_features': 4}
50694.1639439 {'n_estimators': 30, 'max_features': 4}
59263.1177799 {'n_estimators': 3, 'max_features': 6}
52298.8277049 {'n_estimators': 10, 'max_features': 6}
50199.3075679 {'n_estimators': 30, 'max_features': 6}
59315.502987 {'n_estimators': 3, 'max_features': 8}
51729.6741884 {'n_estimators': 10, 'max_features': 8}
49888.1835241 {'n_estimators': 30, 'max_features': 8}
61852.3749068 {'n_estimators': 3, 'max_features': 2, 'bootstrap': False}
54126.063434 {'n_estimators': 10, 'max_features': 2, 'bootstrap': False}
60580.4889183 {'n_estimators': 3, 'max_features': 3, 'bootstrap': False}
52801.4653098 {'n_estimators': 10, 'max_features': 3, 'bootstrap': False}
58821.161113 {'n_estimators': 3, 'max_features': 4, 'bootstrap': False}
51876.7879822 {'n_estimators': 10, 'max_features': 4, 'bootstrap': False}

Analyze the best model


In [115]:
feature_importances = grid_search.best_estimator_.feature_importances_
feature_importances


Out[115]:
array([  6.52749031e-02,   6.40291613e-02,   4.39969150e-02,
         1.58179553e-02,   1.49377521e-02,   1.51419081e-02,
         1.49064223e-02,   3.83992123e-01,   5.30596760e-02,
         1.15412263e-01,   6.28743452e-02,   2.90580930e-03,
         1.42608311e-01,   1.17525167e-04,   2.22495395e-03,
         2.69997549e-03])

In [116]:
extra_attribs = ["rooms_per_hhold", "pop_per_hhold", "bedrooms_per_room"]
cat_one_hot_attribs = list(encoder.classes_)
attributes = num_attribs + extra_attribs + cat_one_hot_attribs

sorted(zip(feature_importances, attributes), reverse=True)


Out[116]:
[(0.38399212319486603, 'median_income'),
 (0.1426083108433914, 'INLAND'),
 (0.11541226347731361, 'pop_per_hhold'),
 (0.065274903135532719, 'longitude'),
 (0.064029161327101694, 'latitude'),
 (0.062874345215446756, 'bedrooms_per_room'),
 (0.053059676030047707, 'rooms_per_hhold'),
 (0.043996915007104544, 'housing_median_age'),
 (0.015817955299075339, 'total_rooms'),
 (0.015141908134835678, 'population'),
 (0.014937752121754654, 'total_bedrooms'),
 (0.014906422305543099, 'households'),
 (0.00290580929823078, '<1H OCEAN'),
 (0.0026999754883498483, 'NEAR OCEAN'),
 (0.0022249539544208223, 'NEAR BAY'),
 (0.00011752516698529767, 'ISLAND')]

Evaluate the model on TEST Set


In [117]:
final_model = grid_search.best_estimator_

X_test = strat_test_set.drop("median_house_value", axis=1)
y_test = strat_test_set["median_house_value"].copy()

X_test_prepared = full_pipeline.transform(X_test)
final_predictions = final_model.predict(X_test_prepared)

final_mse = mean_squared_error(y_test, final_predictions)
final_rmse = np.sqrt(final_mse)

In [119]:
final_rmse


Out[119]:
48203.962654764393

In [ ]: