In [2]:
import os
import tarfile
from six.moves import urllib
from IPython.core.display import display, HTML

DOWNLOAD_ROOT = "https://raw.githubusercontent.com/ageron/handson-ml/master/"
HOUSING_PATH = "datasets/housing"
HOUSING_URL = DOWNLOAD_ROOT+HOUSING_PATH+"/housing.tgz"

def fetch_housing_data(housing_url = HOUSING_URL,housing_path = HOUSING_PATH):
    if not os.path.isdir(housing_path):
        os.makedirs(housing_path)
    
    tgz_path = os.path.join(housing_path,"housing.tgz")
    urllib.request.urlretrieve(housing_url,tgz_path)
    housing_tgz = tarfile.open(tgz_path)
    housing_tgz.extractall(path = housing_path)
    housing_tgz.close()
    
fetch_housing_data()

import pandas as pd

def load_housing_data(housing_path = HOUSING_PATH):
    csv_path = os.path.join(housing_path, "housing.csv")
    return pd.read_csv(csv_path)

housing = load_housing_data()
housing.head()


Out[2]:
longitude latitude housing_median_age total_rooms total_bedrooms population households median_income median_house_value ocean_proximity
0 -122.23 37.88 41.0 880.0 129.0 322.0 126.0 8.3252 452600.0 NEAR BAY
1 -122.22 37.86 21.0 7099.0 1106.0 2401.0 1138.0 8.3014 358500.0 NEAR BAY
2 -122.24 37.85 52.0 1467.0 190.0 496.0 177.0 7.2574 352100.0 NEAR BAY
3 -122.25 37.85 52.0 1274.0 235.0 558.0 219.0 5.6431 341300.0 NEAR BAY
4 -122.25 37.85 52.0 1627.0 280.0 565.0 259.0 3.8462 342200.0 NEAR BAY

In [3]:
housing.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20640 entries, 0 to 20639
Data columns (total 10 columns):
longitude             20640 non-null float64
latitude              20640 non-null float64
housing_median_age    20640 non-null float64
total_rooms           20640 non-null float64
total_bedrooms        20433 non-null float64
population            20640 non-null float64
households            20640 non-null float64
median_income         20640 non-null float64
median_house_value    20640 non-null float64
ocean_proximity       20640 non-null object
dtypes: float64(9), object(1)
memory usage: 1.6+ MB

In [4]:
housing.size


Out[4]:
206400

In [5]:
# What ctegories exists in ocean_proximity and how many districts belongs to each category
housing["ocean_proximity"].value_counts()


Out[5]:
<1H OCEAN     9136
INLAND        6551
NEAR OCEAN    2658
NEAR BAY      2290
ISLAND           5
Name: ocean_proximity, dtype: int64

In [6]:
housing.describe()


Out[6]:
longitude latitude housing_median_age total_rooms total_bedrooms population households median_income median_house_value
count 20640.000000 20640.000000 20640.000000 20640.000000 20433.000000 20640.000000 20640.000000 20640.000000 20640.000000
mean -119.569704 35.631861 28.639486 2635.763081 537.870553 1425.476744 499.539680 3.870671 206855.816909
std 2.003532 2.135952 12.585558 2181.615252 421.385070 1132.462122 382.329753 1.899822 115395.615874
min -124.350000 32.540000 1.000000 2.000000 1.000000 3.000000 1.000000 0.499900 14999.000000
25% -121.800000 33.930000 18.000000 1447.750000 296.000000 787.000000 280.000000 2.563400 119600.000000
50% -118.490000 34.260000 29.000000 2127.000000 435.000000 1166.000000 409.000000 3.534800 179700.000000
75% -118.010000 37.710000 37.000000 3148.000000 647.000000 1725.000000 605.000000 4.743250 264725.000000
max -114.310000 41.950000 52.000000 39320.000000 6445.000000 35682.000000 6082.000000 15.000100 500001.000000

In [7]:
%matplotlib inline 
import matplotlib.pyplot as plt
housing.hist(bins = 50, figsize=(20,15))
plt.show()



In [8]:
display(HTML('<h3>Create A Test Set</h3>'))


Create A Test Set


In [9]:
import numpy as np

def split_train_test(data,test_ratio):
    # Can set seed like np.random.seed(num) - generates same shuffled indices
    shuffled_indices = np.random.permutation(len(data))
    test_set_size = int(len(data)*test_ratio)
    test_indices = shuffled_indices[:test_set_size]
    train_indices = shuffled_indices[test_set_size:]
    return data.iloc[train_indices],data.iloc[test_indices]

In [10]:
train_set,test_set = split_train_test(housing,0.2)
print("The data has ",len(train_set)," training data and",len(test_set)," test data")


The data has  16512  training data and 4128  test data

In [11]:
# import hashlib
# hash= hashlib.md5
# a = hash(np.int64(100))
# #For hash(np.int64(100)) , the byte representation of the digest : b"\xbcr#\x9eE't\xbcW\x05K\x11\xac]Q*"
# #The last byte is * and value returned for that is like ASCII (*) -> 42 
# print(a.digest())

In [12]:
display(HTML('<h3>Use Hashing to create A Test Set</h3>'))


Use Hashing to create A Test Set


In [13]:
import hashlib

def test_set_check(identifier,test_ratio,hash):
    return hash(np.int64(identifier)).digest()[-1] < 256*test_ratio

In [14]:
def split_train_test_by_id(data,test_ratio,id_column,hash = hashlib.md5):
    ids = data[id_column]
    in_test_set = ids.apply(lambda id_: test_set_check(id_,test_ratio,hash) )
    return data.loc[~in_test_set], data.loc[in_test_set]

In [15]:
# Adding index column
housing_with_id = housing.reset_index()  
train_set,test_set = split_train_test_by_id(housing_with_id,0.2,"index")

In [16]:
# Adding id based on longitude and latitute

# housing_with_id["id"] = housing["longitude"]*1000 +  housing["latitude"]
# train_set,test_set = split_train_test_by_id(housing_with_id,0.2,"id")
# print(train_set)
# print(test_set)

In [17]:
housing["income_cat"] = np.ceil(housing["median_income"]/1.5)

In [18]:
housing["income_cat"].where(housing["income_cat"]<5,5.0,inplace=True)

In [19]:
from sklearn.model_selection import StratifiedShuffleSplit

split = StratifiedShuffleSplit(n_splits = 1,test_size = 0.2,random_state=42)

for train_index,test_index in split.split(housing,housing["income_cat"]):
    strat_train_set = housing.loc[train_index]
    strat_test_set = housing.loc[test_index]

In [ ]:


In [20]:
for set in (strat_train_set,strat_test_set):
    set.drop(["income_cat"],axis = 1,inplace=True)

In [21]:
display(HTML('<h3>Visualize Data To Gain Insights</h3>'))


Visualize Data To Gain Insights


In [22]:
housing = strat_train_set.copy()
housing.plot(kind="scatter" , x = "longitude",y="latitude")
plt.legend()



In [23]:
housing = strat_train_set.copy()
housing.plot(kind="scatter" , x = "longitude",y="latitude",alpha=0.1)
plt.legend()



In [24]:
housing.plot(kind = "scatter", x = "longitude",y = "latitude",alpha=0.4,
             s=housing["population"]/100,label="population",c="median_house_value",
             cmap=plt.get_cmap("jet"),colorbar = True)
plt.legend()


Out[24]:
<matplotlib.legend.Legend at 0x10ec07438>

In [25]:
corr_matrix = housing.corr()
print(corr_matrix)


                    longitude  latitude  housing_median_age  total_rooms  \
longitude            1.000000 -0.924478           -0.105848     0.048871   
latitude            -0.924478  1.000000            0.005766    -0.039184   
housing_median_age  -0.105848  0.005766            1.000000    -0.364509   
total_rooms          0.048871 -0.039184           -0.364509     1.000000   
total_bedrooms       0.076598 -0.072419           -0.325047     0.929379   
population           0.108030 -0.115222           -0.298710     0.855109   
households           0.063070 -0.077647           -0.306428     0.918392   
median_income       -0.019583 -0.075205           -0.111360     0.200087   
median_house_value  -0.047432 -0.142724            0.114110     0.135097   

                    total_bedrooms  population  households  median_income  \
longitude                 0.076598    0.108030    0.063070      -0.019583   
latitude                 -0.072419   -0.115222   -0.077647      -0.075205   
housing_median_age       -0.325047   -0.298710   -0.306428      -0.111360   
total_rooms               0.929379    0.855109    0.918392       0.200087   
total_bedrooms            1.000000    0.876320    0.980170      -0.009740   
population                0.876320    1.000000    0.904637       0.002380   
households                0.980170    0.904637    1.000000       0.010781   
median_income            -0.009740    0.002380    0.010781       1.000000   
median_house_value        0.047689   -0.026920    0.064506       0.687160   

                    median_house_value  
longitude                    -0.047432  
latitude                     -0.142724  
housing_median_age            0.114110  
total_rooms                   0.135097  
total_bedrooms                0.047689  
population                   -0.026920  
households                    0.064506  
median_income                 0.687160  
median_house_value            1.000000  

In [26]:
corr_matrix["median_house_value"].sort_values(ascending=False)


Out[26]:
median_house_value    1.000000
median_income         0.687160
total_rooms           0.135097
housing_median_age    0.114110
households            0.064506
total_bedrooms        0.047689
population           -0.026920
longitude            -0.047432
latitude             -0.142724
Name: median_house_value, dtype: float64

In [27]:
display(HTML('<h3>Scatter Matrix</h3>'))


Scatter Matrix


In [28]:
from pandas.plotting import scatter_matrix

attributes = ["median_house_value","median_income","total_rooms","housing_median_age"]
scatter_matrix(housing[attributes],figsize=(12,8))
plt.legend()



In [29]:
housing.plot(kind="scatter",x="median_income",y="median_house_value",alpha=0.1)
plt.legend()



In [30]:
display(HTML('<h3>Experimenting with Attribute Combinations</h3>'))


Experimenting with Attribute Combinations


In [31]:
housing["rooms_per_household"] = housing["total_rooms"]/housing["households"]
housing["bedrooms_per_room"] = housing["total_bedrooms"]/housing["total_rooms"]
housing["population_per_household"] = housing["population"]/housing["households"]

corr_matrix = housing.corr()
corr_matrix["median_house_value"].sort_values(ascending=False)


Out[31]:
median_house_value          1.000000
median_income               0.687160
rooms_per_household         0.146285
total_rooms                 0.135097
housing_median_age          0.114110
households                  0.064506
total_bedrooms              0.047689
population_per_household   -0.021985
population                 -0.026920
longitude                  -0.047432
latitude                   -0.142724
bedrooms_per_room          -0.259984
Name: median_house_value, dtype: float64

In [32]:
display(HTML('<h1>Preparing Data For Machine Learning Algorithm</h1>'))


Preparing Data For Machine Learning Algorithm


In [33]:
housing = strat_train_set.drop("median_house_value",axis=1)
housing_labels = strat_train_set["median_house_value"].copy()

In [34]:
display(HTML('<h3>Data Cleaning</h3>'))


Data Cleaning


In [35]:
#Options to handle missing data

# housing.dropna(subset = ["total_bedrooms"])
# housing.drop("total_bedrooms",axis = 1)
# median = housing["total_bedrooms"].median
# housing["total_bedrooms"].fillna(median)

#Handling missing data using Imputer
from sklearn.preprocessing import Imputer

#Creating Imputer instance
imputer = Imputer(strategy = "median")

#Median can be computed only on numerical values, so drop text attribute
housing_num = housing.drop("ocean_proximity",axis = 1)

#Fit imputer instance to training data
imputer.fit(housing_num)

#Imputer stores medians in statistics_ instance variable
imputer.statistics_


Out[35]:
array([ -118.51  ,    34.26  ,    29.    ,  2119.5   ,   433.    ,
        1164.    ,   408.    ,     3.5409])

In [36]:
housing_num.median().values


Out[36]:
array([ -118.51  ,    34.26  ,    29.    ,  2119.5   ,   433.    ,
        1164.    ,   408.    ,     3.5409])

In [37]:
#Use the 'trained' imputer to transform the training set by replacing missing vaues by learned
#medians

X = imputer.transform(housing_num)

In [38]:
#X is a plain array -> convert it into dataframe

housing_tr = pd.DataFrame(X,columns = housing_num.columns)

In [39]:
display(HTML('<h1>Handling Text and categorical Attributes</h1>'))


Handling Text and categorical Attributes


In [40]:
#Cconverting text labels to numbers --> Transformer - LabelEncoder

from sklearn.preprocessing import LabelEncoder

encoder = LabelEncoder()
housing_cat = housing["ocean_proximity"]
housing_cat_encoded = encoder.fit_transform(housing_cat)
housing_cat_encoded


Out[40]:
array([0, 0, 4, ..., 1, 0, 3])

In [41]:
#1HOCEAN is mapped to 0, INLAND to 1 etc.
encoder.classes_


Out[41]:
array(['<1H OCEAN', 'INLAND', 'ISLAND', 'NEAR BAY', 'NEAR OCEAN'], dtype=object)

In [42]:
#But there is an issue with this tyoe of representation ->ML algorithms will assume that 
#two nearby values are more similar than two distant values -> But thats not the case
#Categories 0 and 4 are more similar than 1 and 4

#So we use one-hot encoding - 1(hot) and 0(cold)

#To convert 1D - 2D array since fix_transform() accepts only 2D array ->
#housing_cat_encoded.reshape(-1,1)

from sklearn.preprocessing import OneHotEncoder
encoder = OneHotEncoder()
housing_cat_1hot = encoder.fit_transform(housing_cat_encoded.reshape(-1,1))
housing_cat_1hot


Out[42]:
<16512x5 sparse matrix of type '<class 'numpy.float64'>'
	with 16512 stored elements in Compressed Sparse Row format>

In [43]:
#As shown above, output is Scipy sparse matrix
#So only non-zero values are stored in the memory- to avoid wasting memory

#To convert into a dense array -> call toarray()
housing_cat_1hot.toarray()


Out[43]:
array([[ 1.,  0.,  0.,  0.,  0.],
       [ 1.,  0.,  0.,  0.,  0.],
       [ 0.,  0.,  0.,  0.,  1.],
       ..., 
       [ 0.,  1.,  0.,  0.,  0.],
       [ 1.,  0.,  0.,  0.,  0.],
       [ 0.,  0.,  0.,  1.,  0.]])

In [44]:
# Applying two transformations ->  From text categories to integer categories, then from
#integer categories to one-hot vectors can be done in one shot by using
# LabelBinarizer

from sklearn.preprocessing import LabelBinarizer

encoder = LabelBinarizer()
housing_cat_1hot = encoder.fit_transform(housing_cat)
housing_cat_1hot


Out[44]:
array([[1, 0, 0, 0, 0],
       [1, 0, 0, 0, 0],
       [0, 0, 0, 0, 1],
       ..., 
       [0, 1, 0, 0, 0],
       [1, 0, 0, 0, 0],
       [0, 0, 0, 1, 0]])

In [45]:
display(HTML('<h1>Custom Transformers</h1>'))


Custom Transformers


In [46]:
#TransformerMixin to get fit_transform() and BaseEstimator to get get_prams() and set_params()
from sklearn.base import BaseEstimator,TransformerMixin

rooms_ix, bedrooms_ix, population_ix, household_ix = 3,4,5,6

class CombinedAttributesAdder(BaseEstimator,TransformerMixin):
    def __init__(self, add_bedrooms_per_room = True):
        self.add_bedrooms_per_room = add_bedrooms_per_room
    
    def fit(self,X,y=None):
        return self
    
    def transform(self, X, y=None):
        rooms_per_household = X[:,rooms_ix] / X[:,household_ix]
        population_per_household = X[:,population_ix] / X[:,household_ix]
        
        if self.add_bedrooms_per_room:
            bedrooms_per_room = X[:,bedrooms_ix] / X[:,rooms_ix]
            return np.c_[X,rooms_per_household,population_per_household,bedrooms_per_room]
            
        else:
            return np.c_[X,rooms_per_household,population_per_household]

In [47]:
attr_adder = CombinedAttributesAdder(False)
housing_extra_attribs = attr_adder.transform(housing.values)

In [48]:
display(HTML('<h1>Feature Scaling</h1>'))


Feature Scaling


In [49]:
from sklearn.preprocessing import StandardScaler
std_scaler = StandardScaler()
std_scaler.fit(housing_tr.values)
std_scaler.transform(housing_tr.values)


Out[49]:
array([[-1.15604281,  0.77194962,  0.74333089, ..., -0.63621141,
        -0.42069842, -0.61493744],
       [-1.17602483,  0.6596948 , -1.1653172 , ..., -0.99833135,
        -1.02222705,  1.33645936],
       [ 1.18684903, -1.34218285,  0.18664186, ..., -0.43363936,
        -0.0933178 , -0.5320456 ],
       ..., 
       [ 1.58648943, -0.72478134, -1.56295222, ...,  0.60790363,
         0.71315642, -0.3167053 ],
       [ 0.78221312, -0.85106801,  0.18664186, ..., -0.05717804,
        -0.37545069,  0.09812139],
       [-1.43579109,  0.99645926,  1.85670895, ..., -0.13515931,
         0.3777909 , -0.15779865]])

In [50]:
display(HTML('<h1>Transformer Pipelines</h1>'))


Transformer Pipelines


In [51]:
# To handle many data tranformation steps

from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler


num_pipeline = Pipeline([('imputer',Imputer(strategy="median")),
                         ('attribs_adder',CombinedAttributesAdder()),
                         ('std_scaler',StandardScaler())])

housing_num_tr = num_pipeline.fit_transform(housing_num)
housing_num_tr


Out[51]:
array([[-1.15604281,  0.77194962,  0.74333089, ..., -0.31205452,
        -0.08649871,  0.15531753],
       [-1.17602483,  0.6596948 , -1.1653172 , ...,  0.21768338,
        -0.03353391, -0.83628902],
       [ 1.18684903, -1.34218285,  0.18664186, ..., -0.46531516,
        -0.09240499,  0.4222004 ],
       ..., 
       [ 1.58648943, -0.72478134, -1.56295222, ...,  0.3469342 ,
        -0.03055414, -0.52177644],
       [ 0.78221312, -0.85106801,  0.18664186, ...,  0.02499488,
         0.06150916, -0.30340741],
       [-1.43579109,  0.99645926,  1.85670895, ..., -0.22852947,
        -0.09586294,  0.10180567]])

In [52]:
#Tranformer to convert DaataFrame to a Numpy Array

In [53]:
from sklearn.base import BaseEstimator, TransformerMixin

class DataFrameSelector(BaseEstimator, TransformerMixin):
    def __init__(self,attribute_names):
        self.attribute_names = attribute_names
    def fit(self, X, y=None):
        return self
    def transform(self, X, y=None):
        return X[self.attribute_names].values

In [54]:
#Combining two transformer pipelines -> FeatureUnion

In [58]:
from sklearn.base import BaseEstimator, TransformerMixin

class LabelBinarizer_Modified(TransformerMixin, BaseEstimator):
    def fit(self, X, y = False):
        return self
    def transform(self, X, y = False):
        encoder = LabelBinarizer();
        return encoder.fit_transform(X)
        

from sklearn.pipeline import FeatureUnion


num_attribs = list(housing_num)
cat_attribs = ["ocean_proximity"]

num_pipeline = Pipeline([('selector',DataFrameSelector(num_attribs)),
                         ('imputer',Imputer(strategy="median")),
                         ('attribs_adder',CombinedAttributesAdder()),
                         ('std_scaler',StandardScaler())])

cat_pipeline = Pipeline([('selector',DataFrameSelector(cat_attribs)),
                         ('label_binarizer',LabelBinarizer_Modified())])

full_pipeline = FeatureUnion(transformer_list=[("num_pipeline",num_pipeline),
                                              ("cat_pipeline",cat_pipeline)])

In [56]:
import sys
sys.setrecursionlimit(100000000) 

sys.getrecursionlimit()


Out[56]:
100000000

In [60]:
housing_prepared = full_pipeline.fit_transform(housing)
print(housing_prepared)


[[-1.15604281  0.77194962  0.74333089 ...,  0.          0.          0.        ]
 [-1.17602483  0.6596948  -1.1653172  ...,  0.          0.          0.        ]
 [ 1.18684903 -1.34218285  0.18664186 ...,  0.          0.          1.        ]
 ..., 
 [ 1.58648943 -0.72478134 -1.56295222 ...,  0.          0.          0.        ]
 [ 0.78221312 -0.85106801  0.18664186 ...,  0.          0.          0.        ]
 [-1.43579109  0.99645926  1.85670895 ...,  0.          1.          0.        ]]

In [ ]: