In [2]:
import os
import tarfile
from six.moves import urllib
from IPython.core.display import display, HTML
DOWNLOAD_ROOT = "https://raw.githubusercontent.com/ageron/handson-ml/master/"
HOUSING_PATH = "datasets/housing"
HOUSING_URL = DOWNLOAD_ROOT+HOUSING_PATH+"/housing.tgz"
def fetch_housing_data(housing_url = HOUSING_URL,housing_path = HOUSING_PATH):
if not os.path.isdir(housing_path):
os.makedirs(housing_path)
tgz_path = os.path.join(housing_path,"housing.tgz")
urllib.request.urlretrieve(housing_url,tgz_path)
housing_tgz = tarfile.open(tgz_path)
housing_tgz.extractall(path = housing_path)
housing_tgz.close()
fetch_housing_data()
import pandas as pd
def load_housing_data(housing_path = HOUSING_PATH):
csv_path = os.path.join(housing_path, "housing.csv")
return pd.read_csv(csv_path)
housing = load_housing_data()
housing.head()
Out[2]:
In [3]:
housing.info()
In [4]:
housing.size
Out[4]:
In [5]:
# What ctegories exists in ocean_proximity and how many districts belongs to each category
housing["ocean_proximity"].value_counts()
Out[5]:
In [6]:
housing.describe()
Out[6]:
In [7]:
%matplotlib inline
import matplotlib.pyplot as plt
housing.hist(bins = 50, figsize=(20,15))
plt.show()
In [8]:
display(HTML('<h3>Create A Test Set</h3>'))
In [9]:
import numpy as np
def split_train_test(data,test_ratio):
# Can set seed like np.random.seed(num) - generates same shuffled indices
shuffled_indices = np.random.permutation(len(data))
test_set_size = int(len(data)*test_ratio)
test_indices = shuffled_indices[:test_set_size]
train_indices = shuffled_indices[test_set_size:]
return data.iloc[train_indices],data.iloc[test_indices]
In [10]:
train_set,test_set = split_train_test(housing,0.2)
print("The data has ",len(train_set)," training data and",len(test_set)," test data")
In [11]:
# import hashlib
# hash= hashlib.md5
# a = hash(np.int64(100))
# #For hash(np.int64(100)) , the byte representation of the digest : b"\xbcr#\x9eE't\xbcW\x05K\x11\xac]Q*"
# #The last byte is * and value returned for that is like ASCII (*) -> 42
# print(a.digest())
In [12]:
display(HTML('<h3>Use Hashing to create A Test Set</h3>'))
In [13]:
import hashlib
def test_set_check(identifier,test_ratio,hash):
return hash(np.int64(identifier)).digest()[-1] < 256*test_ratio
In [14]:
def split_train_test_by_id(data,test_ratio,id_column,hash = hashlib.md5):
ids = data[id_column]
in_test_set = ids.apply(lambda id_: test_set_check(id_,test_ratio,hash) )
return data.loc[~in_test_set], data.loc[in_test_set]
In [15]:
# Adding index column
housing_with_id = housing.reset_index()
train_set,test_set = split_train_test_by_id(housing_with_id,0.2,"index")
In [16]:
# Adding id based on longitude and latitute
# housing_with_id["id"] = housing["longitude"]*1000 + housing["latitude"]
# train_set,test_set = split_train_test_by_id(housing_with_id,0.2,"id")
# print(train_set)
# print(test_set)
In [17]:
housing["income_cat"] = np.ceil(housing["median_income"]/1.5)
In [18]:
housing["income_cat"].where(housing["income_cat"]<5,5.0,inplace=True)
In [19]:
from sklearn.model_selection import StratifiedShuffleSplit
split = StratifiedShuffleSplit(n_splits = 1,test_size = 0.2,random_state=42)
for train_index,test_index in split.split(housing,housing["income_cat"]):
strat_train_set = housing.loc[train_index]
strat_test_set = housing.loc[test_index]
In [ ]:
In [20]:
for set in (strat_train_set,strat_test_set):
set.drop(["income_cat"],axis = 1,inplace=True)
In [21]:
display(HTML('<h3>Visualize Data To Gain Insights</h3>'))
In [22]:
housing = strat_train_set.copy()
housing.plot(kind="scatter" , x = "longitude",y="latitude")
plt.legend()
In [23]:
housing = strat_train_set.copy()
housing.plot(kind="scatter" , x = "longitude",y="latitude",alpha=0.1)
plt.legend()
In [24]:
housing.plot(kind = "scatter", x = "longitude",y = "latitude",alpha=0.4,
s=housing["population"]/100,label="population",c="median_house_value",
cmap=plt.get_cmap("jet"),colorbar = True)
plt.legend()
Out[24]:
In [25]:
corr_matrix = housing.corr()
print(corr_matrix)
In [26]:
corr_matrix["median_house_value"].sort_values(ascending=False)
Out[26]:
In [27]:
display(HTML('<h3>Scatter Matrix</h3>'))
In [28]:
from pandas.plotting import scatter_matrix
attributes = ["median_house_value","median_income","total_rooms","housing_median_age"]
scatter_matrix(housing[attributes],figsize=(12,8))
plt.legend()
In [29]:
housing.plot(kind="scatter",x="median_income",y="median_house_value",alpha=0.1)
plt.legend()
In [30]:
display(HTML('<h3>Experimenting with Attribute Combinations</h3>'))
In [31]:
housing["rooms_per_household"] = housing["total_rooms"]/housing["households"]
housing["bedrooms_per_room"] = housing["total_bedrooms"]/housing["total_rooms"]
housing["population_per_household"] = housing["population"]/housing["households"]
corr_matrix = housing.corr()
corr_matrix["median_house_value"].sort_values(ascending=False)
Out[31]:
In [32]:
display(HTML('<h1>Preparing Data For Machine Learning Algorithm</h1>'))
In [33]:
housing = strat_train_set.drop("median_house_value",axis=1)
housing_labels = strat_train_set["median_house_value"].copy()
In [34]:
display(HTML('<h3>Data Cleaning</h3>'))
In [35]:
#Options to handle missing data
# housing.dropna(subset = ["total_bedrooms"])
# housing.drop("total_bedrooms",axis = 1)
# median = housing["total_bedrooms"].median
# housing["total_bedrooms"].fillna(median)
#Handling missing data using Imputer
from sklearn.preprocessing import Imputer
#Creating Imputer instance
imputer = Imputer(strategy = "median")
#Median can be computed only on numerical values, so drop text attribute
housing_num = housing.drop("ocean_proximity",axis = 1)
#Fit imputer instance to training data
imputer.fit(housing_num)
#Imputer stores medians in statistics_ instance variable
imputer.statistics_
Out[35]:
In [36]:
housing_num.median().values
Out[36]:
In [37]:
#Use the 'trained' imputer to transform the training set by replacing missing vaues by learned
#medians
X = imputer.transform(housing_num)
In [38]:
#X is a plain array -> convert it into dataframe
housing_tr = pd.DataFrame(X,columns = housing_num.columns)
In [39]:
display(HTML('<h1>Handling Text and categorical Attributes</h1>'))
In [40]:
#Cconverting text labels to numbers --> Transformer - LabelEncoder
from sklearn.preprocessing import LabelEncoder
encoder = LabelEncoder()
housing_cat = housing["ocean_proximity"]
housing_cat_encoded = encoder.fit_transform(housing_cat)
housing_cat_encoded
Out[40]:
In [41]:
#1HOCEAN is mapped to 0, INLAND to 1 etc.
encoder.classes_
Out[41]:
In [42]:
#But there is an issue with this tyoe of representation ->ML algorithms will assume that
#two nearby values are more similar than two distant values -> But thats not the case
#Categories 0 and 4 are more similar than 1 and 4
#So we use one-hot encoding - 1(hot) and 0(cold)
#To convert 1D - 2D array since fix_transform() accepts only 2D array ->
#housing_cat_encoded.reshape(-1,1)
from sklearn.preprocessing import OneHotEncoder
encoder = OneHotEncoder()
housing_cat_1hot = encoder.fit_transform(housing_cat_encoded.reshape(-1,1))
housing_cat_1hot
Out[42]:
In [43]:
#As shown above, output is Scipy sparse matrix
#So only non-zero values are stored in the memory- to avoid wasting memory
#To convert into a dense array -> call toarray()
housing_cat_1hot.toarray()
Out[43]:
In [44]:
# Applying two transformations -> From text categories to integer categories, then from
#integer categories to one-hot vectors can be done in one shot by using
# LabelBinarizer
from sklearn.preprocessing import LabelBinarizer
encoder = LabelBinarizer()
housing_cat_1hot = encoder.fit_transform(housing_cat)
housing_cat_1hot
Out[44]:
In [45]:
display(HTML('<h1>Custom Transformers</h1>'))
In [46]:
#TransformerMixin to get fit_transform() and BaseEstimator to get get_prams() and set_params()
from sklearn.base import BaseEstimator,TransformerMixin
rooms_ix, bedrooms_ix, population_ix, household_ix = 3,4,5,6
class CombinedAttributesAdder(BaseEstimator,TransformerMixin):
def __init__(self, add_bedrooms_per_room = True):
self.add_bedrooms_per_room = add_bedrooms_per_room
def fit(self,X,y=None):
return self
def transform(self, X, y=None):
rooms_per_household = X[:,rooms_ix] / X[:,household_ix]
population_per_household = X[:,population_ix] / X[:,household_ix]
if self.add_bedrooms_per_room:
bedrooms_per_room = X[:,bedrooms_ix] / X[:,rooms_ix]
return np.c_[X,rooms_per_household,population_per_household,bedrooms_per_room]
else:
return np.c_[X,rooms_per_household,population_per_household]
In [47]:
attr_adder = CombinedAttributesAdder(False)
housing_extra_attribs = attr_adder.transform(housing.values)
In [48]:
display(HTML('<h1>Feature Scaling</h1>'))
In [49]:
from sklearn.preprocessing import StandardScaler
std_scaler = StandardScaler()
std_scaler.fit(housing_tr.values)
std_scaler.transform(housing_tr.values)
Out[49]:
In [50]:
display(HTML('<h1>Transformer Pipelines</h1>'))
In [51]:
# To handle many data tranformation steps
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
num_pipeline = Pipeline([('imputer',Imputer(strategy="median")),
('attribs_adder',CombinedAttributesAdder()),
('std_scaler',StandardScaler())])
housing_num_tr = num_pipeline.fit_transform(housing_num)
housing_num_tr
Out[51]:
In [52]:
#Tranformer to convert DaataFrame to a Numpy Array
In [53]:
from sklearn.base import BaseEstimator, TransformerMixin
class DataFrameSelector(BaseEstimator, TransformerMixin):
def __init__(self,attribute_names):
self.attribute_names = attribute_names
def fit(self, X, y=None):
return self
def transform(self, X, y=None):
return X[self.attribute_names].values
In [54]:
#Combining two transformer pipelines -> FeatureUnion
In [58]:
from sklearn.base import BaseEstimator, TransformerMixin
class LabelBinarizer_Modified(TransformerMixin, BaseEstimator):
def fit(self, X, y = False):
return self
def transform(self, X, y = False):
encoder = LabelBinarizer();
return encoder.fit_transform(X)
from sklearn.pipeline import FeatureUnion
num_attribs = list(housing_num)
cat_attribs = ["ocean_proximity"]
num_pipeline = Pipeline([('selector',DataFrameSelector(num_attribs)),
('imputer',Imputer(strategy="median")),
('attribs_adder',CombinedAttributesAdder()),
('std_scaler',StandardScaler())])
cat_pipeline = Pipeline([('selector',DataFrameSelector(cat_attribs)),
('label_binarizer',LabelBinarizer_Modified())])
full_pipeline = FeatureUnion(transformer_list=[("num_pipeline",num_pipeline),
("cat_pipeline",cat_pipeline)])
In [56]:
import sys
sys.setrecursionlimit(100000000)
sys.getrecursionlimit()
Out[56]:
In [60]:
housing_prepared = full_pipeline.fit_transform(housing)
print(housing_prepared)
In [ ]: