In [3]:
# from p. 44 of O'Reilly
import os
import tarfile
from six.moves import urllib
# and the usual stuff
%matplotlib inline
import matplotlib.pyplot as plt
import numpy as np
import scipy as sci
import sklearn
import pandas as pd
In [4]:
DOWNLOAD_ROOT = "https://raw.githubusercontent.com/ageron/handson-ml/master/"
HOUSING_PATH = "datasets/housing"
HOUSING_URL = DOWNLOAD_ROOT + HOUSING_PATH + "/housing.tgz"
In [9]:
def fetch_housing_data(housing_url=HOUSING_URL,housing_path = HOUSING_PATH):
if not os.path.isdir(housing_path):
os.makedirs(housing_path)
tgz_path = os.path.join(housing_path, "housing.tgz")
urllib.request.urlretrieve(housing_url, tgz_path)
housing_tgz = tarfile.open(tgz_path)
housing_tgz.extractall(path=housing_path)
housing_tgz.close()
def load_housing_data(housing_path=HOUSING_PATH):
csv_path = os.path.join(housing_path,"housing.csv")
return pd.read_csv(csv_path)
In [10]:
fetch_housing_data()
print 'done'
In [13]:
housing = load_housing_data()
housing.head()
Out[13]:
In [15]:
housing.info()
In [16]:
housing["ocean_proximity"].value_counts()
Out[16]:
In [17]:
housing.describe()
Out[17]:
In [19]:
housing.hist(bins=50,figsize=(20,15))
plt.show
Out[19]:
In [23]:
def split_train_test(data, test_ratio,seed=42):
np.random.seed(seed)
shuffled_indices = np.random.permutation(len(data))
test_set_size = int(len(data) * test_ratio)
test_indices = shuffled_indices[:test_set_size]
train_indices = shuffled_indices[test_set_size:]
return data.iloc[train_indices], data.iloc[test_indices]
In [24]:
train_set, test_set = split_train_test(housing, 0.2)
print len(train_set), "train +", len(test_set),"test"
In [25]:
import hashlib
def test_set_check(identifier, test_ratio, hash):
return hash(np.int64(identifier)).digest()[-1] < 256 * test_ratio
def split_train_test_by_id(data, test_ratio, id_column, hash=hashlib.md5):
ids = data[id_column]
in_test_set = ids.apply(lambda id_: test_set_check(id_, test_ratio, hash))
in_test_set = ids.apply(lambda id_: test_set_check(id_, test_ratio, hash))
return data.loc[~in_test_set], data.loc[in_test_set]
In [26]:
housing_with_id = housing.reset_index()
train_set, test_set = split_train_test_by_id(housing_with_id, 0.2, "index")
In [27]:
housing_with_id["id"] = housing["longitude"] * 1000 + housing["latitude"]
train_set, test_set = split_train_test_by_id(housing_with_id, 0.2, "id")
In [28]:
from sklearn.model_selection import train_test_split
train_set, test_set = train_test_split(housing, test_size=0.2, random_state=42)
In [29]:
housing["income_cat"] = np.ceil(housing["median_income"] / 1.5)
housing["income_cat"].where(housing["income_cat"] < 5, 5.0, inplace=True)
In [31]:
from sklearn.model_selection import StratifiedShuffleSplit
split = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=42)
for train_index, test_index in split.split(housing, housing["income_cat"]):
strat_train_set = housing.loc[train_index]
strat_test_set = housing.loc[test_index]
In [32]:
housing["income_cat"].value_counts() / len(housing)
Out[32]:
In [33]:
# dropping the income_cat attribute so the data is back to its original state
for set in (strat_train_set, strat_test_set):
set.drop(["income_cat"], axis=1, inplace=True)
In [34]:
housing = strat_train_set.copy()
housing.plot(kind="scatter",x="longitude",y="latitude")
Out[34]:
In [35]:
housing.plot(kind="scatter", x="longitude",y="latitude",alpha=0.1)
Out[35]:
In [36]:
housing.plot(kind="scatter", x="longitude",y="latitude",alpha=0.4,
s=housing["population"]/100, label="population",
c="median_house_value",cmap=plt.get_cmap("jet"), colorbar=True,)
plt.legend()
Out[36]:
In [38]:
corr_matrix=housing.corr()
corr_matrix["median_house_value"].sort_values(ascending=False)
Out[38]:
In [40]:
from pandas.plotting import scatter_matrix
attributes = ["median_house_value", "median_income", "total_rooms", "housing_median_age"]
scatter_matrix(housing[attributes], figsize=(12,8))
Out[40]:
In [41]:
housing.plot(kind="scatter", x="median_income", y="median_house_value", alpha=0.1)
Out[41]:
In [ ]: