In [3]:
# from p. 44 of O'Reilly
import os
import tarfile
from six.moves import urllib

# and the usual stuff
%matplotlib inline
import matplotlib.pyplot as plt
import numpy as np
import scipy as sci
import sklearn
import pandas as pd

In [4]:
DOWNLOAD_ROOT = "https://raw.githubusercontent.com/ageron/handson-ml/master/"
HOUSING_PATH  = "datasets/housing"
HOUSING_URL   = DOWNLOAD_ROOT + HOUSING_PATH + "/housing.tgz"

In [9]:
def fetch_housing_data(housing_url=HOUSING_URL,housing_path = HOUSING_PATH):
    if not os.path.isdir(housing_path):
        os.makedirs(housing_path)
    tgz_path = os.path.join(housing_path, "housing.tgz")
    urllib.request.urlretrieve(housing_url, tgz_path)
    housing_tgz = tarfile.open(tgz_path)
    housing_tgz.extractall(path=housing_path)
    housing_tgz.close()
    
def load_housing_data(housing_path=HOUSING_PATH):
    csv_path = os.path.join(housing_path,"housing.csv")
    return pd.read_csv(csv_path)

In [10]:
fetch_housing_data()
print 'done'


done

In [13]:
housing = load_housing_data()
housing.head()


Out[13]:
longitude latitude housing_median_age total_rooms total_bedrooms population households median_income median_house_value ocean_proximity
0 -122.23 37.88 41.0 880.0 129.0 322.0 126.0 8.3252 452600.0 NEAR BAY
1 -122.22 37.86 21.0 7099.0 1106.0 2401.0 1138.0 8.3014 358500.0 NEAR BAY
2 -122.24 37.85 52.0 1467.0 190.0 496.0 177.0 7.2574 352100.0 NEAR BAY
3 -122.25 37.85 52.0 1274.0 235.0 558.0 219.0 5.6431 341300.0 NEAR BAY
4 -122.25 37.85 52.0 1627.0 280.0 565.0 259.0 3.8462 342200.0 NEAR BAY

In [15]:
housing.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20640 entries, 0 to 20639
Data columns (total 10 columns):
longitude             20640 non-null float64
latitude              20640 non-null float64
housing_median_age    20640 non-null float64
total_rooms           20640 non-null float64
total_bedrooms        20433 non-null float64
population            20640 non-null float64
households            20640 non-null float64
median_income         20640 non-null float64
median_house_value    20640 non-null float64
ocean_proximity       20640 non-null object
dtypes: float64(9), object(1)
memory usage: 1.6+ MB

In [16]:
housing["ocean_proximity"].value_counts()


Out[16]:
<1H OCEAN     9136
INLAND        6551
NEAR OCEAN    2658
NEAR BAY      2290
ISLAND           5
Name: ocean_proximity, dtype: int64

In [17]:
housing.describe()


Out[17]:
longitude latitude housing_median_age total_rooms total_bedrooms population households median_income median_house_value
count 20640.000000 20640.000000 20640.000000 20640.000000 20433.000000 20640.000000 20640.000000 20640.000000 20640.000000
mean -119.569704 35.631861 28.639486 2635.763081 537.870553 1425.476744 499.539680 3.870671 206855.816909
std 2.003532 2.135952 12.585558 2181.615252 421.385070 1132.462122 382.329753 1.899822 115395.615874
min -124.350000 32.540000 1.000000 2.000000 1.000000 3.000000 1.000000 0.499900 14999.000000
25% -121.800000 33.930000 18.000000 1447.750000 296.000000 787.000000 280.000000 2.563400 119600.000000
50% -118.490000 34.260000 29.000000 2127.000000 435.000000 1166.000000 409.000000 3.534800 179700.000000
75% -118.010000 37.710000 37.000000 3148.000000 647.000000 1725.000000 605.000000 4.743250 264725.000000
max -114.310000 41.950000 52.000000 39320.000000 6445.000000 35682.000000 6082.000000 15.000100 500001.000000

In [19]:
housing.hist(bins=50,figsize=(20,15))
plt.show


Out[19]:
<function matplotlib.pyplot.show>

In [23]:
def split_train_test(data, test_ratio,seed=42):
    np.random.seed(seed)
    shuffled_indices = np.random.permutation(len(data))
    test_set_size = int(len(data) * test_ratio)
    test_indices = shuffled_indices[:test_set_size]
    train_indices = shuffled_indices[test_set_size:]
    return data.iloc[train_indices], data.iloc[test_indices]

In [24]:
train_set, test_set = split_train_test(housing, 0.2)
print len(train_set), "train +", len(test_set),"test"


16512 train + 4128 test

In [25]:
import hashlib

def test_set_check(identifier, test_ratio, hash):
    return hash(np.int64(identifier)).digest()[-1] < 256 * test_ratio

def split_train_test_by_id(data, test_ratio, id_column, hash=hashlib.md5):
    ids = data[id_column]
    in_test_set = ids.apply(lambda id_: test_set_check(id_, test_ratio, hash))
    in_test_set = ids.apply(lambda id_: test_set_check(id_, test_ratio, hash))
    return data.loc[~in_test_set], data.loc[in_test_set]

In [26]:
housing_with_id = housing.reset_index()
train_set, test_set = split_train_test_by_id(housing_with_id, 0.2, "index")

In [27]:
housing_with_id["id"] = housing["longitude"] * 1000 + housing["latitude"]
train_set, test_set = split_train_test_by_id(housing_with_id, 0.2, "id")

In [28]:
from sklearn.model_selection import train_test_split

train_set, test_set = train_test_split(housing, test_size=0.2, random_state=42)

In [29]:
housing["income_cat"] = np.ceil(housing["median_income"] / 1.5)
housing["income_cat"].where(housing["income_cat"] < 5, 5.0, inplace=True)

In [31]:
from sklearn.model_selection import StratifiedShuffleSplit

split = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=42)
for train_index, test_index in split.split(housing, housing["income_cat"]):
    strat_train_set = housing.loc[train_index]
    strat_test_set = housing.loc[test_index]

In [32]:
housing["income_cat"].value_counts() / len(housing)


Out[32]:
3.0    0.350581
2.0    0.318847
4.0    0.176308
5.0    0.114438
1.0    0.039826
Name: income_cat, dtype: float64

In [33]:
# dropping the income_cat attribute so the data is back to its original state
for set in (strat_train_set, strat_test_set):
    set.drop(["income_cat"], axis=1, inplace=True)

In [34]:
housing = strat_train_set.copy()
housing.plot(kind="scatter",x="longitude",y="latitude")


Out[34]:
<matplotlib.axes._subplots.AxesSubplot at 0x10e6d0690>

In [35]:
housing.plot(kind="scatter", x="longitude",y="latitude",alpha=0.1)


Out[35]:
<matplotlib.axes._subplots.AxesSubplot at 0x1124a1ed0>

In [36]:
housing.plot(kind="scatter", x="longitude",y="latitude",alpha=0.4,
            s=housing["population"]/100, label="population",
            c="median_house_value",cmap=plt.get_cmap("jet"), colorbar=True,)
plt.legend()


Out[36]:
<matplotlib.legend.Legend at 0x1124e68d0>

In [38]:
corr_matrix=housing.corr()
corr_matrix["median_house_value"].sort_values(ascending=False)


Out[38]:
median_house_value    1.000000
median_income         0.687160
total_rooms           0.135097
housing_median_age    0.114110
households            0.064506
total_bedrooms        0.047689
population           -0.026920
longitude            -0.047432
latitude             -0.142724
Name: median_house_value, dtype: float64

In [40]:
from pandas.plotting import scatter_matrix

attributes = ["median_house_value", "median_income", "total_rooms", "housing_median_age"]
scatter_matrix(housing[attributes], figsize=(12,8))


Out[40]:
array([[<matplotlib.axes._subplots.AxesSubplot object at 0x110b015d0>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x1125bc3d0>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x11163f450>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x10ee3b110>],
       [<matplotlib.axes._subplots.AxesSubplot object at 0x10f163450>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x10e686d10>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x10f009750>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x10efc01d0>],
       [<matplotlib.axes._subplots.AxesSubplot object at 0x10ef3da90>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x10efe8fd0>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x10ebfd290>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x10eb8b110>],
       [<matplotlib.axes._subplots.AxesSubplot object at 0x10ef71dd0>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x10ee56d50>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x11018b510>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x112e09550>]], dtype=object)

In [41]:
housing.plot(kind="scatter", x="median_income", y="median_house_value", alpha=0.1)


Out[41]:
<matplotlib.axes._subplots.AxesSubplot at 0x110abf910>

In [ ]: