notebook.community

Edit and run



In [3]:

    
# from p. 44 of O'Reilly
import os
import tarfile
from six.moves import urllib

# and the usual stuff
%matplotlib inline
import matplotlib.pyplot as plt
import numpy as np
import scipy as sci
import sklearn
import pandas as pd



In [4]:

    
DOWNLOAD_ROOT = "https://raw.githubusercontent.com/ageron/handson-ml/master/"
HOUSING_PATH  = "datasets/housing"
HOUSING_URL   = DOWNLOAD_ROOT + HOUSING_PATH + "/housing.tgz"



In [9]:

    
def fetch_housing_data(housing_url=HOUSING_URL,housing_path = HOUSING_PATH):
    if not os.path.isdir(housing_path):
        os.makedirs(housing_path)
    tgz_path = os.path.join(housing_path, "housing.tgz")
    urllib.request.urlretrieve(housing_url, tgz_path)
    housing_tgz = tarfile.open(tgz_path)
    housing_tgz.extractall(path=housing_path)
    housing_tgz.close()
    
def load_housing_data(housing_path=HOUSING_PATH):
    csv_path = os.path.join(housing_path,"housing.csv")
    return pd.read_csv(csv_path)



In [10]:

    
fetch_housing_data()
print 'done'









    



done



In [13]:

    
housing = load_housing_data()
housing.head()









    Out[13]:







  
    
      
      longitude
      latitude
      housing_median_age
      total_rooms
      total_bedrooms
      population
      households
      median_income
      median_house_value
      ocean_proximity
    
  
  
    
      0
      -122.23
      37.88
      41.0
      880.0
      129.0
      322.0
      126.0
      8.3252
      452600.0
      NEAR BAY
    
    
      1
      -122.22
      37.86
      21.0
      7099.0
      1106.0
      2401.0
      1138.0
      8.3014
      358500.0
      NEAR BAY
    
    
      2
      -122.24
      37.85
      52.0
      1467.0
      190.0
      496.0
      177.0
      7.2574
      352100.0
      NEAR BAY
    
    
      3
      -122.25
      37.85
      52.0
      1274.0
      235.0
      558.0
      219.0
      5.6431
      341300.0
      NEAR BAY
    
    
      4
      -122.25
      37.85
      52.0
      1627.0
      280.0
      565.0
      259.0
      3.8462
      342200.0
      NEAR BAY



In [15]:

    
housing.info()









    



<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20640 entries, 0 to 20639
Data columns (total 10 columns):
longitude             20640 non-null float64
latitude              20640 non-null float64
housing_median_age    20640 non-null float64
total_rooms           20640 non-null float64
total_bedrooms        20433 non-null float64
population            20640 non-null float64
households            20640 non-null float64
median_income         20640 non-null float64
median_house_value    20640 non-null float64
ocean_proximity       20640 non-null object
dtypes: float64(9), object(1)
memory usage: 1.6+ MB



In [16]:

    
housing["ocean_proximity"].value_counts()









    Out[16]:





<1H OCEAN     9136
INLAND        6551
NEAR OCEAN    2658
NEAR BAY      2290
ISLAND           5
Name: ocean_proximity, dtype: int64



In [17]:

    
housing.describe()









    Out[17]:







  
    
      
      longitude
      latitude
      housing_median_age
      total_rooms
      total_bedrooms
      population
      households
      median_income
      median_house_value
    
  
  
    
      count
      20640.000000
      20640.000000
      20640.000000
      20640.000000
      20433.000000
      20640.000000
      20640.000000
      20640.000000
      20640.000000
    
    
      mean
      -119.569704
      35.631861
      28.639486
      2635.763081
      537.870553
      1425.476744
      499.539680
      3.870671
      206855.816909
    
    
      std
      2.003532
      2.135952
      12.585558
      2181.615252
      421.385070
      1132.462122
      382.329753
      1.899822
      115395.615874
    
    
      min
      -124.350000
      32.540000
      1.000000
      2.000000
      1.000000
      3.000000
      1.000000
      0.499900
      14999.000000
    
    
      25%
      -121.800000
      33.930000
      18.000000
      1447.750000
      296.000000
      787.000000
      280.000000
      2.563400
      119600.000000
    
    
      50%
      -118.490000
      34.260000
      29.000000
      2127.000000
      435.000000
      1166.000000
      409.000000
      3.534800
      179700.000000
    
    
      75%
      -118.010000
      37.710000
      37.000000
      3148.000000
      647.000000
      1725.000000
      605.000000
      4.743250
      264725.000000
    
    
      max
      -114.310000
      41.950000
      52.000000
      39320.000000
      6445.000000
      35682.000000
      6082.000000
      15.000100
      500001.000000



In [19]:

    
housing.hist(bins=50,figsize=(20,15))
plt.show









    Out[19]:





<function matplotlib.pyplot.show>



In [23]:

    
def split_train_test(data, test_ratio,seed=42):
    np.random.seed(seed)
    shuffled_indices = np.random.permutation(len(data))
    test_set_size = int(len(data) * test_ratio)
    test_indices = shuffled_indices[:test_set_size]
    train_indices = shuffled_indices[test_set_size:]
    return data.iloc[train_indices], data.iloc[test_indices]



In [24]:

    
train_set, test_set = split_train_test(housing, 0.2)
print len(train_set), "train +", len(test_set),"test"









    



16512 train + 4128 test



In [25]:

    
import hashlib

def test_set_check(identifier, test_ratio, hash):
    return hash(np.int64(identifier)).digest()[-1] < 256 * test_ratio

def split_train_test_by_id(data, test_ratio, id_column, hash=hashlib.md5):
    ids = data[id_column]
    in_test_set = ids.apply(lambda id_: test_set_check(id_, test_ratio, hash))
    in_test_set = ids.apply(lambda id_: test_set_check(id_, test_ratio, hash))
    return data.loc[~in_test_set], data.loc[in_test_set]



In [26]:

    
housing_with_id = housing.reset_index()
train_set, test_set = split_train_test_by_id(housing_with_id, 0.2, "index")



In [27]:

    
housing_with_id["id"] = housing["longitude"] * 1000 + housing["latitude"]
train_set, test_set = split_train_test_by_id(housing_with_id, 0.2, "id")



In [28]:

    
from sklearn.model_selection import train_test_split

train_set, test_set = train_test_split(housing, test_size=0.2, random_state=42)



In [29]:

    
housing["income_cat"] = np.ceil(housing["median_income"] / 1.5)
housing["income_cat"].where(housing["income_cat"] < 5, 5.0, inplace=True)



In [31]:

    
from sklearn.model_selection import StratifiedShuffleSplit

split = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=42)
for train_index, test_index in split.split(housing, housing["income_cat"]):
    strat_train_set = housing.loc[train_index]
    strat_test_set = housing.loc[test_index]



In [32]:

    
housing["income_cat"].value_counts() / len(housing)









    Out[32]:





3.0    0.350581
2.0    0.318847
4.0    0.176308
5.0    0.114438
1.0    0.039826
Name: income_cat, dtype: float64



In [33]:

    
# dropping the income_cat attribute so the data is back to its original state
for set in (strat_train_set, strat_test_set):
    set.drop(["income_cat"], axis=1, inplace=True)



In [34]:

    
housing = strat_train_set.copy()
housing.plot(kind="scatter",x="longitude",y="latitude")









    Out[34]:





<matplotlib.axes._subplots.AxesSubplot at 0x10e6d0690>



In [35]:

    
housing.plot(kind="scatter", x="longitude",y="latitude",alpha=0.1)









    Out[35]:





<matplotlib.axes._subplots.AxesSubplot at 0x1124a1ed0>



In [36]:

    
housing.plot(kind="scatter", x="longitude",y="latitude",alpha=0.4,
            s=housing["population"]/100, label="population",
            c="median_house_value",cmap=plt.get_cmap("jet"), colorbar=True,)
plt.legend()









    Out[36]:





<matplotlib.legend.Legend at 0x1124e68d0>



In [38]:

    
corr_matrix=housing.corr()
corr_matrix["median_house_value"].sort_values(ascending=False)









    Out[38]:





median_house_value    1.000000
median_income         0.687160
total_rooms           0.135097
housing_median_age    0.114110
households            0.064506
total_bedrooms        0.047689
population           -0.026920
longitude            -0.047432
latitude             -0.142724
Name: median_house_value, dtype: float64



In [40]:

    
from pandas.plotting import scatter_matrix

attributes = ["median_house_value", "median_income", "total_rooms", "housing_median_age"]
scatter_matrix(housing[attributes], figsize=(12,8))









    Out[40]:





array([[<matplotlib.axes._subplots.AxesSubplot object at 0x110b015d0>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x1125bc3d0>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x11163f450>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x10ee3b110>],
       [<matplotlib.axes._subplots.AxesSubplot object at 0x10f163450>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x10e686d10>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x10f009750>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x10efc01d0>],
       [<matplotlib.axes._subplots.AxesSubplot object at 0x10ef3da90>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x10efe8fd0>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x10ebfd290>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x10eb8b110>],
       [<matplotlib.axes._subplots.AxesSubplot object at 0x10ef71dd0>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x10ee56d50>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x11018b510>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x112e09550>]], dtype=object)



In [41]:

    
housing.plot(kind="scatter", x="median_income", y="median_house_value", alpha=0.1)









    Out[41]:





<matplotlib.axes._subplots.AxesSubplot at 0x110abf910>



In [ ]:

	longitude	latitude	housing_median_age	total_rooms	total_bedrooms	population	households	median_income	median_house_value	ocean_proximity
0	-122.23	37.88	41.0	880.0	129.0	322.0	126.0	8.3252	452600.0	NEAR BAY
1	-122.22	37.86	21.0	7099.0	1106.0	2401.0	1138.0	8.3014	358500.0	NEAR BAY
2	-122.24	37.85	52.0	1467.0	190.0	496.0	177.0	7.2574	352100.0	NEAR BAY
3	-122.25	37.85	52.0	1274.0	235.0	558.0	219.0	5.6431	341300.0	NEAR BAY
4	-122.25	37.85	52.0	1627.0	280.0	565.0	259.0	3.8462	342200.0	NEAR BAY

	longitude	latitude	housing_median_age	total_rooms	total_bedrooms	population	households	median_income	median_house_value
count	20640.000000	20640.000000	20640.000000	20640.000000	20433.000000	20640.000000	20640.000000	20640.000000	20640.000000
mean	-119.569704	35.631861	28.639486	2635.763081	537.870553	1425.476744	499.539680	3.870671	206855.816909
std	2.003532	2.135952	12.585558	2181.615252	421.385070	1132.462122	382.329753	1.899822	115395.615874
min	-124.350000	32.540000	1.000000	2.000000	1.000000	3.000000	1.000000	0.499900	14999.000000
25%	-121.800000	33.930000	18.000000	1447.750000	296.000000	787.000000	280.000000	2.563400	119600.000000
50%	-118.490000	34.260000	29.000000	2127.000000	435.000000	1166.000000	409.000000	3.534800	179700.000000
75%	-118.010000	37.710000	37.000000	3148.000000	647.000000	1725.000000	605.000000	4.743250	264725.000000
max	-114.310000	41.950000	52.000000	39320.000000	6445.000000	35682.000000	6082.000000	15.000100	500001.000000