In [1]:
import os 
import tarfile
from six.moves import urllib

DOWNLOAD_ROOT="https://raw.githubusercontent.com/ageron/handson-ml/master/"
HOUSING_PATH="datasets/housing"
HOUSING_URL=DOWNLOAD_ROOT + HOUSING_PATH + "/housing.tgz"

def fetch_housing_data(housing_url = HOUSING_URL, housing_path = HOUSING_PATH):
    if not os.path.isdir(housing_path):
        os.makedirs(housing_path)
    tgz_path = os.path.join(housing_path, "housing.tgz")
    urllib.request.urlretrieve(housing_url, tgz_path)
    print("Downloaded file.")
    housing_tgz = tarfile.open(tgz_path)
    housing_tgz.extractall(path = housing_path)
    print("Extracted archive.")
    housing_tgz.close()
    print("Done.")

fetch_housing_data()


Downloaded file.
Extracted archive.
Done.

In [2]:
import pandas as pd

def load_housing_data(housing_path = HOUSING_PATH):
    csv_path = os.path.join(housing_path, "housing.csv")
    return pd.read_csv(csv_path)

housing = load_housing_data()
housing.head()


Out[2]:
longitude latitude housing_median_age total_rooms total_bedrooms population households median_income median_house_value ocean_proximity
0 -122.23 37.88 41.0 880.0 129.0 322.0 126.0 8.3252 452600.0 NEAR BAY
1 -122.22 37.86 21.0 7099.0 1106.0 2401.0 1138.0 8.3014 358500.0 NEAR BAY
2 -122.24 37.85 52.0 1467.0 190.0 496.0 177.0 7.2574 352100.0 NEAR BAY
3 -122.25 37.85 52.0 1274.0 235.0 558.0 219.0 5.6431 341300.0 NEAR BAY
4 -122.25 37.85 52.0 1627.0 280.0 565.0 259.0 3.8462 342200.0 NEAR BAY

In [3]:
housing.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20640 entries, 0 to 20639
Data columns (total 10 columns):
longitude             20640 non-null float64
latitude              20640 non-null float64
housing_median_age    20640 non-null float64
total_rooms           20640 non-null float64
total_bedrooms        20433 non-null float64
population            20640 non-null float64
households            20640 non-null float64
median_income         20640 non-null float64
median_house_value    20640 non-null float64
ocean_proximity       20640 non-null object
dtypes: float64(9), object(1)
memory usage: 1.6+ MB

In [4]:
housing["ocean_proximity"].head()


Out[4]:
0    NEAR BAY
1    NEAR BAY
2    NEAR BAY
3    NEAR BAY
4    NEAR BAY
Name: ocean_proximity, dtype: object

In [5]:
housing["ocean_proximity"].value_counts()


Out[5]:
<1H OCEAN     9136
INLAND        6551
NEAR OCEAN    2658
NEAR BAY      2290
ISLAND           5
Name: ocean_proximity, dtype: int64

In [6]:
housing.describe()


Out[6]:
longitude latitude housing_median_age total_rooms total_bedrooms population households median_income median_house_value
count 20640.000000 20640.000000 20640.000000 20640.000000 20433.000000 20640.000000 20640.000000 20640.000000 20640.000000
mean -119.569704 35.631861 28.639486 2635.763081 537.870553 1425.476744 499.539680 3.870671 206855.816909
std 2.003532 2.135952 12.585558 2181.615252 421.385070 1132.462122 382.329753 1.899822 115395.615874
min -124.350000 32.540000 1.000000 2.000000 1.000000 3.000000 1.000000 0.499900 14999.000000
25% -121.800000 33.930000 18.000000 1447.750000 296.000000 787.000000 280.000000 2.563400 119600.000000
50% -118.490000 34.260000 29.000000 2127.000000 435.000000 1166.000000 409.000000 3.534800 179700.000000
75% -118.010000 37.710000 37.000000 3148.000000 647.000000 1725.000000 605.000000 4.743250 264725.000000
max -114.310000 41.950000 52.000000 39320.000000 6445.000000 35682.000000 6082.000000 15.000100 500001.000000

In [7]:
import matplotlib.pyplot as plt
housing.hist(bins=50, figsize=(20,15))
plt.show()



In [8]:
import hashlib
import numpy as np # erratum, was missing

def test_set_check(identifier, test_ratio, hash):
    return hash(np.int64(identifier)).digest()[-1] < 256 * test_ratio

def split_train_test_by_id(data, test_ratio, id_column, hash=hashlib.md5):
    ids = data[id_column]
    in_test_set = ids.apply(lambda id_: test_set_check(id_, test_ratio, hash))
    return data.loc[~in_test_set], data.loc[in_test_set]

# Using this approach, we need to make sure that new data
# gets added at the end of the dataset!
housing_with_id = housing.reset_index()    # add id column
train_set, test_set = split_train_test_by_id(housing_with_id, 0.2, "index")

print("Train set:")
train_set.info()

print("Test set:")
test_set.info()


Train set:
<class 'pandas.core.frame.DataFrame'>
Int64Index: 16362 entries, 0 to 20639
Data columns (total 11 columns):
index                 16362 non-null int64
longitude             16362 non-null float64
latitude              16362 non-null float64
housing_median_age    16362 non-null float64
total_rooms           16362 non-null float64
total_bedrooms        16195 non-null float64
population            16362 non-null float64
households            16362 non-null float64
median_income         16362 non-null float64
median_house_value    16362 non-null float64
ocean_proximity       16362 non-null object
dtypes: float64(9), int64(1), object(1)
memory usage: 1.5+ MB
Test set:
<class 'pandas.core.frame.DataFrame'>
Int64Index: 4278 entries, 4 to 20637
Data columns (total 11 columns):
index                 4278 non-null int64
longitude             4278 non-null float64
latitude              4278 non-null float64
housing_median_age    4278 non-null float64
total_rooms           4278 non-null float64
total_bedrooms        4238 non-null float64
population            4278 non-null float64
households            4278 non-null float64
median_income         4278 non-null float64
median_house_value    4278 non-null float64
ocean_proximity       4278 non-null object
dtypes: float64(9), int64(1), object(1)
memory usage: 401.1+ KB

In [9]:
# Create income categories to be able to use stratified sampling 
# across data (avoid sampling bias).
housing["income_cat"] = np.ceil(housing["median_income"] / 1.5)
housing["income_cat"].where(housing["income_cat"] < 5, 5.0, inplace=True)

housing["income_cat"].hist(bins=5, figsize=(8,8))
plt.show()



In [10]:
# Create stratified test / train set split using income categories
from sklearn.model_selection import StratifiedShuffleSplit

split = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=42)
for train_index, test_index in split.split(housing, housing["income_cat"]):
    strat_train_set = housing.loc[train_index]
    strat_test_set = housing.loc[test_index]
    
housing["income_cat"].value_counts() / len(housing)


Out[10]:
3.0    0.350581
2.0    0.318847
4.0    0.176308
5.0    0.114438
1.0    0.039826
Name: income_cat, dtype: float64

In [11]:
# Remove temporary income category from sets
for set in (strat_test_set, strat_train_set):
    set.drop(["income_cat"], axis=1, inplace=True)

In [12]:
# Create working set and visualize
%matplotlib inline
housing = strat_train_set.copy()
housing.plot(kind="scatter", x="longitude", y="latitude", alpha=0.1, figsize=(12,10))


Out[12]:
<matplotlib.axes._subplots.AxesSubplot at 0x7f993cf4e940>

In [13]:
housing.plot(kind="scatter", x="longitude", y="latitude", alpha=0.4, 
            s=housing["population"]/100, label="population",
            c="median_house_value", cmap=plt.get_cmap("jet"), colorbar=True,
            figsize=(12,10))
plt.legend()


Out[13]:
<matplotlib.legend.Legend at 0x7f993ca39b38>

In [14]:
# Check correlation between columns
corr_matrix = housing.corr()
corr_matrix["median_house_value"].sort_values(ascending=False)


Out[14]:
median_house_value    1.000000
median_income         0.687160
total_rooms           0.135097
housing_median_age    0.114110
households            0.064506
total_bedrooms        0.047689
population           -0.026920
longitude            -0.047432
latitude             -0.142724
Name: median_house_value, dtype: float64

In [15]:
# Create scatter matrix to visually check for correlations
from pandas.tools.plotting import scatter_matrix

attributes = ["median_house_value", "median_income", "total_rooms", "housing_median_age"]
scatter_matrix(housing[attributes], figsize=(12, 8))


Out[15]:
array([[<matplotlib.axes._subplots.AxesSubplot object at 0x7f993ce38c50>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x7f993cad5d68>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x7f993a189a20>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x7f993caebcc0>],
       [<matplotlib.axes._subplots.AxesSubplot object at 0x7f993cc59518>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x7f993a1ca9e8>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x7f993a0c7390>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x7f993a17a908>],
       [<matplotlib.axes._subplots.AxesSubplot object at 0x7f993cd977b8>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x7f993a0c1e10>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x7f993a206898>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x7f993cae29b0>],
       [<matplotlib.axes._subplots.AxesSubplot object at 0x7f993ca1e908>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x7f993ce032e8>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x7f993cc442e8>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x7f993cef2240>]], dtype=object)

In [16]:
housing.plot(kind="scatter", x="median_income", y="median_house_value", alpha=0.1, figsize=(8,6))


Out[16]:
<matplotlib.axes._subplots.AxesSubplot at 0x7f993cef7f98>

In [17]:
housing["rooms_per_household"] = housing["total_rooms"] / housing["households"]
housing["bedrooms_per_room"] = housing["total_bedrooms"] / housing["total_rooms"]
housing["population_per_household"] = housing["population"] / housing["households"]

corr_matrix = housing.corr()
corr_matrix["median_house_value"].sort_values(ascending=False)


Out[17]:
median_house_value          1.000000
median_income               0.687160
rooms_per_household         0.146285
total_rooms                 0.135097
housing_median_age          0.114110
households                  0.064506
total_bedrooms              0.047689
population_per_household   -0.021985
population                 -0.026920
longitude                  -0.047432
latitude                   -0.142724
bedrooms_per_room          -0.259984
Name: median_house_value, dtype: float64

In [18]:
# reset data
housing = strat_train_set.drop("median_house_value", axis=1)
housing_labels = strat_train_set["median_house_value"].copy()

# Now handle n/a values using one of the following:

#housing.dropna(subset=["total_bedrooms"]) # 1) get rid of districts
#housing.drop(["total_bedrooms"], axis=2)  # 2) get rid of whole attribute
#median = housing["total_bedrooms"].median()
#housing["total_bedrooms"].fillna(median)  # 3) fill with median value

# This is 3) using sklearn's library
from sklearn.preprocessing import Imputer

housing_num = housing.drop("ocean_proximity", axis=1)
imputer = Imputer(strategy="median")
imputer.fit(housing_num)
X = imputer.transform(housing_num)
housing_tr = pd.DataFrame(X, columns=housing_num.columns)
housing_tr.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 16512 entries, 0 to 16511
Data columns (total 8 columns):
longitude             16512 non-null float64
latitude              16512 non-null float64
housing_median_age    16512 non-null float64
total_rooms           16512 non-null float64
total_bedrooms        16512 non-null float64
population            16512 non-null float64
households            16512 non-null float64
median_income         16512 non-null float64
dtypes: float64(8)
memory usage: 1.0 MB

In [19]:
# Use one-hot encoding got near ocean attribute
from sklearn.preprocessing import LabelBinarizer
encoder = LabelBinarizer()

housing_cat = housing["ocean_proximity"]
housing_cat_1hot = encoder.fit_transform(housing_cat)

In [20]:
# Custom class for adding further attributes
from sklearn.base import BaseEstimator, TransformerMixin

rooms_ix, bedrooms_ix, population_ix, household_ix = 3, 4, 5, 6

class CombinedAttributesAdder(BaseEstimator, TransformerMixin):
    def __init__(self, add_bedrooms_per_room=True):
        self.add_bedrooms_per_room = add_bedrooms_per_room
    def fit(self, X, y=None):
        return self
    def transform(self, X, y=None):
        rooms_per_household = X[:, population_ix] / X[:, household_ix]
        population_per_household = X[:, population_ix] / X[:, household_ix]
        if self.add_bedrooms_per_room:
            bedrooms_per_room = X[:, bedrooms_ix] / X[:, rooms_ix]
            return np.c_[X, rooms_per_household, population_per_household, bedrooms_per_room]
        else:
            return np.c_[X, rooms_per_household, population_per_household]
        
attr_adder = CombinedAttributesAdder(add_bedrooms_per_room=False)
housing_extra_attribs = attr_adder.transform(housing.values)
housing_extra_attribs


Out[20]:
array([[-121.89, 37.29, 38.0, ..., '<1H OCEAN', 2.094395280235988,
        2.094395280235988],
       [-121.93, 37.05, 14.0, ..., '<1H OCEAN', 2.7079646017699117,
        2.7079646017699117],
       [-117.2, 32.77, 31.0, ..., 'NEAR OCEAN', 2.0259740259740258,
        2.0259740259740258],
       ..., 
       [-116.4, 34.09, 9.0, ..., 'INLAND', 2.742483660130719,
        2.742483660130719],
       [-118.01, 33.82, 31.0, ..., '<1H OCEAN', 3.808988764044944,
        3.808988764044944],
       [-122.45, 37.77, 52.0, ..., 'NEAR BAY', 1.9859154929577465,
        1.9859154929577465]], dtype=object)

In [21]:
# Creating a full pipeline with all transformations so far

# 1) Helper class to extract only given attributes
from sklearn.base import BaseEstimator, TransformerMixin

class DataFrameSelector(BaseEstimator, TransformerMixin):
    def __init__(self, attribute_names):
        self.attribute_names = attribute_names
    def fit(self, X, y=None):
        return self
    def transform(self, X, y=None):
        return X[self.attribute_names].values

# 2) Define pipeline with interim steps
from sklearn.pipeline import FeatureUnion, Pipeline
from sklearn.preprocessing import StandardScaler

num_attribs = list(housing_num)
cat_attribs = ["ocean_proximity"]

num_pipeline = Pipeline([
    ("selector", DataFrameSelector(num_attribs)),  # select only numeric attributes
    ("imputer", Imputer(strategy="median")),       # fill n/a values with median
    ("attribs_added", CombinedAttributesAdder()),  # add derived attributes
    ("std_scaler", StandardScaler()),              # scale values to [-1,1]
])

cat_pipeline = Pipeline([
    ("selector", DataFrameSelector(cat_attribs)),  # select only category attributes
    ("label_binarizer", LabelBinarizer()),         # convert to 0/1 flags in columns
])

full_pipeline = FeatureUnion(transformer_list=[
    ("num_pipeline", num_pipeline),
    ("cat_pipeline", cat_pipeline),
])

housing_prepared = full_pipeline.fit_transform(housing)
housing_prepared.shape


Out[21]:
(16512, 16)

In [22]:
# trying linear regression as first example
from sklearn.linear_model import LinearRegression

lin_reg = LinearRegression()
lin_reg.fit(housing_prepared, housing_labels)

some_data = housing.iloc[:5]
some_labels = housing_labels.iloc[:5]
some_data_prepared = full_pipeline.transform(some_data)  # NO fit()!
print("Predictions:\t", lin_reg.predict(some_data_prepared))
print("Labels:\t\t", list(some_labels))

# calculate MRSE
from sklearn.metrics import mean_squared_error

housing_predictions = lin_reg.predict(housing_prepared)
lin_mse = mean_squared_error(housing_labels, housing_predictions)
lin_mrse = np.sqrt(lin_mse)
print("MRSE (linear): ", lin_mrse, " (calculated across all, not only sample above)")


Predictions:	 [ 210563.33661755  318260.76438827  211215.14630977   59114.79474222
  187121.77956263]
Labels:		 [286600.0, 340600.0, 196900.0, 46300.0, 254500.0]
MRSE (linear):  68826.635851  (calculated across all, not only sample above)

In [23]:
from sklearn.tree import DecisionTreeRegressor

tree_reg = DecisionTreeRegressor()
tree_reg.fit(housing_prepared, housing_labels)

housing_predictions = tree_reg.predict(housing_prepared)
tree_mse = mean_squared_error(housing_labels, housing_predictions)
tree_mrse = np.sqrt(tree_mse)
print("MRSE (tree): ", tree_mrse, " (obviously overfitting)")


MRSE (tree):  0.0  (obviously overfitting)

In [24]:
from sklearn.model_selection import cross_val_score

tree_scores = cross_val_score(tree_reg, housing_prepared, housing_labels, scoring="neg_mean_squared_error", cv=10)
tree_rmse_scores = np.sqrt(-tree_scores)

def display_scores(scores):
    print("Scores: ", scores)
    print("Mean: ", scores.mean())
    print("Standard deviation: ", scores.std())

print("Decision tree: ")
display_scores(tree_rmse_scores)

lin_scores = cross_val_score(lin_reg, housing_prepared, housing_labels, scoring="neg_mean_squared_error", cv=10)
lin_rmse_scores = np.sqrt(-lin_scores)

print("Linear regression: ")
display_scores(lin_rmse_scores)


Decision tree: 
Scores:  [ 67251.23576485  68962.89751612  70025.62356522  69237.89254929
  71345.24640804  74707.79328983  71476.10527552  70963.85556754
  74607.66564682  70861.96266544]
Mean:  70944.0278249
Standard deviation:  2221.6061222
Linear regression: 
Scores:  [ 66899.81050358  67175.37517063  70150.13267841  74996.34376617
  67805.64981263  71294.59349397  65184.94156317  68217.48595344
  72578.14406325  67736.06602734]
Mean:  69203.8543033
Standard deviation:  2843.63683451

In [25]:
# Next, implement ForestRegressor prediction like lin_reg and tree above! p.70/71
from sklearn.ensemble import RandomForestRegressor

forest_reg = RandomForestRegressor()
forest_reg.fit(housing_prepared, housing_labels)

housing_predictions = forest_reg.predict(housing_prepared)
forest_mse = mean_squared_error(housing_labels, housing_predictions)
forest_mrse = np.sqrt(forest_mse)
print("MRSE (forest): ", forest_mrse)


forest_scores = cross_val_score(forest_reg, housing_prepared, housing_labels, scoring="neg_mean_squared_error", cv=10)
forest_rmse_scores = np.sqrt(-forest_scores)

print("Forest scores: ")
display_scores(forest_rmse_scores)


MRSE (forest):  22059.327924
Forest scores: 
Scores:  [ 52798.34263259  51313.08784562  54801.01660897  55602.06345358
  51978.53744617  55468.39400497  51691.72285024  49944.39222288
  55354.64610866  54063.8221069 ]
Mean:  53301.6025281
Standard deviation:  1919.6835931

In [26]:
# If you want to save the learned model (p.71), use pickle or joblib
from sklearn.externals import joblib

# save via
#joblib.dump(lin_reg, "lin_reg.pkl")

# restore it via
#lin_reg_loaded = joblib.load("lin_reg.pkl")

In [27]:
# (p. 72) Using grid-search to fine-tune parameters for model
from sklearn.model_selection import GridSearchCV

param_grid = [
    { 'n_estimators' : [3, 10, 30], 'max_features' : [2, 4, 6, 8] },
    { 'bootstrap' : [False], 'n_estimators' : [3, 10], 'max_features' : [2, 3, 4] }
]

forest_reg = RandomForestRegressor()

grid_search = GridSearchCV(forest_reg, param_grid, cv=5, scoring='neg_mean_squared_error')
grid_search.fit(housing_prepared, housing_labels)

print("Grid search, best params:\n--------------", grid_search.best_params_)

print("Grid search, best estimator:\n--------------", grid_search.best_estimator_)

print("Grid search, cross-evaluation scores:\n--------------")
cvres = grid_search.cv_results_
for mean_score, params in zip(cvres["mean_test_score"], cvres["params"]):
    print(np.sqrt(-mean_score), params)

#joblib.dump(grid_search, "grid_search.pkl")


Grid search, best params:
-------------- {'n_estimators': 30, 'max_features': 6}
Grid search, best estimator:
-------------- RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features=6, max_leaf_nodes=None, min_impurity_split=1e-07,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=30, n_jobs=1,
           oob_score=False, random_state=None, verbose=0, warm_start=False)
Grid search, cross-evaluation scores:
--------------
64609.5256662 {'n_estimators': 3, 'max_features': 2}
55755.0244917 {'n_estimators': 10, 'max_features': 2}
52871.9229124 {'n_estimators': 30, 'max_features': 2}
60622.0711223 {'n_estimators': 3, 'max_features': 4}
53379.0294788 {'n_estimators': 10, 'max_features': 4}
51258.1336416 {'n_estimators': 30, 'max_features': 4}
60711.2568896 {'n_estimators': 3, 'max_features': 6}
52593.5752989 {'n_estimators': 10, 'max_features': 6}
50541.6639541 {'n_estimators': 30, 'max_features': 6}
59101.0265536 {'n_estimators': 3, 'max_features': 8}
52280.167475 {'n_estimators': 10, 'max_features': 8}
50964.2834679 {'n_estimators': 30, 'max_features': 8}
62506.5504757 {'n_estimators': 3, 'max_features': 2, 'bootstrap': False}
54585.0342877 {'n_estimators': 10, 'max_features': 2, 'bootstrap': False}
60755.0352464 {'n_estimators': 3, 'max_features': 3, 'bootstrap': False}
53451.3104212 {'n_estimators': 10, 'max_features': 3, 'bootstrap': False}
58672.9636205 {'n_estimators': 3, 'max_features': 4, 'bootstrap': False}
52613.8694748 {'n_estimators': 10, 'max_features': 4, 'bootstrap': False}
Out[27]:
['grid_search.pkl']

In [28]:
# TODO: should fine tune parameters for model or use RandomizedSearchCV instead of Grid~

# (p. 74) analyze best model features, here: median_income + INLAND + others
feature_importances = grid_search.best_estimator_.feature_importances_
extra_attribs = ["rooms_per_hhold", "pop_per_hhold", "bedrooms_per_room"]
cat_one_hot_attribs = list(encoder.classes_)
attributes = num_attribs + extra_attribs + cat_one_hot_attribs
sorted(zip(feature_importances, attributes), reverse=True)


Out[28]:
[(0.37316533281442332, 'median_income'),
 (0.15028020126424085, 'INLAND'),
 (0.080748613803678912, 'bedrooms_per_room'),
 (0.073523054745776223, 'pop_per_hhold'),
 (0.068061355901989959, 'longitude'),
 (0.065176993076601686, 'rooms_per_hhold'),
 (0.061658555764659638, 'latitude'),
 (0.042698438685555319, 'housing_median_age'),
 (0.018215667598756827, 'total_rooms'),
 (0.017377384479193327, 'population'),
 (0.017247102687009894, 'total_bedrooms'),
 (0.01667986510035473, 'households'),
 (0.0084198670234885333, '<1H OCEAN'),
 (0.0039352009852568351, 'NEAR OCEAN'),
 (0.0027057199869508696, 'NEAR BAY'),
 (0.00010664608206306942, 'ISLAND')]

In [29]:
# TODO: Should drop less relevant features from model here.

# (p. 75) After having fine-tuned the model etc, evaluate on the test set
final_model = grid_search.best_estimator_

X_test = strat_test_set.drop("median_house_value", axis=1)
y_test = strat_test_set["median_house_value"].copy()

X_test_prepared = full_pipeline.transform(X_test)  # DO NOT fit()!

final_predictions = final_model.predict(X_test_prepared)

final_mse = mean_squared_error(y_test, final_predictions)
final_rmse = np.sqrt(final_mse)

print("Final RMSE against test set: ", final_rmse)


Final RMSE against test set:  49145.6096665

In [30]:
print("Done! Next: cleanup, consolidate, automate, vary, try out on test set from other source.")


Done! Next: cleanup, consolidate, automate, vary, try out on test set from other source.

In [ ]: