The analysis requires a Python installation with scikit-learn, pandas, and matplotlib installed for full functionality. I highly recommend the Anaconda distribution for fulling the dependencies. The exact versions of the major software used in my analysis are:
In [20]:
%matplotlib inline
In [21]:
import sys
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import datetime
from sklearn.preprocessing import StandardScaler, Imputer
from sklearn.svm import SVC, LinearSVC
from sklearn.decomposition import PCA
from sklearn.cross_validation import cross_val_score
from sklearn.learning_curve import learning_curve
from sklearn.feature_selection import f_regression, SelectKBest
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestRegressor
from sklearn.grid_search import GridSearchCV
from sklearn.metrics import mean_squared_error
In [22]:
# load TSV into pandas as DataFrame
df = pd.read_csv('missouri-normal.csv')
# or
# df = pd.read_csv('seattle.csv')
In [23]:
# Subsample to get about 15000 entries
df = df[::int(len(df)/15000)]
In [24]:
# Some basic information about the dataset
# Note that almost all the features appear to be normal distributions
# centered on 0 with standard deviation approximately 1.
# However, there is a *lot* of missing data.
# There probably isn't a need to run through a scaler, but it doesn't do any harm.
df.describe()
Out[24]:
In [25]:
del df['Unnamed: 0']
df.columns
Out[25]:
In [26]:
for column in df.columns:
print(column, df[column].dtype, len(pd.unique(df[column])))
In [27]:
# The 'target' column contains new construction vs. renovation, which is the target of the model
target = df['target']
del df['target']
if 'type' in df.columns:
del df['type']
In [28]:
dfd = pd.get_dummies(df)
In [29]:
# Evaluate feasibility of extracting a subset of important features
# Looks like a good candidate for k-best feature selection
dfi = Imputer().fit_transform(dfd)
f, p = f_regression(dfi, target)
plt.plot(f, 'bx')
plt.gca().set_yscale('log')
In [30]:
# Evaluate feasibility of extracting a subset of important components
# Energy seems to be pretty evenly distributed among principal components
# Not a good candidate for extracting principal components
pca = PCA()
pca.fit(dfi)
plt.plot(pca.explained_variance_ratio_, 'bx')
Out[30]:
In [31]:
# We can now conduct a grid search over the hyperparameters of several models to
# choose a tuned model
imputer = Imputer(strategy=None)
scaler = StandardScaler()
kbest = SelectKBest(f_regression, k=None)
rfr = RandomForestRegressor(n_estimators=None)
In [32]:
# Helper function to tune the pipeline and print out the tuned parameters
def tune_pipeline(fullname, shortname, model, params):
pipeline = Pipeline([('imputer', imputer), ('scaler', scaler), ('kbest', kbest), (shortname, model)])
grid_search = GridSearchCV(pipeline, param_grid=params, scoring='mean_squared_error')
grid_search.fit(dfd, target)
print("Results for {}".format(fullname))
print(grid_search.best_params_)
print(grid_search.best_score_)
return grid_search
In [33]:
# start with a simple random forest as we try to determine the ideal imputer and number of features to select.
# Let's try to pick an imputer strategy first.
# Note that I hand-picked a few seed values here based on eyeballing earlier plots.
tuned = tune_pipeline('Random Forest Regressor', 'rfr', rfr,
params = dict(imputer__strategy=['mean', 'median', 'most_frequent'],
kbest__k=[9],
rfr__n_estimators=[9]))
In [34]:
# Okay, now let's try to determine the right number of features to select.
# We know approximately how many we want from the f_regression scores
tuned = tune_pipeline('Random Forest Regressor', 'rfr', rfr,
params = dict(imputer__strategy=[tuned.best_params_['imputer__strategy']],
kbest__k=[1, 2, 3, 4, 5, 6, 7, 8, 9],
rfr__n_estimators=[9]))
In [35]:
# Final comparison, learning curves for SVR and a more aggressive RFR
# This function is adapted from: http://scikit-learn.org/stable/auto_examples/model_selection/plot_learning_curve.html
def plot_learning_curve(estimator, X, y, title="Learning Curve", ylim=None, cv=None, n_jobs=1):
"""
Generate a simple plot of the test and traning learning curve.
Parameters
----------
estimator : object type that implements the "fit" and "predict" methods
An object of that type which is cloned for each validation.
X : array-like, shape (n_samples, n_features)
Training vector, where n_samples is the number of samples and
n_features is the number of features.
y : array-like, shape (n_samples) or (n_samples, n_features), optional
Target relative to X for classification or regression;
None for unsupervised learning.
title : string
Title for the chart.
ylim : tuple, shape (ymin, ymax), optional
Defines minimum and maximum yvalues plotted.
cv : integer, cross-validation generator, optional
If an integer is passed, it is the number of folds (defaults to 3).
Specific cross-validation objects can be passed, see
sklearn.cross_validation module for the list of possible objects
n_jobs : integer, optional
Number of jobs to run in parallel (default -1).
"""
plt.figure()
plt.title(title)
if ylim is not None:
plt.ylim(*ylim)
plt.xlabel("Training examples")
plt.ylabel("Score")
train_sizes, train_scores, test_scores = learning_curve(
estimator, X, y, cv=cv, n_jobs=n_jobs)
train_scores_mean = np.mean(train_scores, axis=1)
train_scores_std = np.std(train_scores, axis=1)
test_scores_mean = np.mean(test_scores, axis=1)
test_scores_std = np.std(test_scores, axis=1)
plt.grid()
plt.fill_between(train_sizes, train_scores_mean - train_scores_std,
train_scores_mean + train_scores_std, alpha=0.1,
color="r")
plt.fill_between(train_sizes, test_scores_mean - test_scores_std,
test_scores_mean + test_scores_std, alpha=0.1, color="g")
plt.plot(train_sizes, train_scores_mean, 'o-', color="r",
label="Training score")
plt.plot(train_sizes, test_scores_mean, 'o-', color="g",
label="Cross-validation score")
plt.legend(loc="best")
In [36]:
imputer_prod = Imputer(strategy=tuned.best_params_['imputer__strategy'])
scaler_prod = StandardScaler()
kbest_prod = SelectKBest(f_regression, tuned.best_params_['kbest__k'])
rfr_prod = RandomForestRegressor(n_estimators=100)
svc_prod = LinearSVC()
svcr_prod = SVC(kernel='rbf')
In [37]:
prod_rfr = Pipeline([('imputer', imputer_prod), ('scaler', scaler_prod),
('kbest', kbest_prod), ('rfr', rfr_prod)])
plot_learning_curve(prod_rfr, dfd, target)
In [38]:
prod_svc = Pipeline([('imputer', imputer_prod), ('scaler', scaler_prod),
('kbest', kbest_prod), ('svc', svc_prod)])
plot_learning_curve(prod_svc, dfd, target)
In [39]:
prod_svcr = Pipeline([('imputer', imputer_prod), ('scaler', scaler_prod),
('kbest', kbest_prod), ('svcr', svcr_prod)])
plot_learning_curve(prod_svcr, dfd, target)
In [40]:
# without increasing C, the MSE of the cross-validated fit to the training data is around 4
cross_val_score(prod_svc, dfd, target)
Out[40]:
In [41]:
# This is the best pipeline we've found,
# * median or mean imputer (this makes sense, the median/mean of a normal distribution are the same),
# * standard scaler,
# * threshold to the 16 best features
# * Support Vector Regression using a radial-basis function and C=10
prod_svc.fit(dfd, target)
Out[41]:
In [42]:
# Let's review the best components
selected_features = prod_svc.named_steps['kbest'].get_support()
print(dfd.columns[selected_features])
In [43]:
# The best pipeline fits its training data with an MSE of 1.33
# Assuming that the withheld data is a non-pathological sample,
# the MSE of the fit to the withheld data should be between 3.2 and 1.33
mean_squared_error(target, prod_svc.predict(dfd))
Out[43]:
In [45]:
# Plot the decision boundary. For that, we will assign a color to each
# point in the mesh [x_min, m_max]x[y_min, y_max].
# create a mesh to plot in
x_min, x_max = dfd['year'].min() - 1, dfd['year'].max() + 1
y_min, y_max = dfd['permit_value'].min() - 1, dfd['permit_value'].max() + 1
print(x_min, x_max)
print(y_min, y_max)
In [59]:
svc_prod = LinearSVC()
svc_prod.fit(dfd[["permit_value", "year"]], target)
Out[59]:
In [60]:
xx, yy = np.meshgrid(np.arange(x_min, x_max, 1),
np.arange(y_min, y_max, 50))
Z = svc_prod.predict(np.c_[xx.ravel(), yy.ravel()])
# Put the result into a color plot
Z = Z.reshape(xx.shape)
plt.contourf(xx, yy, Z, cmap=plt.cm.Paired, alpha=0.8)
plt.colorbar()
Out[60]:
In [53]:
plt.colorbar()
In [ ]: