In [1]:
from __future__ import print_function

from sklearn import __version__ as sklearn_version
print('Sklearn version:', sklearn_version)


Sklearn version: 0.18.1

Load data

Predict the california average house value


In [2]:
from sklearn import datasets

all_data = datasets.california_housing.fetch_california_housing()

# Describe dataset
print(all_data.DESCR)
print(all_data.feature_names)


California housing dataset.

The original database is available from StatLib

    http://lib.stat.cmu.edu/

The data contains 20,640 observations on 9 variables.

This dataset contains the average house value as target variable
and the following input variables (features): average income,
housing average age, average rooms, average bedrooms, population,
average occupation, latitude, and longitude in that order.

References
----------

Pace, R. Kelley and Ronald Barry, Sparse Spatial Autoregressions,
Statistics and Probability Letters, 33 (1997) 291-297.


['MedInc', 'HouseAge', 'AveRooms', 'AveBedrms', 'Population', 'AveOccup', 'Latitude', 'Longitude']

In [3]:
# Print some data lines
print(all_data.data[:10])
print(all_data.target)


[[  8.32520000e+00   4.10000000e+01   6.98412698e+00   1.02380952e+00
    3.22000000e+02   2.55555556e+00   3.78800000e+01  -1.22230000e+02]
 [  8.30140000e+00   2.10000000e+01   6.23813708e+00   9.71880492e-01
    2.40100000e+03   2.10984183e+00   3.78600000e+01  -1.22220000e+02]
 [  7.25740000e+00   5.20000000e+01   8.28813559e+00   1.07344633e+00
    4.96000000e+02   2.80225989e+00   3.78500000e+01  -1.22240000e+02]
 [  5.64310000e+00   5.20000000e+01   5.81735160e+00   1.07305936e+00
    5.58000000e+02   2.54794521e+00   3.78500000e+01  -1.22250000e+02]
 [  3.84620000e+00   5.20000000e+01   6.28185328e+00   1.08108108e+00
    5.65000000e+02   2.18146718e+00   3.78500000e+01  -1.22250000e+02]
 [  4.03680000e+00   5.20000000e+01   4.76165803e+00   1.10362694e+00
    4.13000000e+02   2.13989637e+00   3.78500000e+01  -1.22250000e+02]
 [  3.65910000e+00   5.20000000e+01   4.93190661e+00   9.51361868e-01
    1.09400000e+03   2.12840467e+00   3.78400000e+01  -1.22250000e+02]
 [  3.12000000e+00   5.20000000e+01   4.79752705e+00   1.06182380e+00
    1.15700000e+03   1.78825348e+00   3.78400000e+01  -1.22250000e+02]
 [  2.08040000e+00   4.20000000e+01   4.29411765e+00   1.11764706e+00
    1.20600000e+03   2.02689076e+00   3.78400000e+01  -1.22260000e+02]
 [  3.69120000e+00   5.20000000e+01   4.97058824e+00   9.90196078e-01
    1.55100000e+03   2.17226891e+00   3.78400000e+01  -1.22250000e+02]]
[ 4.526  3.585  3.521 ...,  0.923  0.847  0.894]

In [4]:
#Randomize, normalize and separate train & test

from sklearn.utils import shuffle
X, y = shuffle(all_data.data, all_data.target, random_state=42)

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

# Normalize the data
from sklearn.preprocessing import Normalizer
normal = Normalizer()
X_train = normal.fit_transform(X_train)
X_test = normal.transform(X_test)

In [ ]:

Model with the recommendation of the cheat-sheet


In [5]:
from sklearn import linear_model

reg = linear_model.Ridge()
reg.fit(X_train, y_train)

# Evaluate
from sklearn.metrics import mean_absolute_error

y_test_predict = reg.predict(X_test)
print('Mean absolute error ', mean_absolute_error(y_test, y_test_predict))
print('Variance score: ', reg.score(X_test, y_test))


Mean absolute error  0.886655025074
Variance score:  0.0685566069894

In [6]:
# Plot a scaterplot real vs predict
import matplotlib.pyplot as plt
%matplotlib inline

plt.scatter(y_test, y_test_predict)


Out[6]:
<matplotlib.collections.PathCollection at 0x7effa5fdd550>

In [7]:
# Save model
from sklearn.externals import joblib

joblib.dump(reg, '/tmp/reg_model.pkl')


Out[7]:
['/tmp/reg_model.pkl']

In [8]:
# Load model
reg_loaded = joblib.load('/tmp/reg_model.pkl')

In [9]:
# View the coeficients
print('Coeficients :', reg_loaded.coef_)
print('Intercept: ', reg_loaded.intercept_ )


Coeficients : [ 21.64332905   4.39536291   3.13367919  -1.16447227   0.83220545
  -3.2404152   -6.26577343  -0.54926884]
Intercept:  1.17978496107

In [ ]:

Improve the model parametrization


In [10]:
# Use the function RidgeCV to select the best alpha using cross validation
reg = linear_model.RidgeCV(alphas=[.1, 1., 10.])
reg.fit(X_train, y_train)

print('Best alpha: ', reg.alpha_)


Best alpha:  0.1

In [11]:
# Build a model with the recommended alpha
reg = linear_model.Ridge (alpha = 0.1)
reg.fit(X_train, y_train)

y_test_predict = reg.predict(X_test)
print('Mean absolute error ', mean_absolute_error(y_test, y_test_predict))
print('Variance score: ', reg.score(X_test, y_test))

plt.scatter(y_test, y_test_predict)


Mean absolute error  0.810041467227
Variance score:  0.209383591131
Out[11]:
<matplotlib.collections.PathCollection at 0x7effa5eeee50>

In [ ]:

Check the second cheat sheet recommendation, a LinearSVR model


In [12]:
from sklearn import svm

reg_svr = svm.LinearSVR()
reg_svr.fit(X_train, y_train)

y_test_predict = reg_svr.predict(X_test)
print('Mean absolute error ', mean_absolute_error(y_test, y_test_predict))
print('Variance score: ', reg_svr.score(X_test, y_test))

plt.scatter(y_test, y_test_predict)


Mean absolute error  0.867453648
Variance score:  -0.00607433544938
Out[12]:
<matplotlib.collections.PathCollection at 0x7effa5e40a90>

In [ ]:

Build a decision tree regressor


In [13]:
# Basic regression tree
from sklearn import tree

dtree = tree.DecisionTreeRegressor()
dtree.fit(X_train, y_train)

y_test_predict = dtree.predict(X_test)
print('Mean absolute error ', mean_absolute_error(y_test, y_test_predict))
print('Variance score: ', dtree.score(X_test, y_test))

plt.scatter(y_test, y_test_predict)


Mean absolute error  0.657522387279
Variance score:  0.356019005722
Out[13]:
<matplotlib.collections.PathCollection at 0x7effa549af10>

In [14]:
# A second model regularized controling the depth

dtree2 = tree.DecisionTreeRegressor(max_depth=5)
dtree2.fit(X_train, y_train)

y_test_predict = dtree2.predict(X_test)
print('Mean absolute error ', mean_absolute_error(y_test, y_test_predict))
print('Variance score: ', dtree2.score(X_test, y_test))

plt.scatter(y_test, y_test_predict)


Mean absolute error  0.626954839938
Variance score:  0.488285126735
Out[14]:
<matplotlib.collections.PathCollection at 0x7effa4266f50>

In [15]:
# Plot the tree
import pydotplus 

from IPython.display import Image  
dot_data = tree.export_graphviz(dtree2, out_file=None, 
                         feature_names=all_data.feature_names,  
                         filled=True, rounded=True,  
                         special_characters=True)  
graph = pydotplus.graph_from_dot_data(dot_data)  
Image(graph.create_png())


Out[15]:

In [ ]: