In [ ]:
from __future__ import print_function

from sklearn import __version__ as sklearn_version
print('Sklearn version:', sklearn_version)

Load data

Predict the california average house value

In [ ]:
from sklearn import datasets

all_data = datasets.california_housing.fetch_california_housing()

# Describe dataset

In [ ]:
# Print some data lines

In [ ]:
#Randomize, normalize and separate train & test

from sklearn.utils import shuffle
X, y = shuffle(,, random_state=42)

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

# Normalize the data
from sklearn.preprocessing import Normalizer
# Define normalizer
#Fit & transform over trin
# transform test

In [ ]:

Model with the recommendation of the cheat-sheet

- Based on the Sklearn algorithm cheat-sheet

In [ ]:
from sklearn import linear_model

# Select the correct linear model and fit it
reg = linear_model. ..., y_train)

# Evaluate
from sklearn.metrics import mean_absolute_error

y_test_predict = reg.predict(X_test)
print('Mean absolute error ', mean_absolute_error(y_test, y_test_predict))
print('Variance score: ', reg.score(X_test, y_test))

In [ ]:
# Plot a scaterplot real vs predict
import matplotlib.pyplot as plt
%matplotlib inline

# Plot the scatter plot real vs predict

In [ ]:
# Save model
from sklearn.externals import joblib

joblib.dump(reg, '/tmp/reg_model.pkl')

In [ ]:
# Load model
reg_loaded = joblib.load('/tmp/reg_model.pkl')

In [ ]:
# View the coeficients
print('Coeficients :', reg_loaded.coef_)
print('Intercept: ', reg_loaded.intercept_ )

In [ ]:

Improve the model parametrization

In [ ]:
# Use the function RidgeCV to select the best alpha using cross validation

#Define the RidgeCV model. Test alpha over the values 0.1, 1 and 10
..., y_train)

print('Best alpha: ', reg.alpha_)

In [ ]:
# Build a model with the recommended alpha
reg = linear_model.Ridge (alpha = ...), y_train)

y_test_predict = reg.predict(X_test)
print('Mean absolute error ', mean_absolute_error(y_test, y_test_predict))
print('Variance score: ', reg.score(X_test, y_test))

plt.scatter(y_test, y_test_predict)

In [ ]:

Check the second cheat sheet recommendation

In [ ]:
from sklearn import svm

# Select the correct model and define it
reg_svr = ..., y_train)

y_test_predict = reg_svr.predict(X_test)
print('Mean absolute error ', mean_absolute_error(y_test, y_test_predict))
print('Variance score: ', reg_svr.score(X_test, y_test))

plt.scatter(y_test, y_test_predict)

In [ ]:

Build a decision tree regressor

In [ ]:
# Import the regression tree function
from sklearn import ...

# Define the tree
..., y_train)

y_test_predict = dtree.predict(X_test)
print('Mean absolute error ', mean_absolute_error(y_test, y_test_predict))
print('Variance score: ', dtree.score(X_test, y_test))

plt.scatter(y_test, y_test_predict)

In [ ]:
# A second model regularized controling the depth

# Build a second tree with a max deep of 5

y_test_predict = dtree2.predict(X_test)
print('Mean absolute error ', mean_absolute_error(y_test, y_test_predict))
print('Variance score: ', dtree2.score(X_test, y_test))

plt.scatter(y_test, y_test_predict)

In [ ]:
# Plot the tree
import pydotplus 

from IPython.display import Image  
dot_data = tree.export_graphviz(dtree2, out_file=None, 
                         filled=True, rounded=True,  
graph = pydotplus.graph_from_dot_data(dot_data)  

In [ ]: