In [ ]:
from __future__ import print_function

from sklearn import __version__ as sklearn_version
print('Sklearn version:', sklearn_version)

Load data

Predict the california average house value


In [ ]:
from sklearn import datasets

all_data = datasets.california_housing.fetch_california_housing()

# Describe dataset
print(all_data.DESCR)
print(all_data.feature_names)

In [ ]:
# Print some data lines
print(all_data.data[:10])
print(all_data.target)

In [ ]:
#Randomize, normalize and separate train & test

from sklearn.utils import shuffle
X, y = shuffle(all_data.data, all_data.target, random_state=42)

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)


# Normalize the data
from sklearn.preprocessing import Normalizer
# Define normalizer
...
#Fit & transform over trin
...
# transform test
...

In [ ]:

Model with the recommendation of the cheat-sheet

- Based on the Sklearn algorithm cheat-sheet


In [ ]:
from sklearn import linear_model

# Select the correct linear model and fit it
reg = linear_model. ...
reg.fit(X_train, y_train)

# Evaluate
from sklearn.metrics import mean_absolute_error

y_test_predict = reg.predict(X_test)
print('Mean absolute error ', mean_absolute_error(y_test, y_test_predict))
print('Variance score: ', reg.score(X_test, y_test))

In [ ]:
# Plot a scaterplot real vs predict
import matplotlib.pyplot as plt
%matplotlib inline

# Plot the scatter plot real vs predict
...

In [ ]:
# Save model
from sklearn.externals import joblib

joblib.dump(reg, '/tmp/reg_model.pkl')

In [ ]:
# Load model
reg_loaded = joblib.load('/tmp/reg_model.pkl')

In [ ]:
# View the coeficients
print('Coeficients :', reg_loaded.coef_)
print('Intercept: ', reg_loaded.intercept_ )

In [ ]:

Improve the model parametrization


In [ ]:
# Use the function RidgeCV to select the best alpha using cross validation

#Define the RidgeCV model. Test alpha over the values 0.1, 1 and 10
...
reg.fit(X_train, y_train)

print('Best alpha: ', reg.alpha_)

In [ ]:
# Build a model with the recommended alpha
reg = linear_model.Ridge (alpha = ...)
reg.fit(X_train, y_train)

y_test_predict = reg.predict(X_test)
print('Mean absolute error ', mean_absolute_error(y_test, y_test_predict))
print('Variance score: ', reg.score(X_test, y_test))

plt.scatter(y_test, y_test_predict)

In [ ]:

Check the second cheat sheet recommendation


In [ ]:
from sklearn import svm

# Select the correct model and define it
reg_svr = ...

reg_svr.fit(X_train, y_train)

y_test_predict = reg_svr.predict(X_test)
print('Mean absolute error ', mean_absolute_error(y_test, y_test_predict))
print('Variance score: ', reg_svr.score(X_test, y_test))

plt.scatter(y_test, y_test_predict)

In [ ]:

Build a decision tree regressor


In [ ]:
# Import the regression tree function
from sklearn import ...

# Define the tree
... 

dtree.fit(X_train, y_train)

y_test_predict = dtree.predict(X_test)
print('Mean absolute error ', mean_absolute_error(y_test, y_test_predict))
print('Variance score: ', dtree.score(X_test, y_test))

plt.scatter(y_test, y_test_predict)

In [ ]:
# A second model regularized controling the depth

# Build a second tree with a max deep of 5
...
...

y_test_predict = dtree2.predict(X_test)
print('Mean absolute error ', mean_absolute_error(y_test, y_test_predict))
print('Variance score: ', dtree2.score(X_test, y_test))

plt.scatter(y_test, y_test_predict)

In [ ]:
# Plot the tree
import pydotplus 

from IPython.display import Image  
dot_data = tree.export_graphviz(dtree2, out_file=None, 
                         feature_names=all_data.feature_names,  
                         filled=True, rounded=True,  
                         special_characters=True)  
graph = pydotplus.graph_from_dot_data(dot_data)  
Image(graph.create_png())

In [ ]: