In [1]:
from __future__ import print_function
from sklearn import __version__ as sklearn_version
print('Sklearn version:', sklearn_version)
In [2]:
from sklearn import datasets
all_data = datasets.california_housing.fetch_california_housing()
# Describe dataset
print(all_data.DESCR)
print(all_data.feature_names)
In [3]:
# Print some data lines
print(all_data.data[:10])
print(all_data.target)
In [4]:
#Randomize, normalize and separate train & test
from sklearn.utils import shuffle
X, y = shuffle(all_data.data, all_data.target, random_state=42)
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)
# Normalize the data
from sklearn.preprocessing import Normalizer
normal = Normalizer()
X_train = normal.fit_transform(X_train)
X_test = normal.transform(X_test)
In [ ]:
In [5]:
from sklearn import linear_model
reg = linear_model.Ridge()
reg.fit(X_train, y_train)
# Evaluate
from sklearn.metrics import mean_absolute_error
y_test_predict = reg.predict(X_test)
print('Mean absolute error ', mean_absolute_error(y_test, y_test_predict))
print('Variance score: ', reg.score(X_test, y_test))
In [6]:
# Plot a scaterplot real vs predict
import matplotlib.pyplot as plt
%matplotlib inline
plt.scatter(y_test, y_test_predict)
Out[6]:
In [7]:
# Save model
from sklearn.externals import joblib
joblib.dump(reg, '/tmp/reg_model.pkl')
Out[7]:
In [8]:
# Load model
reg_loaded = joblib.load('/tmp/reg_model.pkl')
In [9]:
# View the coeficients
print('Coeficients :', reg_loaded.coef_)
print('Intercept: ', reg_loaded.intercept_ )
In [ ]:
In [10]:
# Use the function RidgeCV to select the best alpha using cross validation
reg = linear_model.RidgeCV(alphas=[.1, 1., 10.])
reg.fit(X_train, y_train)
print('Best alpha: ', reg.alpha_)
In [11]:
# Build a model with the recommended alpha
reg = linear_model.Ridge (alpha = 0.1)
reg.fit(X_train, y_train)
y_test_predict = reg.predict(X_test)
print('Mean absolute error ', mean_absolute_error(y_test, y_test_predict))
print('Variance score: ', reg.score(X_test, y_test))
plt.scatter(y_test, y_test_predict)
Out[11]:
In [ ]:
In [12]:
from sklearn import svm
reg_svr = svm.LinearSVR()
reg_svr.fit(X_train, y_train)
y_test_predict = reg_svr.predict(X_test)
print('Mean absolute error ', mean_absolute_error(y_test, y_test_predict))
print('Variance score: ', reg_svr.score(X_test, y_test))
plt.scatter(y_test, y_test_predict)
Out[12]:
In [ ]:
In [13]:
# Basic regression tree
from sklearn import tree
dtree = tree.DecisionTreeRegressor()
dtree.fit(X_train, y_train)
y_test_predict = dtree.predict(X_test)
print('Mean absolute error ', mean_absolute_error(y_test, y_test_predict))
print('Variance score: ', dtree.score(X_test, y_test))
plt.scatter(y_test, y_test_predict)
Out[13]:
In [14]:
# A second model regularized controling the depth
dtree2 = tree.DecisionTreeRegressor(max_depth=5)
dtree2.fit(X_train, y_train)
y_test_predict = dtree2.predict(X_test)
print('Mean absolute error ', mean_absolute_error(y_test, y_test_predict))
print('Variance score: ', dtree2.score(X_test, y_test))
plt.scatter(y_test, y_test_predict)
Out[14]:
In [15]:
# Plot the tree
import pydotplus
from IPython.display import Image
dot_data = tree.export_graphviz(dtree2, out_file=None,
feature_names=all_data.feature_names,
filled=True, rounded=True,
special_characters=True)
graph = pydotplus.graph_from_dot_data(dot_data)
Image(graph.create_png())
Out[15]:
In [ ]: