In [1]:
from __future__ import print_function
from sklearn import __version__ as sklearn_version
print('Sklearn version:', sklearn_version)
In [2]:
from sklearn import datasets
all_data = datasets.california_housing.fetch_california_housing()
print(all_data.DESCR)
In [3]:
# Randomize, separate train & test and normalize
from sklearn.utils import shuffle
X, y = shuffle(all_data.data, all_data.target, random_state=0)
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=0)
# Normalize the data
from sklearn.preprocessing import Normalizer
normal = Normalizer()
X_train = normal.fit_transform(X_train)
X_test = normal.transform(X_test)
In [4]:
# Create a basic decision tree
from sklearn import tree
from sklearn.metrics import mean_absolute_error
clf = tree.DecisionTreeRegressor()
clf.fit(X_train, y_train)
mean_absolute_error(y_test, clf.predict(X_test))
Out[4]:
In [5]:
# Define a function to evaluate the error over models with different max_depth
def acc(md):
'''
Calculate error of a tree with a specific mas_depth
Paramters:
md: max depth of the tree
Returns:
Mean absolute error of the fitted tree
'''
clf = tree.DecisionTreeRegressor(max_depth=md)
clf.fit(X_train, y_train)
return mean_absolute_error(y_test, clf.predict(X_test))
# Evaluate from max_depth=1 to max_depth=30
index = []
accuracy = []
for i in range(1,30):
accuracy_step = acc(i)
index += [i]
accuracy += [accuracy_step]
print('Max depth - Error:', i, accuracy_step)
In [6]:
# Plot the error vs max_depth
import matplotlib.pyplot as plt
%matplotlib inline
plt.plot(index,accuracy)
Out[6]:
In [7]:
clf = tree.DecisionTreeRegressor(max_depth=9)
clf.fit(X_train, y_train)
mean_absolute_error(y_test, clf.predict(X_test))
Out[7]:
In [8]:
# Plot the sctterplot
plt.scatter(y_test, clf.predict(X_test))
Out[8]:
In [ ]:
In [9]:
import numpy as np
from time import time
from scipy.stats import randint
from sklearn.model_selection import RandomizedSearchCV
# Define estimator. No parameters
clf = tree.DecisionTreeRegressor()
# specify parameters and distributions to sample from
param_dist = {"max_depth": randint(3, 20),
"min_samples_leaf": randint(5, 50)}
# Define randomized search
n_iter_search = 30
random_search = RandomizedSearchCV(clf, param_distributions=param_dist, n_iter=n_iter_search)
# Run the randomized search
start = time()
random_search.fit(X_train, y_train)
print("RandomizedSearchCV took %.2f seconds for %d candidates parameter settings." % ((time() - start), n_iter_search))
In [10]:
# Utility function to report best scores
def report(results, n_top=3):
for i in range(1, n_top + 1):
candidate = np.argmax(results['rank_test_score'] == i)
print("Model with rank: ", i)
print("Mean validation score: ", results['mean_test_score'][candidate])
print("Parameters: ", results['params'][candidate], "\n")
report(random_search.cv_results_)
In [11]:
# Build the tree with the optimal parametrization
clf = tree.DecisionTreeRegressor(max_depth=15, min_samples_leaf=28)
clf.fit(X_train, y_train)
print(mean_absolute_error(y_test, clf.predict(X_test)))
plt.scatter(y_test, clf.predict(X_test))
Out[11]:
In [ ]: