In [ ]:
from __future__ import print_function
from sklearn import __version__ as sklearn_version
print('Sklearn version:', sklearn_version)
In [ ]:
from sklearn import datasets
all_data = datasets.california_housing.fetch_california_housing()
print(all_data.DESCR)
In [ ]:
# Randomize, separate train & test and normalize
from sklearn.utils import shuffle
X, y = shuffle(all_data.data, all_data.target, random_state=0)
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=0)
# Normalize the data
from sklearn.preprocessing import Normalizer
normal = Normalizer()
X_train = normal.fit_transform(X_train)
X_test = normal.transform(X_test)
In [ ]:
# Create a basic decision tree
from sklearn import tree
from sklearn.metrics import mean_absolute_error
clf = tree.DecisionTreeRegressor()
clf.fit(X_train, y_train)
mean_absolute_error(y_test, clf.predict(X_test))
In [ ]:
# Define a function to evaluate the error over models with different max_depth
def acc(md):
'''
Calculate error of a tree with a specific mas_depth
Paramters:
md: max depth of the tree
Returns:
Mean absolute error of the fitted tree
'''
# Define model
...
# Fit model
...
# Evaluate and return the error
...
return ...
# Evaluate from max_depth=1 to max_depth=30
index = []
accuracy = []
for i in range(1,30):
accuracy_step = acc(i)
index += [i]
accuracy += [accuracy_step]
print('Max depth - Error:', i, accuracy_step)
In [ ]:
# Plot the error vs max_depth
import matplotlib.pyplot as plt
%matplotlib inline
plt.plot(index,accuracy)
In [ ]:
# Define the model with the best parametrization
...
clf.fit(X_train, y_train)
mean_absolute_error(y_test, clf.predict(X_test))
In [ ]:
# Plot the scatterplot
plt.scatter(y_test, clf.predict(X_test))
In [ ]:
In [ ]:
import numpy as np
from time import time
from scipy.stats import randint
from sklearn.model_selection import RandomizedSearchCV
# Define estimator. No parameters
...
# specify parameters and distributions to sample from (COMPLETE)
param_dist = {"max_depth": randint(3, 20),
"min_samples_leaf": ...}
# Define randomized search. Complete the function parameters
random_search = RandomizedSearchCV(...)
# Run the randomized search
start = time()
random_search.fit(X_train, y_train)
print("RandomizedSearchCV took %.2f seconds for %d candidates parameter settings." % ((time() - start), n_iter_search))
In [ ]:
# Utility function to report best scores
def report(results, n_top=3):
for i in range(1, n_top + 1):
candidate = np.argmax(results['rank_test_score'] == i)
print("Model with rank: ", i)
print("Mean validation score: ", results['mean_test_score'][candidate])
print("Parameters: ", results['params'][candidate], "\n")
report(random_search.cv_results_)
In [ ]:
# Build the tree with the optimal parametrization
# Define the model with the best parametrization
...
clf.fit(X_train, y_train)
print(mean_absolute_error(y_test, clf.predict(X_test)))
plt.scatter(y_test, clf.predict(X_test))
In [ ]: