Sklearn control overfit example

- Use the California house database to show how to control overfit tuning the model parameters



In [ ]:

    
from __future__ import print_function

from sklearn import __version__ as sklearn_version
print('Sklearn version:', sklearn_version)

Load data



In [ ]:

    
from sklearn import datasets
all_data = datasets.california_housing.fetch_california_housing()
print(all_data.DESCR)



In [ ]:

    
# Randomize, separate train & test and normalize

from sklearn.utils import shuffle
X, y = shuffle(all_data.data, all_data.target, random_state=0)

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=0)

# Normalize the data
from sklearn.preprocessing import Normalizer
normal = Normalizer()
X_train = normal.fit_transform(X_train)
X_test = normal.transform(X_test)



In [ ]:

    
# Create a basic decision tree
from sklearn import tree
from sklearn.metrics import mean_absolute_error

clf = tree.DecisionTreeRegressor()
clf.fit(X_train, y_train)
mean_absolute_error(y_test, clf.predict(X_test))



In [ ]:

    
# Define a function to evaluate the error over models with different max_depth
def acc(md):
    '''
    Calculate error of a tree with a specific mas_depth
    
    Paramters:
        md: max depth of the tree
    
    Returns:
        Mean absolute error of the fitted tree
    '''
    # Define model
    ...
    # Fit model
    ...
    # Evaluate and return the error
    ...
    return ...



# Evaluate from max_depth=1 to max_depth=30
index = []
accuracy = []
for i in range(1,30):
    accuracy_step = acc(i)
    index += [i]
    accuracy += [accuracy_step]
    print('Max depth - Error:', i, accuracy_step)



In [ ]:

    
# Plot the error vs max_depth
import matplotlib.pyplot as plt
%matplotlib inline

plt.plot(index,accuracy)

Fit the best model



In [ ]:

    
# Define the model with the best parametrization
...

clf.fit(X_train, y_train)
mean_absolute_error(y_test, clf.predict(X_test))



In [ ]:

    
# Plot the scatterplot
plt.scatter(y_test, clf.predict(X_test))



In [ ]:

A better way. Use a model_selection tool: RandomizedSeachCV



In [ ]:

    
import numpy as np

from time import time
from scipy.stats import randint

from sklearn.model_selection import RandomizedSearchCV


# Define estimator. No parameters
...


# specify parameters and distributions to sample from (COMPLETE)
param_dist = {"max_depth": randint(3, 20), 
              "min_samples_leaf": ...}


# Define randomized search. Complete the function parameters
random_search = RandomizedSearchCV(...)


# Run the randomized search
start = time()
random_search.fit(X_train, y_train)
print("RandomizedSearchCV took %.2f seconds for %d candidates parameter settings." % ((time() - start), n_iter_search))



In [ ]:

    
# Utility function to report best scores
def report(results, n_top=3):
    for i in range(1, n_top + 1):
        candidate = np.argmax(results['rank_test_score'] == i)
        print("Model with rank: ", i)
        print("Mean validation score: ", results['mean_test_score'][candidate])
        print("Parameters: ", results['params'][candidate], "\n")
            
report(random_search.cv_results_)



In [ ]:

    
# Build the tree with the optimal parametrization

# Define the model with the best parametrization
...

clf.fit(X_train, y_train)
print(mean_absolute_error(y_test, clf.predict(X_test)))

plt.scatter(y_test, clf.predict(X_test))



In [ ]: