In [15]:
import pandas as pd 
import numpy as np 
% matplotlib inline
from matplotlib import pyplot as plt

Import the data:

During this example we will be using a dataset of politicians affairs. This dataset has a number of politicians, their attributes and the number of affairs in the past.


In [16]:
path_to_data = r'E:\Universidade\Projects\DSSA\learning-units\units\16-tuning-hyper-parameters\data\affairs.csv'

data = pd.read_csv(path_to_data)
data.head(10)


Out[16]:
sex age ym child religious education occupation rate nbaffairs
0 male 37.0 10.00 no 3 18 7 4 0
1 female 27.0 4.00 no 4 14 6 4 0
2 female 32.0 15.00 yes 1 12 1 4 0
3 male 57.0 15.00 yes 5 18 6 5 0
4 male 22.0 0.75 no 2 17 6 3 0
5 female 32.0 1.50 no 2 17 5 5 0
6 female 22.0 0.75 no 2 12 1 3 0
7 male 57.0 15.00 yes 2 14 4 4 0
8 female 32.0 15.00 yes 4 16 1 2 0
9 male 22.0 1.50 no 4 14 4 5 0

Import Scikit Learn:


In [17]:
from sklearn.model_selection import train_test_split, RandomizedSearchCV, GridSearchCV
from sklearn.metrics import classification_report
from sklearn import tree

from scipy.stats import randint

Prepare dataset for learning:


In [18]:
data = pd.read_csv(path_to_data)

data['child'] = data['child'].map({'no': 0, 'yes': 1}).astype(int)
data['sex']   = data['sex'].map({'male': 0, 'female': 1}).astype(int)

y = data['nbaffairs'].values
X = data.drop(['nbaffairs'], axis=1).values

data.head(10)


Out[18]:
sex age ym child religious education occupation rate nbaffairs
0 0 37.0 10.00 0 3 18 7 4 0
1 1 27.0 4.00 0 4 14 6 4 0
2 1 32.0 15.00 1 1 12 1 4 0
3 0 57.0 15.00 1 5 18 6 5 0
4 0 22.0 0.75 0 2 17 6 3 0
5 1 32.0 1.50 0 2 17 5 5 0
6 1 22.0 0.75 0 2 12 1 3 0
7 0 57.0 15.00 1 2 14 4 4 0
8 1 32.0 15.00 1 4 16 1 2 0
9 0 22.0 1.50 0 4 14 4 5 0

Split the data in train and test set:


In [19]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.5, random_state=0)

Hyper parameter tuning

Now we will use the training data to find the best parameters using cross validation

First we will try the grid search.


In [20]:
# Define the parameter space
parameter_space = [{'max_depth': [1,2,3,4,5], 'max_features': range(1, 8)}]

# Choose the classifier
classifier = tree.DecisionTreeClassifier()

# Select grid search with cross validation
grid_search = GridSearchCV(classifier, parameter_space, cv=5)

grid_search.fit(X_train, y_train)


Out[20]:
GridSearchCV(cv=5, error_score='raise',
       estimator=DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            presort=False, random_state=None, splitter='best'),
       fit_params={}, iid=True, n_jobs=1,
       param_grid=[{'max_features': range(1, 8), 'max_depth': [1, 2, 3, 4, 5]}],
       pre_dispatch='2*n_jobs', refit=True, return_train_score=True,
       scoring=None, verbose=0)

In [21]:
grid_search.best_params_


Out[21]:
{'max_depth': 3, 'max_features': 6}

In [22]:
grid_search.cv_results_['mean_test_score']


Out[22]:
array([ 0.75      ,  0.75      ,  0.75      ,  0.75      ,  0.75      ,
        0.75      ,  0.75      ,  0.74333333,  0.75      ,  0.74333333,
        0.75      ,  0.75      ,  0.74333333,  0.74      ,  0.73666667,
        0.74      ,  0.73333333,  0.73666667,  0.75      ,  0.76666667,
        0.74333333,  0.74666667,  0.74      ,  0.74666667,  0.74333333,
        0.74333333,  0.73      ,  0.74666667,  0.72333333,  0.71666667,
        0.72666667,  0.73333333,  0.71666667,  0.74      ,  0.73666667])

In [23]:
grid_search.best_score_


Out[23]:
0.76666666666666672

Now we will repeat the same with the random search


In [24]:
# Define the parameter space
parameter_space_dist = {"max_depth": [1, 2, 3, 4, 5], "max_features": randint(1, 8)}
             
# Choose the classifier
classifier = tree.DecisionTreeClassifier()

# Select grid search with cross validation
random_search = RandomizedSearchCV(classifier, parameter_space_dist, cv=5, n_iter=500)
                                   
random_search.fit(X_train, y_train)


Out[24]:
RandomizedSearchCV(cv=5, error_score='raise',
          estimator=DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            presort=False, random_state=None, splitter='best'),
          fit_params={}, iid=True, n_iter=500, n_jobs=1,
          param_distributions={'max_features': <scipy.stats._distn_infrastructure.rv_frozen object at 0x000000000C935AC8>, 'max_depth': [1, 2, 3, 4, 5]},
          pre_dispatch='2*n_jobs', random_state=None, refit=True,
          return_train_score=True, scoring=None, verbose=0)

In [25]:
random_search.best_params_


Out[25]:
{'max_depth': 4, 'max_features': 2}

In [26]:
random_search.best_score_


Out[26]:
0.75666666666666671

Training the classifier

Instantiate the classifier with the parameters found during the grid search


In [41]:
classifier = tree.DecisionTreeClassifier(max_depth=4)
classifier.fit(X_train, y_train)


Out[41]:
DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=4,
            max_features=None, max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            presort=False, random_state=None, splitter='best')

In [42]:
classifier.score(X_test, y_test)


Out[42]:
0.7441860465116279

In [46]:
tree.export_graphviz(classifier, feature_names=data.columns.values.tolist(), filled=True, out_file=r'E:\Universidade\Projects\DSSA\learning-units\units\16-tuning-hyper-parameters\tree.dot')

In [48]:
classifier.get_params()


Out[48]:
{'class_weight': None,
 'criterion': 'gini',
 'max_depth': 4,
 'max_features': None,
 'max_leaf_nodes': None,
 'min_impurity_split': 1e-07,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'presort': False,
 'random_state': None,
 'splitter': 'best'}

In [ ]: