In [1]:
# %load nbinit.py
from IPython.display import HTML
HTML("<style>.container { width: 100% !important; padding-left: 1em; padding-right: 2em; } div.output_stderr { background: #FFA; }</style>")


Out[1]:

Decision Tree

Let's see how well a decision tree can classify the data. Hereby we need to consider

  1. the parameters to the classifier, and
  2. the features of the data set that will be used. We may just explore the impact of the maximum depth of the decision tree. Two of the 16 features ('day' and 'month') may not be useful because they reflect a date, and we're not looking for seasonal effects. So, it's fairly safe to take them out.

Once the dataset is loaded we will convert the categorical data into numeric values.

Finding the right parameters and features for the best performing classifier can be a challenge. The number of possible configurations grows quickly, and knowing how they perform requires training and testing with each of them.

We may also run the training and testing on a configuration multiple times with different random splits of the data set. The performance metrics will be avaraged over the iterations.

We use percision, recall, and the F1 score to evaluate each configuration.


In [29]:
### Load Packages
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
import sklearn.tree
import pydot_ng as pdot
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.metrics import precision_recall_fscore_support
import itertools

Reading Data


In [2]:
### Read data
DATAFILE = '/home/data/archive.ics.uci.edu/BankMarketing/bank.csv'
df = pd.read_csv(DATAFILE, sep=';')

In [3]:
### use sets and '-' difference operation 'A-B'. Also there is a symmetric different '^'
all_features = set(df.columns)-set(['y'])
num_features = set(df.describe().columns)
cat_features = all_features-num_features
print("All features:         ", ", ".join(all_features), "\nNumerical features:   ", ", ".join(num_features), "\nCategorical features: ", ", ".join(cat_features))


All features:          balance, day, education, previous, loan, contact, pdays, marital, duration, job, campaign, month, poutcome, age, default, housing 
Numerical features:    balance, day, duration, previous, campaign, age, pdays 
Categorical features:  job, education, month, loan, contact, poutcome, default, marital, housing

In [30]:
### convert to categorical variables to numeric ones
level_substitution = {}

def levels2index(levels):
    dct = {}
    for i in range(len(levels)):
        dct[levels[i]] = i
    return dct

df_num = df.copy()

for c in cat_features:
    level_substitution[c] = levels2index(df[c].unique())
    df_num[c].replace(level_substitution[c], inplace=True)

## same for target
df_num.y.replace({'no':0, 'yes':1}, inplace=True)
df_num


Out[30]:
age job marital education default balance housing loan contact day month duration campaign pdays previous poutcome y
0 30 0 0 0 0 1787 0 0 0 19 0 79 1 -1 0 0 0
1 33 1 0 1 0 4789 1 1 0 11 1 220 1 339 4 1 0
2 35 2 1 2 0 1350 1 0 0 16 2 185 1 330 1 1 0
3 30 2 0 2 0 1476 1 1 1 3 3 199 4 -1 0 0 0
4 59 3 0 1 0 0 1 0 1 5 1 226 1 -1 0 0 0
5 35 2 1 2 0 747 0 0 0 23 4 141 2 176 3 1 0
6 36 4 0 2 0 307 1 0 0 14 1 341 1 330 2 2 0
7 39 5 0 1 0 147 1 0 0 6 1 151 2 -1 0 0 0
8 41 6 0 2 0 221 1 0 1 14 1 57 2 -1 0 0 0
9 43 1 0 0 0 -88 1 1 0 17 2 313 1 147 2 1 0
10 39 1 0 1 0 9374 1 0 1 20 1 273 1 -1 0 0 0
11 43 7 0 1 0 264 1 0 0 17 2 113 2 -1 0 0 0
12 36 5 0 2 0 1109 0 0 0 13 5 328 2 -1 0 0 0
13 20 8 1 1 0 502 0 0 0 30 2 261 1 -1 0 0 1
14 31 3 0 1 0 360 1 1 0 29 6 89 1 241 1 1 0
15 40 2 0 2 0 194 0 1 0 29 5 189 2 -1 0 0 0
16 56 5 0 1 0 4073 0 0 0 27 5 239 5 -1 0 0 0
17 37 7 1 2 0 2317 1 0 0 20 2 114 1 152 2 1 0
18 25 3 1 0 0 -221 1 0 1 23 1 250 1 -1 0 0 0
19 31 1 0 1 0 132 0 0 0 7 7 148 1 152 1 2 0
20 38 2 2 3 0 0 1 0 0 18 8 96 2 -1 0 0 0
21 42 2 2 2 0 16 0 0 0 19 8 140 3 -1 0 0 0
22 44 1 1 1 0 106 0 0 1 12 3 109 2 -1 0 0 0
23 44 6 0 1 0 93 0 0 0 7 7 125 2 -1 0 0 0
24 26 9 0 2 0 543 0 0 0 30 6 169 3 -1 0 0 0
25 41 2 0 2 0 5883 0 0 0 20 8 182 2 -1 0 0 0
26 55 3 0 0 0 627 1 0 1 5 1 247 1 -1 0 0 0
27 67 10 0 3 0 696 0 0 2 17 5 119 1 105 2 1 0
28 56 4 0 1 0 784 0 1 0 30 7 149 2 -1 0 0 0
29 53 7 0 1 0 105 0 1 0 21 5 74 2 -1 0 0 0
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
4491 35 3 1 1 0 0 1 0 0 16 2 169 1 -1 0 0 0
4492 32 5 1 1 0 309 1 1 0 16 2 346 1 234 3 1 0
4493 28 5 1 2 0 0 1 0 1 4 3 205 6 -1 0 0 0
4494 26 5 1 1 0 668 1 0 1 28 1 576 3 -1 0 0 1
4495 48 2 0 2 0 1175 1 0 2 18 8 1476 3 -1 0 0 0
4496 30 3 1 1 0 363 0 0 0 28 7 171 3 -1 0 0 0
4497 31 6 1 2 0 38 0 0 0 20 8 185 2 -1 0 0 0
4498 31 2 0 2 0 1183 1 0 1 27 1 676 6 -1 0 0 0
4499 45 3 2 0 0 942 0 0 0 21 8 362 1 -1 0 0 0
4500 38 7 0 1 0 4196 1 0 0 12 1 193 2 -1 0 0 0
4501 34 2 0 2 0 297 1 0 0 26 5 63 4 -1 0 0 0
4502 42 1 0 1 0 -91 1 1 0 5 4 43 1 -1 0 0 0
4503 60 4 0 0 0 362 0 1 0 29 7 816 6 -1 0 0 1
4504 42 3 1 1 0 1080 1 1 0 13 1 951 3 370 4 1 1
4505 32 7 1 1 0 620 1 0 1 26 1 1234 3 -1 0 0 1
4506 42 0 2 2 0 -166 0 0 0 29 5 85 4 -1 0 0 0
4507 33 1 0 1 0 288 1 0 0 17 2 306 1 -1 0 0 0
4508 42 7 0 3 0 642 1 1 1 16 1 509 2 -1 0 0 0
4509 51 5 0 2 0 2506 0 0 0 30 8 210 3 -1 0 0 0
4510 36 5 2 1 0 566 1 0 1 20 1 129 2 -1 0 0 0
4511 46 3 0 1 0 668 1 0 1 15 1 1263 2 -1 0 0 1
4512 40 3 0 1 0 1100 1 0 1 29 1 660 2 -1 0 0 0
4513 49 3 0 1 0 322 0 0 0 14 5 356 2 -1 0 0 0
4514 38 3 0 1 0 1205 1 0 0 20 2 45 4 153 1 1 0
4515 32 1 1 1 0 473 1 0 0 7 7 624 5 -1 0 0 0
4516 33 1 0 1 0 -333 1 0 0 30 7 329 5 -1 0 0 0
4517 57 4 0 2 1 -3313 1 1 1 9 1 153 1 -1 0 0 0
4518 57 5 0 1 0 295 0 0 0 19 5 151 11 -1 0 0 0
4519 28 3 0 1 0 1137 0 0 0 6 4 129 4 211 3 2 0
4520 44 6 1 2 0 1136 1 1 0 3 2 345 2 249 7 2 0

4521 rows × 17 columns


In [33]:
### create feature matrix and target vector
X = df_num[list(all_features-set(['day', 'month']))].as_matrix()
y = df_num.y.as_matrix()
X, y


Out[33]:
(array([[1787,    0,    0, ...,   30,    0,    0],
        [4789,    1,    4, ...,   33,    0,    1],
        [1350,    2,    1, ...,   35,    0,    1],
        ..., 
        [ 295,    1,    0, ...,   57,    0,    0],
        [1137,    1,    3, ...,   28,    0,    0],
        [1136,    2,    7, ...,   44,    0,    1]]),
 array([0, 0, 0, ..., 0, 0, 0]))

Evaluation

Test how Maximum Depth of tree impacts performance


In [34]:
for d in [3, 5, 7, 11, 13]:
    clf = DecisionTreeClassifier(max_depth=d)
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.4, random_state=42)
    clf.fit(X_train, y_train)
     = clf.predict(X_test)
    print('Depth %d' % d)
    print(classification_report(y_test, ))


Depth 3
             precision    recall  f1-score   support

          0       0.93      0.97      0.95      1620
          1       0.55      0.35      0.43       189

avg / total       0.89      0.90      0.89      1809

Depth 5
             precision    recall  f1-score   support

          0       0.93      0.96      0.94      1620
          1       0.51      0.34      0.41       189

avg / total       0.88      0.90      0.89      1809

Depth 7
             precision    recall  f1-score   support

          0       0.93      0.96      0.94      1620
          1       0.51      0.34      0.41       189

avg / total       0.88      0.90      0.89      1809

Depth 11
             precision    recall  f1-score   support

          0       0.93      0.94      0.93      1620
          1       0.41      0.38      0.39       189

avg / total       0.87      0.88      0.88      1809

Depth 13
             precision    recall  f1-score   support

          0       0.93      0.92      0.92      1620
          1       0.37      0.42      0.40       189

avg / total       0.87      0.86      0.87      1809

Two methods from sklearn.metrics can be helpful:

  1. confusion_matrix produces a confusion matrix
  2. precision_recall_fscore_support returns a matrix with values for each of them across all target levels.

In [36]:
cm = confusion_matrix(y_test, )
cm


Out[36]:
array([[1484,  136],
       [ 109,   80]])

In [37]:
prf1s = precision_recall_fscore_support(y_test, )
prf1s


Out[37]:
(array([ 0.93157564,  0.37037037]),
 array([ 0.91604938,  0.42328042]),
 array([ 0.92374728,  0.39506173]),
 array([1620,  189]))

In [10]:
perf = None
for i in range(100):
    if type(perf)!=type(None):
        perf = np.vstack((perf, np.array(prf1s).reshape(1,8)))
    else:
        perf = np.array(prf1s).reshape(1,8)
perf_agg = perf.mean(axis=0)
pd.DataFrame(perf_agg.reshape(1,8), columns=[[b for a in ['Precision', 'Recall', 'F1_score', 'Support'] for b in [a, a]], ['no', 'yes']*4])
##pd.DataFrame([5,5, 'a|b|c'] + list(perf.mean(axis=0)), columns=perf_df.columns)


Out[10]:
Precision Recall F1_score Support
no yes no yes no yes no yes
0 0.933374 0.462428 0.942593 0.42328 0.937961 0.441989 1620.0 189.0

In [14]:
performance_df = pd.DataFrame(columns=[
        ['Params']*3 + [b for a in ['Precision', 'Recall', 'F1_score', 'Support'] for b in [a, a]],
        ['MaxDepth', 'Nfeature', 'Features'] + ['no', 'yes']*4
    ])
tempdf = pd.concat([
        pd.DataFrame({'a': [1], 'b': [2], 'c': ['Hello']}),
        pd.DataFrame(np.zeros((1,8)))
    ], axis=1, ignore_index=True)

tempdf.columns=performance_df.columns
#performance_df
tempdf


Out[14]:
Params Precision Recall F1_score Support
MaxDepth Nfeature Features no yes no yes no yes no yes
0 1 2 Hello 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0

In [274]:
pd.DataFrame(np.zeros(8).reshape(1,8))


Out[274]:
0 1 2 3 4 5 6 7
0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0

The Heavy Lifting

Now, let's run the performance evaluation across a number of configurations. We'll collect the results for each configuration into a dataframe.


In [41]:
# creating a template (i.e. empty table)
performance_template_df = pd.DataFrame(columns= [
        ['Params']*3 + [b for a in ['Precision', 'Recall', 'F1_score', 'Support'] for b in [a, a]],
        ['MaxDepth', 'Nfeature', 'Features'] + ['no', 'yes']*4
    ])
performance_template_df


Out[41]:
Params Precision Recall F1_score Support
MaxDepth Nfeature Features no yes no yes no yes no yes

The following code implements nested loops for MaxDepth, number and permutation of features. In addition, we have an internal loop to aggregate the performance metrics over a number of different random splits.

The outer two loops, however, only iterate over one value each. The commmented code shows how they should run...


In [42]:
%%time
performance_df = performance_template_df.copy() #-- always start fresh

for MaxDepth in [5]: ###range(5,9):
    for Nftr in [8]: ###[len(all_features) - k for k in range(len(all_features)-2))]:
        for ftrs in itertools.combinations(all_features-set(['day', 'month']), Nftr):
            X = df_num[list(ftrs)].as_matrix()
            clf = DecisionTreeClassifier(max_depth=MaxDepth)

            perf_arr = None    #-- this array will hold results for different random samples
            for i in range(10): ### running train and test on different random samples
                X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.2, random_state=i)
                clf.fit(X_train, y_train)
                 = clf.predict(X_test)
                #Prec, Recall, F1, Supp 
                prf1s = precision_recall_fscore_support(y_test, )

                ## 
                if type(perf_arr)!=type(None):
                    perf_arr = np.vstack((perf, np.array(prf1s).reshape(1,8)))
                else:
                    perf_arr = np.array(prf1s).reshape(1,8)
            perf_agg = perf_arr.mean(axis=0)  #-- mean over rows, for each column
            perf_df = pd.concat([    #-- creating a 1 row dataframe is a bit tricky because of the different data types
                        pd.DataFrame({'a': [MaxDepth], 'b': [Nftr], 'c': ['|'.join(list(ftrs))]}),
                        pd.DataFrame(perf_agg.reshape(1, 8))
                    ], axis=1, ignore_index=True)
            perf_df.columns=performance_df.columns
            performance_df = performance_df.append(perf_df, ignore_index=True)


/usr/lib64/python3.4/site-packages/sklearn/metrics/classification.py:1113: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples.
  'precision', 'predicted', average, warn_for)
CPU times: user 2min 15s, sys: 13.7 ms, total: 2min 15s
Wall time: 2min 15s

In [43]:
performance_df


Out[43]:
Params Precision Recall F1_score Support
MaxDepth Nfeature Features no yes no yes no yes no yes
0 5.0 8.0 balance|education|previous|loan|contact|pdays|... 0.933046 0.463507 0.942820 0.422060 0.937899 0.441508 1611.732673 188.316832
1 5.0 8.0 balance|education|previous|loan|contact|pdays|... 0.932785 0.462509 0.943047 0.419750 0.937859 0.438769 1611.732673 188.316832
2 5.0 8.0 balance|education|previous|loan|contact|pdays|... 0.932806 0.463350 0.943060 0.419915 0.937876 0.439048 1611.732673 188.316832
3 5.0 8.0 balance|education|previous|loan|contact|pdays|... 0.932874 0.464323 0.943047 0.420492 0.937909 0.439919 1611.732673 188.316832
4 5.0 8.0 balance|education|previous|loan|contact|pdays|... 0.932799 0.463790 0.943085 0.419832 0.937883 0.438933 1611.732673 188.316832
5 5.0 8.0 balance|education|previous|loan|contact|pdays|... 0.932806 0.463350 0.943060 0.419915 0.937876 0.439048 1611.732673 188.316832
6 5.0 8.0 balance|education|previous|loan|contact|pdays|... 0.932779 0.463181 0.943085 0.419667 0.937872 0.438655 1611.732673 188.316832
7 5.0 8.0 balance|education|previous|loan|contact|pdays|... 0.933073 0.464120 0.942884 0.422225 0.937943 0.441793 1611.732673 188.316832
8 5.0 8.0 balance|education|previous|loan|contact|pdays|... 0.933084 0.464179 0.942884 0.422307 0.937949 0.441880 1611.732673 188.316832
9 5.0 8.0 balance|education|previous|loan|contact|pdays|... 0.933220 0.464450 0.942833 0.423380 0.937998 0.442813 1611.732673 188.316832
10 5.0 8.0 balance|education|previous|loan|contact|pdays|... 0.932946 0.464037 0.942972 0.421152 0.937915 0.440707 1611.732673 188.316832
11 5.0 8.0 balance|education|previous|loan|contact|pdays|... 0.933084 0.464179 0.942884 0.422307 0.937949 0.441880 1611.732673 188.316832
12 5.0 8.0 balance|education|previous|loan|contact|pdays|... 0.933125 0.464300 0.942871 0.422637 0.937965 0.442191 1611.732673 188.316832
13 5.0 8.0 balance|education|previous|loan|contact|pdays|... 0.932785 0.462509 0.943047 0.419750 0.937859 0.438769 1611.732673 188.316832
14 5.0 8.0 balance|education|previous|loan|contact|pdays|... 0.932907 0.465465 0.943085 0.420740 0.937944 0.440325 1611.732673 188.316832
15 5.0 8.0 balance|education|previous|loan|contact|pdays|... 0.932789 0.463507 0.943085 0.419750 0.937878 0.438795 1611.732673 188.316832
16 5.0 8.0 balance|education|previous|loan|contact|pdays|... 0.932785 0.462509 0.943047 0.419750 0.937859 0.438769 1611.732673 188.316832
17 5.0 8.0 balance|education|previous|loan|contact|pdays|... 0.932754 0.461150 0.943035 0.419502 0.937836 0.438346 1611.732673 188.316832
18 5.0 8.0 balance|education|previous|loan|contact|pdays|... 0.932864 0.464186 0.943047 0.420410 0.937903 0.439798 1611.732673 188.316832
19 5.0 8.0 balance|education|previous|loan|contact|pdays|... 0.932799 0.463790 0.943085 0.419832 0.937883 0.438933 1611.732673 188.316832
20 5.0 8.0 balance|education|previous|loan|contact|pdays|... 0.932793 0.462539 0.943035 0.419832 0.937858 0.438895 1611.732673 188.316832
21 5.0 8.0 balance|education|previous|loan|contact|pdays|... 0.932767 0.462093 0.943060 0.419585 0.937854 0.438499 1611.732673 188.316832
22 5.0 8.0 balance|education|previous|loan|contact|pdays|... 0.932925 0.465110 0.943060 0.420905 0.937943 0.440517 1611.732673 188.316832
23 5.0 8.0 balance|education|previous|loan|contact|pdays|... 0.932874 0.464323 0.943047 0.420492 0.937909 0.439919 1611.732673 188.316832
24 5.0 8.0 balance|education|previous|loan|contact|pdays|... 0.932865 0.464450 0.943060 0.420410 0.937909 0.439813 1611.732673 188.316832
25 5.0 8.0 balance|education|previous|loan|contact|pdays|... 0.932799 0.463790 0.943085 0.419832 0.937883 0.438933 1611.732673 188.316832
26 5.0 8.0 balance|education|previous|loan|contact|pdays|... 0.932782 0.464150 0.943111 0.419667 0.937885 0.438671 1611.732673 188.316832
27 5.0 8.0 balance|education|previous|loan|contact|pdays|... 0.932767 0.462093 0.943060 0.419585 0.937854 0.438499 1611.732673 188.316832
28 5.0 8.0 balance|education|previous|loan|contact|marita... 0.932927 0.462350 0.942783 0.421152 0.937818 0.440442 1611.732673 188.316832
29 5.0 8.0 balance|education|previous|loan|contact|marita... 0.932941 0.462706 0.942820 0.421235 0.937843 0.440589 1611.732673 188.316832
... ... ... ... ... ... ... ... ... ... ... ...
2973 5.0 8.0 contact|pdays|marital|job|campaign|poutcome|ag... 0.932915 0.465019 0.943060 0.420822 0.937937 0.440404 1611.732673 188.316832
2974 5.0 8.0 contact|pdays|marital|job|campaign|poutcome|ag... 0.932860 0.465666 0.943111 0.420327 0.937929 0.439750 1611.732673 188.316832
2975 5.0 8.0 contact|pdays|marital|job|campaign|poutcome|de... 0.932830 0.464838 0.943098 0.420080 0.937906 0.439347 1611.732673 188.316832
2976 5.0 8.0 contact|pdays|marital|job|campaign|age|default... 0.932773 0.463790 0.943111 0.419585 0.937880 0.438527 1611.732673 188.316832
2977 5.0 8.0 contact|pdays|marital|job|poutcome|age|default... 0.932849 0.465145 0.943098 0.420245 0.937917 0.439607 1611.732673 188.316832
2978 5.0 8.0 contact|pdays|marital|campaign|poutcome|age|de... 0.932859 0.465275 0.943098 0.420327 0.937923 0.439734 1611.732673 188.316832
2979 5.0 8.0 contact|pdays|duration|job|campaign|poutcome|a... 0.933086 0.463589 0.942795 0.422390 0.937909 0.441804 1611.732673 188.316832
2980 5.0 8.0 contact|pdays|duration|job|campaign|poutcome|a... 0.933086 0.463589 0.942795 0.422390 0.937909 0.441804 1611.732673 188.316832
2981 5.0 8.0 contact|pdays|duration|job|campaign|poutcome|d... 0.933165 0.464224 0.942833 0.422967 0.937969 0.442435 1611.732673 188.316832
2982 5.0 8.0 contact|pdays|duration|job|campaign|age|defaul... 0.933107 0.463625 0.942783 0.422555 0.937914 0.441945 1611.732673 188.316832
2983 5.0 8.0 contact|pdays|duration|job|poutcome|age|defaul... 0.933087 0.463673 0.942808 0.422390 0.937916 0.441826 1611.732673 188.316832
2984 5.0 8.0 contact|pdays|duration|campaign|poutcome|age|d... 0.933086 0.463589 0.942795 0.422390 0.937909 0.441804 1611.732673 188.316832
2985 5.0 8.0 contact|pdays|job|campaign|poutcome|age|defaul... 0.932870 0.465770 0.943111 0.420410 0.937935 0.439876 1611.732673 188.316832
2986 5.0 8.0 contact|marital|duration|job|campaign|poutcome... 0.933036 0.464323 0.942934 0.421895 0.937946 0.441527 1611.732673 188.316832
2987 5.0 8.0 contact|marital|duration|job|campaign|poutcome... 0.933058 0.463664 0.942833 0.422142 0.937912 0.441617 1611.732673 188.316832
2988 5.0 8.0 contact|marital|duration|job|campaign|poutcome... 0.933138 0.463712 0.942770 0.422802 0.937925 0.442159 1611.732673 188.316832
2989 5.0 8.0 contact|marital|duration|job|campaign|age|defa... 0.932975 0.463166 0.942846 0.421482 0.937873 0.440913 1611.732673 188.316832
2990 5.0 8.0 contact|marital|duration|job|poutcome|age|defa... 0.933090 0.463850 0.942833 0.422390 0.937929 0.441871 1611.732673 188.316832
2991 5.0 8.0 contact|marital|duration|campaign|poutcome|age... 0.933058 0.463664 0.942833 0.422142 0.937912 0.441617 1611.732673 188.316832
2992 5.0 8.0 contact|marital|job|campaign|poutcome|age|defa... 0.932830 0.464838 0.943098 0.420080 0.937906 0.439347 1611.732673 188.316832
2993 5.0 8.0 contact|duration|job|campaign|poutcome|age|def... 0.933098 0.463732 0.942808 0.422472 0.937921 0.441908 1611.732673 188.316832
2994 5.0 8.0 pdays|marital|duration|job|campaign|poutcome|a... 0.933100 0.463908 0.942833 0.422472 0.937934 0.441954 1611.732673 188.316832
2995 5.0 8.0 pdays|marital|duration|job|campaign|poutcome|a... 0.933100 0.463908 0.942833 0.422472 0.937934 0.441954 1611.732673 188.316832
2996 5.0 8.0 pdays|marital|duration|job|campaign|poutcome|d... 0.933062 0.464058 0.942884 0.422142 0.937938 0.441706 1611.732673 188.316832
2997 5.0 8.0 pdays|marital|duration|job|campaign|age|defaul... 0.933120 0.463846 0.942808 0.422637 0.937933 0.442071 1611.732673 188.316832
2998 5.0 8.0 pdays|marital|duration|job|poutcome|age|defaul... 0.933100 0.463908 0.942833 0.422472 0.937934 0.441954 1611.732673 188.316832
2999 5.0 8.0 pdays|marital|duration|campaign|poutcome|age|d... 0.933100 0.463908 0.942833 0.422472 0.937934 0.441954 1611.732673 188.316832
3000 5.0 8.0 pdays|marital|job|campaign|poutcome|age|defaul... 0.932862 0.466100 0.943123 0.420327 0.937935 0.439765 1611.732673 188.316832
3001 5.0 8.0 pdays|duration|job|campaign|poutcome|age|defau... 0.932995 0.463234 0.942833 0.421647 0.937878 0.441081 1611.732673 188.316832
3002 5.0 8.0 marital|duration|job|campaign|poutcome|age|def... 0.933068 0.463728 0.942833 0.422225 0.937917 0.441702 1611.732673 188.316832

3003 rows × 11 columns

That took a while (about 2 minutes). Once computations take that long we should look at a different way to implement them ... outside the notebook .

Let's see what the best performing configuration with respect to the F1-score of 'yes' is:


In [61]:
best = performance_df.F1_score.yes.argmax()
print(performance_df.iloc[best])
print("\nFeatures: ", ', '.join([ '"%s"'%f for f in performance_df.iloc[best].Params.Features.split('|') ], ))


Params     MaxDepth                                                    5
           Nfeature                                                    8
           Features    balance|education|previous|loan|contact|pdays|...
Precision  no                                                    0.93322
           yes                                                   0.46445
Recall     no                                                   0.942833
           yes                                                   0.42338
F1_score   no                                                   0.937998
           yes                                                  0.442813
Support    no                                                    1611.73
           yes                                                   188.317
Name: 9, dtype: object

Features:  "balance", "education", "previous", "loan", "contact", "pdays", "duration", "poutcome"

In [ ]: