In [2]:
import pandas as pd
from sklearn import * 
import numpy as np
import matplotlib.pyplot as plt

In [4]:
df = pd.read_csv("/data/bank-full.csv", sep=";")
df.head()


Out[4]:
age job marital education default balance housing loan contact day month duration campaign pdays previous poutcome y
0 58 management married tertiary no 2143 yes no unknown 5 may 261 1 -1 0 unknown no
1 44 technician single secondary no 29 yes no unknown 5 may 151 1 -1 0 unknown no
2 33 entrepreneur married secondary no 2 yes yes unknown 5 may 76 1 -1 0 unknown no
3 47 blue-collar married unknown no 1506 yes no unknown 5 may 92 1 -1 0 unknown no
4 33 unknown single unknown no 1 no no unknown 5 may 198 1 -1 0 unknown no

In [5]:
df.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 45211 entries, 0 to 45210
Data columns (total 17 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   age        45211 non-null  int64 
 1   job        45211 non-null  object
 2   marital    45211 non-null  object
 3   education  45211 non-null  object
 4   default    45211 non-null  object
 5   balance    45211 non-null  int64 
 6   housing    45211 non-null  object
 7   loan       45211 non-null  object
 8   contact    45211 non-null  object
 9   day        45211 non-null  int64 
 10  month      45211 non-null  object
 11  duration   45211 non-null  int64 
 12  campaign   45211 non-null  int64 
 13  pdays      45211 non-null  int64 
 14  previous   45211 non-null  int64 
 15  poutcome   45211 non-null  object
 16  y          45211 non-null  object
dtypes: int64(7), object(10)
memory usage: 5.9+ MB

In [6]:
df.y.value_counts()


Out[6]:
no     39922
yes     5289
Name: y, dtype: int64

In [7]:
df.y.value_counts()/len(df)


Out[7]:
no     0.883015
yes    0.116985
Name: y, dtype: float64

In [8]:
df_dummy = pd.get_dummies(df, drop_first=True)
df_dummy.head()


Out[8]:
age balance day duration campaign pdays previous job_blue-collar job_entrepreneur job_housemaid ... month_jun month_mar month_may month_nov month_oct month_sep poutcome_other poutcome_success poutcome_unknown y_yes
0 58 2143 5 261 1 -1 0 0 0 0 ... 0 0 1 0 0 0 0 0 1 0
1 44 29 5 151 1 -1 0 0 0 0 ... 0 0 1 0 0 0 0 0 1 0
2 33 2 5 76 1 -1 0 0 1 0 ... 0 0 1 0 0 0 0 0 1 0
3 47 1506 5 92 1 -1 0 1 0 0 ... 0 0 1 0 0 0 0 0 1 0
4 33 1 5 198 1 -1 0 0 0 0 ... 0 0 1 0 0 0 0 0 1 0

5 rows × 43 columns


In [11]:
target = "y_yes"

In [15]:
y = df_dummy[target]
X = df_dummy.drop(columns=target)

X_train, X_test, y_train, y_test = model_selection.train_test_split(X, y, test_size = 0.3, random_state = 1)

cls = tree.DecisionTreeClassifier(
    max_depth=4, 
    min_samples_leaf= 100,
    min_samples_split= 250
)

cls.fit(X_train, y_train)


Out[15]:
DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None, criterion='gini',
                       max_depth=4, max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=100, min_samples_split=250,
                       min_weight_fraction_leaf=0.0, presort='deprecated',
                       random_state=None, splitter='best')

In [34]:
y_train_pred = cls.predict(X_train)
y_test_pred = cls.predict(X_test)

y_test_prob = cls.predict_proba(X_test)[:, 1]

print("training accuracy: ", metrics.accuracy_score(y_train, y_train_pred))
print("test accuracy: ", metrics.accuracy_score(y_test, y_test_pred))
print("training precision: ", metrics.precision_score(y_train, y_train_pred))
print("test precision: ", metrics.precision_score(y_test, y_test_pred))
print("training recall: ", metrics.recall_score(y_train, y_train_pred))
print("test recall: ", metrics.recall_score(y_test, y_test_pred))


training accuracy:  0.9016652447309381
test accuracy:  0.9009879091713359
training precision:  0.6382508833922261
test precision:  0.608786610878661
training recall:  0.38657035848047083
test recall:  0.37524177949709864

In [21]:
1 - 0.9009879091713359


Out[21]:
0.09901209082866413

In [23]:
m = metrics.confusion_matrix(y_test, y_test_pred)
m


Out[23]:
array([[11639,   374],
       [  969,   582]])

In [27]:
TN = m[0][0]
FP = m[0][1]
FN = m[1][0]
TP = m[1][1]
TN, TP, FP, FN


Out[27]:
(11639, 582, 374, 969)

In [30]:
recall = TP / (TP + FN)
recall


Out[30]:
0.37524177949709864

In [31]:
precision = TP / (TP + FP)
precision


Out[31]:
0.608786610878661

In [35]:
y_test_pred = np.where(y_test_prob>0.5, 1, 0)

print("training accuracy: ", metrics.accuracy_score(y_train, y_train_pred))
print("test accuracy: ", metrics.accuracy_score(y_test, y_test_pred))
print("training precision: ", metrics.precision_score(y_train, y_train_pred))
print("test precision: ", metrics.precision_score(y_test, y_test_pred))
print("training recall: ", metrics.recall_score(y_train, y_train_pred))
print("test recall: ", metrics.recall_score(y_test, y_test_pred))


training accuracy:  0.9016652447309381
test accuracy:  0.9009879091713359
training precision:  0.6382508833922261
test precision:  0.608786610878661
training recall:  0.38657035848047083
test recall:  0.37524177949709864

In [41]:
y_test_pred = np.where(y_test_prob>0.1, 1, 0)

print("training accuracy: ", metrics.accuracy_score(y_train, y_train_pred))
print("test accuracy: ", metrics.accuracy_score(y_test, y_test_pred))
print("training precision: ", metrics.precision_score(y_train, y_train_pred))
print("test precision: ", metrics.precision_score(y_test, y_test_pred))
print("training recall: ", metrics.recall_score(y_train, y_train_pred))
print("test recall: ", metrics.recall_score(y_test, y_test_pred))
print("training f1: ", metrics.f1_score(y_train, y_train_pred))
print("test f1: ", metrics.f1_score(y_test, y_test_pred))


training accuracy:  0.9016652447309381
test accuracy:  0.8561633736360955
training precision:  0.6382508833922261
test precision:  0.41928974979822436
training recall:  0.38657035848047083
test recall:  0.6698903932946486
training f1:  0.4815061646117961
test f1:  0.5157607346736163

f1 = 2 p r / (p + r)


In [43]:
np.unique(y_test_prob)


Out[43]:
array([0.04592271, 0.18364611, 0.18471338, 0.19745223, 0.36373166,
       0.41666667, 0.44907407, 0.51394422, 0.56441718, 0.57439446,
       0.57843137, 0.66743119, 0.75502008, 0.79850746])

In [44]:
len(np.unique(y_test_prob))


Out[44]:
14

In [47]:
from sklearn.tree import export_graphviz
export_graphviz(cls, out_file = "tree.dot", feature_names = X.columns, filled=True)
!dot -Tpng tree.dot -o tree.png

In [52]:
694/1908


Out[52]:
0.3637316561844864

In [50]:
291/436


Out[50]:
0.6674311926605505

In [51]:
332/578


Out[51]:
0.5743944636678201

In [54]:
param_grid = {
    "max_depth": np.arange(2, 6),
    "criterion": ["gini", "entropy"],
    "min_samples_leaf": np.arange(5, 20)
}


gsearch =model_selection.GridSearchCV(cls, param_grid=param_grid, scoring="accuracy"
                                      , cv = 5, verbose = True, n_jobs= 8)
gsearch.fit(X_train, y_train)


Fitting 5 folds for each of 120 candidates, totalling 600 fits
[Parallel(n_jobs=8)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=8)]: Done  34 tasks      | elapsed:    1.4s
[Parallel(n_jobs=8)]: Done 320 tasks      | elapsed:    5.2s
[Parallel(n_jobs=8)]: Done 585 out of 600 | elapsed:    8.5s remaining:    0.2s
[Parallel(n_jobs=8)]: Done 600 out of 600 | elapsed:    8.7s finished
Out[54]:
GridSearchCV(cv=5, error_score=nan,
             estimator=DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None,
                                              criterion='gini', max_depth=4,
                                              max_features=None,
                                              max_leaf_nodes=None,
                                              min_impurity_decrease=0.0,
                                              min_impurity_split=None,
                                              min_samples_leaf=100,
                                              min_samples_split=250,
                                              min_weight_fraction_leaf=0.0,
                                              presort='deprecated',
                                              random_state=None,
                                              splitter='best'),
             iid='deprecated', n_jobs=8,
             param_grid={'criterion': ['gini', 'entropy'],
                         'max_depth': array([2, 3, 4, 5]),
                         'min_samples_leaf': array([ 5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19])},
             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
             scoring='accuracy', verbose=True)

In [55]:
gsearch.score(X_train, y_train), gsearch.best_score_


Out[55]:
(0.9023288147375739, 0.9006223415023051)

In [56]:
gsearch.best_params_


Out[56]:
{'criterion': 'gini', 'max_depth': 5, 'min_samples_leaf': 16}

In [ ]: