In [2]:
import pandas as pd
from sklearn import *
import numpy as np
import matplotlib.pyplot as plt
In [4]:
df = pd.read_csv("/data/bank-full.csv", sep=";")
df.head()
Out[4]:
In [5]:
df.info()
In [6]:
df.y.value_counts()
Out[6]:
In [7]:
df.y.value_counts()/len(df)
Out[7]:
In [8]:
df_dummy = pd.get_dummies(df, drop_first=True)
df_dummy.head()
Out[8]:
In [11]:
target = "y_yes"
In [15]:
y = df_dummy[target]
X = df_dummy.drop(columns=target)
X_train, X_test, y_train, y_test = model_selection.train_test_split(X, y, test_size = 0.3, random_state = 1)
cls = tree.DecisionTreeClassifier(
max_depth=4,
min_samples_leaf= 100,
min_samples_split= 250
)
cls.fit(X_train, y_train)
Out[15]:
In [34]:
y_train_pred = cls.predict(X_train)
y_test_pred = cls.predict(X_test)
y_test_prob = cls.predict_proba(X_test)[:, 1]
print("training accuracy: ", metrics.accuracy_score(y_train, y_train_pred))
print("test accuracy: ", metrics.accuracy_score(y_test, y_test_pred))
print("training precision: ", metrics.precision_score(y_train, y_train_pred))
print("test precision: ", metrics.precision_score(y_test, y_test_pred))
print("training recall: ", metrics.recall_score(y_train, y_train_pred))
print("test recall: ", metrics.recall_score(y_test, y_test_pred))
In [21]:
1 - 0.9009879091713359
Out[21]:
In [23]:
m = metrics.confusion_matrix(y_test, y_test_pred)
m
Out[23]:
In [27]:
TN = m[0][0]
FP = m[0][1]
FN = m[1][0]
TP = m[1][1]
TN, TP, FP, FN
Out[27]:
In [30]:
recall = TP / (TP + FN)
recall
Out[30]:
In [31]:
precision = TP / (TP + FP)
precision
Out[31]:
In [35]:
y_test_pred = np.where(y_test_prob>0.5, 1, 0)
print("training accuracy: ", metrics.accuracy_score(y_train, y_train_pred))
print("test accuracy: ", metrics.accuracy_score(y_test, y_test_pred))
print("training precision: ", metrics.precision_score(y_train, y_train_pred))
print("test precision: ", metrics.precision_score(y_test, y_test_pred))
print("training recall: ", metrics.recall_score(y_train, y_train_pred))
print("test recall: ", metrics.recall_score(y_test, y_test_pred))
In [41]:
y_test_pred = np.where(y_test_prob>0.1, 1, 0)
print("training accuracy: ", metrics.accuracy_score(y_train, y_train_pred))
print("test accuracy: ", metrics.accuracy_score(y_test, y_test_pred))
print("training precision: ", metrics.precision_score(y_train, y_train_pred))
print("test precision: ", metrics.precision_score(y_test, y_test_pred))
print("training recall: ", metrics.recall_score(y_train, y_train_pred))
print("test recall: ", metrics.recall_score(y_test, y_test_pred))
print("training f1: ", metrics.f1_score(y_train, y_train_pred))
print("test f1: ", metrics.f1_score(y_test, y_test_pred))
f1 = 2 p r / (p + r)
In [43]:
np.unique(y_test_prob)
Out[43]:
In [44]:
len(np.unique(y_test_prob))
Out[44]:
In [47]:
from sklearn.tree import export_graphviz
export_graphviz(cls, out_file = "tree.dot", feature_names = X.columns, filled=True)
!dot -Tpng tree.dot -o tree.png
In [52]:
694/1908
Out[52]:
In [50]:
291/436
Out[50]:
In [51]:
332/578
Out[51]:
In [54]:
param_grid = {
"max_depth": np.arange(2, 6),
"criterion": ["gini", "entropy"],
"min_samples_leaf": np.arange(5, 20)
}
gsearch =model_selection.GridSearchCV(cls, param_grid=param_grid, scoring="accuracy"
, cv = 5, verbose = True, n_jobs= 8)
gsearch.fit(X_train, y_train)
Out[54]:
In [55]:
gsearch.score(X_train, y_train), gsearch.best_score_
Out[55]:
In [56]:
gsearch.best_params_
Out[56]:
In [ ]: