In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline'fivethirtyeight')
In [7]:
df = pd.read_csv('data/historical_loan.csv')
In [8]:
In [9]:
df.years = df.years.fillna(np.mean(df.years))
In [10]:
#Load the preprocessing module
from sklearn import preprocessing
In [11]:
categorical_variables = df.dtypes[df.dtypes=="object"].index.tolist()
In [12]:
In [13]:
for i in categorical_variables:
lbl = preprocessing.LabelEncoder()[i]))
df[i] = lbl.transform(df[i])
In [14]:
In [15]:
X = df.iloc[:,1:8]
In [16]:
y = df.iloc[:,0]
The most basic evaluation metric is accuracy score. if $\hat{y}_i$ is the predicted value of the i-th sample and $y_i$ is the corresponding true value, then the fraction of correct predictions over $n_\text{samples}$ is defined as
$$ accuracy(y, \hat{y}) = \frac{1}{n_\text{samples}} \sum_{i=0}^{n_\text{samples}-1} 1(\hat{y}_i = y_i) $$Confusion matrix evaluate the quality of the output of a classifier.
Predicted - Yes | Predicted - No | |
Actual - Yes | True Positive | False Negative |
Actual - No | False Positive | True Negative |
The diagonal elements represent the number of points for which the predicted label is equal to the true label, while off-diagonal elements are those that are mislabeled by the classifier. The higher the diagonal values of the confusion matrix the better, indicating many correct predictions.
The precision is the ratio TP / (TP + FP) where TP is the number of true positives and FP the number of false positives. The precision is intuitively the ability of the classifier not to label as positive a sample that is negative. The best value is 1 and the worst value is 0.
The recall is the ratio TP / (TP + FN) where TP is the number of true positives and FN the number of false negatives. The recall is intuitively the ability of the classifier to find all the positive samples. The best value is 1 and the worst value is 0.
“A receiver operating characteristic (ROC), or simply ROC curve, is a graphical plot which illustrates the performance of a binary classifier system as its discrimination threshold is varied. It is created by plotting the fraction of true positives out of the positives (TPR = true positive rate) vs. the fraction of false positives out of the negatives (FPR = false positive rate), at various threshold settings.”
The AUC computes the area under the receiver operating characteristic (ROC) curve, which is also denoted by AUC or AUROC. By computing the area under the roc curve, the curve information is summarized in one number.
In [17]:
from sklearn import tree
from sklearn import metrics
In [18]:
def model_evaluation(data, target, model, model_name):
model_fit =, target)
pred = model_fit.predict(data)
proba = model_fit.predict_proba(data)
fpr, tpr, thresholds = metrics.roc_curve(target, proba[:,1])
roc_auc = metrics.auc(fpr, tpr)
print("Model: %s" % model_name)
# Scores for the model
print("accuracy: %.3f" % metrics.accuracy_score(target, pred))
print("recall: %.3f" % metrics.precision_score(target, pred))
print("precision: %.3f" % metrics.recall_score(target, pred))
print(metrics.confusion_matrix(target, pred))
print("auc: %.3f" % metrics.auc(fpr, tpr))
# ROC Curve
plt.title('Receiver Operating Characteristic')
plt.plot(fpr, tpr, 'b', label='AUC = %0.2f'% roc_auc)
plt.legend(loc='lower right')
plt.ylabel('True Positive Rate')
plt.xlabel('False Positive Rate')
return roc_auc
In [19]:
benchmark = tree.DecisionTreeClassifier(max_depth = 1)
In [20]:
In [21]:
model_evaluation(X, y, benchmark, "benchmark")
In [22]:
Shallow = tree.DecisionTreeClassifier(max_depth=10)
In [23]:
In [24]:
model_evaluation(X, y, Shallow, "Shallow")
In [25]:
Full = tree.DecisionTreeClassifier()
In [26]:
In [27]:
model_evaluation(X, y, Full, "Full")
So far we have been evaluating our metrics on the train data. However, there is an important modelling lesson: you should never evaluate a model on the same data it was fit to because it’s going to seem more confident. Instead, it’s better to divide the data up and use one piece to fit the model and the other piece to evaluate it. A popular technique for this is called k-fold cross validation. You randomly hold out x% of the data and fit the model to the rest. You need to repeat this a few times because of random variation.
In [28]:
from sklearn.model_selection import StratifiedKFold
from scipy import interp
In [29]:
def model_evaluation_crossval(data, target, model, model_name):
data = np.array(data)
target = np.array(target)
cv = StratifiedKFold(n_splits=5)
# Create the color options
cmap = plt.get_cmap('viridis')
indices = np.linspace(0, cmap.N, 5)
colors = [cmap(int(i)) for i in indices]
mean_tpr = 0.0
mean_fpr = np.linspace(0, 1, 100)
# intiate plot
plt.figure(figsize=(8, 8))
i = 0
for (train, test) in cv.split(data, target):
print(train, test)
probas_ =[train], target[train]).predict_proba(data[test])
# Compute ROC curve and area the curve
fpr, tpr, thresholds = metrics.roc_curve(target[test], probas_[:, 1])
mean_tpr += interp(mean_fpr, fpr, tpr)
mean_tpr[0] = 0.0
roc_auc = metrics.auc(fpr, tpr)
plt.plot(fpr, tpr, lw=2, color=colors[i],
label='ROC fold %d (area = %0.2f)' % (i, roc_auc))
i = i + 1
# ROC Curve
mean_tpr /= cv.get_n_splits(data, target)
mean_tpr[-1] = 1.0
mean_auc = metrics.auc(mean_fpr, mean_tpr)
plt.plot(mean_fpr, mean_tpr, color='g', linestyle='--',
label='Mean ROC (area = %0.2f)' % mean_auc, lw=2)
plt.plot([0, 1], [0, 1], linestyle='--', lw=2, color='k', label='random')
plt.title('Receiver operating characteristic example')
plt.legend(loc="lower right")
plt.ylabel('True Positive Rate')
plt.xlabel('False Positive Rate')
In [30]:
model_evaluation_crossval(X, y, Shallow, "Shallow")
