This is the historical data that the bank has provided. It has the following columns
Application Attributes:
years
: Number of years the applicant has been employed ownership
: Whether the applicant owns a house or not income
: Annual income of the applicant age
: Age of the applicant Behavioural Attributes:
grade
: Credit grade of the applicantOutcome Variable:
amount
: Amount of Loan provided to the applicant default
: Whether the applicant has defaulted or not interest
: Interest rate charged for the applicant You are provided with the following data: loan_data.csv
There is a cleaned-up dataset which has removed outliers and treated missing values - loan_data_clean.csv
Let us build some intuition around the Loan Data
In [1]:
#Load the libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
In [2]:
#Default Variables
%matplotlib inline
plt.rcParams['figure.figsize'] = (16,9)
plt.rcParams['font.size'] = 18
plt.style.use('fivethirtyeight')
pd.set_option('display.float_format', lambda x: '%.2f' % x)
In [3]:
#Load the dataset
df = pd.read_csv("data/loan_data_clean.csv")
In [4]:
df.head()
Out[4]:
In [5]:
# Distribution of default
df.default.hist()
Out[5]:
In [6]:
df.default.sum()/df.default.count()
Out[6]:
In [7]:
# Plot three variable - default vs income and interest
plt.scatter(df.income, df.interest, c=df.default, alpha=0.4)
plt.xlabel("income")
plt.ylabel("interest")
Out[7]:
In [8]:
plt.scatter(np.log(df.income), df.interest, c=df.default, alpha=0.4)
Out[8]:
In [9]:
# Get the module
from sklearn.linear_model import LogisticRegression
In [10]:
# Define the features
df['incomeLog'] = np.log(df.income)
X2 = df[['incomeLog', 'interest']]
In [11]:
# Define the target
y = df.default
In [12]:
# Initiate the model
clf = LogisticRegression()
In [13]:
# Fit the model
clf.fit(X2,y)
Out[13]:
In [14]:
# Calculate the Accuracy Score
clf.score(X2,y)
Out[14]:
In [15]:
# Calculate the predictions
clf.predict(X2)
Out[15]:
In [16]:
clf.predict(X2).sum()
Out[16]:
In [17]:
# Calculate the probabilities
clf.predict_proba(X2)
Out[17]:
In [18]:
plt.hist(clf.predict_proba(X2)[:,0], bins=100)
#plt.hist(clf.predict_proba(X2)[:,1], bins=100)
plt.show()
In [19]:
def plot_classifier(X,y,clf):
x1_min, x1_max = X.iloc[:,0].min(), X.iloc[:,0].max()
x2_min, x2_max = X.iloc[:,1].min(), X.iloc[:,1].max()
xx1, xx2 = np.meshgrid(np.arange(x1_min, x1_max, (x1_max - x1_min)/100),
np.arange(x2_min, x2_max, (x2_max - x2_min)/100))
Z = clf.predict_proba(np.c_[xx1.ravel(), xx2.ravel()])[:,0]
Z = Z.reshape(xx1.shape)
cs = plt.contourf(xx1, xx2, Z, cmap="magma", alpha = 0.3)
plt.scatter(x = X.iloc[:,0], y = X.iloc[:,1], c = y, s = 50, cmap="viridis", alpha=0.3)
plt.colorbar(cs)
In [20]:
plot_classifier(X2,y,clf)
Exercise: What is the range of the predicted probabilities
In [ ]:
Exercise: What is the accuracy measure if you change the cut-off threshold
In [ ]:
In [21]:
from sklearn.tree import DecisionTreeClassifier
In [22]:
clf_dt = DecisionTreeClassifier()
In [23]:
clf_dt.fit(X2,y)
Out[23]:
In [24]:
plot_classifier(X2,y,clf_dt)
In [25]:
import pydotplus
from IPython.display import Image
In [26]:
from sklearn import tree
In [27]:
# dot_data = tree.export_graphviz(clf_dt, out_file='tree.dot', feature_names=X2.columns,
# class_names=['0', '1'], filled=True,
# rounded=True, special_characters=True)
In [28]:
# graph = pydotplus.graph_from_dot_file('tree.dot')
In [29]:
# Image(graph.create_png())
In [30]:
from sklearn.linear_model import LogisticRegressionCV
from sklearn.model_selection import StratifiedKFold
In [31]:
clf_LogCV = LogisticRegressionCV(Cs=10, cv=StratifiedKFold(5),scoring="accuracy")
In [32]:
clf_LogCV.fit(X2,y)
Out[32]:
In [33]:
clf_LogCV.Cs_
Out[33]:
In [34]:
clf_LogCV.C_
Out[34]:
In [35]:
clf_LogCV.predict(X2)
Out[35]:
In [36]:
clf_LogCV.score(X2,y)
Out[36]:
In [11]:
# Preprocess the data
In [ ]:
In [ ]:
In [65]:
# Build the Model
In [ ]:
In [ ]:
In [1]:
# Choose a Threshold
In [ ]:
In [ ]:
In [2]:
# Calculate the accuracy
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]: