Let us build some intuition around the Loan Data
In [1]:
#Load the libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
In [2]:
#Default Variables
%matplotlib inline
plt.rcParams['figure.figsize'] = (16,9)
plt.rcParams['font.size'] = 18
plt.style.use('fivethirtyeight')
pd.set_option('display.float_format', lambda x: '%.2f' % x)
In [24]:
#Load the dataset
df = pd.read_csv("../data/loan_data_clean.csv")
In [25]:
df.head()
Out[25]:
In [6]:
from sklearn.linear_model import LogisticRegression
In [7]:
# Define the features
X = df.loc[:,('age', 'years')]
In [8]:
# Define the target
y = df['default']
In [9]:
# Initiate the model
clf_logistic_2 = LogisticRegression()
In [10]:
#Fit the model
clf_logistic_2.fit(X,y)
Out[10]:
In [11]:
# Calculate the Accuracy Score
clf_logistic_2.score(X,y)
Out[11]:
In [12]:
# Calculate the predictions
y_pred = clf_logistic_2.predict(X)
In [13]:
# Calculate the probabilities
y_proba = clf_logistic_2.predict_proba(X)[:,0]
In [14]:
x1_min, x1_max = X.iloc[:,0].min(), X.iloc[:,0].max()
x2_min, x2_max = X.iloc[:,1].min(), X.iloc[:,1].max()
xx1, xx2 = np.meshgrid(np.arange(x1_min, x1_max, (x1_max - x1_min)/100),
np.arange(x2_min, x2_max, (x2_max - x2_min)/100))
xx = np.c_[np.ones(xx1.ravel().shape[0]), xx1.ravel(), xx2.ravel()]
In [15]:
Z = clf_logistic_2.predict_proba(np.c_[xx1.ravel(), xx2.ravel()])[:,0]
Z = Z.reshape(xx1.shape)
cs = plt.contourf(xx1, xx2, Z, cmap=plt.cm.viridis, alpha = 0.3)
plt.scatter(x = X.iloc[:,0], y = X.iloc[:,1], c = y, s = 50, cmap=plt.cm.magma)
plt.colorbar(cs)
plt.xlabel('age')
plt.ylabel('years')
Out[15]:
Exercise: What is the range of the predicted probabilities
In [ ]:
Exercise: What is the accuracy measure if you change the cut-off threshold
In [ ]:
In [64]:
# Preprocess the data
In [41]:
df = pd.read_csv("../data/loan_data_clean.csv")
In [42]:
df.head()
Out[42]:
In [43]:
from sklearn.preprocessing import LabelEncoder
In [44]:
le = LabelEncoder()
In [45]:
df.grade = le.fit_transform(df.grade)
In [46]:
le.classes_
Out[46]:
In [47]:
df.head()
Out[47]:
In [48]:
df.ownership = le.fit_transform(df.ownership)
In [50]:
?le
In [49]:
le.classes_
Out[49]:
In [51]:
df.head()
Out[51]:
In [65]:
# Build the Model
In [65]:
df['amount_log'] = np.log10(df.amount)
In [66]:
df['income_log'] = np.log10(df.income)
In [78]:
df['age_log'] = np.log10(df.age)
In [83]:
df['years_log'] = np.log10(df.years + 1)
In [84]:
df.years_log.hist()
Out[84]:
In [85]:
df.head()
Out[85]:
In [87]:
X = df[['amount_log', 'interest', 'grade', 'years_log', 'ownership', 'income_log', 'age_log' ]]
In [88]:
y = df.default
In [95]:
from sklearn.linear_model import LogisticRegressionCV
In [138]:
clf = LogisticRegressionCV(Cs = [0.001, 0.01, 0.1, 1, 10], class_weight="balanced", penalty='l1',
verbose =1, cv=5, solver="liblinear" )
In [139]:
#from sklearn.feature_selection import SelectFromModel
In [140]:
#model = SelectFromModel(clf)
In [141]:
clf.fit(X,y)
Out[141]:
In [142]:
clf.C_
Out[142]:
In [143]:
clf.coef_
Out[143]:
In [186]:
y_pred = clf.predict(X)
In [187]:
y_pred
Out[187]:
In [145]:
X.head()
Out[145]:
In [147]:
clf.predict_proba(X)
Out[147]:
In [165]:
def P(z):
return 1/(1+np.exp(-z))
In [175]:
z=0.12*df.interest + 0.12 * df.grade - 0.33 * df.income_log
#z=0.12*df.interest + 0.12 * df.grade + 0.02 * df.ownership - 0.33 * df.income_log
In [188]:
y_pred_easy = P(z)
In [190]:
y_pred_easy.tail()
Out[190]:
In [2]:
# Calculate the accuracy
In [191]:
from sklearn import metrics
In [192]:
metrics.roc_auc_score(y_pred, y)
Out[192]:
In [193]:
metricas.accuracy_score(y_pred,y)
Out[193]:
In [194]:
metrics.confusion_matrix(y_pred=y_pred, y_true=y)
Out[194]:
In [195]:
y.value_counts()
Out[195]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]: