In [13]:
import pandas as pd
import numpy as np

In [14]:
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [15]:
loans = pd.read_csv("loan_data.csv")

In [16]:
loans.head()


Out[16]:
credit.policy purpose int.rate installment log.annual.inc dti fico days.with.cr.line revol.bal revol.util inq.last.6mths delinq.2yrs pub.rec not.fully.paid
0 1 debt_consolidation 0.1189 829.10 11.350407 19.48 737 5639.958333 28854 52.1 0 0 0 0
1 1 credit_card 0.1071 228.22 11.082143 14.29 707 2760.000000 33623 76.7 0 0 0 0
2 1 debt_consolidation 0.1357 366.86 10.373491 11.63 682 4710.000000 3511 25.6 1 0 0 0
3 1 debt_consolidation 0.1008 162.34 11.350407 8.10 712 2699.958333 33667 73.2 1 0 0 0
4 1 credit_card 0.1426 102.92 11.299732 14.97 667 4066.000000 4740 39.5 0 1 0 0

In [17]:
loans.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9578 entries, 0 to 9577
Data columns (total 14 columns):
credit.policy        9578 non-null int64
purpose              9578 non-null object
int.rate             9578 non-null float64
installment          9578 non-null float64
log.annual.inc       9578 non-null float64
dti                  9578 non-null float64
fico                 9578 non-null int64
days.with.cr.line    9578 non-null float64
revol.bal            9578 non-null int64
revol.util           9578 non-null float64
inq.last.6mths       9578 non-null int64
delinq.2yrs          9578 non-null int64
pub.rec              9578 non-null int64
not.fully.paid       9578 non-null int64
dtypes: float64(6), int64(7), object(1)
memory usage: 1.0+ MB

In [18]:
loans.describe()


Out[18]:
credit.policy int.rate installment log.annual.inc dti fico days.with.cr.line revol.bal revol.util inq.last.6mths delinq.2yrs pub.rec not.fully.paid
count 9578.000000 9578.000000 9578.000000 9578.000000 9578.000000 9578.000000 9578.000000 9.578000e+03 9578.000000 9578.000000 9578.000000 9578.000000 9578.000000
mean 0.804970 0.122640 319.089413 10.932117 12.606679 710.846314 4560.767197 1.691396e+04 46.799236 1.577469 0.163708 0.062122 0.160054
std 0.396245 0.026847 207.071301 0.614813 6.883970 37.970537 2496.930377 3.375619e+04 29.014417 2.200245 0.546215 0.262126 0.366676
min 0.000000 0.060000 15.670000 7.547502 0.000000 612.000000 178.958333 0.000000e+00 0.000000 0.000000 0.000000 0.000000 0.000000
25% 1.000000 0.103900 163.770000 10.558414 7.212500 682.000000 2820.000000 3.187000e+03 22.600000 0.000000 0.000000 0.000000 0.000000
50% 1.000000 0.122100 268.950000 10.928884 12.665000 707.000000 4139.958333 8.596000e+03 46.300000 1.000000 0.000000 0.000000 0.000000
75% 1.000000 0.140700 432.762500 11.291293 17.950000 737.000000 5730.000000 1.824950e+04 70.900000 2.000000 0.000000 0.000000 0.000000
max 1.000000 0.216400 940.140000 14.528354 29.960000 827.000000 17639.958330 1.207359e+06 119.000000 33.000000 13.000000 5.000000 1.000000

In [19]:
plt.figure(figsize=(10,6))
loans[loans['credit.policy']==1]['fico'].hist(alpha=0.5,color='blue',
                                              bins=30,label='Credit.Policy=1')
loans[loans['credit.policy']==0]['fico'].hist(alpha=0.5,color='red',
                                              bins=30,label='Credit.Policy=0')
plt.legend()
plt.xlabel('FICO')


Out[19]:
<matplotlib.text.Text at 0x7f3f529478d0>

In [20]:
plt.figure(figsize=(10,6))
loans[loans['not.fully.paid']==1]['fico'].hist(alpha=0.5,color='blue',
                                              bins=30,label='Credit.Policy=1')
loans[loans['not.fully.paid']==0]['fico'].hist(alpha=0.5,color='red',
                                              bins=30,label='Credit.Policy=0')
plt.legend()
plt.xlabel('FICO')


Out[20]:
<matplotlib.text.Text at 0x7f3f5269cfd0>

In [22]:
sns.countplot(loans["purpose"], hue = loans["not.fully.paid"])


Out[22]:
<matplotlib.axes._subplots.AxesSubplot at 0x7f3f5292dda0>

In [23]:
sns.jointplot(loans['fico'], loans['int.rate'])


Out[23]:
<seaborn.axisgrid.JointGrid at 0x7f3f52868588>

In [24]:
plt.figure(figsize=(11,7))
sns.lmplot(y='int.rate',x='fico',data=loans,hue='credit.policy',
           col='not.fully.paid',palette='Set1')


Out[24]:
<seaborn.axisgrid.FacetGrid at 0x7f3f5209d550>
<matplotlib.figure.Figure at 0x7f3f51ed0cc0>

In [25]:
loans.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9578 entries, 0 to 9577
Data columns (total 14 columns):
credit.policy        9578 non-null int64
purpose              9578 non-null object
int.rate             9578 non-null float64
installment          9578 non-null float64
log.annual.inc       9578 non-null float64
dti                  9578 non-null float64
fico                 9578 non-null int64
days.with.cr.line    9578 non-null float64
revol.bal            9578 non-null int64
revol.util           9578 non-null float64
inq.last.6mths       9578 non-null int64
delinq.2yrs          9578 non-null int64
pub.rec              9578 non-null int64
not.fully.paid       9578 non-null int64
dtypes: float64(6), int64(7), object(1)
memory usage: 1.0+ MB

In [28]:
cat_feats = ["purpose"]

In [30]:
final_data = pd.get_dummies(loans, columns = cat_feats, drop_first = True)

In [31]:
from sklearn.model_selection import train_test_split

In [32]:
X = final_data.drop("not.fully.paid", axis=1)
y = final_data["not.fully.paid"]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)

In [33]:
from sklearn.tree import DecisionTreeClassifier

In [34]:
tree = DecisionTreeClassifier()

In [35]:
tree.fit(X_train, y_train)


Out[35]:
DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            presort=False, random_state=None, splitter='best')

In [36]:
tree_pred = tree.predict(X_test)

In [37]:
from sklearn.metrics import confusion_matrix, classification_report

In [38]:
print(confusion_matrix(y_test, tree_pred))
print("\n")
print(classification_report(y_test, tree_pred))


[[2018  413]
 [ 355   88]]


             precision    recall  f1-score   support

          0       0.85      0.83      0.84      2431
          1       0.18      0.20      0.19       443

avg / total       0.75      0.73      0.74      2874


In [39]:
from sklearn.ensemble import RandomForestClassifier

In [40]:
rfc = RandomForestClassifier(n_estimators=600)

In [41]:
rfc.fit(X_train, y_train)


Out[41]:
RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            n_estimators=600, n_jobs=1, oob_score=False, random_state=None,
            verbose=0, warm_start=False)

In [42]:
rfc_pred = rfc.predict(X_test)

In [43]:
print(confusion_matrix(y_test, rfc_pred))
print("\n")
print(classification_report(y_test, rfc_pred))


[[2417   14]
 [ 438    5]]


             precision    recall  f1-score   support

          0       0.85      0.99      0.91      2431
          1       0.26      0.01      0.02       443

avg / total       0.76      0.84      0.78      2874


In [44]:
##### Finding optimal number of trees

In [45]:
error_rates =[]
for i in range(50, 1000, 50):
    c = RandomForestClassifier(n_estimators=i)
    c.fit(X_train, y_train)
    i_pred = c.predict(X_test)
    error = np.mean(y_test != i_pred)
    error_rates.append(error)

In [48]:
plt.figure(figsize=(10, 6))
plt.plot(range(50, 1000, 50), error_rates, color='blue', linestyle='dashed', marker='o', markerfacecolor='red', markersize=10)
plt.title("Error Rate vs n trees")
plt.xlabel("n trees")
plt.ylabel("Error Rate")


Out[48]:
<matplotlib.text.Text at 0x7f3f3cc639b0>

In [49]:
rfc = RandomForestClassifier(n_estimators=100)
rfc.fit(X_train, y_train)
rfc_pred = rfc.predict(X_test)
print(confusion_matrix(y_test, rfc_pred))
print("\n")
print(classification_report(y_test, rfc_pred))


[[2411   20]
 [ 432   11]]


             precision    recall  f1-score   support

          0       0.85      0.99      0.91      2431
          1       0.35      0.02      0.05       443

avg / total       0.77      0.84      0.78      2874

Error rates do not seem to change much with n_trees ranging from 50-1000

In [ ]: