In [1]:
#importing things
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [3]:
#reading data
df = pd.read_csv('loan_data.csv')

In [4]:
#describing the data
df.head()


Out[4]:
credit.policy purpose int.rate installment log.annual.inc dti fico days.with.cr.line revol.bal revol.util inq.last.6mths delinq.2yrs pub.rec not.fully.paid
0 1 debt_consolidation 0.1189 829.10 11.350407 19.48 737 5639.958333 28854 52.1 0 0 0 0
1 1 credit_card 0.1071 228.22 11.082143 14.29 707 2760.000000 33623 76.7 0 0 0 0
2 1 debt_consolidation 0.1357 366.86 10.373491 11.63 682 4710.000000 3511 25.6 1 0 0 0
3 1 debt_consolidation 0.1008 162.34 11.350407 8.10 712 2699.958333 33667 73.2 1 0 0 0
4 1 credit_card 0.1426 102.92 11.299732 14.97 667 4066.000000 4740 39.5 0 1 0 0

In [5]:
df.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9578 entries, 0 to 9577
Data columns (total 14 columns):
credit.policy        9578 non-null int64
purpose              9578 non-null object
int.rate             9578 non-null float64
installment          9578 non-null float64
log.annual.inc       9578 non-null float64
dti                  9578 non-null float64
fico                 9578 non-null int64
days.with.cr.line    9578 non-null float64
revol.bal            9578 non-null int64
revol.util           9578 non-null float64
inq.last.6mths       9578 non-null int64
delinq.2yrs          9578 non-null int64
pub.rec              9578 non-null int64
not.fully.paid       9578 non-null int64
dtypes: float64(6), int64(7), object(1)
memory usage: 1.0+ MB

In [6]:
df.describe()


Out[6]:
credit.policy int.rate installment log.annual.inc dti fico days.with.cr.line revol.bal revol.util inq.last.6mths delinq.2yrs pub.rec not.fully.paid
count 9578.000000 9578.000000 9578.000000 9578.000000 9578.000000 9578.000000 9578.000000 9.578000e+03 9578.000000 9578.000000 9578.000000 9578.000000 9578.000000
mean 0.804970 0.122640 319.089413 10.932117 12.606679 710.846314 4560.767197 1.691396e+04 46.799236 1.577469 0.163708 0.062122 0.160054
std 0.396245 0.026847 207.071301 0.614813 6.883970 37.970537 2496.930377 3.375619e+04 29.014417 2.200245 0.546215 0.262126 0.366676
min 0.000000 0.060000 15.670000 7.547502 0.000000 612.000000 178.958333 0.000000e+00 0.000000 0.000000 0.000000 0.000000 0.000000
25% 1.000000 0.103900 163.770000 10.558414 7.212500 682.000000 2820.000000 3.187000e+03 22.600000 0.000000 0.000000 0.000000 0.000000
50% 1.000000 0.122100 268.950000 10.928884 12.665000 707.000000 4139.958333 8.596000e+03 46.300000 1.000000 0.000000 0.000000 0.000000
75% 1.000000 0.140700 432.762500 11.291293 17.950000 737.000000 5730.000000 1.824950e+04 70.900000 2.000000 0.000000 0.000000 0.000000
max 1.000000 0.216400 940.140000 14.528354 29.960000 827.000000 17639.958330 1.207359e+06 119.000000 33.000000 13.000000 5.000000 1.000000

In [7]:
#initial exploration and visualization of accept/deny credit
plt.figure(figsize=(10,6))
plt.hist(df[df['credit.policy']==1]['fico'],bins=30,alpha=0.5,edgecolor='black')
plt.hist(df[df['credit.policy']==0]['fico'],color='red',bins=30,alpha=0.5,edgecolor='black')
plt.legend(['credit.policy=1','credit.polocy=0'])
plt.xlabel("FICO")


Out[7]:
<matplotlib.text.Text at 0x1134da5c0>

In [8]:
#visualization of loans not paid back
plt.figure(figsize=(10,6))
plt.hist(df[df['not.fully.paid']==1]['fico'],bins=30,alpha=0.5,edgecolor='black')
plt.hist(df[df['not.fully.paid']==0]['fico'],color='red',bins=30,alpha=0.5,edgecolor='black')
plt.legend(['.not.fully.paid=1','not.fully.paid=0'])
plt.xlabel("FICO")


Out[8]:
<matplotlib.text.Text at 0x116ba7c50>

In [10]:
#seaborn plot to show reasons for debt and payment status
plt.figure(figsize=(12,6))
sns.countplot(df['purpose'],hue=df['not.fully.paid'])


Out[10]:
<matplotlib.axes._subplots.AxesSubplot at 0x10f940e80>

In [12]:
#plotting relationship between fico score and interest rate
sns.jointplot('fico','int.rate',data=df,color='purple',size=8,ratio=3,space=0.4)


Out[12]:
<seaborn.axisgrid.JointGrid at 0x11736e8d0>

In [13]:
#differentiate between not-full-paid and credit-policy
sns.lmplot('fico','int.rate',data=df,hue='credit.policy',col='not.fully.paid',size=8)


Out[13]:
<seaborn.axisgrid.FacetGrid at 0x1178755c0>

In [14]:
#inferential model: decision trees.
# lets set it up
df.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9578 entries, 0 to 9577
Data columns (total 14 columns):
credit.policy        9578 non-null int64
purpose              9578 non-null object
int.rate             9578 non-null float64
installment          9578 non-null float64
log.annual.inc       9578 non-null float64
dti                  9578 non-null float64
fico                 9578 non-null int64
days.with.cr.line    9578 non-null float64
revol.bal            9578 non-null int64
revol.util           9578 non-null float64
inq.last.6mths       9578 non-null int64
delinq.2yrs          9578 non-null int64
pub.rec              9578 non-null int64
not.fully.paid       9578 non-null int64
dtypes: float64(6), int64(7), object(1)
memory usage: 1.0+ MB

In [16]:
#lets look at categorical features
#transforming to dummy variables
cat_feats = ['purpose']
final_data = pd.get_dummies(df,columns=cat_feats,drop_first=True)

In [17]:
#lets look at the transformed data
final_data.head()


Out[17]:
credit.policy int.rate installment log.annual.inc dti fico days.with.cr.line revol.bal revol.util inq.last.6mths delinq.2yrs pub.rec not.fully.paid purpose_credit_card purpose_debt_consolidation purpose_educational purpose_home_improvement purpose_major_purchase purpose_small_business
0 1 0.1189 829.10 11.350407 19.48 737 5639.958333 28854 52.1 0 0 0 0 0 1 0 0 0 0
1 1 0.1071 228.22 11.082143 14.29 707 2760.000000 33623 76.7 0 0 0 0 1 0 0 0 0 0
2 1 0.1357 366.86 10.373491 11.63 682 4710.000000 3511 25.6 1 0 0 0 0 1 0 0 0 0
3 1 0.1008 162.34 11.350407 8.10 712 2699.958333 33667 73.2 1 0 0 0 0 1 0 0 0 0
4 1 0.1426 102.92 11.299732 14.97 667 4066.000000 4740 39.5 0 1 0 0 1 0 0 0 0 0

In [18]:
#train-test splitting
X = final_data.drop('not.fully.paid',axis=1)
y = final_data['not.fully.paid']

In [19]:
#forgot to import! oops
from sklearn.cross_validation import train_test_split


/anaconda/lib/python3.6/site-packages/sklearn/cross_validation.py:44: DeprecationWarning: This module was deprecated in version 0.18 in favor of the model_selection module into which all the refactored classes and functions are moved. Also note that the interface of the new CV iterators are different from that of this module. This module will be removed in 0.20.
  "This module will be removed in 0.20.", DeprecationWarning)

In [20]:
#70-30 split: you can experiment with any split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3,random_state=101)

In [21]:
#training decision trees
from sklearn.tree import DecisionTreeClassifier
dTree = DecisionTreeClassifier()
dTree.fit(X_train,y_train)


Out[21]:
DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            presort=False, random_state=None, splitter='best')

In [22]:
#predictions and inferences
predictions = dTree.predict(X_test)
from sklearn.metrics import classification_report,confusion_matrix
print(classification_report(y_test,predictions))


             precision    recall  f1-score   support

          0       0.86      0.82      0.84      2431
          1       0.20      0.24      0.21       443

avg / total       0.75      0.73      0.74      2874


In [23]:
#printing confusion matrix
print(confusion_matrix(y_test,predictions))


[[2002  429]
 [ 338  105]]

In [24]:
#lets do random forests - a better way than singular decision trees
from sklearn.ensemble import RandomForestClassifier
rfc = RandomForestClassifier(n_estimators=300)
rfc.fit(X_train,y_train)


Out[24]:
RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            n_estimators=300, n_jobs=1, oob_score=False, random_state=None,
            verbose=0, warm_start=False)

In [25]:
#inferences from random forests
rfc_pred = rfc.predict(X_test)
print(classification_report(y_test,rfc_pred))


             precision    recall  f1-score   support

          0       0.85      1.00      0.92      2431
          1       0.40      0.02      0.03       443

avg / total       0.78      0.84      0.78      2874


In [26]:
#confusion matrix from random forests
print(confusion_matrix(y_test,rfc_pred))


[[2419   12]
 [ 435    8]]

In [ ]: