notebook.community

Edit and run



In [1]:

    
#importing things
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline



In [3]:

    
#reading data
df = pd.read_csv('loan_data.csv')



In [4]:

    
#describing the data
df.head()









    Out[4]:






  
    
      
      credit.policy
      purpose
      int.rate
      installment
      log.annual.inc
      dti
      fico
      days.with.cr.line
      revol.bal
      revol.util
      inq.last.6mths
      delinq.2yrs
      pub.rec
      not.fully.paid
    
  
  
    
      0
      1
      debt_consolidation
      0.1189
      829.10
      11.350407
      19.48
      737
      5639.958333
      28854
      52.1
      0
      0
      0
      0
    
    
      1
      1
      credit_card
      0.1071
      228.22
      11.082143
      14.29
      707
      2760.000000
      33623
      76.7
      0
      0
      0
      0
    
    
      2
      1
      debt_consolidation
      0.1357
      366.86
      10.373491
      11.63
      682
      4710.000000
      3511
      25.6
      1
      0
      0
      0
    
    
      3
      1
      debt_consolidation
      0.1008
      162.34
      11.350407
      8.10
      712
      2699.958333
      33667
      73.2
      1
      0
      0
      0
    
    
      4
      1
      credit_card
      0.1426
      102.92
      11.299732
      14.97
      667
      4066.000000
      4740
      39.5
      0
      1
      0
      0



In [5]:

    
df.info()









    



<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9578 entries, 0 to 9577
Data columns (total 14 columns):
credit.policy        9578 non-null int64
purpose              9578 non-null object
int.rate             9578 non-null float64
installment          9578 non-null float64
log.annual.inc       9578 non-null float64
dti                  9578 non-null float64
fico                 9578 non-null int64
days.with.cr.line    9578 non-null float64
revol.bal            9578 non-null int64
revol.util           9578 non-null float64
inq.last.6mths       9578 non-null int64
delinq.2yrs          9578 non-null int64
pub.rec              9578 non-null int64
not.fully.paid       9578 non-null int64
dtypes: float64(6), int64(7), object(1)
memory usage: 1.0+ MB



In [6]:

    
df.describe()









    Out[6]:






  
    
      
      credit.policy
      int.rate
      installment
      log.annual.inc
      dti
      fico
      days.with.cr.line
      revol.bal
      revol.util
      inq.last.6mths
      delinq.2yrs
      pub.rec
      not.fully.paid
    
  
  
    
      count
      9578.000000
      9578.000000
      9578.000000
      9578.000000
      9578.000000
      9578.000000
      9578.000000
      9.578000e+03
      9578.000000
      9578.000000
      9578.000000
      9578.000000
      9578.000000
    
    
      mean
      0.804970
      0.122640
      319.089413
      10.932117
      12.606679
      710.846314
      4560.767197
      1.691396e+04
      46.799236
      1.577469
      0.163708
      0.062122
      0.160054
    
    
      std
      0.396245
      0.026847
      207.071301
      0.614813
      6.883970
      37.970537
      2496.930377
      3.375619e+04
      29.014417
      2.200245
      0.546215
      0.262126
      0.366676
    
    
      min
      0.000000
      0.060000
      15.670000
      7.547502
      0.000000
      612.000000
      178.958333
      0.000000e+00
      0.000000
      0.000000
      0.000000
      0.000000
      0.000000
    
    
      25%
      1.000000
      0.103900
      163.770000
      10.558414
      7.212500
      682.000000
      2820.000000
      3.187000e+03
      22.600000
      0.000000
      0.000000
      0.000000
      0.000000
    
    
      50%
      1.000000
      0.122100
      268.950000
      10.928884
      12.665000
      707.000000
      4139.958333
      8.596000e+03
      46.300000
      1.000000
      0.000000
      0.000000
      0.000000
    
    
      75%
      1.000000
      0.140700
      432.762500
      11.291293
      17.950000
      737.000000
      5730.000000
      1.824950e+04
      70.900000
      2.000000
      0.000000
      0.000000
      0.000000
    
    
      max
      1.000000
      0.216400
      940.140000
      14.528354
      29.960000
      827.000000
      17639.958330
      1.207359e+06
      119.000000
      33.000000
      13.000000
      5.000000
      1.000000



In [7]:

    
#initial exploration and visualization of accept/deny credit
plt.figure(figsize=(10,6))
plt.hist(df[df['credit.policy']==1]['fico'],bins=30,alpha=0.5,edgecolor='black')
plt.hist(df[df['credit.policy']==0]['fico'],color='red',bins=30,alpha=0.5,edgecolor='black')
plt.legend(['credit.policy=1','credit.polocy=0'])
plt.xlabel("FICO")









    Out[7]:





<matplotlib.text.Text at 0x1134da5c0>



In [8]:

    
#visualization of loans not paid back
plt.figure(figsize=(10,6))
plt.hist(df[df['not.fully.paid']==1]['fico'],bins=30,alpha=0.5,edgecolor='black')
plt.hist(df[df['not.fully.paid']==0]['fico'],color='red',bins=30,alpha=0.5,edgecolor='black')
plt.legend(['.not.fully.paid=1','not.fully.paid=0'])
plt.xlabel("FICO")









    Out[8]:





<matplotlib.text.Text at 0x116ba7c50>



In [10]:

    
#seaborn plot to show reasons for debt and payment status
plt.figure(figsize=(12,6))
sns.countplot(df['purpose'],hue=df['not.fully.paid'])









    Out[10]:





<matplotlib.axes._subplots.AxesSubplot at 0x10f940e80>



In [12]:

    
#plotting relationship between fico score and interest rate
sns.jointplot('fico','int.rate',data=df,color='purple',size=8,ratio=3,space=0.4)









    Out[12]:





<seaborn.axisgrid.JointGrid at 0x11736e8d0>



In [13]:

    
#differentiate between not-full-paid and credit-policy
sns.lmplot('fico','int.rate',data=df,hue='credit.policy',col='not.fully.paid',size=8)









    Out[13]:





<seaborn.axisgrid.FacetGrid at 0x1178755c0>



In [14]:

    
#inferential model: decision trees.
# lets set it up
df.info()









    



<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9578 entries, 0 to 9577
Data columns (total 14 columns):
credit.policy        9578 non-null int64
purpose              9578 non-null object
int.rate             9578 non-null float64
installment          9578 non-null float64
log.annual.inc       9578 non-null float64
dti                  9578 non-null float64
fico                 9578 non-null int64
days.with.cr.line    9578 non-null float64
revol.bal            9578 non-null int64
revol.util           9578 non-null float64
inq.last.6mths       9578 non-null int64
delinq.2yrs          9578 non-null int64
pub.rec              9578 non-null int64
not.fully.paid       9578 non-null int64
dtypes: float64(6), int64(7), object(1)
memory usage: 1.0+ MB



In [16]:

    
#lets look at categorical features
#transforming to dummy variables
cat_feats = ['purpose']
final_data = pd.get_dummies(df,columns=cat_feats,drop_first=True)



In [17]:

    
#lets look at the transformed data
final_data.head()









    Out[17]:






  
    
      
      credit.policy
      int.rate
      installment
      log.annual.inc
      dti
      fico
      days.with.cr.line
      revol.bal
      revol.util
      inq.last.6mths
      delinq.2yrs
      pub.rec
      not.fully.paid
      purpose_credit_card
      purpose_debt_consolidation
      purpose_educational
      purpose_home_improvement
      purpose_major_purchase
      purpose_small_business
    
  
  
    
      0
      1
      0.1189
      829.10
      11.350407
      19.48
      737
      5639.958333
      28854
      52.1
      0
      0
      0
      0
      0
      1
      0
      0
      0
      0
    
    
      1
      1
      0.1071
      228.22
      11.082143
      14.29
      707
      2760.000000
      33623
      76.7
      0
      0
      0
      0
      1
      0
      0
      0
      0
      0
    
    
      2
      1
      0.1357
      366.86
      10.373491
      11.63
      682
      4710.000000
      3511
      25.6
      1
      0
      0
      0
      0
      1
      0
      0
      0
      0
    
    
      3
      1
      0.1008
      162.34
      11.350407
      8.10
      712
      2699.958333
      33667
      73.2
      1
      0
      0
      0
      0
      1
      0
      0
      0
      0
    
    
      4
      1
      0.1426
      102.92
      11.299732
      14.97
      667
      4066.000000
      4740
      39.5
      0
      1
      0
      0
      1
      0
      0
      0
      0
      0



In [18]:

    
#train-test splitting
X = final_data.drop('not.fully.paid',axis=1)
y = final_data['not.fully.paid']



In [19]:

    
#forgot to import! oops
from sklearn.cross_validation import train_test_split









    



/anaconda/lib/python3.6/site-packages/sklearn/cross_validation.py:44: DeprecationWarning: This module was deprecated in version 0.18 in favor of the model_selection module into which all the refactored classes and functions are moved. Also note that the interface of the new CV iterators are different from that of this module. This module will be removed in 0.20.
  "This module will be removed in 0.20.", DeprecationWarning)



In [20]:

    
#70-30 split: you can experiment with any split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3,random_state=101)



In [21]:

    
#training decision trees
from sklearn.tree import DecisionTreeClassifier
dTree = DecisionTreeClassifier()
dTree.fit(X_train,y_train)









    Out[21]:





DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            presort=False, random_state=None, splitter='best')



In [22]:

    
#predictions and inferences
predictions = dTree.predict(X_test)
from sklearn.metrics import classification_report,confusion_matrix
print(classification_report(y_test,predictions))









    



             precision    recall  f1-score   support

          0       0.86      0.82      0.84      2431
          1       0.20      0.24      0.21       443

avg / total       0.75      0.73      0.74      2874



In [23]:

    
#printing confusion matrix
print(confusion_matrix(y_test,predictions))









    



[[2002  429]
 [ 338  105]]



In [24]:

    
#lets do random forests - a better way than singular decision trees
from sklearn.ensemble import RandomForestClassifier
rfc = RandomForestClassifier(n_estimators=300)
rfc.fit(X_train,y_train)









    Out[24]:





RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            n_estimators=300, n_jobs=1, oob_score=False, random_state=None,
            verbose=0, warm_start=False)



In [25]:

    
#inferences from random forests
rfc_pred = rfc.predict(X_test)
print(classification_report(y_test,rfc_pred))









    



             precision    recall  f1-score   support

          0       0.85      1.00      0.92      2431
          1       0.40      0.02      0.03       443

avg / total       0.78      0.84      0.78      2874



In [26]:

    
#confusion matrix from random forests
print(confusion_matrix(y_test,rfc_pred))









    



[[2419   12]
 [ 435    8]]



In [ ]:

	credit.policy	purpose	int.rate	installment	log.annual.inc	dti	fico	days.with.cr.line	revol.bal	revol.util	inq.last.6mths	delinq.2yrs
0	1	debt_consolidation	0.1189	829.10	11.350407	19.48	737	5639.958333	28854	52.1	0	0
1	1	credit_card	0.1071	228.22	11.082143	14.29	707	2760.000000	33623	76.7	0	0
2	1	debt_consolidation	0.1357	366.86	10.373491	11.63	682	4710.000000	3511	25.6	1	0
3	1	debt_consolidation	0.1008	162.34	11.350407	8.10	712	2699.958333	33667	73.2	1	0
4	1	credit_card	0.1426	102.92	11.299732	14.97	667	4066.000000	4740	39.5	0	1

	credit.policy	int.rate	installment	log.annual.inc	dti	fico	days.with.cr.line	revol.bal	revol.util	inq.last.6mths	delinq.2yrs	pub.rec	not.fully.paid
count	9578.000000	9578.000000	9578.000000	9578.000000	9578.000000	9578.000000	9578.000000	9.578000e+03	9578.000000	9578.000000	9578.000000	9578.000000	9578.000000
mean	0.804970	0.122640	319.089413	10.932117	12.606679	710.846314	4560.767197	1.691396e+04	46.799236	1.577469	0.163708	0.062122	0.160054
std	0.396245	0.026847	207.071301	0.614813	6.883970	37.970537	2496.930377	3.375619e+04	29.014417	2.200245	0.546215	0.262126	0.366676
min	0.000000	0.060000	15.670000	7.547502	0.000000	612.000000	178.958333	0.000000e+00	0.000000	0.000000	0.000000	0.000000	0.000000
25%	1.000000	0.103900	163.770000	10.558414	7.212500	682.000000	2820.000000	3.187000e+03	22.600000	0.000000	0.000000	0.000000	0.000000
50%	1.000000	0.122100	268.950000	10.928884	12.665000	707.000000	4139.958333	8.596000e+03	46.300000	1.000000	0.000000	0.000000	0.000000
75%	1.000000	0.140700	432.762500	11.291293	17.950000	737.000000	5730.000000	1.824950e+04	70.900000	2.000000	0.000000	0.000000	0.000000
max	1.000000	0.216400	940.140000	14.528354	29.960000	827.000000	17639.958330	1.207359e+06	119.000000	33.000000	13.000000	5.000000	1.000000