notebook.community

Edit and run



In [13]:

    
import pandas as pd
import numpy as np



In [14]:

    
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline



In [15]:

    
loans = pd.read_csv("loan_data.csv")



In [16]:

    
loans.head()









    Out[16]:







  
    
      
      credit.policy
      purpose
      int.rate
      installment
      log.annual.inc
      dti
      fico
      days.with.cr.line
      revol.bal
      revol.util
      inq.last.6mths
      delinq.2yrs
      pub.rec
      not.fully.paid
    
  
  
    
      0
      1
      debt_consolidation
      0.1189
      829.10
      11.350407
      19.48
      737
      5639.958333
      28854
      52.1
      0
      0
      0
      0
    
    
      1
      1
      credit_card
      0.1071
      228.22
      11.082143
      14.29
      707
      2760.000000
      33623
      76.7
      0
      0
      0
      0
    
    
      2
      1
      debt_consolidation
      0.1357
      366.86
      10.373491
      11.63
      682
      4710.000000
      3511
      25.6
      1
      0
      0
      0
    
    
      3
      1
      debt_consolidation
      0.1008
      162.34
      11.350407
      8.10
      712
      2699.958333
      33667
      73.2
      1
      0
      0
      0
    
    
      4
      1
      credit_card
      0.1426
      102.92
      11.299732
      14.97
      667
      4066.000000
      4740
      39.5
      0
      1
      0
      0



In [17]:

    
loans.info()









    



<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9578 entries, 0 to 9577
Data columns (total 14 columns):
credit.policy        9578 non-null int64
purpose              9578 non-null object
int.rate             9578 non-null float64
installment          9578 non-null float64
log.annual.inc       9578 non-null float64
dti                  9578 non-null float64
fico                 9578 non-null int64
days.with.cr.line    9578 non-null float64
revol.bal            9578 non-null int64
revol.util           9578 non-null float64
inq.last.6mths       9578 non-null int64
delinq.2yrs          9578 non-null int64
pub.rec              9578 non-null int64
not.fully.paid       9578 non-null int64
dtypes: float64(6), int64(7), object(1)
memory usage: 1.0+ MB



In [18]:

    
loans.describe()









    Out[18]:







  
    
      
      credit.policy
      int.rate
      installment
      log.annual.inc
      dti
      fico
      days.with.cr.line
      revol.bal
      revol.util
      inq.last.6mths
      delinq.2yrs
      pub.rec
      not.fully.paid
    
  
  
    
      count
      9578.000000
      9578.000000
      9578.000000
      9578.000000
      9578.000000
      9578.000000
      9578.000000
      9.578000e+03
      9578.000000
      9578.000000
      9578.000000
      9578.000000
      9578.000000
    
    
      mean
      0.804970
      0.122640
      319.089413
      10.932117
      12.606679
      710.846314
      4560.767197
      1.691396e+04
      46.799236
      1.577469
      0.163708
      0.062122
      0.160054
    
    
      std
      0.396245
      0.026847
      207.071301
      0.614813
      6.883970
      37.970537
      2496.930377
      3.375619e+04
      29.014417
      2.200245
      0.546215
      0.262126
      0.366676
    
    
      min
      0.000000
      0.060000
      15.670000
      7.547502
      0.000000
      612.000000
      178.958333
      0.000000e+00
      0.000000
      0.000000
      0.000000
      0.000000
      0.000000
    
    
      25%
      1.000000
      0.103900
      163.770000
      10.558414
      7.212500
      682.000000
      2820.000000
      3.187000e+03
      22.600000
      0.000000
      0.000000
      0.000000
      0.000000
    
    
      50%
      1.000000
      0.122100
      268.950000
      10.928884
      12.665000
      707.000000
      4139.958333
      8.596000e+03
      46.300000
      1.000000
      0.000000
      0.000000
      0.000000
    
    
      75%
      1.000000
      0.140700
      432.762500
      11.291293
      17.950000
      737.000000
      5730.000000
      1.824950e+04
      70.900000
      2.000000
      0.000000
      0.000000
      0.000000
    
    
      max
      1.000000
      0.216400
      940.140000
      14.528354
      29.960000
      827.000000
      17639.958330
      1.207359e+06
      119.000000
      33.000000
      13.000000
      5.000000
      1.000000



In [19]:

    
plt.figure(figsize=(10,6))
loans[loans['credit.policy']==1]['fico'].hist(alpha=0.5,color='blue',
                                              bins=30,label='Credit.Policy=1')
loans[loans['credit.policy']==0]['fico'].hist(alpha=0.5,color='red',
                                              bins=30,label='Credit.Policy=0')
plt.legend()
plt.xlabel('FICO')









    Out[19]:





<matplotlib.text.Text at 0x7f3f529478d0>



In [20]:

    
plt.figure(figsize=(10,6))
loans[loans['not.fully.paid']==1]['fico'].hist(alpha=0.5,color='blue',
                                              bins=30,label='Credit.Policy=1')
loans[loans['not.fully.paid']==0]['fico'].hist(alpha=0.5,color='red',
                                              bins=30,label='Credit.Policy=0')
plt.legend()
plt.xlabel('FICO')









    Out[20]:





<matplotlib.text.Text at 0x7f3f5269cfd0>



In [22]:

    
sns.countplot(loans["purpose"], hue = loans["not.fully.paid"])









    Out[22]:





<matplotlib.axes._subplots.AxesSubplot at 0x7f3f5292dda0>



In [23]:

    
sns.jointplot(loans['fico'], loans['int.rate'])









    Out[23]:





<seaborn.axisgrid.JointGrid at 0x7f3f52868588>



In [24]:

    
plt.figure(figsize=(11,7))
sns.lmplot(y='int.rate',x='fico',data=loans,hue='credit.policy',
           col='not.fully.paid',palette='Set1')









    Out[24]:





<seaborn.axisgrid.FacetGrid at 0x7f3f5209d550>






    





<matplotlib.figure.Figure at 0x7f3f51ed0cc0>



In [25]:

    
loans.info()









    



<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9578 entries, 0 to 9577
Data columns (total 14 columns):
credit.policy        9578 non-null int64
purpose              9578 non-null object
int.rate             9578 non-null float64
installment          9578 non-null float64
log.annual.inc       9578 non-null float64
dti                  9578 non-null float64
fico                 9578 non-null int64
days.with.cr.line    9578 non-null float64
revol.bal            9578 non-null int64
revol.util           9578 non-null float64
inq.last.6mths       9578 non-null int64
delinq.2yrs          9578 non-null int64
pub.rec              9578 non-null int64
not.fully.paid       9578 non-null int64
dtypes: float64(6), int64(7), object(1)
memory usage: 1.0+ MB



In [28]:

    
cat_feats = ["purpose"]



In [30]:

    
final_data = pd.get_dummies(loans, columns = cat_feats, drop_first = True)



In [31]:

    
from sklearn.model_selection import train_test_split



In [32]:

    
X = final_data.drop("not.fully.paid", axis=1)
y = final_data["not.fully.paid"]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)



In [33]:

    
from sklearn.tree import DecisionTreeClassifier



In [34]:

    
tree = DecisionTreeClassifier()



In [35]:

    
tree.fit(X_train, y_train)









    Out[35]:





DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            presort=False, random_state=None, splitter='best')



In [36]:

    
tree_pred = tree.predict(X_test)



In [37]:

    
from sklearn.metrics import confusion_matrix, classification_report



In [38]:

    
print(confusion_matrix(y_test, tree_pred))
print("\n")
print(classification_report(y_test, tree_pred))









    



[[2018  413]
 [ 355   88]]


             precision    recall  f1-score   support

          0       0.85      0.83      0.84      2431
          1       0.18      0.20      0.19       443

avg / total       0.75      0.73      0.74      2874



In [39]:

    
from sklearn.ensemble import RandomForestClassifier



In [40]:

    
rfc = RandomForestClassifier(n_estimators=600)



In [41]:

    
rfc.fit(X_train, y_train)









    Out[41]:





RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            n_estimators=600, n_jobs=1, oob_score=False, random_state=None,
            verbose=0, warm_start=False)



In [42]:

    
rfc_pred = rfc.predict(X_test)



In [43]:

    
print(confusion_matrix(y_test, rfc_pred))
print("\n")
print(classification_report(y_test, rfc_pred))









    



[[2417   14]
 [ 438    5]]


             precision    recall  f1-score   support

          0       0.85      0.99      0.91      2431
          1       0.26      0.01      0.02       443

avg / total       0.76      0.84      0.78      2874



In [44]:

    
##### Finding optimal number of trees



In [45]:

    
error_rates =[]
for i in range(50, 1000, 50):
    c = RandomForestClassifier(n_estimators=i)
    c.fit(X_train, y_train)
    i_pred = c.predict(X_test)
    error = np.mean(y_test != i_pred)
    error_rates.append(error)



In [48]:

    
plt.figure(figsize=(10, 6))
plt.plot(range(50, 1000, 50), error_rates, color='blue', linestyle='dashed', marker='o', markerfacecolor='red', markersize=10)
plt.title("Error Rate vs n trees")
plt.xlabel("n trees")
plt.ylabel("Error Rate")









    Out[48]:





<matplotlib.text.Text at 0x7f3f3cc639b0>



In [49]:

    
rfc = RandomForestClassifier(n_estimators=100)
rfc.fit(X_train, y_train)
rfc_pred = rfc.predict(X_test)
print(confusion_matrix(y_test, rfc_pred))
print("\n")
print(classification_report(y_test, rfc_pred))









    



[[2411   20]
 [ 432   11]]


             precision    recall  f1-score   support

          0       0.85      0.99      0.91      2431
          1       0.35      0.02      0.05       443

avg / total       0.77      0.84      0.78      2874

Error rates do not seem to change much with n_trees ranging from 50-1000



In [ ]:

	credit.policy	purpose	int.rate	installment	log.annual.inc	dti	fico	days.with.cr.line	revol.bal	revol.util	inq.last.6mths	delinq.2yrs
0	1	debt_consolidation	0.1189	829.10	11.350407	19.48	737	5639.958333	28854	52.1	0	0
1	1	credit_card	0.1071	228.22	11.082143	14.29	707	2760.000000	33623	76.7	0	0
2	1	debt_consolidation	0.1357	366.86	10.373491	11.63	682	4710.000000	3511	25.6	1	0
3	1	debt_consolidation	0.1008	162.34	11.350407	8.10	712	2699.958333	33667	73.2	1	0
4	1	credit_card	0.1426	102.92	11.299732	14.97	667	4066.000000	4740	39.5	0	1

	credit.policy	int.rate	installment	log.annual.inc	dti	fico	days.with.cr.line	revol.bal	revol.util	inq.last.6mths	delinq.2yrs	pub.rec	not.fully.paid
count	9578.000000	9578.000000	9578.000000	9578.000000	9578.000000	9578.000000	9578.000000	9.578000e+03	9578.000000	9578.000000	9578.000000	9578.000000	9578.000000
mean	0.804970	0.122640	319.089413	10.932117	12.606679	710.846314	4560.767197	1.691396e+04	46.799236	1.577469	0.163708	0.062122	0.160054
std	0.396245	0.026847	207.071301	0.614813	6.883970	37.970537	2496.930377	3.375619e+04	29.014417	2.200245	0.546215	0.262126	0.366676
min	0.000000	0.060000	15.670000	7.547502	0.000000	612.000000	178.958333	0.000000e+00	0.000000	0.000000	0.000000	0.000000	0.000000
25%	1.000000	0.103900	163.770000	10.558414	7.212500	682.000000	2820.000000	3.187000e+03	22.600000	0.000000	0.000000	0.000000	0.000000
50%	1.000000	0.122100	268.950000	10.928884	12.665000	707.000000	4139.958333	8.596000e+03	46.300000	1.000000	0.000000	0.000000	0.000000
75%	1.000000	0.140700	432.762500	11.291293	17.950000	737.000000	5730.000000	1.824950e+04	70.900000	2.000000	0.000000	0.000000	0.000000
max	1.000000	0.216400	940.140000	14.528354	29.960000	827.000000	17639.958330	1.207359e+06	119.000000	33.000000	13.000000	5.000000	1.000000