In [2]:
import pandas as pd
import numpy as np

from sklearn import preprocessing
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.svm import LinearSVC
from sklearn.calibration import CalibratedClassifierCV

from sklearn.feature_selection import RFE
from sklearn.model_selection import GridSearchCV
from sklearn import metrics
from sklearn.model_selection import cross_val_score

In [3]:
df = pd.read_csv('LoanStats3d_securev1.csv.zip', compression='zip',header=1)


/Users/Tinoargentino/anaconda2/lib/python2.7/site-packages/IPython/core/interactiveshell.py:2717: DtypeWarning: Columns (0,19,59) have mixed types. Specify dtype option on import or set low_memory=False.
  interactivity=interactivity, compiler=compiler, result=result)

In [4]:
df.head()


Out[4]:
id member_id loan_amnt funded_amnt funded_amnt_inv term int_rate installment grade sub_grade ... num_tl_90g_dpd_24m num_tl_op_past_12m pct_tl_nvr_dlq percent_bc_gt_75 pub_rec_bankruptcies tax_liens tot_hi_cred_lim total_bal_ex_mort total_bc_limit total_il_high_credit_limit
0 68615169 NaN 16000.0 16000.0 16000.0 60 months 8.49% 328.19 B B1 ... 0.0 3.0 87.5 50.0 0.0 0.0 307343.0 64371.0 23500.0 59526.0
1 68537564 NaN 8000.0 8000.0 8000.0 36 months 10.78% 261.08 B B4 ... 0.0 4.0 97.1 100.0 0.0 0.0 168819.0 148154.0 5800.0 136996.0
2 68536799 NaN 10000.0 10000.0 10000.0 36 months 10.78% 326.35 B B4 ... 0.0 2.0 100.0 33.3 0.0 0.0 190694.0 40681.0 13900.0 29512.0
3 68355089 NaN 24700.0 24700.0 24700.0 36 months 11.99% 820.28 C C1 ... 0.0 2.0 97.4 7.7 0.0 0.0 314017.0 39475.0 79300.0 24667.0
4 68506885 NaN 10000.0 10000.0 10000.0 60 months 11.99% 222.40 C C1 ... 0.0 1.0 100.0 50.0 0.0 0.0 61054.0 41166.0 13100.0 46854.0

5 rows × 115 columns


In [5]:
defaulters=['Default','Charged Off', 'Late (31-120 days)']
non_defaulters=['Fully Paid']
uncertain = ['Current','Late (16-30 days)','In Grace Period', 'none']

In [6]:
df.loan_status.unique()


Out[6]:
array(['Current', 'Fully Paid', 'Default', 'Charged Off',
       'Late (16-30 days)', 'Late (31-120 days)', 'In Grace Period', nan], dtype=object)

In [7]:
df['Target']= 2 ## uncertain
df.loc[df.loan_status.isin(defaulters),'Target'] = 0  ## defaulters
df.loc[df.loan_status.isin(non_defaulters),'Target'] = 1  ## paid -- (and to whom to issue the loan)

In [8]:
df.head()


Out[8]:
id member_id loan_amnt funded_amnt funded_amnt_inv term int_rate installment grade sub_grade ... num_tl_op_past_12m pct_tl_nvr_dlq percent_bc_gt_75 pub_rec_bankruptcies tax_liens tot_hi_cred_lim total_bal_ex_mort total_bc_limit total_il_high_credit_limit Target
0 68615169 NaN 16000.0 16000.0 16000.0 60 months 8.49% 328.19 B B1 ... 3.0 87.5 50.0 0.0 0.0 307343.0 64371.0 23500.0 59526.0 2
1 68537564 NaN 8000.0 8000.0 8000.0 36 months 10.78% 261.08 B B4 ... 4.0 97.1 100.0 0.0 0.0 168819.0 148154.0 5800.0 136996.0 2
2 68536799 NaN 10000.0 10000.0 10000.0 36 months 10.78% 326.35 B B4 ... 2.0 100.0 33.3 0.0 0.0 190694.0 40681.0 13900.0 29512.0 2
3 68355089 NaN 24700.0 24700.0 24700.0 36 months 11.99% 820.28 C C1 ... 2.0 97.4 7.7 0.0 0.0 314017.0 39475.0 79300.0 24667.0 1
4 68506885 NaN 10000.0 10000.0 10000.0 60 months 11.99% 222.40 C C1 ... 1.0 100.0 50.0 0.0 0.0 61054.0 41166.0 13100.0 46854.0 2

5 rows × 116 columns


In [9]:
df.loc[df['Target'] == 0].describe()


Out[9]:
member_id loan_amnt funded_amnt funded_amnt_inv installment annual_inc dti delinq_2yrs fico_range_low fico_range_high ... num_tl_op_past_12m pct_tl_nvr_dlq percent_bc_gt_75 pub_rec_bankruptcies tax_liens tot_hi_cred_lim total_bal_ex_mort total_bc_limit total_il_high_credit_limit Target
count 0.0 42302.000000 42302.000000 42302.000000 42302.000000 4.230200e+04 42302.000000 42302.000000 42302.000000 42302.000000 ... 42302.000000 42302.000000 41790.000000 42302.000000 42302.000000 4.230200e+04 42302.000000 42302.000000 42302.000000 42302.0
mean NaN 15658.597702 15658.597702 15650.406461 461.664684 7.023571e+04 21.319409 0.376294 685.131436 689.131483 ... 2.714080 93.982022 52.155906 0.157321 0.066545 1.429021e+05 51433.688738 17917.678242 43445.640135 0.0
std NaN 8555.295964 8555.295964 8550.764991 250.791773 8.017445e+04 49.315800 0.982413 23.979229 23.979545 ... 2.067767 8.685771 35.785969 0.417973 0.576476 1.443945e+05 44800.858670 18031.898258 41308.078088 0.0
min NaN 1000.000000 1000.000000 900.000000 30.540000 0.000000e+00 0.000000 0.000000 660.000000 664.000000 ... 0.000000 12.500000 0.000000 0.000000 0.000000 2.584000e+03 0.000000 0.000000 0.000000 0.0
25% NaN 9500.000000 9500.000000 9500.000000 279.720000 4.300000e+04 14.470000 0.000000 665.000000 669.000000 ... 1.000000 90.900000 22.200000 0.000000 0.000000 4.567700e+04 23353.250000 6500.000000 16575.500000 0.0
50% NaN 14475.000000 14475.000000 14450.000000 403.035000 6.000000e+04 20.940000 0.000000 680.000000 684.000000 ... 2.000000 97.300000 50.000000 0.000000 0.000000 8.670500e+04 40000.000000 12500.000000 33714.000000 0.0
75% NaN 20400.000000 20400.000000 20400.000000 596.340000 8.400000e+04 27.740000 0.000000 695.000000 699.000000 ... 4.000000 100.000000 83.300000 0.000000 0.000000 1.991648e+05 65288.250000 23000.000000 58323.750000 0.0
max NaN 35000.000000 35000.000000 35000.000000 1445.460000 8.900060e+06 9999.000000 27.000000 845.000000 850.000000 ... 22.000000 100.000000 100.000000 8.000000 85.000000 2.319146e+06 893784.000000 326300.000000 757807.000000 0.0

8 rows × 90 columns


In [10]:
df.loc[df['Target'] == 1].describe()


Out[10]:
member_id loan_amnt funded_amnt funded_amnt_inv installment annual_inc dti delinq_2yrs fico_range_low fico_range_high ... num_tl_op_past_12m pct_tl_nvr_dlq percent_bc_gt_75 pub_rec_bankruptcies tax_liens tot_hi_cred_lim total_bal_ex_mort total_bc_limit total_il_high_credit_limit Target
count 0.0 94968.000000 94968.000000 94968.000000 94968.000000 9.496800e+04 94968.000000 94968.000000 94968.000000 94968.000000 ... 94968.000000 94968.000000 93915.000000 94968.000000 94968.000000 9.496800e+04 9.496800e+04 94968.000000 9.496800e+04 94968.0
mean NaN 14678.662813 14678.662813 14673.312853 437.941518 7.944094e+04 17.906022 0.322066 696.294278 700.294457 ... 2.412507 94.198152 42.624899 0.155926 0.050175 1.864274e+05 5.150747e+04 22252.687547 4.336753e+04 1.0
std NaN 8680.966801 8680.966801 8677.636591 254.684005 6.976620e+04 8.553780 0.903981 32.486469 32.487292 ... 1.949052 8.496625 35.786696 0.409449 0.342618 1.824717e+05 4.883338e+04 22438.665164 4.387513e+04 0.0
min NaN 1000.000000 1000.000000 975.000000 30.540000 5.000000e+03 0.000000 0.000000 660.000000 664.000000 ... 0.000000 16.700000 0.000000 0.000000 0.000000 2.500000e+03 0.000000e+00 0.000000 0.000000e+00 1.0
25% NaN 8000.000000 8000.000000 8000.000000 251.360000 4.880000e+04 11.550000 0.000000 670.000000 674.000000 ... 1.000000 91.300000 0.000000 0.000000 0.000000 5.399600e+04 2.213300e+04 7800.000000 1.579275e+04 1.0
50% NaN 12525.000000 12525.000000 12525.000000 377.040000 6.800000e+04 17.300000 0.000000 690.000000 694.000000 ... 2.000000 97.600000 40.000000 0.000000 0.000000 1.268690e+05 3.898000e+04 15300.000000 3.312550e+04 1.0
75% NaN 20000.000000 20000.000000 20000.000000 583.500000 9.500000e+04 23.700000 0.000000 710.000000 714.000000 ... 3.000000 100.000000 66.700000 0.000000 0.000000 2.693190e+05 6.492800e+04 28900.000000 5.819925e+04 1.0
max NaN 35000.000000 35000.000000 35000.000000 1409.990000 7.600000e+06 380.530000 39.000000 845.000000 850.000000 ... 25.000000 100.000000 100.000000 7.000000 15.000000 4.214831e+06 1.684313e+06 684000.000000 2.101913e+06 1.0

8 rows × 90 columns


In [11]:
df.loc[df['Target'] == 0][['funded_amnt','total_pymnt']].describe()


Out[11]:
funded_amnt total_pymnt
count 42302.000000 42302.000000
mean 15658.597702 5534.268757
std 8555.295964 3859.052983
min 1000.000000 0.000000
25% 9500.000000 2781.667500
50% 14475.000000 4660.085000
75% 20400.000000 7337.890000
max 35000.000000 48307.240000

In [12]:
df.loc[df['Target'] == 1][['funded_amnt','total_pymnt']].describe()


Out[12]:
funded_amnt total_pymnt
count 94968.000000 94968.000000
mean 14678.662813 16161.004698
std 8680.966801 9740.942874
min 1000.000000 1000.010000
25% 8000.000000 8569.170000
50% 12525.000000 13994.355000
75% 20000.000000 22034.527500
max 35000.000000 50312.207800

In [13]:
df.loc[df['Target'] == 0][['funded_amnt','total_pymnt']].head(10)


Out[13]:
funded_amnt total_pymnt
14 27300.0 8727.52
21 23975.0 3928.86
28 23100.0 5305.12
50 16000.0 4382.17
101 24250.0 4124.42
113 7200.0 3043.61
136 16000.0 3109.00
139 5000.0 1320.07
143 8000.0 3449.23
179 25825.0 3028.09

In [14]:
C=0.0820849986238988

In [15]:
A=df.loc[df['Target'] == 0][['funded_amnt','total_pymnt']].describe()

In [24]:
B=df.loc[df['Target'] == 1][['funded_amnt','total_pymnt']].describe()

In [25]:
Funded=A['funded_amnt']['count']*A['funded_amnt']['mean']+B['funded_amnt']['count']*B['funded_amnt']['mean']

In [26]:
Funded


Out[26]:
2056393250.0

In [27]:
Collected=A['total_pymnt']['count']*A['total_pymnt']['mean']+B['total_pymnt']['count']*B['total_pymnt']['mean']

In [30]:
Collected


Out[30]:
1768888931.1485128

In [31]:
Collected-Funded


Out[31]:
-287504318.85148716

In [ ]: