In [ ]:
%load_ext Cython
pd.options.display.max_columns = 999
In [ ]:
import math
import dir_constants as dc
import re
from tqdm import tqdm_notebook
In [ ]:
platform = 'lendingclub'
store = pd.HDFStore(
dc.home_path+'/justin_tinkering/data_science/lendingclub/{0}_store.h5'.
format(platform),
append=True)
loan_info = store['loan_info_merged']
store.close()
In [ ]:
#turn all date columns into pandas timestamp
month_dict = {
'jan': '1',
'feb': '2',
'mar': '3',
'apr': '4',
'may': '5',
'jun': '6',
'jul': '7',
'aug': '8',
'sep': '9',
'oct': '10',
'nov': '11',
'dec': '12'
}
# date cols
date_cols = [
'issue_d', 'earliest_cr_line', 'last_pymnt_d', 'last_credit_pull_d',
'next_pymnt_d', 'sec_app_earliest_cr_line'
]
for col in date_cols:
loan_info[col] = loan_info[col].str.lower()
loan_info[col] = pd.to_datetime(
loan_info[col].str[:3].str.lower().replace(month_dict) +
loan_info[col].str[3:],
format='%m-%Y')
In [ ]:
# Cleanups
# term
#loan_info['term'] = loan_info['term'].str.strip().str[:2].astype(float)
# int_rate
loan_info['int_rate'] = loan_info['int_rate'] / 100
# installment funded _________________________________________________________
rename_dict = {'installment': 'installment_funded'}
loan_info.rename_axis(rename_dict, inplace=True, axis=1)
# installment_funded_amnt and installment_loan_amnt
# pretty sure installment is installment_funded_amnt
# loan_info.rename(columns = {'installment': 'installment_funded_amnt'}, inplace = True)
# loan_info['installment_loan_amnt'] = loan_info.apply(funded_amnt_installment_loan_amnt, axis = 1)
# grade_n
# allGrades = ['A1', 'A2', 'A3', 'A4', 'A5', 'B1', 'B2', 'B3', 'B4', 'B5', 'C1', 'C2', 'C3', 'C4', 'C5',
# 'D1', 'D2', 'D3', 'D4', 'D5', 'E1', 'E2', 'E3', 'E4', 'E5', 'F1', 'F2', 'F3', 'F4', 'F5',
# 'G1', 'G2', 'G3', 'G4', 'G5']
# loan_info['grade_n'] = loan_info['sub_grade'].apply(lambda x: allGrades.index(x))+1
# emp_title
loan_info['emp_title'] = loan_info['emp_title'].str.lower()
# emp_length
# dic_emp_length = {'10+ years': 10, '< 1 year': 0, '1 year': 1, '2 years': 2, '3 years': 3,
# '4 years': 4, '5 years': 5, '6 years': 6, '7 years': 7,
# '8 years': 8, '9 years': 9,'n/a': 0, '': 0}
# loan_info['emp_length'] = loan_info['emp_length'].replace(dic_emp_length)
# home_ownership
dic_home_ownership = {
'admin_us': 'none',
'mortgage': 'mortgage',
'rent': 'rent',
'own': 'own',
'other': 'other',
'none': 'none',
'any': 'none'
}
loan_info['home_ownership'] = loan_info['home_ownership'].str.lower().replace(
dic_home_ownership)
# verification_status and verification_status_joint
dic_verification_status = {
'VERIFIED - income': 'platform',
'VERIFIED - income source': 'source',
'not verified': 'none',
'Source Verified': 'source',
'Not Verified': 'none',
'Verified': 'platform'
}
loan_info['verification_status'] = loan_info['verification_status'].replace(
dic_verification_status)
loan_info['verification_status_joint'] = loan_info[
'verification_status_joint'].replace(dic_verification_status)
# status
dic_status = {
'Current': 'current',
'Charged Off': 'charged_off',
'Fully Paid': 'paid',
'Late (31-120 days)': 'late_120',
'In Grace Period': 'grace_15',
'Late (16-30 days)': 'late_30',
'Default': 'defaulted',
'Issued': 'current'
}
loan_info['loan_status'] = loan_info['loan_status'].apply(
lambda x: re.sub('Does not meet the credit policy. Status:', '', x))
loan_info['loan_status'] = loan_info['loan_status'].apply(
lambda x: re.sub('Does not meet the credit policy. Status:', '', x))
loan_info['loan_status'] = loan_info['loan_status'].replace(dic_status)
#title
loan_info['title'] = loan_info['title'].str.lower()
#application_type
loan_info['application_type'] = loan_info['application_type'].str.lower()
# consider at a future date, not turning the below into percents. ________
#revol_util
loan_info['revol_util'] = loan_info['revol_util'].apply(
lambda x: float(x.strip('%')) / 100 if pd.notnull(x) else np.nan)
#all_util
loan_info['all_util'] = loan_info['all_util'] / 100.
# pct_tl_nvr_dlq
loan_info['pct_tl_nvr_dlq'] = loan_info['pct_tl_nvr_dlq'] / 100.
# percent_bc_gt_75
loan_info['percent_bc_gt_75'] = loan_info['percent_bc_gt_75'] / 100.
In [ ]:
# out_prncp is messed up. Fix it
loan_info['out_prncp'] = loan_info['funded_amnt'] - loan_info['total_rec_prncp']
loan_info['out_prncp'] = np.where(loan_info['out_prncp'] <= 0.019, 0, loan_info['out_prncp'])
loan_info['out_prncp'] = loan_info['out_prncp'].round(2)
# fix installment_funded as well
loan_info['installment_funded'] = np.pmt(loan_info['int_rate']/12, loan_info['term'], -loan_info['funded_amnt'])
In [ ]:
# some date setting stuff for cleanups
max_date = loan_info['last_pymnt_d'].max()
# end_d to me means the date we can stop tracking things about the loan. Should be defunct
def applyEndD(status, group):
if status == 'charged_off':
#split the group into two groups, one which has paid something, and other which has paid nothing
never_paid = group[group['last_pymnt_d'].isnull()]
has_paid = group[group['last_pymnt_d'].notnull()]
never_paid['end_d'] = never_paid['issue_d'] + pd.DateOffset(months=+5)
has_paid['end_d'] = has_paid['last_pymnt_d'] + pd.DateOffset(months=+5)
group.ix[never_paid.index.values, 'end_d'] = never_paid['end_d']
group.ix[has_paid.index.values, 'end_d'] = has_paid['end_d']
return group['end_d']
elif status == 'paid':
return group['last_pymnt_d']
else:
return pd.Series([max_date] * len(group), index=group.index.values)
In [ ]:
# status_grouped
status_grouped = loan_info.groupby('loan_status')
In [ ]:
# columns that need to be added
# # specify platform
# loan_info['platform'] = 'LC'
# loan_info['days_late'] = 0 #placeholder column, filling from pmt_loan_infost
end_d_series = pd.Series([])
for status, group in status_grouped:
end_d_series = end_d_series.append(
applyEndD(status, group), verify_integrity=True)
loan_info['end_d'] = end_d_series
# # next_pymnt_d
# def applyNextPmtD(status, group):
# if status in ['charged_off', 'paid']:
# return pd.Series([np.nan]*len(group), index = group.index.values)
# else:
# return group['issue_d'] + pd.DateOffset(months=+1)
# next_pymnt_d_series = pd.Series([])
# for status, group in status_grouped:
# next_pymnt_d_series = next_pymnt_d_series.append(applyNextPmtD(status, group), verify_integrity=True)
# loan_info['next_pymnt_d'] = next_pymnt_d_series
# adding line_history_d, line_history_m, and line_history_y, there's prob a more accurate way to do this
loan_info['line_history_d'] = (
loan_info['issue_d'] - loan_info['earliest_cr_line']) / np.timedelta64(
1, 'D')
loan_info['line_history_m'] = (loan_info['line_history_d'] *
(12 / 365.25)).round()
loan_info['line_history_y'] = (loan_info['line_history_m']) / 12.
#credit_score
loan_info['fico'] = (
loan_info['fico_range_high'] + loan_info['fico_range_low']) / 2
# maturity_time
# put in maturity time so I can look at old data for historic default int_rates
loan_info['months_paid'] = ((
max_date - loan_info['issue_d']) / np.timedelta64(1, 'D') *
(12 / 365.25)).round()
loan_info['maturity_time'] = loan_info['months_paid'] / loan_info['term']
loan_info['maturity_time'] = np.where(loan_info['maturity_time'] >= 1, 1,
loan_info['maturity_time'])
In [ ]:
%%cython
cimport numpy as np
import numpy as np
cpdef rem_to_be_paid(double out_prncp,
double install,
double int_rate):
cdef double m_rate
cdef double to_be_paid
m_rate = int_rate/12
to_be_paid = 0.0
k = 0
while out_prncp > 0:
k += 1
out_prncp = (1+m_rate) * out_prncp
out_prncp -= install
to_be_paid += install
# the break was added to figure out what was wrong with infinite while; it was due to installment funded
# being INCORRECTLY REPORTED by lending club
if k >= 100:
print(to_be_paid)
break
if out_prncp < 0:
to_be_paid -= abs(out_prncp)
return to_be_paid
cpdef np.ndarray[double] apply_rem_to_be_paid(np.ndarray col_out_prncp,
np.ndarray col_install,
np.ndarray col_int_rate):
assert (col_out_prncp.dtype == np.float and col_install.dtype == np.float and col_int_rate.dtype == np.float)
cdef Py_ssize_t i, n = len(col_out_prncp)
assert (len(col_out_prncp) == len(col_install) == n)
cdef np.ndarray[double] res = np.empty(n)
for i in xrange(n):
res[i] = rem_to_be_paid(col_out_prncp[i],
col_install[i],
col_int_rate[i])
return res
In [ ]:
# add maturity paid
loan_info['rem_to_be_paid'] = apply_rem_to_be_paid(
loan_info['out_prncp'].values, loan_info['installment_funded'].values,
loan_info['int_rate'].values)
In [ ]:
loan_info['maturity_paid'] = loan_info['total_pymnt'] / (
loan_info['total_pymnt'] + loan_info['rem_to_be_paid'])
# mat_paid has nans because some charged_off loans have 0 rem_to_be_paid
# which is inconsistent with the majority treatment. quick fix is fillna
loan_info['maturity_paid'] = loan_info['maturity_paid'].fillna(0)
adjust maturity paid by status, with late-status notes being adjusted by loan status migration rates below https://www.lendingclub.com/info/demand-and-credit-profile.action
In [ ]:
# We have 25% of grace period notes reaching charged-off 9 months later.
# How do I translate this into a confidence about how done-paid wise it is?
# Done paid wise = how sure we are that no more non-recovery money is coming?
# The chart depicts % outstanding principal recovered....
# Without putting much thought into it, I'll increment all percents by 10%
# Because I do know that everything in default is pretty much sold
# at 10 cents on the dollar hence the 10% recovery under the default
# column
In [ ]:
# grace = 35%, late_30 = 64%, late_120 = 98%,
# ********************to be completed later after reading how to best deal with dataset shift**********
# loan_info['maturity_paid_stat_adj'] = np.where(loan_info['loan_status'] == '')
In [ ]:
loan_info['target_loose'] = np.where(loan_info['loan_status'].isin(['charged_off', 'defaulted']), 1, 0)
In [ ]:
# going to drop for now, could use desc at later point with NLP
loan_info.drop('desc', axis=1, inplace=True)
loan_info.drop('url', axis=1, inplace=True)
loan_info.drop('title', axis=1, inplace=True)
In [ ]:
# home_ownership: none should be other
loan_info['home_ownership'].replace({'none': 'other'}, inplace=True)
# annual_income has 4 nulls. Just fill with 0
loan_info['annual_inc'].replace({np.nan: 0.0}, inplace=True)
# drop the one null zip_code
loan_info = loan_info[loan_info['zip_code'].notnull()]
# drop the loans where earliest_cr_line is null
loan_info = loan_info[loan_info['earliest_cr_line'].notnull()]
# drop null chargeoff_within_12_mths
loan_info = loan_info[loan_info['chargeoff_within_12_mths'].notnull()]
# drop null tax_liens
loan_info = loan_info[loan_info['tax_liens'].notnull()]
# add installment_amount
loan_info['installment_amount'] = np.pmt(loan_info['int_rate']/12, loan_info['term'], -loan_info['loan_amnt'])
# drop loans that have this null
loan_info = loan_info[loan_info['inq_last_6mths'].notnull()]
In [ ]:
# More Data Cleaning Steps ____________________________________________________
# Drop columns before cleanup
# Dropping these since I don't want them and they might confuse me.
# There is no reason why I care about money that went just to investors rather
# than to lending club as well when they top off loans.
loan_info.drop(['funded_amnt_inv',
'out_prncp_inv'], axis = 1, inplace = True)
# Dropping these because I'm getting them from data_pmt_history
# loan_info.drop(['home_ownership'], axis = 1, inplace = True)
# loan_info.rename(columns = {'total_pymnt': 'total_pymnt_rec_recov',
# 'total_pymnt_recov': 'total_pymnt_rec_recov'}, inplace=True)
loan_info['orig_amt_due'] = loan_info['term'] * loan_info['installment_amount']
loan_info['roi_simple'] = loan_info['total_pymnt']/loan_info['funded_amnt']
In [ ]:
# fix loans with no record at all for a specific month ________________________
store.open()
pmt_hist_ids = store['pmt_hist_ids'].astype(int)
max_id = pmt_hist_ids.max()
chunksize = 800
n_chunks = len(pmt_hist_ids)//chunksize + 1
bad_statuses = set(['late_120', 'defaulted', 'charged_off', 'late_30'])
# m_disc_rate = .08/12
target_strict_dict = {}
nvps_8_dict = {}
for n in tqdm_notebook(np.arange(n_chunks)):
if n == 0:
left_bound = 0
else:
left_bound = pmt_hist_ids[n*chunksize]
if n == (n_chunks - 1):
right_bound = max_id
else:
right_bound = pmt_hist_ids[(n+1)*chunksize]
chunk = pd.read_hdf(
store,
'pmt_hist_intermediary_2',
where='(loan_id_num > left_bound) & (loan_id_num <= right_bound)')
id_grouped = chunk.groupby('loan_id')
for ids, group in id_grouped:
statuses = set(group['status_period_end'])
if len(statuses.intersection(bad_statuses)) > 0:
target_strict_dict[ids] = 1
else:
target_strict_dict[ids] = 0
In [ ]:
loan_info['target_strict'] = pd.Series(target_strict_dict)
In [ ]:
loan_info.head()
In [ ]:
store.open()
store['loan_info_clean'] = loan_info
In [ ]:
store.close()
In [ ]:
# code used for tweaking speed ups
# test = loan_info.tail(1000)
# loan_info.tail(30).apply(lambda x: rem_to_be_paid(x['out_prncp'],
# x['installment_funded'],
# x['int_rate']), axis=1)
# apply_rem_to_be_paid(test['out_prncp'].values, test['installment_funded'].values,
# test['int_rate'].values)
# %timeit apply_rem_to_be_paid(test['out_prncp'].values, test['installment_funded'].values,test['int_rate'].values)
# %timeit loan_info.tail(1000).apply(lambda x: rem_to_be_paid(x['out_prncp'],x['installment_funded'],x['int_rate']), axis=1)
# %prun -l 10 apply_rem_to_be_paid(test['out_prncp'].values, test['installment_funded'].values,test['int_rate'].values)
# %timeit apply_rem_to_be_paid(test['out_prncp'].values, test['installment_funded'].values,test['int_rate'].values)