Author: Justin Hsi

The prepare script for all platforms


In [ ]:
%load_ext Cython
pd.options.display.max_columns = 999

In [ ]:
import math
import dir_constants as dc
import re
from tqdm import tqdm_notebook

In [ ]:
platform = 'lendingclub'

store = pd.HDFStore(
    dc.home_path+'/justin_tinkering/data_science/lendingclub/{0}_store.h5'.
    format(platform),
    append=True)
loan_info = store['loan_info_merged']
store.close()

Cleaning the loan_info


In [ ]:
#turn all date columns into pandas timestamp
month_dict = {
    'jan': '1',
    'feb': '2',
    'mar': '3',
    'apr': '4',
    'may': '5',
    'jun': '6',
    'jul': '7',
    'aug': '8',
    'sep': '9',
    'oct': '10',
    'nov': '11',
    'dec': '12'
}
# date cols
date_cols = [
    'issue_d', 'earliest_cr_line', 'last_pymnt_d', 'last_credit_pull_d',
    'next_pymnt_d', 'sec_app_earliest_cr_line'
]

for col in date_cols:
    loan_info[col] = loan_info[col].str.lower()
    loan_info[col] = pd.to_datetime(
        loan_info[col].str[:3].str.lower().replace(month_dict) +
        loan_info[col].str[3:],
        format='%m-%Y')

In [ ]:
# Cleanups

# term
#loan_info['term'] = loan_info['term'].str.strip().str[:2].astype(float)

# int_rate
loan_info['int_rate'] = loan_info['int_rate'] / 100

# installment funded _________________________________________________________
rename_dict = {'installment': 'installment_funded'}
loan_info.rename_axis(rename_dict, inplace=True, axis=1)

# installment_funded_amnt and installment_loan_amnt
# pretty sure installment is installment_funded_amnt
# loan_info.rename(columns = {'installment': 'installment_funded_amnt'}, inplace = True)
# loan_info['installment_loan_amnt'] = loan_info.apply(funded_amnt_installment_loan_amnt, axis = 1)

# grade_n
# allGrades = ['A1', 'A2', 'A3', 'A4', 'A5', 'B1', 'B2', 'B3', 'B4', 'B5', 'C1', 'C2', 'C3', 'C4', 'C5',
#              'D1', 'D2', 'D3', 'D4', 'D5', 'E1', 'E2', 'E3', 'E4', 'E5', 'F1', 'F2', 'F3', 'F4', 'F5',
#              'G1', 'G2', 'G3', 'G4', 'G5']
# loan_info['grade_n'] = loan_info['sub_grade'].apply(lambda x: allGrades.index(x))+1

# emp_title
loan_info['emp_title'] = loan_info['emp_title'].str.lower()

# emp_length
# dic_emp_length = {'10+ years': 10, '< 1 year': 0, '1 year': 1, '2 years': 2, '3 years': 3,
#                   '4 years': 4, '5 years': 5, '6 years': 6, '7 years': 7,
#                   '8 years': 8, '9 years': 9,'n/a': 0, '': 0}
# loan_info['emp_length'] = loan_info['emp_length'].replace(dic_emp_length)

# home_ownership
dic_home_ownership = {
    'admin_us': 'none',
    'mortgage': 'mortgage',
    'rent': 'rent',
    'own': 'own',
    'other': 'other',
    'none': 'none',
    'any': 'none'
}
loan_info['home_ownership'] = loan_info['home_ownership'].str.lower().replace(
    dic_home_ownership)

# verification_status and verification_status_joint
dic_verification_status = {
    'VERIFIED - income': 'platform',
    'VERIFIED - income source': 'source',
    'not verified': 'none',
    'Source Verified': 'source',
    'Not Verified': 'none',
    'Verified': 'platform'
}
loan_info['verification_status'] = loan_info['verification_status'].replace(
    dic_verification_status)
loan_info['verification_status_joint'] = loan_info[
    'verification_status_joint'].replace(dic_verification_status)

# status
dic_status = {
    'Current': 'current',
    'Charged Off': 'charged_off',
    'Fully Paid': 'paid',
    'Late (31-120 days)': 'late_120',
    'In Grace Period': 'grace_15',
    'Late (16-30 days)': 'late_30',
    'Default': 'defaulted',
    'Issued': 'current'
}
loan_info['loan_status'] = loan_info['loan_status'].apply(
    lambda x: re.sub('Does not meet the credit policy.  Status:', '', x))
loan_info['loan_status'] = loan_info['loan_status'].apply(
    lambda x: re.sub('Does not meet the credit policy. Status:', '', x))
loan_info['loan_status'] = loan_info['loan_status'].replace(dic_status)

#title
loan_info['title'] = loan_info['title'].str.lower()

#application_type
loan_info['application_type'] = loan_info['application_type'].str.lower()

# consider at a future date, not turning the below into percents. ________

#revol_util
loan_info['revol_util'] = loan_info['revol_util'].apply(
    lambda x: float(x.strip('%')) / 100 if pd.notnull(x) else np.nan)

#all_util
loan_info['all_util'] = loan_info['all_util'] / 100.

# pct_tl_nvr_dlq
loan_info['pct_tl_nvr_dlq'] = loan_info['pct_tl_nvr_dlq'] / 100.

# percent_bc_gt_75
loan_info['percent_bc_gt_75'] = loan_info['percent_bc_gt_75'] / 100.

In [ ]:
# out_prncp is messed up. Fix it
loan_info['out_prncp'] = loan_info['funded_amnt'] - loan_info['total_rec_prncp']
loan_info['out_prncp'] = np.where(loan_info['out_prncp'] <= 0.019, 0, loan_info['out_prncp'])
loan_info['out_prncp'] = loan_info['out_prncp'].round(2)

# fix installment_funded as well
loan_info['installment_funded'] = np.pmt(loan_info['int_rate']/12, loan_info['term'], -loan_info['funded_amnt'])

In [ ]:
# some date setting stuff for cleanups
max_date = loan_info['last_pymnt_d'].max()

# end_d to me means the date we can stop tracking things about the loan. Should be defunct
def applyEndD(status, group):
    if status == 'charged_off':
        #split the group into two groups, one which has paid something, and other which has paid nothing
        never_paid = group[group['last_pymnt_d'].isnull()]
        has_paid = group[group['last_pymnt_d'].notnull()]

        never_paid['end_d'] = never_paid['issue_d'] + pd.DateOffset(months=+5)
        has_paid['end_d'] = has_paid['last_pymnt_d'] + pd.DateOffset(months=+5)

        group.ix[never_paid.index.values, 'end_d'] = never_paid['end_d']
        group.ix[has_paid.index.values, 'end_d'] = has_paid['end_d']
        return group['end_d']
    elif status == 'paid':
        return group['last_pymnt_d']
    else:
        return pd.Series([max_date] * len(group), index=group.index.values)

In [ ]:
# status_grouped 
status_grouped = loan_info.groupby('loan_status')

In [ ]:
# columns that need to be added

# # specify platform
# loan_info['platform'] = 'LC'

# loan_info['days_late'] = 0 #placeholder column, filling from pmt_loan_infost

end_d_series = pd.Series([])
for status, group in status_grouped:
    end_d_series = end_d_series.append(
        applyEndD(status, group), verify_integrity=True)

loan_info['end_d'] = end_d_series

# # next_pymnt_d
# def applyNextPmtD(status, group):
#     if status in ['charged_off', 'paid']:
#         return pd.Series([np.nan]*len(group), index = group.index.values)
#     else:
#         return group['issue_d'] + pd.DateOffset(months=+1)

# next_pymnt_d_series = pd.Series([])
# for status, group in status_grouped:
#     next_pymnt_d_series = next_pymnt_d_series.append(applyNextPmtD(status, group), verify_integrity=True)

# loan_info['next_pymnt_d'] = next_pymnt_d_series

# adding line_history_d, line_history_m, and line_history_y, there's prob a more accurate way to do this
loan_info['line_history_d'] = (
    loan_info['issue_d'] - loan_info['earliest_cr_line']) / np.timedelta64(
        1, 'D')
loan_info['line_history_m'] = (loan_info['line_history_d'] *
                               (12 / 365.25)).round()
loan_info['line_history_y'] = (loan_info['line_history_m']) / 12.

#credit_score
loan_info['fico'] = (
    loan_info['fico_range_high'] + loan_info['fico_range_low']) / 2

# maturity_time
# put in maturity time so I can look at old data for historic default int_rates
loan_info['months_paid'] = ((
    max_date - loan_info['issue_d']) / np.timedelta64(1, 'D') *
                            (12 / 365.25)).round()
loan_info['maturity_time'] = loan_info['months_paid'] / loan_info['term']
loan_info['maturity_time'] = np.where(loan_info['maturity_time'] >= 1, 1,
                                      loan_info['maturity_time'])

In [ ]:
%%cython
cimport numpy as np
import numpy as np

cpdef rem_to_be_paid(double out_prncp,
                     double install,
                     double int_rate):
    cdef double m_rate
    cdef double to_be_paid
    m_rate = int_rate/12
    to_be_paid = 0.0
    k = 0
    while out_prncp > 0:
        k += 1
        out_prncp = (1+m_rate) * out_prncp
        out_prncp -= install
        to_be_paid += install
        # the break was added to figure out what was wrong with infinite while; it was due to installment funded
        # being INCORRECTLY REPORTED by lending club
        if k >= 100:
            print(to_be_paid)
            break
        if out_prncp < 0:
            to_be_paid -= abs(out_prncp)
    return to_be_paid

cpdef np.ndarray[double] apply_rem_to_be_paid(np.ndarray col_out_prncp,
                                              np.ndarray col_install,
                                              np.ndarray col_int_rate):
    assert (col_out_prncp.dtype == np.float and col_install.dtype == np.float and col_int_rate.dtype == np.float)
    cdef Py_ssize_t i, n = len(col_out_prncp)
    assert (len(col_out_prncp) == len(col_install) == n)
    cdef np.ndarray[double] res = np.empty(n)
    for i in xrange(n):
        res[i] = rem_to_be_paid(col_out_prncp[i],
                                col_install[i],
                                col_int_rate[i])
    return res

Somewhere, apply rem_to_be_paid stalls. Chunk it and iterate to see which chunk?


In [ ]:
# add maturity paid
loan_info['rem_to_be_paid'] = apply_rem_to_be_paid(
    loan_info['out_prncp'].values, loan_info['installment_funded'].values,
    loan_info['int_rate'].values)

In [ ]:
loan_info['maturity_paid'] = loan_info['total_pymnt'] / (
    loan_info['total_pymnt'] + loan_info['rem_to_be_paid'])

# mat_paid has nans because some charged_off loans have 0 rem_to_be_paid
# which is inconsistent with the majority treatment. quick fix is fillna

loan_info['maturity_paid'] = loan_info['maturity_paid'].fillna(0)

adjust maturity paid by status, with late-status notes being adjusted by loan status migration rates below https://www.lendingclub.com/info/demand-and-credit-profile.action


In [ ]:
# We have 25% of grace period notes reaching charged-off 9 months later. 
# How do I translate this into a confidence about how done-paid wise it is?
# Done paid wise = how sure we are that no more non-recovery money is coming?
# The chart depicts % outstanding principal recovered....
# Without putting much thought into it, I'll increment all percents by 10%
# Because I do know that everything in default is pretty much sold
# at 10 cents on the dollar hence the 10% recovery under the default
# column

In [ ]:
# grace = 35%, late_30 = 64%, late_120 = 98%, 
# ********************to be completed later after reading how to best deal with dataset shift**********
# loan_info['maturity_paid_stat_adj'] = np.where(loan_info['loan_status'] == '')

In [ ]:
loan_info['target_loose'] = np.where(loan_info['loan_status'].isin(['charged_off', 'defaulted']), 1, 0)

In [ ]:
# going to drop for now, could use desc at later point with NLP
loan_info.drop('desc', axis=1, inplace=True)
loan_info.drop('url', axis=1, inplace=True)
loan_info.drop('title', axis=1, inplace=True)

Additional cleaning based off information from api

https://www.lendingclub.com/developers/listed-loans.action


In [ ]:
# home_ownership: none should be other
loan_info['home_ownership'].replace({'none': 'other'}, inplace=True)

# annual_income has 4 nulls. Just fill with 0
loan_info['annual_inc'].replace({np.nan: 0.0}, inplace=True)

# drop the one null zip_code
loan_info = loan_info[loan_info['zip_code'].notnull()]

# drop the loans where earliest_cr_line is null
loan_info = loan_info[loan_info['earliest_cr_line'].notnull()]

# drop null chargeoff_within_12_mths
loan_info = loan_info[loan_info['chargeoff_within_12_mths'].notnull()]

# drop null tax_liens
loan_info = loan_info[loan_info['tax_liens'].notnull()]

# add installment_amount
loan_info['installment_amount'] = np.pmt(loan_info['int_rate']/12, loan_info['term'], -loan_info['loan_amnt'])

# drop loans that have this null
loan_info = loan_info[loan_info['inq_last_6mths'].notnull()]

In [ ]:
# More Data Cleaning Steps ____________________________________________________

# Drop columns before cleanup
# Dropping these since I don't want them and they might confuse me.
# There is no reason why I care about money that went just to investors rather
# than to lending club as well when they top off loans.
loan_info.drop(['funded_amnt_inv',
                'out_prncp_inv'], axis = 1, inplace = True)

# Dropping these because I'm getting them from data_pmt_history
# loan_info.drop(['home_ownership'], axis = 1, inplace = True)


# loan_info.rename(columns = {'total_pymnt': 'total_pymnt_rec_recov',
#                             'total_pymnt_recov': 'total_pymnt_rec_recov'}, inplace=True)

loan_info['orig_amt_due'] = loan_info['term'] * loan_info['installment_amount']

loan_info['roi_simple'] = loan_info['total_pymnt']/loan_info['funded_amnt']

Adding target strict, which is flag a loan that has ever been late, found via payment history


In [ ]:
# fix loans with no record at all for a specific month ________________________
store.open()
pmt_hist_ids = store['pmt_hist_ids'].astype(int)
max_id = pmt_hist_ids.max()
chunksize = 800
n_chunks = len(pmt_hist_ids)//chunksize + 1
bad_statuses = set(['late_120', 'defaulted', 'charged_off', 'late_30'])
# m_disc_rate = .08/12

target_strict_dict = {}
nvps_8_dict = {}
for n in tqdm_notebook(np.arange(n_chunks)):
    if n == 0:
        left_bound = 0
    else:
        left_bound = pmt_hist_ids[n*chunksize]
    if n == (n_chunks - 1):
        right_bound = max_id
    else:
        right_bound = pmt_hist_ids[(n+1)*chunksize]
    
    chunk = pd.read_hdf(
        store,
        'pmt_hist_intermediary_2',
        where='(loan_id_num > left_bound) & (loan_id_num <= right_bound)')
    
    id_grouped = chunk.groupby('loan_id')
    for ids, group in id_grouped:
        statuses = set(group['status_period_end'])
        if len(statuses.intersection(bad_statuses)) > 0:
            target_strict_dict[ids] = 1
        else:
            target_strict_dict[ids] = 0

In [ ]:
loan_info['target_strict'] = pd.Series(target_strict_dict)

Examine loan_info


In [ ]:
loan_info.head()

Store


In [ ]:
store.open()
store['loan_info_clean'] = loan_info

In [ ]:
store.close()

Stuff below can be deleted?


In [ ]:
# code used for tweaking speed ups

# test = loan_info.tail(1000)

# loan_info.tail(30).apply(lambda x: rem_to_be_paid(x['out_prncp'],
#                                                   x['installment_funded'],
#                                                   x['int_rate']), axis=1)

# apply_rem_to_be_paid(test['out_prncp'].values, test['installment_funded'].values,
#                      test['int_rate'].values)

# %timeit apply_rem_to_be_paid(test['out_prncp'].values, test['installment_funded'].values,test['int_rate'].values)

# %timeit loan_info.tail(1000).apply(lambda x: rem_to_be_paid(x['out_prncp'],x['installment_funded'],x['int_rate']), axis=1)

# %prun -l 10 apply_rem_to_be_paid(test['out_prncp'].values, test['installment_funded'].values,test['int_rate'].values)

# %timeit apply_rem_to_be_paid(test['out_prncp'].values, test['installment_funded'].values,test['int_rate'].values)