Make npv_rois first, then can run this



In [ ]:

    
%matplotlib inline
pd.set_option('display.max_rows',500)



In [ ]:

    
import dir_constants as dc
seed = 42



In [ ]:

    
platform = 'lendingclub'

store = pd.HDFStore(
    dc.home_path+'/justin_tinkering/data_science/lendingclub/{0}_store.h5'.
    format(platform),
    append=True)
loan_info = store['loan_info_clean']
npv_rois = store['loan_npv_rois']
store.close()



In [ ]:

    
dont_keep_cols = [
    'id',
    'member_id',
    'funded_amnt',
    'installment_funded',
    'emp_title',
    #'issue_d',
    'loan_status',
    'pymnt_plan',
    'fico_range_low',
    'fico_range_high',
    'initial_list_status',
    'out_prncp',
    'total_pymnt',
    'total_pymnt_inv',
    'total_rec_prncp',
    'total_rec_int',
    'total_rec_late_fee',
    'recoveries',
    'collection_recovery_fee',
    'last_pymnt_d',
    'last_pymnt_amnt',
    'next_pymnt_d',
    'last_credit_pull_d',
    'last_fico_range_high',
    'last_fico_range_low',
    'policy_code',
    'end_d',
    #'line_history_m',
    'line_history_y',
    'months_paid',
    'earliest_cr_line',
    'sec_app_earliest_cr_line',
#     'maturity_time',
    'rem_to_be_paid',
#     'maturity_paid',
    'roi_simple',
#     'target_strict',
#     'target_loose',
]



In [ ]:

    
# first lets try and look at only loans with done status.
loan_info = loan_info[loan_info['loan_status'].isin(['paid', 'defaulted', 'charged_off'])]
npv_rois = npv_rois.loc[loan_info.index,:]



In [ ]:

    
# examine what the loans I have "look" like
loan_info['npv_roi_10'] = npv_rois[0.1]
# somehow the npv rois higher seem unrealistic
loan_info = loan_info[loan_info['npv_roi_10'] <= 0.9]
# only take npv_rois that aren't null (although none should be null)
loan_info = loan_info[loan_info['npv_roi_10'].notnull()]



In [ ]:

    
paid = loan_info[loan_info['loan_status'] == 'paid']
defaulted = loan_info[~loan_info['loan_status'].isin(['paid'])]



In [ ]:

    
loan_info['npv_roi_10'].hist(bins=50)



In [ ]:

    
keep_cols = [col for col in loan_info.columns.values if col not in dont_keep_cols]
useless_cols = []
for col in keep_cols:
    if len(loan_info[col].value_counts(dropna=False))<=1:
        useless_cols.append(col)
keep_cols = [col for col in keep_cols if col not in useless_cols]



In [ ]:

    
train, validate, test = np.split(
    loan_info.sample(frac=1, random_state=seed),
    [int(.7 * len(loan_info)),
     int(.8 * len(loan_info))])



In [ ]:

    
# save this loan_info set
store.open()
store['base_dataset_all_columns'] = loan_info
store['base_dataset_filtered_columns'] = loan_info[keep_cols]
store['train_all_columns'] = train
store['train_filtered_columns'] = train[keep_cols]
store['validate_all_columns'] = validate
store['validate_filtered_columns'] = validate[keep_cols]
store['test_all_columns'] = test
store['test_filtered_columns'] = test[keep_cols]
store.close()



In [ ]: