In [ ]:
import modeling_utils.data_prep as data_prep
from sklearn.externals import joblib
import time

In [ ]:
platform = 'lendingclub'

store = pd.HDFStore(
    '/Users/justinhsi/justin_tinkering/data_science/lendingclub/{0}_store.h5'.
    format(platform),
    append=True)

Make sure no loan in test set was in train set


In [ ]:
store.open()
train = store['train_filtered_columns']
test = store['test_filtered_columns']
loan_npv_rois = store['loan_npv_rois']
default_series = test['target_strict']
results = store['results']
store.close()

train_ids = set(train.index.values)
test_ids = set(test.index.values)
assert len(train_ids.intersection(test_ids)) == 0

Examine performance on test set


In [ ]:
test_X, test_y = data_prep.process_data_test(test)
test_y = test_y['npv_roi_10'].values
regr = joblib.load('model_dump/model_0.1.0.pkl')
regr_version = '0.1.0'
test_yhat = regr.predict(test_X)

Short digression on examining feature importances


In [ ]:
test['0.1.0_scores'] = test_yhat

In [ ]:
percentiles = np.arange(0,100,1)

In [ ]:
results_dict = {}
for perc in tqdm_notebook(percentiles):
    low_bound = np.percentile(test['0.1.0_scores'], perc)
    up_bound = np.percentile(test['0.1.0_scores'], perc + 1)
    results_dict[perc] = test[(test['0.1.0_scores'] >= low_bound) &
                              (test['0.1.0_scores'] < up_bound)][
                                  'npv_roi_10'].mean()

In [ ]:
pd.Series(results_dict)

In [ ]:
plt.figure(figsize=(12,9))
plt.plot(test['0.1.0_scores'], test['npv_roi_10'], '-')
plt.show()

In [ ]:
def eval_models(trials, port_size, available_loans, regr, regr_version, test, loan_npv_rois,
                default_series):
    results = {}
    pct_default = {}
    test_copy = test.copy()
    for trial in tqdm_notebook(np.arange(trials)):
        loan_ids = np.random.choice(
            test_copy.index.values, available_loans, replace=False)
        loans_to_pick_from = test_copy.loc[loan_ids, :]
        scores = regr.predict(loans_to_pick_from)
        scores_series = pd.Series(dict(zip(loan_ids, scores)))
        scores_series.sort_values(ascending=False, inplace=True)
        return scores_series
#         picks = scores_series[:900].index.values
#         results[trial] = loan_npv_rois.loc[picks, :].mean().to_dict()
#         pct_default[trial] = (default_series.loc[picks].sum()) / port_size
#     pct_default_series = pd.Series(pct_default)
#     results_df = pd.DataFrame(results).T
#     results_df['pct_def'] = pct_default_series
#     return results_df

In [ ]:
test_X

In [ ]:
# as per done with baseline models, say 3000 loans available
# , pick 900 of them
trials = 20000
port_size = 900
available_loans = 3000
model_results = eval_models(trials, port_size, available_loans, regr, regr_version, test_X, loan_npv_rois, default_series)

In [ ]:
model_results

In [ ]:
multi_index = []
for col in model_results.columns.values:
    multi_index.append((col,regr_version))

In [ ]:
append_results = model_results
append_results.columns = pd.MultiIndex.from_tuples(multi_index, names = ['discount_rate', 'model'])

In [ ]:
try:
    results = results.join(append_results)
except ValueError:
    results.loc[:, (slice(None), slice('0.2.1','0.2.1'))] = append_results
results.sort_index(axis=1, inplace = True)

In [ ]:
store.open()
store['results'] = results
model_info = store['model_info']
store.close()

In [ ]:
results.describe()

In [ ]:
model_info

In [ ]: