In [1]:
import numpy as np
import pandas as pd
from transfers.fileio import load_from_pickle
from helpers.preprocessing import process_features, process_payment
from model.validate import actual_IRR

In [2]:
%reload_ext rpy2.ipython

In [3]:
%%R
library(ggplot2)
library(scales)
library(grid)

In [4]:
df_3c = pd.read_csv('data/LoanStats3c_securev1.csv', header=True).iloc[:-2, :]
df_3b = pd.read_csv('data/LoanStats3b_securev1.csv', header=True).iloc[:-2, :]
df_3a = pd.read_csv('data/LoanStats3a_securev1.csv', header=True).iloc[:-2, :]

df_raw = pd.concat((df_3a, df_3b, df_3c), axis=0)
df_raw = df_raw[df_raw['issue_d'].str.contains('20', na=False)]
df_raw['year'] = df_raw['issue_d'].map(lambda x: str(x).split('-')[1])


/Users/savarin/anaconda/envs/py27/lib/python2.7/site-packages/pandas/io/parsers.py:1139: DtypeWarning: Columns (0,19) have mixed types. Specify dtype option on import or set low_memory=False.
  data = self._reader.read(nrows)
/Users/savarin/anaconda/envs/py27/lib/python2.7/site-packages/pandas/io/parsers.py:1139: DtypeWarning: Columns (0) have mixed types. Specify dtype option on import or set low_memory=False.
  data = self._reader.read(nrows)

In [5]:
df_risk = df_3a.copy()

df_risk.rename(columns={'sub_grade': 'subgrade'}, inplace=True)

df_risk['rate'] = df_risk['int_rate'].map(lambda x: float(str(x).strip('%')) / 100)

df_risk['default'] = df_risk['loan_status'].map({
                             'Fully Paid': 0, 
                             'Does not meet the credit policy.  Status:Fully Paid': 0, 
                             'Current': 0,
                             'Does not meet the credit policy.  Status:Current': 0, 
                             'Charged Off': 1, 
                             'Does not meet the credit policy.  Status:Charged Off': 1,
                             'In Grace Period': 0,
                             'Does not meet the credit policy.  Status:In Grace Period': 1,
                             'Late (16-30 days)': 1,
                             'Does not meet the credit policy.  Status:Late (16-30 days)': 1,
                             'Late (31-120 days)': 1,
                             'Does not meet the credit policy.  Status:Late (31-120 days)': 1,
                             'Default': 1})

df_risk = df_risk[df_risk['grade'].isin(['A', 'B', 'C', 'D'])]\
                                  .groupby('subgrade', as_index=False)\
                                  .mean()[['rate', 'default', 'subgrade']]

%Rpush df_risk

In [6]:
%%R -w 800

multiplot = function(..., plotlist=NULL, file, cols=1, layout=NULL) {
    require(grid)

    # Make a list from the ... arguments and plotlist
    plots = c(list(...), plotlist)

    numPlots = length(plots)

    # If layout is NULL, then use 'cols' to determine layout
    if (is.null(layout)) {
        # Make the panel
        # ncol: Number of columns of plots
        # nrow: Number of rows needed, calculated from # of cols
        layout <- matrix(seq(1, cols * ceiling(numPlots/cols)),
                        ncol = cols, nrow = ceiling(numPlots/cols))
    }

    if (numPlots==1) {
        print(plots[[1]])
    } else {
        # Set up the page
        grid.newpage()
        pushViewport(viewport(layout = grid.layout(nrow(layout), ncol(layout))))

        # Make each plot, in the correct location
        for (i in 1:numPlots) {
          # Get the i,j matrix positions of the regions that contain this subplot
          matchidx <- as.data.frame(which(layout == i, arr.ind = TRUE))

          print(plots[[i]], vp = viewport(layout.pos.row = matchidx$row,
                                          layout.pos.col = matchidx$col))
      }
    }
}


p1 = ggplot(df_risk, aes(x = factor(subgrade), y = rate, fill=factor(subgrade))) + 
geom_bar(stat = "identity") +
scale_x_discrete(limits = rev(levels(df_risk$subgrade))) +
scale_y_reverse(labels=percent) +
ylab("Interest rate") +
guides(fill=FALSE) + 
theme(text = element_text(size=15), axis.title.y = element_blank()) +
coord_flip() 

p2 = ggplot(df_risk, aes(x = factor(subgrade), y = default, fill=factor(subgrade))) + 
geom_bar(stat = "identity") + 
scale_x_discrete(limits = rev(levels(df_risk$subgrade))) +
scale_y_continuous(labels=percent) + 
ylab("Default rate") +
guides(fill=FALSE) + 
theme(text = element_text(size=15), axis.title.y = element_blank()) +
coord_flip()

multiplot(p1, p2, cols=2)



In [7]:
df_volume = df_raw.groupby('year', as_index=False).sum()[['year', 'loan_amnt']]

%Rpush df_volume

In [8]:
%%R -w 800
volume_formatter = function(x) {h = sprintf('$%d bn',floor(x/1000000000))}

ggplot(df_volume, aes(x = factor(year), y = loan_amnt, fill=factor(year))) + 
geom_bar(stat = "identity") +
scale_fill_brewer() +
scale_y_continuous("Total issuance on Lending Club", label=volume_formatter) + 
guides(fill=FALSE) + 
theme(text = element_text(size=15), axis.title.x = element_blank())



In [9]:
print "Loading model..."
try:
    model = load_from_pickle('pickle/model.pkl')
except (OSError, IOError):
    print "Model not found. Initializing training process, this might take some time..."
    model = initialize_model()

print "Calculating expected IRR of loans that have matured..."
df_expected = process_features(df_3a.copy(), restrict_date=True, current_loans=False)

# To make a fair comparison, the same set of interest rates are used. The 
# enclosed set is from Dec 2014.
int_rate_dict = {'A1':0.0603, 'A2':0.0649, 'A3':0.0699, 'A4':0.0749, 'A5':0.0819,
                 'B1':0.0867, 'B2':0.0949, 'B3':0.1049, 'B4':0.1144, 'B5':0.1199,
                 'C1':0.1239, 'C2':0.1299, 'C3':0.1366, 'C4':0.1431, 'C5':0.1499,
                 'D1':0.1559, 'D2':0.1599, 'D3':0.1649, 'D4':0.1714, 'D5':0.1786}

IRR_predict = model.expected_IRR(df_expected, actual_rate=False, rate_dict=int_rate_dict)

print "Calculating actual IRR of loans that have matured..."
df_actual = process_payment(df_3a.copy())
IRR_true = actual_IRR(df_actual, actual_rate=False, rate_dict=int_rate_dict)

print "Collecting results..."
df_result = df_expected[['id', 'sub_grade']].copy()

df_result['IRR_predict'] = IRR_predict
df_result['IRR_true'] = IRR_true


Loading model...
Calculating expected IRR of loans that have matured...
helpers/preprocessing.py:138: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_index,col_indexer] = value instead
  'Charged Off': 0.})
helpers/preprocessing.py:140: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_index,col_indexer] = value instead
  df['int_rate'] = df['int_rate'].map(lambda x: float(str(x).strip('%')) / 100)
helpers/preprocessing.py:142: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_index,col_indexer] = value instead
  df['term'] = df['term'].map(lambda x: int(str(x).strip(' months')))
helpers/preprocessing.py:146: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_index,col_indexer] = value instead
  df['emp_length'] = df['emp_length'].map(lambda x: '0.5 years' if x == '< 1 year' else x)
helpers/preprocessing.py:147: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_index,col_indexer] = value instead
  df['emp_length'] = df['emp_length'].map(lambda x: '10 years' if x == '10+ years' else x)
helpers/preprocessing.py:148: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_index,col_indexer] = value instead
  df['emp_length'] = df['emp_length'].map(lambda x: '-1 years' if x == 'n/a' else x)
helpers/preprocessing.py:149: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_index,col_indexer] = value instead
  df['emp_length'] = df['emp_length'].map(lambda x: float(x.strip(' years')))
helpers/preprocessing.py:154: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_index,col_indexer] = value instead
  df['emp_length'] = df['emp_length'].map(lambda x: emp_length_mean if x < 0 else x)
helpers/preprocessing.py:156: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_index,col_indexer] = value instead
  df['annual_inc'] = df['annual_inc'].map(lambda x: float(x) / 12)
/Users/savarin/anaconda/envs/py27/lib/python2.7/site-packages/pandas/core/frame.py:2302: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame
  **kwargs)
helpers/preprocessing.py:159: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_index,col_indexer] = value instead
  df['fico_range_low'] = (df['fico_range_low'] + df['fico_range_high']) / 2.
helpers/preprocessing.py:167: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_index,col_indexer] = value instead
  else x['earliest_cr_line'], axis=1)
helpers/preprocessing.py:171: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_index,col_indexer] = value instead
  '%b-%Y')).days / 30, axis=1)
helpers/preprocessing.py:175: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_index,col_indexer] = value instead
  df['revol_util'] = df['revol_util'].map(lambda x: float(str(x).strip('%')) / 100)
helpers/preprocessing.py:180: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_index,col_indexer] = value instead
  df['revol_util'] = df['revol_util'].fillna(revol_util_mean)
helpers/preprocessing.py:185: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_index,col_indexer] = value instead
  df['last_delinq'] = df['last_delinq'].map(lambda x: -1 if x == 'n/a' else x)
helpers/preprocessing.py:187: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_index,col_indexer] = value instead
  df['last_record'] = df['last_record'].map(lambda x: -1 if x == 'n/a' else x)
helpers/preprocessing.py:189: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_index,col_indexer] = value instead
  df['last_derog'] = df['last_derog'].map(lambda x: -1 if x == 'n/a' else x)
Calculating actual IRR of loans that have matured...
Collecting results...

In [10]:
sub_grade_range = ['A1', 'A2', 'A3', 'A4', 'A5', 
                   'B1', 'B2', 'B3', 'B4', 'B5',
                   'C1', 'C2', 'C3', 'C4', 'C5',
                   'D1', 'D2', 'D3', 'D4', 'D5']

average_list = []
top_list = []
bottom_list = []

for sub_grade in sub_grade_range:
    df_sub_grade = df_result[df_result['sub_grade'] == sub_grade]
    df_sub_grade['quartile'] = pd.qcut(df_sub_grade['IRR_predict'], 4, False)
    
    id_top = df_sub_grade[df_sub_grade['quartile'] == 3]['id'].values
    id_bottom = df_sub_grade[df_sub_grade['quartile'] == 0]['id'].values
    
    average_IRR = df_sub_grade.mean()['IRR_true']
    top_IRR = df_result[df_result['id'].isin(id_top)]['IRR_true'].values.mean()
    bottom_IRR = df_result[df_result['id'].isin(id_bottom)]['IRR_true'].values.mean()
    
    average_list.append(average_IRR)
    top_list.append(top_IRR)
    bottom_list.append(bottom_IRR)

In [11]:
df_strategy = pd.DataFrame({'sub_grade':xrange(60), 'IRR':top_list + average_list + bottom_list})
df_strategy['group'] = df_strategy['sub_grade'].map(lambda x: x/20).map({0:'top quartile', 1:'mean', 2:'bottom quartile'})
df_strategy['sub_grade'] = df_strategy['sub_grade'].map(lambda x: x%20) \
                                                   .map({0:'A1', 1:'A2', 2:'A3', 3:'A4', 4:'A5', 
                                                         5:'B1', 6:'B2', 7:'B3', 8:'B4', 9:'B5',
                                                         10:'C1', 11:'C2', 12:'C3', 13:'C4', 14:'C5',
                                                         15:'D1', 16:'D2', 17:'D3', 18:'D4', 19:'D5'})
    
%Rpush df_strategy

In [12]:
%%R -w 800
ggplot(data=df_strategy, aes(x=factor(sub_grade), y=IRR, group=group, color=group)) + 
geom_line() + 
geom_point() + 
scale_y_continuous("Rate of return", breaks=pretty_breaks(5), labels=percent) +
theme(text = element_text(size=15), axis.title.x = element_blank(), legend.position=c(0.9, 0.125)) + 
guides(col = guide_legend(reverse = TRUE))



In [13]:
feature_importance = np.zeros(36,)

for grade in ['A', 'B', 'C', 'D']:
    for submodel in model.model_dict[grade]:
        feature_importance += submodel.feature_importances_

weights = []
features = []
for item in sorted(zip(feature_importance/144, model.features), reverse=True)[:10]:
    weights.append(item[0])
    features.append(item[1])
    
df_weights = pd.DataFrame({'weights':weights, 'features':features})

%Rpush df_weights

In [14]:
%%R -w 800

df_weights$features = factor(df_weights$features, levels = df_weights$features[rev(order(df_weights$weights))])

ggplot(df_weights, aes(x = factor(features), y = weights, fill=factor(features))) + geom_bar(stat = "identity") +
scale_y_continuous("Feature importance weights", labels=percent) + 
theme(text = element_text(size=15), axis.title.x = element_blank(), legend.position=c(0.885, 0.74), axis.text.x=element_blank())



In [15]:
model.expected_IRR(df_expected.iloc[0:1,:], actual_rate=False, rate_dict=int_rate_dict)

X = df_expected.iloc[:1,:][model.features].values
X_sub_grade = df_expected.iloc[:1,:]['sub_grade'].values

expected_payout = model.get_expected_payout(X, X_sub_grade)
df_probability = pd.DataFrame({'period':xrange(1,37), 'probability':expected_payout.ravel()})

%Rpush df_probability

In [16]:
%%R -w 800
period_formatter = function(x) {h = sprintf('%d mth', x)}

ggplot(df_probability, aes(x=period, y=probability, fill=period, width=0.948)) + 
geom_bar(stat="identity") +
scale_y_continuous("Probability of receiving payment") + 
scale_x_continuous(breaks=seq(0, 36, 12), label=period_formatter) +
theme(text=element_text(size=15), axis.title.x=element_blank(), legend.position=c(0.95, 0.85))



In [17]:
IRR_predict_mean = list(df_result.groupby('sub_grade', as_index=False).mean()['IRR_predict'].values)
IRR_true_mean = list(df_result.groupby('sub_grade', as_index=False).mean()['IRR_true'].values)
headline_rate = sorted(int_rate_dict.values())

df_compare = pd.DataFrame({'sub_grade':xrange(60), 'IRR':IRR_predict_mean + IRR_true_mean + headline_rate})
df_compare['group'] = df_compare['sub_grade'].map(lambda x: x/20).map({0:'predict', 1:'true', 2:'headline'})
df_compare['sub_grade'] = df_compare['sub_grade'].map(lambda x: x%20).map({0:'A1', 1:'A2', 2:'A3', 3:'A4', 4:'A5', 
                                                                           5:'B1', 6:'B2', 7:'B3', 8:'B4', 9:'B5',
                                                                           10:'C1', 11:'C2', 12:'C3', 13:'C4', 14:'C5',
                                                                           15:'D1', 16:'D2', 17:'D3', 18:'D4', 19:'D5'})

%Rpush df_compare

In [18]:
%%R -w 800
ggplot(data=df_compare, aes(x=factor(sub_grade), y=IRR, group=group, color=group)) + 
geom_line() + 
geom_point() + 
scale_y_continuous("Rate of return", breaks=pretty_breaks(6), labels=percent) + 
theme(text = element_text(size=15), axis.title.x = element_blank(), legend.position=c(0.925, 0.125))



In [18]: