In [1]:

    
from IPython.display import display
from IPython.display import HTML
import IPython.core.display as di # Example: di.display_html('<h3>%s:</h3>' % str, raw=True)

# This line will hide code by default when the notebook is exported as HTML
di.display_html('<script>jQuery(function() {if (jQuery("body.notebook_app").length == 0) { jQuery(".input_area").toggle(); jQuery(".prompt").toggle();}});</script>', raw=True)

# This line will add a button to toggle visibility of code blocks, for use with the HTML export version
di.display_html('''<button onclick="jQuery('.input_area').toggle(); jQuery('.prompt').toggle();">Toggle code</button>''', raw=True)

Lending Club Data Explorations

Preprocess Data



In [2]:

    
%pylab inline









    



Populating the interactive namespace from numpy and matplotlib



In [1]:

    
# Import libraries
from __future__ import absolute_import, division, print_function

# Ignore warnings
import warnings
warnings.filterwarnings('ignore')

import numpy as np
import pandas as pd
from sklearn.externals import joblib

# Graphing Libraries
import matplotlib.pyplot as pyplt
import seaborn as sns
sns.set_style("white")



In [4]:

    
def get_item_description(label):
    """
    Print the description for an item label based on the data dictionary
    """
    print(df_data_dictionary[df_data_dictionary.Item == label].Description.values[0])



In [5]:

    
import datetime
import calendar

def convert_date_months(col):
    """
    Convert the date to difference from this month
    
    Attributes
    ------------
    col: pandas series
    """
    today = datetime.date.today()
    tmp_2 = pd.to_datetime(col)
    tmp_2 = today - tmp_2
    tmp_2 = tmp_2.astype('timedelta64[M]').astype(int)
    
    return tmp_2

def convert_date_days(col):
    """
    Convert the date to difference from this day
    
    Attributes
    ------------
    col: pandas series
    """
    today = datetime.date.today()
    tmp_2 = pd.to_datetime(col)
    tmp_2 = today - tmp_2
    tmp_2 = tmp_2.astype('timedelta64[D]').astype(int)
    
    return tmp_2


def convert_num_day_of_week(x):
    """
    Convert the date to numeric day of the week
    
    Attributes
    ------------
    x: datetime obj
    """
    my_date = pd.to_datetime(x) 
    return my_date.weekday()

def convert_day_of_week(x):
    """
    Convert the date to string day of the week
    
    Attributes
    ------------
    x: datetime obj
    """
    my_date = pd.to_datetime(x) 
    return calendar.day_name[my_date.weekday()]

def convert_week_of_year(x):
    """
    Convert the date to the week of the year
    
    Attributes
    ------------
    x: datetime obj
    """
    my_date = pd.to_datetime(x) 
    return my_date.isocalendar()[1]



In [6]:

    
import operator

def find_missing_values(df):
    """
    df: pandas dataframe

    Returns sorted list of features by the percentage missing values
    """
    count_nan = (len(df) - df.count()) / len(df)
    percentage_nan = {}

    for item in range (0, len(count_nan)):
        if count_nan[item] > 0:
            percentage_nan[count_nan.index[item]] = round(count_nan[item], 4)
    
    return sorted(percentage_nan.items(), key=operator.itemgetter(1))



In [7]:

    
def get_var_type(df):
    """
    Automatically determine variable type
    Attribute:
    ------------
    df: pandas dataframe
    """
    
    continuous_vars = []
    rest = []
    for cols in df.columns:
        a = type(df.ix[0, cols])== np.float64 and pd.notnull(df.ix[0, cols])
        b = type(df.ix[0, cols])== float and pd.notnull(df.ix[0, cols])
        if (a) or (b):
            continuous_vars.append(cols)
        else:
            rest.append(cols)
            
    return (continuous_vars, rest)

Load Data



In [2]:

    
dataPath = 'data'
df = pd.read_csv(dataPath+'/LoanStats3a.csv')
df_b = pd.read_csv(dataPath+'/LoanStats3b.csv')
df_c = pd.read_csv(dataPath+'/LoanStats3c.csv')

df = df.append(df_b)
df = df.append(df_c)
df = df.reset_index()
print ("Dataset has {} samples with {} features each.".format(*df.shape))









    



Dataset has 379058 samples with 97 features each.

Load data dictionary



In [9]:

    
xls_file = pd.ExcelFile(dataPath+'/LCDataDictionary.xlsx')
xls_file.sheet_names









    Out[9]:





[u'LoanStats', u'browseNotes', u'RejectStats']



In [10]:

    
df_data_dictionary = xls_file.parse('LoanStats')
df_data_dictionary.columns = [u'Item', u'Description']
df_ = xls_file.parse('browseNotes')
df_.columns = df_data_dictionary.columns
df_data_dictionary = df_data_dictionary.append(df_)



In [11]:

    
df_data_dictionary.head()









    Out[11]:






  
    
      
      Item
      Description
    
  
  
    
      0
      acc_now_delinq
      The number of accounts on which the borrower i...
    
    
      1
      acc_open_past_24mths
      Number of trades opened in past 24 months.
    
    
      2
      accept_d
      The date which the borrower accepted the offer
    
    
      3
      addr_city
      The  city provided by the borrower in the loan...
    
    
      4
      addr_state
      The state provided by the borrower in the loan...



In [12]:

    
df.head()









    Out[12]:






  
    
      
      index
      id
      member_id
      loan_amnt
      funded_amnt
      funded_amnt_inv
      term
      int_rate
      installment
      grade
      ...
      num_il_tl
      mo_sin_old_il_acct
      num_actv_rev_tl
      mo_sin_old_rev_tl_op
      mo_sin_rcnt_rev_tl_op
      total_rev_hi_lim
      num_rev_tl_bal_gt_0
      num_op_rev_tl
      tot_coll_amt
      policy_code
    
  
  
    
      0
      0
      1077501
      1296599.0
      5000.0
      5000.0
      4975.0
      36 months
      10.65%
      162.87
      B
      ...
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      1.0
    
    
      1
      1
      1077430
      1314167.0
      2500.0
      2500.0
      2500.0
      60 months
      15.27%
      59.83
      C
      ...
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      1.0
    
    
      2
      2
      1077175
      1313524.0
      2400.0
      2400.0
      2400.0
      36 months
      15.96%
      84.33
      C
      ...
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      1.0
    
    
      3
      3
      1076863
      1277178.0
      10000.0
      10000.0
      10000.0
      36 months
      13.49%
      339.31
      C
      ...
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      1.0
    
    
      4
      4
      1075358
      1311748.0
      3000.0
      3000.0
      3000.0
      60 months
      12.69%
      67.79
      B
      ...
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      1.0
    
  

5 rows × 97 columns

The first thing I am going to do is find the percentage of missing values in my dataset



In [13]:

    
missing_df = find_missing_values(df)

cols = [x[0] for x in missing_df]
vals = [x[1] for x in missing_df]

pyplt.rcParams['figure.figsize'] = (8, 16)
percentage_nan_frame = pd.DataFrame({ 'Percentage': vals,'Feature': cols })
ax = percentage_nan_frame.plot(kind = 'barh',  width = 0.4, x = 'Feature', color = 'slateblue', 
                               title = "Features Mapped By Percentage Missing Values")
pyplt.grid(True)
pyplt.savefig('report/figures/missing_features.png', format='png', dpi=200)

# reset figure size
pyplt.rcParams['figure.figsize'] = (6, 4)

To make my analysis tractable given the time in which this analysis must be completed, I am going to restrict myself to features for which less than 20% are missing.

Am dropping url, index, id, member_id, emp_title, title



In [14]:

    
drop_list = ['url', 'index', 'id', 'member_id', 'emp_title', 'title']
for col, val in missing_df:
    if val > 0.2:
        drop_list.append(col)



In [15]:

    
df.drop(drop_list, axis=1, inplace=True)



In [16]:

    
df.head()









    Out[16]:






  
    
      
      loan_amnt
      funded_amnt
      funded_amnt_inv
      term
      int_rate
      installment
      grade
      sub_grade
      emp_length
      home_ownership
      ...
      recoveries
      collection_recovery_fee
      last_pymnt_d
      last_pymnt_amnt
      last_credit_pull_d
      pub_rec_bankruptcies
      chargeoff_within_12_mths
      collections_12_mths_ex_med
      tax_liens
      policy_code
    
  
  
    
      0
      5000.0
      5000.0
      4975.0
      36 months
      10.65%
      162.87
      B
      B2
      10+ years
      RENT
      ...
      0.00
      0.00
      2014-08-11
      162.87
      2014-08-21
      0.0
      0.0
      0.0
      0.0
      1.0
    
    
      1
      2500.0
      2500.0
      2500.0
      60 months
      15.27%
      59.83
      C
      C4
      < 1 year
      RENT
      ...
      117.08
      1.11
      2013-04-30
      119.66
      2013-09-23
      0.0
      0.0
      0.0
      0.0
      1.0
    
    
      2
      2400.0
      2400.0
      2400.0
      36 months
      15.96%
      84.33
      C
      C5
      10+ years
      RENT
      ...
      0.00
      0.00
      2014-06-10
      649.91
      2014-08-21
      0.0
      0.0
      0.0
      0.0
      1.0
    
    
      3
      10000.0
      10000.0
      10000.0
      36 months
      13.49%
      339.31
      C
      C1
      10+ years
      RENT
      ...
      0.00
      0.00
      2014-08-08
      339.31
      2014-08-21
      0.0
      0.0
      0.0
      0.0
      1.0
    
    
      4
      3000.0
      3000.0
      3000.0
      60 months
      12.69%
      67.79
      B
      B5
      1 year
      RENT
      ...
      0.00
      0.00
      2014-08-08
      67.79
      2014-08-21
      0.0
      0.0
      0.0
      0.0
      1.0
    
  

5 rows × 50 columns

Automatically detect column type



In [17]:

    
continuous_vars, rest = get_var_type(df)

Categorical variable

'term',
'grade',
'sub_grade',
'emp_length',
'home_ownership',
'is_inc_v',
'loan_status',
'pymnt_plan',
'purpose',
'addr_city',
'addr_state',
'revol_util',
'initial_list_status',

Dates

'accept_d',
'exp_d',
'list_d',
'issue_d',
'earliest_cr_line'
'last_pymnt_d',
'last_credit_pull_d'



In [18]:

    
display(df[continuous_vars[0:10]].head(1))
display(df[continuous_vars[10:20]].head(1))
display(df[continuous_vars[20:]].head(1))









    






  
    
      
      loan_amnt
      funded_amnt
      funded_amnt_inv
      installment
      annual_inc
      acc_now_delinq
      dti
      delinq_2yrs
      delinq_amnt
      inq_last_6mths
    
  
  
    
      0
      5000.0
      5000.0
      4975.0
      162.87
      24000.0
      0.0
      27.65
      0.0
      0.0
      1.0
    
  








    






  
    
      
      open_acc
      pub_rec
      revol_bal
      total_acc
      out_prncp
      out_prncp_inv
      total_pymnt
      total_pymnt_inv
      total_rec_prncp
      total_rec_int
    
  
  
    
      0
      3.0
      0.0
      13648.0
      9.0
      801.87
      797.81
      5037.97
      5012.83
      4198.13
      839.84
    
  








    






  
    
      
      total_rec_late_fee
      recoveries
      collection_recovery_fee
      last_pymnt_amnt
      pub_rec_bankruptcies
      chargeoff_within_12_mths
      collections_12_mths_ex_med
      tax_liens
      policy_code
    
  
  
    
      0
      0.0
      0.0
      0.0
      162.87
      0.0
      0.0
      0.0
      0.0
      1.0

Data Preprocessing



In [19]:

    
get_item_description('revol_util')









    



Revolving line utilization rate, or the amount of credit the borrower is using relative to all available revolving credit.



In [20]:

    
df['int_rate'] = df['int_rate'].str.extract('(\d+.\d+)')
df['int_rate'] = df['int_rate'].map(lambda x: float(x))



In [21]:

    
df['revol_util'] = df['revol_util'].str.extract('(\d+.\d+)')
df['revol_util'] = df['revol_util'].map(lambda x: float(x))
df['revol_util'] = df['revol_util'].fillna(-1) # that is 25% missing, so filling with -1 to indicate missing



In [22]:

    
df.shape









    Out[22]:





(379058, 50)



In [23]:

    
df = df.dropna()
df.shape









    Out[23]:





(331476, 50)



In [24]:

    
categorical_var = ['term','grade','sub_grade',
 'emp_length','home_ownership','is_inc_v','pymnt_plan','purpose',
 'addr_city','addr_state','revol_util','initial_list_status']

date_var = ['accept_d','list_d','exp_d','issue_d',
'earliest_cr_line','last_pymnt_d','last_credit_pull_d']

Deal with dates



In [25]:

    
for cols in date_var:
    print (cols)
    df[cols+'_months'] = convert_date_months(df[cols])
    df[cols+'_days'] = convert_date_days(df[cols])









    



accept_d
list_d
exp_d
issue_d
earliest_cr_line
last_pymnt_d
last_credit_pull_d

Deal with categorical variables



In [26]:

    
display(df[categorical_var[0:10]].head(1))
display(df[categorical_var[10:]].head(1))









    






  
    
      
      term
      grade
      sub_grade
      emp_length
      home_ownership
      is_inc_v
      pymnt_plan
      purpose
      addr_city
      addr_state
    
  
  
    
      0
      36 months
      B
      B2
      10+ years
      RENT
      Verified
      n
      credit_card
      FLAGSTAFF
      AZ
    
  








    






  
    
      
      revol_util
      initial_list_status
    
  
  
    
      0
      83.7
      f



In [27]:

    
from sklearn.preprocessing import LabelEncoder


number = LabelEncoder()
df['term_old'] = df['term']  
df['term'] = number.fit_transform(df['term'].astype(str))


df['addr_city_old'] = df['addr_city']
df['addr_city'] = number.fit_transform(df['addr_city'].astype(str))

df['addr_state_old'] = df['addr_state']
df['addr_state'] = number.fit_transform(df['addr_state'].astype(str))

df['home_ownership_old'] = df['home_ownership']
df['home_ownership'] = number.fit_transform(df['home_ownership'].astype(str))

df['purpose_old'] = df['purpose']
df['purpose'] = number.fit_transform(df['purpose'].astype(str))

df['pymnt_plan_old'] = df['pymnt_plan']
df['pymnt_plan'] = number.fit_transform(df['pymnt_plan'].astype(str))

df['is_inc_v_old'] = df['is_inc_v']
df['is_inc_v'] = number.fit_transform(df['is_inc_v'].astype(str))

df['initial_list_status_old'] = df['initial_list_status']
df['initial_list_status'] = number.fit_transform(df['initial_list_status'].astype(str))


df['emp_length_old'] = df['emp_length']
df['emp_length'] = number.fit_transform(df['emp_length'].astype(str))

df['grade_old'] = df['grade']
df['grade'] = number.fit_transform(df['grade'].astype(str))

df['sub_grade_old'] = df['sub_grade']
df['sub_grade'] = number.fit_transform(df['sub_grade'].astype(str))



In [28]:

    
new_grade_var = [x for x in df.columns if 'grade' in x]
display(df[new_grade_var].head(1))









    






  
    
      
      grade
      sub_grade
      grade_old
      sub_grade_old
    
  
  
    
      0
      1
      6
      B
      B2

Derive Features

loan_rank: 1 for bad loans, 0 for good loans
- Good loans are the ones where the status is
  - current
  - fully paid
- Bad loans are the ones where the status is
  - default
  - in grace period
  - late
  - charged off
Extract day of the week from dates
Extract week of the year from dates

Derive Features

Loan Rank



In [29]:

    
data = df['loan_status']
data.dropna(inplace = True)



In [30]:

    
df['loan_status'].value_counts()









    Out[30]:





Current                                                        245953
Fully Paid                                                      62790
Charged Off                                                     13451
Late (31-120 days)                                               3876
In Grace Period                                                  2001
Does not meet the credit policy.  Status:Fully Paid              1444
Late (16-30 days)                                                 981
Does not meet the credit policy.  Status:Charged Off              523
Default                                                           356
Does not meet the credit policy.  Status:Current                   92
Does not meet the credit policy.  Status:Late (31-120 days)         6
Does not meet the credit policy.  Status:In Grace Period            1
Does not meet the credit policy.  Status:Late (16-30 days)          1
Does not meet the credit policy.  Status:Default                    1
Name: loan_status, dtype: int64



In [31]:

    
index_ = df['loan_status'][df['loan_status'].str.contains('(Does not meet the credit*)')].index

I did some research and discovered that loans with the prefix "Does not meet the credit.." in loan_status, are past loans that were granted before Lending Club did a major change to their lending policy. These represent less than 1% of all loans in the dataset. Moving forward, I will discard these loans from my analysis.



In [32]:

    
df.ix[index_, 'loan_status'].count() / len(df) * 100









    Out[32]:





0.62387623840036688



In [33]:

    
df.drop(df.index[index_], inplace=True)



In [34]:

    
# Create a set of dummy variables from the 'loan_status' variable
df_loan_status = pd.get_dummies(df['loan_status'])



In [35]:

    
df['loan_rank'] =  df_loan_status[u'Default'] +  df_loan_status[u'In Grace Period'] + \
                   df_loan_status[u'Late (16-30 days)'] + df_loan_status[ u'Late (31-120 days)'] + \
                   df_loan_status[u'Charged Off']
    
df.drop(['loan_status'], axis = 1, inplace = True)

Derive Features

Extract day of the week from dates
Extract week of the year from dates



In [36]:

    
for cols in date_var:
    print (cols)
    df[cols+'_num_day'] = df[cols].apply(lambda x: convert_num_day_of_week(x)) 
    df[cols+'_day'] = df[cols].apply(lambda x: convert_day_of_week(x))
    df[cols+'_week_of_year'] = df[cols].apply(lambda x: convert_week_of_year(x))









    



accept_d
list_d
exp_d
issue_d
earliest_cr_line
last_pymnt_d
last_credit_pull_d

Save Dataset to file



In [37]:

    
joblib.dump(df, dataPath+'/df_cleaned.pkl')









    Out[37]:





['data/df_cleaned.pkl']

	Item	Description
0	acc_now_delinq	The number of accounts on which the borrower i...
1	acc_open_past_24mths	Number of trades opened in past 24 months.
2	accept_d	The date which the borrower accepted the offer
3	addr_city	The city provided by the borrower in the loan...
4	addr_state	The state provided by the borrower in the loan...

	index	id	member_id	loan_amnt	funded_amnt	funded_amnt_inv	term	int_rate	installment	grade	...	num_il_tl	mo_sin_old_il_acct	num_actv_rev_tl	mo_sin_old_rev_tl_op	mo_sin_rcnt_rev_tl_op	total_rev_hi_lim	num_rev_tl_bal_gt_0	num_op_rev_tl	tot_coll_amt	policy_code
0	0	1077501	1296599.0	5000.0	5000.0	4975.0	36 months	10.65%	162.87	B	...	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	1.0
1	1	1077430	1314167.0	2500.0	2500.0	2500.0	60 months	15.27%	59.83	C	...	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	1.0
2	2	1077175	1313524.0	2400.0	2400.0	2400.0	36 months	15.96%	84.33	C	...	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	1.0
3	3	1076863	1277178.0	10000.0	10000.0	10000.0	36 months	13.49%	339.31	C	...	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	1.0
4	4	1075358	1311748.0	3000.0	3000.0	3000.0	60 months	12.69%	67.79	B	...	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	1.0