notebook.community



In [1]:

    
%matplotlib inline
%load_ext autoreload
%autoreload 2

import os
import sys
import sklearn
import sqlite3
import matplotlib

import numpy as np
import pandas as pd
import enchant as en
import seaborn as sns
import statsmodels.api as sm
import matplotlib.pyplot as plt

from sklearn.preprocessing import LabelEncoder, OneHotEncoder

src_dir = os.path.join(os.getcwd(), os.pardir, 'src')
sys.path.append(src_dir)
%aimport data
from data import make_dataset as md

plt.style.use('ggplot')
plt.rcParams['figure.figsize'] = (16.0, 6.0)
plt.rcParams['legend.markerscale'] = 3
matplotlib.rcParams['font.size'] = 16.0









    



/usr/local/lib/python2.7/site-packages/matplotlib/font_manager.py:273: UserWarning: Matplotlib is building the font cache using fc-list. This may take a moment.
  warnings.warn('Matplotlib is building the font cache using fc-list. This may take a moment.')
/usr/local/lib/python2.7/site-packages/matplotlib/__init__.py:878: UserWarning: axes.color_cycle is deprecated and replaced with axes.prop_cycle; please use the latter.
  warnings.warn(self.msg_depr % (key, alt_key))



In [8]:

    
spell_check = en.Dict('en_US')



In [3]:

    
DIR = os.getcwd() + "/../data/"
df = pd.read_csv(DIR + 'raw/lending-club-loan-data/loan.csv', low_memory=False)
df.head()









    Out[3]:






  
    
      
      id
      member_id
      loan_amnt
      funded_amnt
      funded_amnt_inv
      term
      int_rate
      installment
      grade
      sub_grade
      ...
      total_bal_il
      il_util
      open_rv_12m
      open_rv_24m
      max_bal_bc
      all_util
      total_rev_hi_lim
      inq_fi
      total_cu_tl
      inq_last_12m
    
  
  
    
      0
      1077501
      1296599
      5000.0
      5000.0
      4975.0
      36 months
      10.65
      162.87
      B
      B2
      ...
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
    
    
      1
      1077430
      1314167
      2500.0
      2500.0
      2500.0
      60 months
      15.27
      59.83
      C
      C4
      ...
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
    
    
      2
      1077175
      1313524
      2400.0
      2400.0
      2400.0
      36 months
      15.96
      84.33
      C
      C5
      ...
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
    
    
      3
      1076863
      1277178
      10000.0
      10000.0
      10000.0
      36 months
      13.49
      339.31
      C
      C1
      ...
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
    
    
      4
      1075358
      1311748
      3000.0
      3000.0
      3000.0
      60 months
      12.69
      67.79
      B
      B5
      ...
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
    
  

5 rows × 74 columns

Clean, impute missing values



In [5]:

    
t = md.clean_data(df)
t.head()









    



Now cleaning data.






    Out[5]:






  
    
      
      id
      member_id
      loan_amnt
      funded_amnt
      funded_amnt_inv
      term
      int_rate
      installment
      grade
      sub_grade
      ...
      last_pymnt_amnt
      next_pymnt_d
      last_credit_pull_d
      collections_12_mths_ex_med
      policy_code
      application_type
      acc_now_delinq
      tot_coll_amt
      tot_cur_bal
      total_rev_hi_lim
    
  
  
    
      0
      1077501
      1296599
      5000.0
      5000.0
      4975.0
      36 months
      10.65
      162.87
      B
      B2
      ...
      171.62
      NaT
      2016-01-01
      0.0
      1.0
      INDIVIDUAL
      0.0
      NaN
      NaN
      NaN
    
    
      1
      1077430
      1314167
      2500.0
      2500.0
      2500.0
      60 months
      15.27
      59.83
      C
      C4
      ...
      119.66
      NaT
      2013-09-01
      0.0
      1.0
      INDIVIDUAL
      0.0
      NaN
      NaN
      NaN
    
    
      2
      1077175
      1313524
      2400.0
      2400.0
      2400.0
      36 months
      15.96
      84.33
      C
      C5
      ...
      649.91
      NaT
      2016-01-01
      0.0
      1.0
      INDIVIDUAL
      0.0
      NaN
      NaN
      NaN
    
    
      3
      1076863
      1277178
      10000.0
      10000.0
      10000.0
      36 months
      13.49
      339.31
      C
      C1
      ...
      357.48
      NaT
      2015-01-01
      0.0
      1.0
      INDIVIDUAL
      0.0
      NaN
      NaN
      NaN
    
    
      4
      1075358
      1311748
      3000.0
      3000.0
      3000.0
      60 months
      12.69
      67.79
      B
      B5
      ...
      67.79
      2016-02-01
      2016-01-01
      0.0
      1.0
      INDIVIDUAL
      0.0
      NaN
      NaN
      NaN
    
  

5 rows × 53 columns



In [6]:

    
t2 = md.impute_missing(t)
t2.head()









    



/usr/local/lib/python2.7/site-packages/pandas/core/frame.py:2824: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  downcast=downcast, **kwargs)






    Out[6]:






  
    
      
      id
      member_id
      loan_amnt
      funded_amnt
      funded_amnt_inv
      term
      int_rate
      installment
      grade
      sub_grade
      ...
      last_pymnt_amnt
      next_pymnt_d
      last_credit_pull_d
      collections_12_mths_ex_med
      policy_code
      application_type
      acc_now_delinq
      tot_coll_amt
      tot_cur_bal
      total_rev_hi_lim
    
  
  
    
      0
      1077501
      1296599
      5000.0
      5000.0
      4975.0
      36 months
      10.65
      162.87
      B
      B2
      ...
      171.62
      NaT
      2016-01-01
      0.0
      1.0
      INDIVIDUAL
      0.0
      NaN
      NaN
      NaN
    
    
      1
      1077430
      1314167
      2500.0
      2500.0
      2500.0
      60 months
      15.27
      59.83
      C
      C4
      ...
      119.66
      NaT
      2013-09-01
      0.0
      1.0
      INDIVIDUAL
      0.0
      NaN
      NaN
      NaN
    
    
      2
      1077175
      1313524
      2400.0
      2400.0
      2400.0
      36 months
      15.96
      84.33
      C
      C5
      ...
      649.91
      NaT
      2016-01-01
      0.0
      1.0
      INDIVIDUAL
      0.0
      NaN
      NaN
      NaN
    
    
      3
      1076863
      1277178
      10000.0
      10000.0
      10000.0
      36 months
      13.49
      339.31
      C
      C1
      ...
      357.48
      NaT
      2015-01-01
      0.0
      1.0
      INDIVIDUAL
      0.0
      NaN
      NaN
      NaN
    
    
      4
      1075358
      1311748
      3000.0
      3000.0
      3000.0
      60 months
      12.69
      67.79
      B
      B5
      ...
      67.79
      2016-02-01
      2016-01-01
      0.0
      1.0
      INDIVIDUAL
      0.0
      NaN
      NaN
      NaN
    
  

5 rows × 53 columns



In [9]:

    
s = 'my spej is bad'
string_list = str(s).split()
errors_list = [spell_check.check(x) for x in string_list]
errors_list









    Out[9]:





[True, False, True, True]



In [14]:

    
float(errors_list.count(False)) / len(errors_list)









    Out[14]:





0.25



In [16]:

    
t2.select_dtypes(include = ['object']).head()









    Out[16]:






  
    
      
      term
      grade
      sub_grade
      emp_title
      emp_length
      home_ownership
      verification_status
      issue_d
      loan_status
      pymnt_plan
      url
      purpose
      title
      zip_code
      addr_state
      earliest_cr_line
      initial_list_status
      application_type
    
  
  
    
      0
      36 months
      B
      B2
      NaN
      10+ years
      RENT
      Verified
      Dec-2011
      Fully Paid
      n
      https://www.lendingclub.com/browse/loanDetail....
      credit_card
      Computer
      860xx
      AZ
      Jan-1985
      f
      INDIVIDUAL
    
    
      1
      60 months
      C
      C4
      Ryder
      < 1 year
      RENT
      Source Verified
      Dec-2011
      Charged Off
      n
      https://www.lendingclub.com/browse/loanDetail....
      car
      bike
      309xx
      GA
      Apr-1999
      f
      INDIVIDUAL
    
    
      2
      36 months
      C
      C5
      NaN
      10+ years
      RENT
      Not Verified
      Dec-2011
      Fully Paid
      n
      https://www.lendingclub.com/browse/loanDetail....
      small_business
      real estate business
      606xx
      IL
      Nov-2001
      f
      INDIVIDUAL
    
    
      3
      36 months
      C
      C1
      AIR RESOURCES BOARD
      10+ years
      RENT
      Source Verified
      Dec-2011
      Fully Paid
      n
      https://www.lendingclub.com/browse/loanDetail....
      other
      personel
      917xx
      CA
      Feb-1996
      f
      INDIVIDUAL
    
    
      4
      60 months
      B
      B5
      University Medical Group
      1 year
      RENT
      Source Verified
      Dec-2011
      Current
      n
      https://www.lendingclub.com/browse/loanDetail....
      other
      Personal
      972xx
      OR
      Jan-1996
      f
      INDIVIDUAL



In [19]:

    
t3 = md.spelling_mistakes(t2)
t3.head()









    



Now calculating spelling mistakes.
This may take a while.






    Out[19]:






  
    
      
      id
      member_id
      loan_amnt
      funded_amnt
      funded_amnt_inv
      term
      int_rate
      installment
      grade
      sub_grade
      ...
      last_credit_pull_d
      collections_12_mths_ex_med
      policy_code
      application_type
      acc_now_delinq
      tot_coll_amt
      tot_cur_bal
      total_rev_hi_lim
      emp_title_percent_misspelled
      title_percent_misspelled
    
  
  
    
      0
      1077501
      1296599
      5000.0
      5000.0
      4975.0
      36 months
      10.65
      162.87
      B
      B2
      ...
      2016-01-01
      0.0
      1.0
      INDIVIDUAL
      0.0
      NaN
      NaN
      NaN
      1.0
      0.0
    
    
      1
      1077430
      1314167
      2500.0
      2500.0
      2500.0
      60 months
      15.27
      59.83
      C
      C4
      ...
      2013-09-01
      0.0
      1.0
      INDIVIDUAL
      0.0
      NaN
      NaN
      NaN
      0.0
      0.0
    
    
      2
      1077175
      1313524
      2400.0
      2400.0
      2400.0
      36 months
      15.96
      84.33
      C
      C5
      ...
      2016-01-01
      0.0
      1.0
      INDIVIDUAL
      0.0
      NaN
      NaN
      NaN
      1.0
      0.0
    
    
      3
      1076863
      1277178
      10000.0
      10000.0
      10000.0
      36 months
      13.49
      339.31
      C
      C1
      ...
      2015-01-01
      0.0
      1.0
      INDIVIDUAL
      0.0
      NaN
      NaN
      NaN
      0.0
      1.0
    
    
      4
      1075358
      1311748
      3000.0
      3000.0
      3000.0
      60 months
      12.69
      67.79
      B
      B5
      ...
      2016-01-01
      0.0
      1.0
      INDIVIDUAL
      0.0
      NaN
      NaN
      NaN
      0.0
      0.0
    
  

5 rows × 55 columns



In [20]:

    
t3[['emp_title', 'emp_title_percent_misspelled', 'title', 'title_percent_misspelled']].head()









    Out[20]:






  
    
      
      emp_title
      emp_title_percent_misspelled
      title
      title_percent_misspelled
    
  
  
    
      0
      NaN
      1.0
      Computer
      0.0
    
    
      1
      Ryder
      0.0
      bike
      0.0
    
    
      2
      NaN
      1.0
      real estate business
      0.0
    
    
      3
      AIR RESOURCES BOARD
      0.0
      personel
      1.0
    
    
      4
      University Medical Group
      0.0
      Personal
      0.0



In [22]:

    
t3[['emp_title_percent_misspelled', 'title_percent_misspelled']].describe()









    Out[22]:






  
    
      
      emp_title_percent_misspelled
      title_percent_misspelled
    
  
  
    
      count
      884766.000000
      884766.000000
    
    
      mean
      0.174831
      0.021598
    
    
      std
      0.335104
      0.128298
    
    
      min
      0.000000
      0.000000
    
    
      25%
      0.000000
      0.000000
    
    
      50%
      0.000000
      0.000000
    
    
      75%
      0.200000
      0.000000
    
    
      max
      1.000000
      1.000000



In [25]:

    
plt.hist(t3['emp_title_percent_misspelled'], label = 'Employee title')
plt.hist(t3['title_percent_misspelled'], label = Title')
plt.show()



In [ ]:



In [ ]:



In [ ]:



In [ ]:



In [ ]:



In [ ]:



In [ ]:

Imputing missing values: `float64`



In [25]:

    
plt.hist(df['annual_inc'].dropna(), bins = 50)
plt.show()



In [52]:

    
# Drop the people making over $400k a year
df2 = df[df['annual_inc'] <= 400000].copy()
df2.shape, df.shape









    Out[52]:





((885412, 74), (887379, 74))



In [58]:

    
plt.hist(df2['annual_inc'], bins = 50)
plt.axvline(df2['annual_inc'].mean(), label = "${:.2f}".format(df2['annual_inc'].mean()))
plt.title('Annual income (outlier millionaires removed)')
plt.xlabel('USD/year')
plt.legend()
plt.show()



In [63]:

    
# Since it's just 25 out of 800k, I'll drop
df2['delinq_2yrs'].value_counts(dropna = False)
df3 = df2[df2['delinq_2yrs'].isnull() == False]
df3.shape, df2.shape









    Out[63]:





((885387, 74), (885412, 74))



In [67]:

    
df3['inq_last_6mths'].isnull().value_counts()









    Out[67]:





False    885387
Name: inq_last_6mths, dtype: int64



In [72]:

    
df3['revol_util'].isnull().value_counts()









    Out[72]:





False    884920
True        467
Name: revol_util, dtype: int64



In [76]:

    
plt.hist(df3['revol_util'].dropna(), bins = 50)
plt.show()



In [83]:

    
float(df3[df3['revol_util'] > 100].shape[0]) / df3.shape[0]
# Let's drop people using > 100% their credit rate

df4 = df3[df3['revol_util'] <= 130].copy()
df4.shape, df3.shape









    Out[83]:





((884878, 74), (885387, 74))



In [84]:

    
df4['revol_util'].hist(bins = 50)









    Out[84]:





<matplotlib.axes._subplots.AxesSubplot at 0x1196ff0d0>



In [90]:

    
df5 = df4[df4['collections_12_mths_ex_med'].isnull() == False].copy()
df5.shape, df4.shape









    Out[90]:





((884766, 74), (884878, 74))



In [105]:

    
# Imputing using the mean
df5['tot_coll_amt'].fillna(value = df5['tot_coll_amt'].mean(), inplace = True)



In [110]:

    
df5['tot_cur_bal'].hist(bins = 50)









    Out[110]:





<matplotlib.axes._subplots.AxesSubplot at 0x11d3bb2d0>



In [115]:

    
df5[df5['tot_cur_bal'].isnull() == True][cols_to_keep].tail()









    Out[115]:






  
    
      
      id
      member_id
      loan_amnt
      funded_amnt
      funded_amnt_inv
      term
      int_rate
      installment
      grade
      sub_grade
      ...
      last_pymnt_amnt
      next_pymnt_d
      last_credit_pull_d
      collections_12_mths_ex_med
      policy_code
      application_type
      acc_now_delinq
      tot_coll_amt
      tot_cur_bal
      total_rev_hi_lim
    
  
  
    
      230652
      1062400
      1294262
      20500.0
      20500.0
      20500.0
      36 months
      16.77
      728.54
      D
      D2
      ...
      10.21
      NaN
      Nov-2015
      0.0
      1.0
      INDIVIDUAL
      0.0
      225.960795
      NaN
      NaN
    
    
      230653
      1059394
      1291010
      15000.0
      15000.0
      15000.0
      36 months
      15.27
      521.97
      C
      C4
      ...
      541.95
      NaN
      Dec-2014
      0.0
      1.0
      INDIVIDUAL
      0.0
      225.960795
      NaN
      NaN
    
    
      230654
      1059224
      1290827
      35000.0
      35000.0
      35000.0
      36 months
      15.96
      1229.81
      C
      C5
      ...
      1244.72
      NaN
      Dec-2014
      0.0
      1.0
      INDIVIDUAL
      0.0
      225.960795
      NaN
      NaN
    
    
      230655
      1058722
      1290521
      12000.0
      12000.0
      12000.0
      36 months
      16.29
      423.61
      D
      D1
      ...
      423.61
      NaN
      Sep-2012
      0.0
      1.0
      INDIVIDUAL
      0.0
      225.960795
      NaN
      NaN
    
    
      230656
      1058291
      1289878
      12000.0
      7775.0
      7775.0
      60 months
      15.27
      186.08
      C
      C4
      ...
      186.08
      Feb-2016
      Jan-2016
      0.0
      1.0
      INDIVIDUAL
      0.0
      225.960795
      NaN
      NaN
    
  

5 rows × 53 columns



In [116]:

    
# I think it makes sense to set these to zero - these are people who's current balance is NaN
# because their loan is finished/paid off
df5['tot_cur_bal'].fillna(value = 0, inplace = True)



In [8]:

    
# Let's just drop this column
cols_to_keep.remove('total_rev_hi_lim')
len(cols_to_keep)









    Out[8]:





52



In [10]:

    
print cols_to_keep









    



['id', 'member_id', 'loan_amnt', 'funded_amnt', 'funded_amnt_inv', 'term', 'int_rate', 'installment', 'grade', 'sub_grade', 'emp_title', 'emp_length', 'home_ownership', 'annual_inc', 'verification_status', 'issue_d', 'loan_status', 'pymnt_plan', 'url', 'purpose', 'title', 'zip_code', 'addr_state', 'dti', 'delinq_2yrs', 'earliest_cr_line', 'inq_last_6mths', 'open_acc', 'pub_rec', 'revol_bal', 'revol_util', 'total_acc', 'initial_list_status', 'out_prncp', 'out_prncp_inv', 'total_pymnt', 'total_pymnt_inv', 'total_rec_prncp', 'total_rec_int', 'total_rec_late_fee', 'recoveries', 'collection_recovery_fee', 'last_pymnt_d', 'last_pymnt_amnt', 'next_pymnt_d', 'last_credit_pull_d', 'collections_12_mths_ex_med', 'policy_code', 'application_type', 'acc_now_delinq', 'tot_coll_amt', 'tot_cur_bal']



In [ ]:

    
for i in cols_to_keep:
    if df5[i]



In [ ]:



In [ ]:



In [ ]:



In [ ]:



In [ ]:



In [ ]:



In [5]:

    
df['emp_title'].apply(lambda x: spell_check.check(x)).value_counts() / len(df['emp_title'])









    Out[5]:





False    0.801466
True     0.198534
Name: emp_title, dtype: float64



In [18]:

    
df[objects].head()









    Out[18]:






  
    
      
      term
      grade
      sub_grade
      emp_title
      emp_length
      home_ownership
      verification_status
      issue_d
      loan_status
      pymnt_plan
      ...
      purpose
      title
      zip_code
      addr_state
      earliest_cr_line
      initial_list_status
      last_pymnt_d
      next_pymnt_d
      last_credit_pull_d
      application_type
    
  
  
    
      0
      36 months
      B
      B2
      NaN
      10+ years
      RENT
      Verified
      Dec-2011
      Fully Paid
      n
      ...
      credit_card
      Computer
      860xx
      AZ
      Jan-1985
      f
      Jan-2015
      NaN
      Jan-2016
      INDIVIDUAL
    
    
      1
      60 months
      C
      C4
      Ryder
      < 1 year
      RENT
      Source Verified
      Dec-2011
      Charged Off
      n
      ...
      car
      bike
      309xx
      GA
      Apr-1999
      f
      Apr-2013
      NaN
      Sep-2013
      INDIVIDUAL
    
    
      2
      36 months
      C
      C5
      NaN
      10+ years
      RENT
      Not Verified
      Dec-2011
      Fully Paid
      n
      ...
      small_business
      real estate business
      606xx
      IL
      Nov-2001
      f
      Jun-2014
      NaN
      Jan-2016
      INDIVIDUAL
    
    
      3
      36 months
      C
      C1
      AIR RESOURCES BOARD
      10+ years
      RENT
      Source Verified
      Dec-2011
      Fully Paid
      n
      ...
      other
      personel
      917xx
      CA
      Feb-1996
      f
      Jan-2015
      NaN
      Jan-2015
      INDIVIDUAL
    
    
      4
      60 months
      B
      B5
      University Medical Group
      1 year
      RENT
      Source Verified
      Dec-2011
      Current
      n
      ...
      other
      Personal
      972xx
      OR
      Jan-1996
      f
      Jan-2016
      Feb-2016
      Jan-2016
      INDIVIDUAL
    
    
      5
      36 months
      A
      A4
      Veolia Transportaton
      3 years
      RENT
      Source Verified
      Dec-2011
      Fully Paid
      n
      ...
      wedding
      My wedding loan I promise to pay back
      852xx
      AZ
      Nov-2004
      f
      Jan-2015
      NaN
      Sep-2015
      INDIVIDUAL
    
    
      6
      60 months
      C
      C5
      Southern Star Photography
      8 years
      RENT
      Not Verified
      Dec-2011
      Current
      n
      ...
      debt_consolidation
      Loan
      280xx
      NC
      Jul-2005
      f
      Jan-2016
      Feb-2016
      Jan-2016
      INDIVIDUAL
    
    
      7
      36 months
      E
      E1
      MKC Accounting
      9 years
      RENT
      Source Verified
      Dec-2011
      Fully Paid
      n
      ...
      car
      Car Downpayment
      900xx
      CA
      Jan-2007
      f
      Jan-2015
      NaN
      Dec-2014
      INDIVIDUAL
    
    
      8
      60 months
      F
      F2
      NaN
      4 years
      OWN
      Source Verified
      Dec-2011
      Charged Off
      n
      ...
      small_business
      Expand Business & Buy Debt Portfolio
      958xx
      CA
      Apr-2004
      f
      Apr-2012
      NaN
      Aug-2012
      INDIVIDUAL
    
    
      9
      60 months
      B
      B5
      Starbucks
      < 1 year
      RENT
      Verified
      Dec-2011
      Charged Off
      n
      ...
      other
      Building my credit history.
      774xx
      TX
      Sep-2004
      f
      Nov-2012
      NaN
      Mar-2013
      INDIVIDUAL
    
    
      10
      60 months
      C
      C3
      Southwest Rural metro
      5 years
      OWN
      Not Verified
      Dec-2011
      Fully Paid
      n
      ...
      debt_consolidation
      High intrest Consolidation
      853xx
      AZ
      Jan-1998
      f
      Jun-2013
      NaN
      Dec-2015
      INDIVIDUAL
    
    
      11
      36 months
      B
      B5
      UCLA
      10+ years
      OWN
      Source Verified
      Dec-2011
      Fully Paid
      n
      ...
      debt_consolidation
      Consolidation
      913xx
      CA
      Oct-1989
      f
      Sep-2013
      NaN
      Aug-2013
      INDIVIDUAL
    
    
      12
      36 months
      C
      C1
      Va. Dept of Conservation/Recreation
      < 1 year
      RENT
      Source Verified
      Dec-2011
      Charged Off
      n
      ...
      debt_consolidation
      freedom
      245xx
      VA
      Apr-2004
      f
      Jul-2012
      NaN
      Nov-2012
      INDIVIDUAL
    
    
      13
      36 months
      B
      B1
      Target
      3 years
      RENT
      Source Verified
      Dec-2011
      Fully Paid
      n
      ...
      credit_card
      citicard fund
      606xx
      IL
      Jul-2003
      f
      Jan-2015
      NaN
      Jan-2016
      INDIVIDUAL
    
    
      14
      36 months
      B
      B2
      SFMTA
      3 years
      RENT
      Source Verified
      Dec-2011
      Charged Off
      n
      ...
      other
      Other Loan
      951xx
      CA
      May-1991
      f
      Oct-2013
      NaN
      Mar-2014
      INDIVIDUAL
    
    
      15
      36 months
      D
      D1
      Internal revenue Service
      < 1 year
      RENT
      Not Verified
      Dec-2011
      Fully Paid
      n
      ...
      debt_consolidation
      Debt Consolidation Loan
      641xx
      MO
      Sep-2007
      f
      Jan-2015
      NaN
      Jan-2016
      INDIVIDUAL
    
    
      16
      36 months
      C
      C4
      Chin's Restaurant
      4 years
      RENT
      Not Verified
      Dec-2011
      Fully Paid
      n
      ...
      home_improvement
      Home
      921xx
      CA
      Oct-1998
      f
      Jan-2015
      NaN
      Apr-2015
      INDIVIDUAL
    
    
      17
      36 months
      A
      A1
      Duracell
      10+ years
      MORTGAGE
      Not Verified
      Dec-2011
      Fully Paid
      n
      ...
      major_purchase
      Holiday
      067xx
      CT
      Aug-1993
      f
      May-2013
      NaN
      May-2014
      INDIVIDUAL
    
    
      18
      36 months
      B
      B3
      Connection Inspection
      1 year
      MORTGAGE
      Verified
      Dec-2011
      Fully Paid
      n
      ...
      medical
      Medical
      890xx
      UT
      Oct-2003
      f
      Feb-2015
      NaN
      Jul-2015
      INDIVIDUAL
    
    
      19
      36 months
      A
      A1
      Network Interpreting Service
      6 years
      RENT
      Not Verified
      Dec-2011
      Fully Paid
      n
      ...
      debt_consolidation
      lowerratemeanseasiertogetoutofdebt!
      921xx
      CA
      Jan-2001
      f
      Jul-2012
      NaN
      Jul-2012
      INDIVIDUAL
    
    
      20
      60 months
      C
      C4
      Archdiocese of Galveston Houston
      3 years
      RENT
      Verified
      Dec-2011
      Fully Paid
      n
      ...
      debt_consolidation
      Freedom From Credit Cards
      770xx
      TX
      Nov-1997
      f
      Aug-2015
      NaN
      Dec-2015
      INDIVIDUAL
    
    
      21
      36 months
      B
      B4
      Osram Sylvania
      10+ years
      RENT
      Verified
      Dec-2011
      Charged Off
      n
      ...
      debt_consolidation
      Debt Cleanup
      335xx
      FL
      Feb-1983
      f
      Sep-2013
      NaN
      Jan-2016
      INDIVIDUAL
    
    
      22
      36 months
      B
      B3
      Value Air
      10+ years
      OWN
      Source Verified
      Dec-2011
      Fully Paid
      n
      ...
      credit_card
      Credit Card Loan
      799xx
      TX
      Jul-1985
      f
      Jan-2015
      NaN
      Jan-2016
      INDIVIDUAL
    
    
      23
      36 months
      B
      B3
      Wells Fargo Bank
      5 years
      RENT
      Not Verified
      Dec-2011
      Fully Paid
      n
      ...
      debt_consolidation
      Debt Consolidation
      917xx
      CA
      Apr-2003
      f
      Oct-2013
      NaN
      Mar-2014
      INDIVIDUAL
    
    
      24
      36 months
      B
      B3
      bmg-educational
      1 year
      RENT
      Not Verified
      Dec-2011
      Charged Off
      n
      ...
      major_purchase
      cash
      900xx
      CA
      Jun-2001
      f
      Oct-2012
      NaN
      Mar-2013
      INDIVIDUAL
    
    
      25
      36 months
      B
      B1
      Winfield Pathology Consultants
      2 years
      MORTGAGE
      Verified
      Dec-2011
      Fully Paid
      n
      ...
      credit_card
      No more credit card debt!
      605xx
      IL
      Feb-2002
      f
      Sep-2012
      NaN
      Sep-2012
      INDIVIDUAL
    
    
      26
      36 months
      C
      C2
      nyc transit
      9 years
      RENT
      Not Verified
      Dec-2011
      Charged Off
      n
      ...
      debt_consolidation
      consolidation
      103xx
      NY
      Oct-2003
      f
      NaN
      NaN
      Jan-2016
      INDIVIDUAL
    
    
      27
      60 months
      D
      D2
      Frito Lay
      2 years
      RENT
      Not Verified
      Dec-2011
      Charged Off
      n
      ...
      other
      Other
      150xx
      PA
      Oct-2003
      f
      Dec-2012
      NaN
      May-2013
      INDIVIDUAL
    
    
      28
      36 months
      B
      B3
      Shands Hospital at the University of Fl
      10+ years
      MORTGAGE
      Not Verified
      Dec-2011
      Fully Paid
      n
      ...
      debt_consolidation
      Debt Consolidation
      326xx
      FL
      Aug-1984
      f
      Apr-2013
      NaN
      Jan-2016
      INDIVIDUAL
    
    
      29
      36 months
      B
      B3
      Oakridge homes
      < 1 year
      RENT
      Source Verified
      Dec-2011
      Fully Paid
      n
      ...
      credit_card
      Credit card repayment loan
      564xx
      MN
      Nov-2006
      f
      Dec-2014
      NaN
      Jan-2015
      INDIVIDUAL
    
    
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
    
    
      887349
      36 months
      A
      A3
      Mission info Tech
      2 years
      MORTGAGE
      Source Verified
      Jan-2015
      Current
      n
      ...
      debt_consolidation
      Debt consolidation
      325xx
      FL
      Mar-1998
      f
      Jan-2016
      Feb-2016
      Jan-2016
      INDIVIDUAL
    
    
      887350
      60 months
      C
      C3
      Business agent
      10+ years
      RENT
      Source Verified
      Jan-2015
      Current
      n
      ...
      debt_consolidation
      Debt consolidation
      925xx
      CA
      Aug-2002
      f
      Jan-2016
      Feb-2016
      Jan-2016
      INDIVIDUAL
    
    
      887351
      36 months
      D
      D2
      supervisor
      10+ years
      MORTGAGE
      Verified
      Jan-2015
      Charged Off
      n
      ...
      medical
      Medical expenses
      810xx
      CO
      Aug-1990
      f
      Aug-2015
      NaN
      Jan-2016
      INDIVIDUAL
    
    
      887352
      36 months
      B
      B5
      Support Scientist II
      < 1 year
      RENT
      Verified
      Jan-2015
      Current
      n
      ...
      debt_consolidation
      Debt consolidation
      209xx
      MD
      Dec-2006
      w
      Jan-2016
      Feb-2016
      Jan-2016
      INDIVIDUAL
    
    
      887353
      60 months
      B
      B4
      VP of Operations
      4 years
      RENT
      Source Verified
      Jan-2015
      Current
      n
      ...
      debt_consolidation
      Other
      339xx
      FL
      Jan-1989
      w
      Jan-2016
      Feb-2016
      Jan-2016
      INDIVIDUAL
    
    
      887354
      36 months
      D
      D5
      Operator
      10+ years
      MORTGAGE
      Verified
      Jan-2015
      Current
      n
      ...
      debt_consolidation
      Debt consolidation
      308xx
      GA
      Jun-2001
      f
      Jan-2016
      Feb-2016
      Jan-2016
      INDIVIDUAL
    
    
      887355
      36 months
      A
      A5
      NaN
      n/a
      RENT
      Verified
      Jan-2015
      Current
      n
      ...
      debt_consolidation
      Debt consolidation
      840xx
      UT
      Jun-1993
      f
      Jan-2016
      Feb-2016
      Jan-2016
      INDIVIDUAL
    
    
      887356
      60 months
      D
      D1
      Assistant Maint Manager
      10+ years
      MORTGAGE
      Source Verified
      Jan-2015
      Current
      n
      ...
      credit_card
      Credit card refinancing
      597xx
      MT
      Dec-1994
      w
      Jan-2016
      Feb-2016
      Jan-2016
      INDIVIDUAL
    
    
      887357
      60 months
      D
      D5
      Field Supervisor
      10+ years
      RENT
      Source Verified
      Jan-2015
      Current
      n
      ...
      debt_consolidation
      Debt consolidation
      412xx
      KY
      Jul-2002
      w
      Jan-2016
      Feb-2016
      Jan-2016
      INDIVIDUAL
    
    
      887358
      60 months
      D
      D2
      Owner
      3 years
      RENT
      Source Verified
      Jan-2015
      Current
      n
      ...
      debt_consolidation
      Debt consolidation
      282xx
      NC
      May-2003
      w
      Jan-2016
      Feb-2016
      Jan-2016
      INDIVIDUAL
    
    
      887359
      60 months
      D
      D1
      NaN
      n/a
      OWN
      Not Verified
      Jan-2015
      Current
      n
      ...
      debt_consolidation
      Debt consolidation
      550xx
      MN
      May-2003
      f
      Jan-2016
      Feb-2016
      Jan-2016
      INDIVIDUAL
    
    
      887360
      60 months
      B
      B1
      Business Manager
      10+ years
      MORTGAGE
      Verified
      Jan-2015
      Current
      n
      ...
      debt_consolidation
      Debt consolidation
      138xx
      NY
      Aug-1983
      w
      Jan-2016
      Feb-2016
      Jan-2016
      INDIVIDUAL
    
    
      887361
      36 months
      A
      A1
      Director
      10+ years
      MORTGAGE
      Source Verified
      Jan-2015
      Current
      n
      ...
      credit_card
      Credit card refinancing
      021xx
      MA
      Oct-1994
      f
      Jan-2016
      Feb-2016
      Jan-2016
      INDIVIDUAL
    
    
      887362
      36 months
      A
      A5
      Sr. Tech
      9 years
      MORTGAGE
      Source Verified
      Jan-2015
      Current
      n
      ...
      debt_consolidation
      Debt consolidation
      871xx
      NM
      Sep-2003
      f
      Jan-2016
      Feb-2016
      Jan-2016
      INDIVIDUAL
    
    
      887363
      36 months
      B
      B2
      Bartender
      4 years
      RENT
      Source Verified
      Jan-2015
      Current
      n
      ...
      debt_consolidation
      Debt consolidation
      891xx
      NV
      Oct-2006
      f
      Jan-2016
      Feb-2016
      Jan-2016
      INDIVIDUAL
    
    
      887364
      36 months
      A
      A1
      Coordinator of RSVP
      < 1 year
      RENT
      Not Verified
      Jan-2015
      Fully Paid
      n
      ...
      debt_consolidation
      Debt consolidation
      325xx
      FL
      Nov-1975
      w
      Jul-2015
      NaN
      Jun-2015
      INDIVIDUAL
    
    
      887365
      36 months
      C
      C4
      machining Cell Lead
      10+ years
      RENT
      Verified
      Jan-2015
      Current
      n
      ...
      debt_consolidation
      Debt consolidation
      471xx
      IN
      Nov-2006
      w
      Jan-2016
      Feb-2016
      Jan-2016
      INDIVIDUAL
    
    
      887366
      36 months
      D
      D3
      Painter
      2 years
      RENT
      Source Verified
      Jan-2015
      Fully Paid
      n
      ...
      debt_consolidation
      Debt consolidation
      330xx
      FL
      Feb-2011
      f
      Dec-2015
      NaN
      Dec-2015
      INDIVIDUAL
    
    
      887367
      36 months
      A
      A2
      Area Director
      1 year
      RENT
      Verified
      Jan-2015
      Current
      n
      ...
      debt_consolidation
      Debt consolidation
      333xx
      FL
      Mar-2000
      w
      Jan-2016
      Feb-2016
      Jan-2016
      INDIVIDUAL
    
    
      887368
      60 months
      C
      C5
      Team Leader
      8 years
      MORTGAGE
      Verified
      Jan-2015
      Current
      n
      ...
      major_purchase
      Major purchase
      286xx
      NC
      May-1994
      w
      Jan-2016
      Feb-2016
      Mar-2015
      INDIVIDUAL
    
    
      887369
      36 months
      B
      B1
      Lead Custodian
      10+ years
      MORTGAGE
      Verified
      Jan-2015
      Fully Paid
      n
      ...
      car
      Car financing
      956xx
      CA
      Sep-2002
      f
      Jun-2015
      NaN
      Nov-2015
      INDIVIDUAL
    
    
      887370
      36 months
      B
      B2
      Owner
      8 years
      RENT
      Verified
      Jan-2015
      Current
      n
      ...
      debt_consolidation
      Debt consolidation
      531xx
      WI
      Aug-1994
      f
      Jan-2016
      Feb-2016
      Jan-2016
      INDIVIDUAL
    
    
      887371
      36 months
      E
      E2
      NaN
      n/a
      OWN
      Verified
      Jan-2015
      Charged Off
      n
      ...
      debt_consolidation
      Debt consolidation
      430xx
      OH
      Jun-2002
      f
      Aug-2015
      NaN
      Jan-2016
      INDIVIDUAL
    
    
      887372
      36 months
      B
      B5
      KYC Business Analyst
      < 1 year
      MORTGAGE
      Verified
      Jan-2015
      Current
      n
      ...
      credit_card
      Credit card refinancing
      432xx
      OH
      Jun-1995
      f
      Jan-2016
      Feb-2016
      Jan-2016
      INDIVIDUAL
    
    
      887373
      36 months
      D
      D1
      Manager
      10+ years
      RENT
      Verified
      Jan-2015
      Current
      n
      ...
      credit_card
      Credit card refinancing
      791xx
      TX
      Jul-1999
      f
      Jan-2016
      Feb-2016
      Jan-2016
      INDIVIDUAL
    
    
      887374
      36 months
      B
      B5
      Office Assistant
      8 years
      RENT
      Verified
      Jan-2015
      Current
      n
      ...
      debt_consolidation
      Debt consolidation
      905xx
      CA
      Sep-2004
      f
      Jan-2016
      Feb-2016
      Jan-2016
      INDIVIDUAL
    
    
      887375
      36 months
      B
      B5
      Director of Maintenance
      10+ years
      MORTGAGE
      Verified
      Jan-2015
      Current
      n
      ...
      home_improvement
      Home improvement
      072xx
      NJ
      Mar-1974
      f
      Jan-2016
      Feb-2016
      Jan-2016
      INDIVIDUAL
    
    
      887376
      60 months
      D
      D2
      Radiologist Technologist
      5 years
      RENT
      Verified
      Jan-2015
      Current
      n
      ...
      debt_consolidation
      Debt consolidation
      378xx
      TN
      Sep-2003
      w
      Jan-2016
      Feb-2016
      Jan-2016
      INDIVIDUAL
    
    
      887377
      60 months
      E
      E3
      Painter
      1 year
      RENT
      Source Verified
      Jan-2015
      Current
      n
      ...
      debt_consolidation
      Debt consolidation
      010xx
      MA
      Oct-2003
      w
      Jan-2016
      Feb-2016
      Jan-2016
      INDIVIDUAL
    
    
      887378
      36 months
      B
      B5
      Manager Hotel Operations Oasis
      10+ years
      RENT
      Verified
      Jan-2015
      Current
      n
      ...
      credit_card
      Credit card refinancing
      331xx
      FL
      Dec-2001
      f
      Jan-2016
      Feb-2016
      Jan-2016
      INDIVIDUAL
    
  

887379 rows × 21 columns



In [19]:

    
df['num_spell_errors'] = 0

for s in ['emp_title', 'title']:
    df['num_spell_errors'] += df[s].apply(lambda x: spell_check.check(x)) == True
    
df[['emp_title', 'title', 'num_spell_errors']].head()









    Out[19]:






  
    
      
      emp_title
      title
      purpose
      num_spell_errors
    
  
  
    
      0
      NaN
      Computer
      credit_card
      1
    
    
      1
      Ryder
      bike
      car
      3
    
    
      2
      NaN
      real estate business
      small_business
      0
    
    
      3
      AIR RESOURCES BOARD
      personel
      other
      1
    
    
      4
      University Medical Group
      Personal
      other
      2



In [24]:

    
df[['emp_title', 'title', 'num_spell_errors']].head(20)









    Out[24]:






  
    
      
      emp_title
      title
      purpose
      num_spell_errors
    
  
  
    
      0
      NaN
      Computer
      credit_card
      1
    
    
      1
      Ryder
      bike
      car
      3
    
    
      2
      NaN
      real estate business
      small_business
      0
    
    
      3
      AIR RESOURCES BOARD
      personel
      other
      1
    
    
      4
      University Medical Group
      Personal
      other
      2
    
    
      5
      Veolia Transportaton
      My wedding loan I promise to pay back
      wedding
      1
    
    
      6
      Southern Star Photography
      Loan
      debt_consolidation
      1
    
    
      7
      MKC Accounting
      Car Downpayment
      car
      1
    
    
      8
      NaN
      Expand Business & Buy Debt Portfolio
      small_business
      0
    
    
      9
      Starbucks
      Building my credit history.
      other
      1
    
    
      10
      Southwest Rural metro
      High intrest Consolidation
      debt_consolidation
      0
    
    
      11
      UCLA
      Consolidation
      debt_consolidation
      2
    
    
      12
      Va. Dept of Conservation/Recreation
      freedom
      debt_consolidation
      1
    
    
      13
      Target
      citicard fund
      credit_card
      1
    
    
      14
      SFMTA
      Other Loan
      other
      1
    
    
      15
      Internal revenue Service
      Debt Consolidation Loan
      debt_consolidation
      0
    
    
      16
      Chin's Restaurant
      Home
      home_improvement
      1
    
    
      17
      Duracell
      Holiday
      major_purchase
      2
    
    
      18
      Connection Inspection
      Medical
      medical
      2
    
    
      19
      Network Interpreting Service
      lowerratemeanseasiertogetoutofdebt!
      debt_consolidation
      0



In [28]:

    
df[['title', df['title'].apply(lambda x: spell_check.check(x))]].head()









    



------------------------------------------------------------------------
ValueError                             Traceback (most recent call last)
<ipython-input-28-353303856aed> in <module>()
----> 1 df[['title', df['title'].apply(lambda x: spell_check.check(x))]].head()

/usr/local/lib/python2.7/site-packages/pandas/core/frame.pyc in __getitem__(self, key)
   2049         if isinstance(key, (Series, np.ndarray, Index, list)):
   2050             # either boolean or fancy integer index
-> 2051             return self._getitem_array(key)
   2052         elif isinstance(key, DataFrame):
   2053             return self._getitem_frame(key)

/usr/local/lib/python2.7/site-packages/pandas/core/frame.pyc in _getitem_array(self, key)
   2076     def _getitem_array(self, key):
   2077         # also raises Exception if object array with NA values
-> 2078         if com.is_bool_indexer(key):
   2079             # warning here just in case -- previously __setitem__ was
   2080             # reindexing but __getitem__ was not; it seems more reasonable to

/usr/local/lib/python2.7/site-packages/pandas/core/common.pyc in is_bool_indexer(key)
    198     elif isinstance(key, list):
    199         try:
--> 200             arr = np.asarray(key)
    201             return arr.dtype == np.bool_ and len(arr) == len(key)
    202         except TypeError:  # pragma: no cover

/usr/local/lib/python2.7/site-packages/numpy/core/numeric.pyc in asarray(a, dtype, order)
    480 
    481     """
--> 482     return array(a, dtype, copy=False, order=order)
    483 
    484 def asanyarray(a, dtype=None, order=None):

ValueError: cannot set an array element with a sequence



In [26]:

    
df['purpose'].value_counts()









    Out[26]:





debt_consolidation    524215
credit_card           206182
home_improvement       51829
other                  42894
major_purchase         17277
small_business         10377
car                     8863
medical                 8540
moving                  5414
vacation                4736
house                   3707
wedding                 2347
renewable_energy         575
educational              423
Name: purpose, dtype: int64



In [ ]:



In [ ]:



In [ ]:



In [4]:

    
catagorical_cols = [
    'application_type', 'initial_list_status',
    'purpose', 'pymnt_plan', 'verification_status',
    'emp_length', 'term'
]

continous_cols =  [
 'loan_amnt','funded_amnt','funded_amnt_inv','installment',
 'dti','revol_bal'
]

y_col = ['int_rate']



In [5]:

    
df_data = df[catagorical_cols + continous_cols]



In [6]:

    
# Converted columns to floating point
for feature_name in continous_cols:
    df_data[feature_name] = df_data[feature_name].astype(float)









    



/usr/local/lib/python3.5/site-packages/ipykernel/__main__.py:3: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  app.launch_new_instance()



In [7]:

    
data = pd.get_dummies(df_data)



In [8]:

    
data.tail(3)









    Out[8]:






  
    
      
      loan_amnt
      funded_amnt
      funded_amnt_inv
      installment
      dti
      revol_bal
      application_type_INDIVIDUAL
      application_type_JOINT
      initial_list_status_f
      initial_list_status_w
      ...
      emp_length_4 years
      emp_length_5 years
      emp_length_6 years
      emp_length_7 years
      emp_length_8 years
      emp_length_9 years
      emp_length_< 1 year
      emp_length_n/a
      term_ 36 months
      term_ 60 months
    
  
  
    
      887376
      13000.0
      13000.0
      13000.0
      316.07
      30.90
      11031.0
      1.0
      0.0
      0.0
      1.0
      ...
      0.0
      1.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      1.0
    
    
      887377
      12000.0
      12000.0
      12000.0
      317.86
      27.19
      8254.0
      1.0
      0.0
      0.0
      1.0
      ...
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      1.0
    
    
      887378
      20000.0
      20000.0
      20000.0
      664.20
      10.83
      33266.0
      1.0
      0.0
      1.0
      0.0
      ...
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      1.0
      0.0
    
  

3 rows × 43 columns



In [9]:

    
x = data.values[:, :]
y = df[y_col].values[:,-1]



In [10]:

    
# def encode_categorical(array):
#     if not array.dtype == np.dtype('float64'):
#         return LabelEncoder().fit_transform(array) 
#     else:
#         return array
    
# # Categorical columns for use in one-hot encoder
# categorical = (df_data.dtypes.values != np.dtype('float64'))

# # Encode all labels
# data = df_data.apply(encode_categorical)

# # Get numpy array from data
# x = data.values[:, :-1]
# y = data.values[:, -1]

# # Apply one hot endcoing
# encoder = OneHotEncoder(categorical_features=categorical[:-1], sparse=False)  # Last value in mask is y
# x = encoder.fit_transform(x)



In [11]:

    
plt.hist(y, bins=10)  # plt.hist passes it's arguments to np.histogram
plt.axvline(np.mean(y), color='black', linestyle='-', lw=6, label='Mean Interest rate')
plt.axvline(np.mean(y) - np.std(y), color='black', linestyle='--', lw=2, label='Std')
plt.axvline(np.mean(y) + np.std(y), color='black', linestyle='--', lw=2)
plt.title("Histogram of Interest Rates, Mean of {:0.2f}%".format(np.mean(y)))
plt.legend()
plt.show()

	id	member_id	loan_amnt	funded_amnt	funded_amnt_inv	term	int_rate	installment	grade	sub_grade	...	total_bal_il	il_util	open_rv_12m	open_rv_24m	max_bal_bc	all_util	total_rev_hi_lim	inq_fi	total_cu_tl	inq_last_12m
0	1077501	1296599	5000.0	5000.0	4975.0	36 months	10.65	162.87	B	B2	...	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN
1	1077430	1314167	2500.0	2500.0	2500.0	60 months	15.27	59.83	C	C4	...	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN
2	1077175	1313524	2400.0	2400.0	2400.0	36 months	15.96	84.33	C	C5	...	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN
3	1076863	1277178	10000.0	10000.0	10000.0	36 months	13.49	339.31	C	C1	...	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN
4	1075358	1311748	3000.0	3000.0	3000.0	60 months	12.69	67.79	B	B5	...	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN

	term	grade	sub_grade	emp_title	emp_length	home_ownership	verification_status	issue_d	loan_status	pymnt_plan	url	purpose	title	zip_code	addr_state	earliest_cr_line	initial_list_status	application_type
0	36 months	B	B2	NaN	10+ years	RENT	Verified	Dec-2011	Fully Paid	n	https://www.lendingclub.com/browse/loanDetail....	credit_card	Computer	860xx	AZ	Jan-1985	f	INDIVIDUAL
1	60 months	C	C4	Ryder	< 1 year	RENT	Source Verified	Dec-2011	Charged Off	n	https://www.lendingclub.com/browse/loanDetail....	car	bike	309xx	GA	Apr-1999	f	INDIVIDUAL
2	36 months	C	C5	NaN	10+ years	RENT	Not Verified	Dec-2011	Fully Paid	n	https://www.lendingclub.com/browse/loanDetail....	small_business	real estate business	606xx	IL	Nov-2001	f	INDIVIDUAL
3	36 months	C	C1	AIR RESOURCES BOARD	10+ years	RENT	Source Verified	Dec-2011	Fully Paid	n	https://www.lendingclub.com/browse/loanDetail....	other	personel	917xx	CA	Feb-1996	f	INDIVIDUAL
4	60 months	B	B5	University Medical Group	1 year	RENT	Source Verified	Dec-2011	Current	n	https://www.lendingclub.com/browse/loanDetail....	other	Personal	972xx	OR	Jan-1996	f	INDIVIDUAL

	emp_title_percent_misspelled	title_percent_misspelled
count	884766.000000	884766.000000
mean	0.174831	0.021598
std	0.335104	0.128298
min	0.000000	0.000000
25%	0.000000	0.000000
50%	0.000000	0.000000
75%	0.200000	0.000000
max	1.000000	1.000000

	id	member_id	loan_amnt	funded_amnt	funded_amnt_inv	term	int_rate	installment	grade	sub_grade	...	last_pymnt_amnt	next_pymnt_d	last_credit_pull_d	policy_code	application_type	tot_coll_amt	tot_cur_bal	total_rev_hi_lim
230652	1062400	1294262	20500.0	20500.0	20500.0	36 months	16.77	728.54	D	D2	...	10.21	NaN	Nov-2015	1.0	INDIVIDUAL	225.960795	NaN	NaN
230653	1059394	1291010	15000.0	15000.0	15000.0	36 months	15.27	521.97	C	C4	...	541.95	NaN	Dec-2014	1.0	INDIVIDUAL	225.960795	NaN	NaN
230654	1059224	1290827	35000.0	35000.0	35000.0	36 months	15.96	1229.81	C	C5	...	1244.72	NaN	Dec-2014	1.0	INDIVIDUAL	225.960795	NaN	NaN
230655	1058722	1290521	12000.0	12000.0	12000.0	36 months	16.29	423.61	D	D1	...	423.61	NaN	Sep-2012	1.0	INDIVIDUAL	225.960795	NaN	NaN
230656	1058291	1289878	12000.0	7775.0	7775.0	60 months	15.27	186.08	C	C4	...	186.08	Feb-2016	Jan-2016	1.0	INDIVIDUAL	225.960795	NaN	NaN

	loan_amnt	funded_amnt	funded_amnt_inv	installment	dti	revol_bal	application_type_INDIVIDUAL	initial_list_status_f	initial_list_status_w	...	emp_length_5 years	term_ 36 months	term_ 60 months
887376	13000.0	13000.0	13000.0	316.07	30.90	11031.0	1.0	0.0	1.0	...	1.0	0.0	1.0
887377	12000.0	12000.0	12000.0	317.86	27.19	8254.0	1.0	0.0	1.0	...	0.0	0.0	1.0
887378	20000.0	20000.0	20000.0	664.20	10.83	33266.0	1.0	1.0	0.0	...	0.0	1.0	0.0

Clean, impute missing values

Imputing missing values: float64

Imputing missing values: `float64`