In [1]:
%matplotlib inline
%load_ext autoreload
%autoreload 2

import os
import sys
import sklearn
import sqlite3
import matplotlib

import numpy as np
import pandas as pd
import enchant as en
import seaborn as sns
import statsmodels.api as sm
import matplotlib.pyplot as plt

from sklearn.preprocessing import LabelEncoder, OneHotEncoder

src_dir = os.path.join(os.getcwd(), os.pardir, 'src')
sys.path.append(src_dir)
%aimport data
from data import make_dataset as md

plt.style.use('ggplot')
plt.rcParams['figure.figsize'] = (16.0, 6.0)
plt.rcParams['legend.markerscale'] = 3
matplotlib.rcParams['font.size'] = 16.0


/usr/local/lib/python2.7/site-packages/matplotlib/font_manager.py:273: UserWarning: Matplotlib is building the font cache using fc-list. This may take a moment.
  warnings.warn('Matplotlib is building the font cache using fc-list. This may take a moment.')
/usr/local/lib/python2.7/site-packages/matplotlib/__init__.py:878: UserWarning: axes.color_cycle is deprecated and replaced with axes.prop_cycle; please use the latter.
  warnings.warn(self.msg_depr % (key, alt_key))

In [8]:
spell_check = en.Dict('en_US')

In [3]:
DIR = os.getcwd() + "/../data/"
df = pd.read_csv(DIR + 'raw/lending-club-loan-data/loan.csv', low_memory=False)
df.head()


Out[3]:
id member_id loan_amnt funded_amnt funded_amnt_inv term int_rate installment grade sub_grade ... total_bal_il il_util open_rv_12m open_rv_24m max_bal_bc all_util total_rev_hi_lim inq_fi total_cu_tl inq_last_12m
0 1077501 1296599 5000.0 5000.0 4975.0 36 months 10.65 162.87 B B2 ... NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
1 1077430 1314167 2500.0 2500.0 2500.0 60 months 15.27 59.83 C C4 ... NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
2 1077175 1313524 2400.0 2400.0 2400.0 36 months 15.96 84.33 C C5 ... NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
3 1076863 1277178 10000.0 10000.0 10000.0 36 months 13.49 339.31 C C1 ... NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
4 1075358 1311748 3000.0 3000.0 3000.0 60 months 12.69 67.79 B B5 ... NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN

5 rows × 74 columns

Clean, impute missing values


In [5]:
t = md.clean_data(df)
t.head()


Now cleaning data.
Out[5]:
id member_id loan_amnt funded_amnt funded_amnt_inv term int_rate installment grade sub_grade ... last_pymnt_amnt next_pymnt_d last_credit_pull_d collections_12_mths_ex_med policy_code application_type acc_now_delinq tot_coll_amt tot_cur_bal total_rev_hi_lim
0 1077501 1296599 5000.0 5000.0 4975.0 36 months 10.65 162.87 B B2 ... 171.62 NaT 2016-01-01 0.0 1.0 INDIVIDUAL 0.0 NaN NaN NaN
1 1077430 1314167 2500.0 2500.0 2500.0 60 months 15.27 59.83 C C4 ... 119.66 NaT 2013-09-01 0.0 1.0 INDIVIDUAL 0.0 NaN NaN NaN
2 1077175 1313524 2400.0 2400.0 2400.0 36 months 15.96 84.33 C C5 ... 649.91 NaT 2016-01-01 0.0 1.0 INDIVIDUAL 0.0 NaN NaN NaN
3 1076863 1277178 10000.0 10000.0 10000.0 36 months 13.49 339.31 C C1 ... 357.48 NaT 2015-01-01 0.0 1.0 INDIVIDUAL 0.0 NaN NaN NaN
4 1075358 1311748 3000.0 3000.0 3000.0 60 months 12.69 67.79 B B5 ... 67.79 2016-02-01 2016-01-01 0.0 1.0 INDIVIDUAL 0.0 NaN NaN NaN

5 rows × 53 columns


In [6]:
t2 = md.impute_missing(t)
t2.head()


/usr/local/lib/python2.7/site-packages/pandas/core/frame.py:2824: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  downcast=downcast, **kwargs)
Out[6]:
id member_id loan_amnt funded_amnt funded_amnt_inv term int_rate installment grade sub_grade ... last_pymnt_amnt next_pymnt_d last_credit_pull_d collections_12_mths_ex_med policy_code application_type acc_now_delinq tot_coll_amt tot_cur_bal total_rev_hi_lim
0 1077501 1296599 5000.0 5000.0 4975.0 36 months 10.65 162.87 B B2 ... 171.62 NaT 2016-01-01 0.0 1.0 INDIVIDUAL 0.0 NaN NaN NaN
1 1077430 1314167 2500.0 2500.0 2500.0 60 months 15.27 59.83 C C4 ... 119.66 NaT 2013-09-01 0.0 1.0 INDIVIDUAL 0.0 NaN NaN NaN
2 1077175 1313524 2400.0 2400.0 2400.0 36 months 15.96 84.33 C C5 ... 649.91 NaT 2016-01-01 0.0 1.0 INDIVIDUAL 0.0 NaN NaN NaN
3 1076863 1277178 10000.0 10000.0 10000.0 36 months 13.49 339.31 C C1 ... 357.48 NaT 2015-01-01 0.0 1.0 INDIVIDUAL 0.0 NaN NaN NaN
4 1075358 1311748 3000.0 3000.0 3000.0 60 months 12.69 67.79 B B5 ... 67.79 2016-02-01 2016-01-01 0.0 1.0 INDIVIDUAL 0.0 NaN NaN NaN

5 rows × 53 columns


In [9]:
s = 'my spej is bad'
string_list = str(s).split()
errors_list = [spell_check.check(x) for x in string_list]
errors_list


Out[9]:
[True, False, True, True]

In [14]:
float(errors_list.count(False)) / len(errors_list)


Out[14]:
0.25

In [16]:
t2.select_dtypes(include = ['object']).head()


Out[16]:
term grade sub_grade emp_title emp_length home_ownership verification_status issue_d loan_status pymnt_plan url purpose title zip_code addr_state earliest_cr_line initial_list_status application_type
0 36 months B B2 NaN 10+ years RENT Verified Dec-2011 Fully Paid n https://www.lendingclub.com/browse/loanDetail.... credit_card Computer 860xx AZ Jan-1985 f INDIVIDUAL
1 60 months C C4 Ryder < 1 year RENT Source Verified Dec-2011 Charged Off n https://www.lendingclub.com/browse/loanDetail.... car bike 309xx GA Apr-1999 f INDIVIDUAL
2 36 months C C5 NaN 10+ years RENT Not Verified Dec-2011 Fully Paid n https://www.lendingclub.com/browse/loanDetail.... small_business real estate business 606xx IL Nov-2001 f INDIVIDUAL
3 36 months C C1 AIR RESOURCES BOARD 10+ years RENT Source Verified Dec-2011 Fully Paid n https://www.lendingclub.com/browse/loanDetail.... other personel 917xx CA Feb-1996 f INDIVIDUAL
4 60 months B B5 University Medical Group 1 year RENT Source Verified Dec-2011 Current n https://www.lendingclub.com/browse/loanDetail.... other Personal 972xx OR Jan-1996 f INDIVIDUAL

In [19]:
t3 = md.spelling_mistakes(t2)
t3.head()


Now calculating spelling mistakes.
This may take a while.
Out[19]:
id member_id loan_amnt funded_amnt funded_amnt_inv term int_rate installment grade sub_grade ... last_credit_pull_d collections_12_mths_ex_med policy_code application_type acc_now_delinq tot_coll_amt tot_cur_bal total_rev_hi_lim emp_title_percent_misspelled title_percent_misspelled
0 1077501 1296599 5000.0 5000.0 4975.0 36 months 10.65 162.87 B B2 ... 2016-01-01 0.0 1.0 INDIVIDUAL 0.0 NaN NaN NaN 1.0 0.0
1 1077430 1314167 2500.0 2500.0 2500.0 60 months 15.27 59.83 C C4 ... 2013-09-01 0.0 1.0 INDIVIDUAL 0.0 NaN NaN NaN 0.0 0.0
2 1077175 1313524 2400.0 2400.0 2400.0 36 months 15.96 84.33 C C5 ... 2016-01-01 0.0 1.0 INDIVIDUAL 0.0 NaN NaN NaN 1.0 0.0
3 1076863 1277178 10000.0 10000.0 10000.0 36 months 13.49 339.31 C C1 ... 2015-01-01 0.0 1.0 INDIVIDUAL 0.0 NaN NaN NaN 0.0 1.0
4 1075358 1311748 3000.0 3000.0 3000.0 60 months 12.69 67.79 B B5 ... 2016-01-01 0.0 1.0 INDIVIDUAL 0.0 NaN NaN NaN 0.0 0.0

5 rows × 55 columns


In [20]:
t3[['emp_title', 'emp_title_percent_misspelled', 'title', 'title_percent_misspelled']].head()


Out[20]:
emp_title emp_title_percent_misspelled title title_percent_misspelled
0 NaN 1.0 Computer 0.0
1 Ryder 0.0 bike 0.0
2 NaN 1.0 real estate business 0.0
3 AIR RESOURCES BOARD 0.0 personel 1.0
4 University Medical Group 0.0 Personal 0.0

In [22]:
t3[['emp_title_percent_misspelled', 'title_percent_misspelled']].describe()


Out[22]:
emp_title_percent_misspelled title_percent_misspelled
count 884766.000000 884766.000000
mean 0.174831 0.021598
std 0.335104 0.128298
min 0.000000 0.000000
25% 0.000000 0.000000
50% 0.000000 0.000000
75% 0.200000 0.000000
max 1.000000 1.000000

In [25]:
plt.hist(t3['emp_title_percent_misspelled'], label = 'Employee title')
plt.hist(t3['title_percent_misspelled'], label = Title')
plt.show()



In [ ]:


In [ ]:


In [ ]:


In [ ]:


In [ ]:


In [ ]:


In [ ]:

Imputing missing values: float64


In [25]:
plt.hist(df['annual_inc'].dropna(), bins = 50)
plt.show()



In [52]:
# Drop the people making over $400k a year
df2 = df[df['annual_inc'] <= 400000].copy()
df2.shape, df.shape


Out[52]:
((885412, 74), (887379, 74))

In [58]:
plt.hist(df2['annual_inc'], bins = 50)
plt.axvline(df2['annual_inc'].mean(), label = "${:.2f}".format(df2['annual_inc'].mean()))
plt.title('Annual income (outlier millionaires removed)')
plt.xlabel('USD/year')
plt.legend()
plt.show()



In [63]:
# Since it's just 25 out of 800k, I'll drop
df2['delinq_2yrs'].value_counts(dropna = False)
df3 = df2[df2['delinq_2yrs'].isnull() == False]
df3.shape, df2.shape


Out[63]:
((885387, 74), (885412, 74))

In [67]:
df3['inq_last_6mths'].isnull().value_counts()


Out[67]:
False    885387
Name: inq_last_6mths, dtype: int64

In [72]:
df3['revol_util'].isnull().value_counts()


Out[72]:
False    884920
True        467
Name: revol_util, dtype: int64

In [76]:
plt.hist(df3['revol_util'].dropna(), bins = 50)
plt.show()



In [83]:
float(df3[df3['revol_util'] > 100].shape[0]) / df3.shape[0]
# Let's drop people using > 100% their credit rate

df4 = df3[df3['revol_util'] <= 130].copy()
df4.shape, df3.shape


Out[83]:
((884878, 74), (885387, 74))

In [84]:
df4['revol_util'].hist(bins = 50)


Out[84]:
<matplotlib.axes._subplots.AxesSubplot at 0x1196ff0d0>

In [90]:
df5 = df4[df4['collections_12_mths_ex_med'].isnull() == False].copy()
df5.shape, df4.shape


Out[90]:
((884766, 74), (884878, 74))

In [105]:
# Imputing using the mean
df5['tot_coll_amt'].fillna(value = df5['tot_coll_amt'].mean(), inplace = True)

In [110]:
df5['tot_cur_bal'].hist(bins = 50)


Out[110]:
<matplotlib.axes._subplots.AxesSubplot at 0x11d3bb2d0>

In [115]:
df5[df5['tot_cur_bal'].isnull() == True][cols_to_keep].tail()


Out[115]:
id member_id loan_amnt funded_amnt funded_amnt_inv term int_rate installment grade sub_grade ... last_pymnt_amnt next_pymnt_d last_credit_pull_d collections_12_mths_ex_med policy_code application_type acc_now_delinq tot_coll_amt tot_cur_bal total_rev_hi_lim
230652 1062400 1294262 20500.0 20500.0 20500.0 36 months 16.77 728.54 D D2 ... 10.21 NaN Nov-2015 0.0 1.0 INDIVIDUAL 0.0 225.960795 NaN NaN
230653 1059394 1291010 15000.0 15000.0 15000.0 36 months 15.27 521.97 C C4 ... 541.95 NaN Dec-2014 0.0 1.0 INDIVIDUAL 0.0 225.960795 NaN NaN
230654 1059224 1290827 35000.0 35000.0 35000.0 36 months 15.96 1229.81 C C5 ... 1244.72 NaN Dec-2014 0.0 1.0 INDIVIDUAL 0.0 225.960795 NaN NaN
230655 1058722 1290521 12000.0 12000.0 12000.0 36 months 16.29 423.61 D D1 ... 423.61 NaN Sep-2012 0.0 1.0 INDIVIDUAL 0.0 225.960795 NaN NaN
230656 1058291 1289878 12000.0 7775.0 7775.0 60 months 15.27 186.08 C C4 ... 186.08 Feb-2016 Jan-2016 0.0 1.0 INDIVIDUAL 0.0 225.960795 NaN NaN

5 rows × 53 columns


In [116]:
# I think it makes sense to set these to zero - these are people who's current balance is NaN
# because their loan is finished/paid off
df5['tot_cur_bal'].fillna(value = 0, inplace = True)

In [8]:
# Let's just drop this column
cols_to_keep.remove('total_rev_hi_lim')
len(cols_to_keep)


Out[8]:
52

In [10]:
print cols_to_keep


['id', 'member_id', 'loan_amnt', 'funded_amnt', 'funded_amnt_inv', 'term', 'int_rate', 'installment', 'grade', 'sub_grade', 'emp_title', 'emp_length', 'home_ownership', 'annual_inc', 'verification_status', 'issue_d', 'loan_status', 'pymnt_plan', 'url', 'purpose', 'title', 'zip_code', 'addr_state', 'dti', 'delinq_2yrs', 'earliest_cr_line', 'inq_last_6mths', 'open_acc', 'pub_rec', 'revol_bal', 'revol_util', 'total_acc', 'initial_list_status', 'out_prncp', 'out_prncp_inv', 'total_pymnt', 'total_pymnt_inv', 'total_rec_prncp', 'total_rec_int', 'total_rec_late_fee', 'recoveries', 'collection_recovery_fee', 'last_pymnt_d', 'last_pymnt_amnt', 'next_pymnt_d', 'last_credit_pull_d', 'collections_12_mths_ex_med', 'policy_code', 'application_type', 'acc_now_delinq', 'tot_coll_amt', 'tot_cur_bal']

In [ ]:
for i in cols_to_keep:
    if df5[i]

In [ ]:


In [ ]:


In [ ]:


In [ ]:


In [ ]:


In [ ]:


In [5]:
df['emp_title'].apply(lambda x: spell_check.check(x)).value_counts() / len(df['emp_title'])


Out[5]:
False    0.801466
True     0.198534
Name: emp_title, dtype: float64

In [18]:
df[objects].head()


Out[18]:
term grade sub_grade emp_title emp_length home_ownership verification_status issue_d loan_status pymnt_plan ... purpose title zip_code addr_state earliest_cr_line initial_list_status last_pymnt_d next_pymnt_d last_credit_pull_d application_type
0 36 months B B2 NaN 10+ years RENT Verified Dec-2011 Fully Paid n ... credit_card Computer 860xx AZ Jan-1985 f Jan-2015 NaN Jan-2016 INDIVIDUAL
1 60 months C C4 Ryder < 1 year RENT Source Verified Dec-2011 Charged Off n ... car bike 309xx GA Apr-1999 f Apr-2013 NaN Sep-2013 INDIVIDUAL
2 36 months C C5 NaN 10+ years RENT Not Verified Dec-2011 Fully Paid n ... small_business real estate business 606xx IL Nov-2001 f Jun-2014 NaN Jan-2016 INDIVIDUAL
3 36 months C C1 AIR RESOURCES BOARD 10+ years RENT Source Verified Dec-2011 Fully Paid n ... other personel 917xx CA Feb-1996 f Jan-2015 NaN Jan-2015 INDIVIDUAL
4 60 months B B5 University Medical Group 1 year RENT Source Verified Dec-2011 Current n ... other Personal 972xx OR Jan-1996 f Jan-2016 Feb-2016 Jan-2016 INDIVIDUAL
5 36 months A A4 Veolia Transportaton 3 years RENT Source Verified Dec-2011 Fully Paid n ... wedding My wedding loan I promise to pay back 852xx AZ Nov-2004 f Jan-2015 NaN Sep-2015 INDIVIDUAL
6 60 months C C5 Southern Star Photography 8 years RENT Not Verified Dec-2011 Current n ... debt_consolidation Loan 280xx NC Jul-2005 f Jan-2016 Feb-2016 Jan-2016 INDIVIDUAL
7 36 months E E1 MKC Accounting 9 years RENT Source Verified Dec-2011 Fully Paid n ... car Car Downpayment 900xx CA Jan-2007 f Jan-2015 NaN Dec-2014 INDIVIDUAL
8 60 months F F2 NaN 4 years OWN Source Verified Dec-2011 Charged Off n ... small_business Expand Business & Buy Debt Portfolio 958xx CA Apr-2004 f Apr-2012 NaN Aug-2012 INDIVIDUAL
9 60 months B B5 Starbucks < 1 year RENT Verified Dec-2011 Charged Off n ... other Building my credit history. 774xx TX Sep-2004 f Nov-2012 NaN Mar-2013 INDIVIDUAL
10 60 months C C3 Southwest Rural metro 5 years OWN Not Verified Dec-2011 Fully Paid n ... debt_consolidation High intrest Consolidation 853xx AZ Jan-1998 f Jun-2013 NaN Dec-2015 INDIVIDUAL
11 36 months B B5 UCLA 10+ years OWN Source Verified Dec-2011 Fully Paid n ... debt_consolidation Consolidation 913xx CA Oct-1989 f Sep-2013 NaN Aug-2013 INDIVIDUAL
12 36 months C C1 Va. Dept of Conservation/Recreation < 1 year RENT Source Verified Dec-2011 Charged Off n ... debt_consolidation freedom 245xx VA Apr-2004 f Jul-2012 NaN Nov-2012 INDIVIDUAL
13 36 months B B1 Target 3 years RENT Source Verified Dec-2011 Fully Paid n ... credit_card citicard fund 606xx IL Jul-2003 f Jan-2015 NaN Jan-2016 INDIVIDUAL
14 36 months B B2 SFMTA 3 years RENT Source Verified Dec-2011 Charged Off n ... other Other Loan 951xx CA May-1991 f Oct-2013 NaN Mar-2014 INDIVIDUAL
15 36 months D D1 Internal revenue Service < 1 year RENT Not Verified Dec-2011 Fully Paid n ... debt_consolidation Debt Consolidation Loan 641xx MO Sep-2007 f Jan-2015 NaN Jan-2016 INDIVIDUAL
16 36 months C C4 Chin's Restaurant 4 years RENT Not Verified Dec-2011 Fully Paid n ... home_improvement Home 921xx CA Oct-1998 f Jan-2015 NaN Apr-2015 INDIVIDUAL
17 36 months A A1 Duracell 10+ years MORTGAGE Not Verified Dec-2011 Fully Paid n ... major_purchase Holiday 067xx CT Aug-1993 f May-2013 NaN May-2014 INDIVIDUAL
18 36 months B B3 Connection Inspection 1 year MORTGAGE Verified Dec-2011 Fully Paid n ... medical Medical 890xx UT Oct-2003 f Feb-2015 NaN Jul-2015 INDIVIDUAL
19 36 months A A1 Network Interpreting Service 6 years RENT Not Verified Dec-2011 Fully Paid n ... debt_consolidation lowerratemeanseasiertogetoutofdebt! 921xx CA Jan-2001 f Jul-2012 NaN Jul-2012 INDIVIDUAL
20 60 months C C4 Archdiocese of Galveston Houston 3 years RENT Verified Dec-2011 Fully Paid n ... debt_consolidation Freedom From Credit Cards 770xx TX Nov-1997 f Aug-2015 NaN Dec-2015 INDIVIDUAL
21 36 months B B4 Osram Sylvania 10+ years RENT Verified Dec-2011 Charged Off n ... debt_consolidation Debt Cleanup 335xx FL Feb-1983 f Sep-2013 NaN Jan-2016 INDIVIDUAL
22 36 months B B3 Value Air 10+ years OWN Source Verified Dec-2011 Fully Paid n ... credit_card Credit Card Loan 799xx TX Jul-1985 f Jan-2015 NaN Jan-2016 INDIVIDUAL
23 36 months B B3 Wells Fargo Bank 5 years RENT Not Verified Dec-2011 Fully Paid n ... debt_consolidation Debt Consolidation 917xx CA Apr-2003 f Oct-2013 NaN Mar-2014 INDIVIDUAL
24 36 months B B3 bmg-educational 1 year RENT Not Verified Dec-2011 Charged Off n ... major_purchase cash 900xx CA Jun-2001 f Oct-2012 NaN Mar-2013 INDIVIDUAL
25 36 months B B1 Winfield Pathology Consultants 2 years MORTGAGE Verified Dec-2011 Fully Paid n ... credit_card No more credit card debt! 605xx IL Feb-2002 f Sep-2012 NaN Sep-2012 INDIVIDUAL
26 36 months C C2 nyc transit 9 years RENT Not Verified Dec-2011 Charged Off n ... debt_consolidation consolidation 103xx NY Oct-2003 f NaN NaN Jan-2016 INDIVIDUAL
27 60 months D D2 Frito Lay 2 years RENT Not Verified Dec-2011 Charged Off n ... other Other 150xx PA Oct-2003 f Dec-2012 NaN May-2013 INDIVIDUAL
28 36 months B B3 Shands Hospital at the University of Fl 10+ years MORTGAGE Not Verified Dec-2011 Fully Paid n ... debt_consolidation Debt Consolidation 326xx FL Aug-1984 f Apr-2013 NaN Jan-2016 INDIVIDUAL
29 36 months B B3 Oakridge homes < 1 year RENT Source Verified Dec-2011 Fully Paid n ... credit_card Credit card repayment loan 564xx MN Nov-2006 f Dec-2014 NaN Jan-2015 INDIVIDUAL
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
887349 36 months A A3 Mission info Tech 2 years MORTGAGE Source Verified Jan-2015 Current n ... debt_consolidation Debt consolidation 325xx FL Mar-1998 f Jan-2016 Feb-2016 Jan-2016 INDIVIDUAL
887350 60 months C C3 Business agent 10+ years RENT Source Verified Jan-2015 Current n ... debt_consolidation Debt consolidation 925xx CA Aug-2002 f Jan-2016 Feb-2016 Jan-2016 INDIVIDUAL
887351 36 months D D2 supervisor 10+ years MORTGAGE Verified Jan-2015 Charged Off n ... medical Medical expenses 810xx CO Aug-1990 f Aug-2015 NaN Jan-2016 INDIVIDUAL
887352 36 months B B5 Support Scientist II < 1 year RENT Verified Jan-2015 Current n ... debt_consolidation Debt consolidation 209xx MD Dec-2006 w Jan-2016 Feb-2016 Jan-2016 INDIVIDUAL
887353 60 months B B4 VP of Operations 4 years RENT Source Verified Jan-2015 Current n ... debt_consolidation Other 339xx FL Jan-1989 w Jan-2016 Feb-2016 Jan-2016 INDIVIDUAL
887354 36 months D D5 Operator 10+ years MORTGAGE Verified Jan-2015 Current n ... debt_consolidation Debt consolidation 308xx GA Jun-2001 f Jan-2016 Feb-2016 Jan-2016 INDIVIDUAL
887355 36 months A A5 NaN n/a RENT Verified Jan-2015 Current n ... debt_consolidation Debt consolidation 840xx UT Jun-1993 f Jan-2016 Feb-2016 Jan-2016 INDIVIDUAL
887356 60 months D D1 Assistant Maint Manager 10+ years MORTGAGE Source Verified Jan-2015 Current n ... credit_card Credit card refinancing 597xx MT Dec-1994 w Jan-2016 Feb-2016 Jan-2016 INDIVIDUAL
887357 60 months D D5 Field Supervisor 10+ years RENT Source Verified Jan-2015 Current n ... debt_consolidation Debt consolidation 412xx KY Jul-2002 w Jan-2016 Feb-2016 Jan-2016 INDIVIDUAL
887358 60 months D D2 Owner 3 years RENT Source Verified Jan-2015 Current n ... debt_consolidation Debt consolidation 282xx NC May-2003 w Jan-2016 Feb-2016 Jan-2016 INDIVIDUAL
887359 60 months D D1 NaN n/a OWN Not Verified Jan-2015 Current n ... debt_consolidation Debt consolidation 550xx MN May-2003 f Jan-2016 Feb-2016 Jan-2016 INDIVIDUAL
887360 60 months B B1 Business Manager 10+ years MORTGAGE Verified Jan-2015 Current n ... debt_consolidation Debt consolidation 138xx NY Aug-1983 w Jan-2016 Feb-2016 Jan-2016 INDIVIDUAL
887361 36 months A A1 Director 10+ years MORTGAGE Source Verified Jan-2015 Current n ... credit_card Credit card refinancing 021xx MA Oct-1994 f Jan-2016 Feb-2016 Jan-2016 INDIVIDUAL
887362 36 months A A5 Sr. Tech 9 years MORTGAGE Source Verified Jan-2015 Current n ... debt_consolidation Debt consolidation 871xx NM Sep-2003 f Jan-2016 Feb-2016 Jan-2016 INDIVIDUAL
887363 36 months B B2 Bartender 4 years RENT Source Verified Jan-2015 Current n ... debt_consolidation Debt consolidation 891xx NV Oct-2006 f Jan-2016 Feb-2016 Jan-2016 INDIVIDUAL
887364 36 months A A1 Coordinator of RSVP < 1 year RENT Not Verified Jan-2015 Fully Paid n ... debt_consolidation Debt consolidation 325xx FL Nov-1975 w Jul-2015 NaN Jun-2015 INDIVIDUAL
887365 36 months C C4 machining Cell Lead 10+ years RENT Verified Jan-2015 Current n ... debt_consolidation Debt consolidation 471xx IN Nov-2006 w Jan-2016 Feb-2016 Jan-2016 INDIVIDUAL
887366 36 months D D3 Painter 2 years RENT Source Verified Jan-2015 Fully Paid n ... debt_consolidation Debt consolidation 330xx FL Feb-2011 f Dec-2015 NaN Dec-2015 INDIVIDUAL
887367 36 months A A2 Area Director 1 year RENT Verified Jan-2015 Current n ... debt_consolidation Debt consolidation 333xx FL Mar-2000 w Jan-2016 Feb-2016 Jan-2016 INDIVIDUAL
887368 60 months C C5 Team Leader 8 years MORTGAGE Verified Jan-2015 Current n ... major_purchase Major purchase 286xx NC May-1994 w Jan-2016 Feb-2016 Mar-2015 INDIVIDUAL
887369 36 months B B1 Lead Custodian 10+ years MORTGAGE Verified Jan-2015 Fully Paid n ... car Car financing 956xx CA Sep-2002 f Jun-2015 NaN Nov-2015 INDIVIDUAL
887370 36 months B B2 Owner 8 years RENT Verified Jan-2015 Current n ... debt_consolidation Debt consolidation 531xx WI Aug-1994 f Jan-2016 Feb-2016 Jan-2016 INDIVIDUAL
887371 36 months E E2 NaN n/a OWN Verified Jan-2015 Charged Off n ... debt_consolidation Debt consolidation 430xx OH Jun-2002 f Aug-2015 NaN Jan-2016 INDIVIDUAL
887372 36 months B B5 KYC Business Analyst < 1 year MORTGAGE Verified Jan-2015 Current n ... credit_card Credit card refinancing 432xx OH Jun-1995 f Jan-2016 Feb-2016 Jan-2016 INDIVIDUAL
887373 36 months D D1 Manager 10+ years RENT Verified Jan-2015 Current n ... credit_card Credit card refinancing 791xx TX Jul-1999 f Jan-2016 Feb-2016 Jan-2016 INDIVIDUAL
887374 36 months B B5 Office Assistant 8 years RENT Verified Jan-2015 Current n ... debt_consolidation Debt consolidation 905xx CA Sep-2004 f Jan-2016 Feb-2016 Jan-2016 INDIVIDUAL
887375 36 months B B5 Director of Maintenance 10+ years MORTGAGE Verified Jan-2015 Current n ... home_improvement Home improvement 072xx NJ Mar-1974 f Jan-2016 Feb-2016 Jan-2016 INDIVIDUAL
887376 60 months D D2 Radiologist Technologist 5 years RENT Verified Jan-2015 Current n ... debt_consolidation Debt consolidation 378xx TN Sep-2003 w Jan-2016 Feb-2016 Jan-2016 INDIVIDUAL
887377 60 months E E3 Painter 1 year RENT Source Verified Jan-2015 Current n ... debt_consolidation Debt consolidation 010xx MA Oct-2003 w Jan-2016 Feb-2016 Jan-2016 INDIVIDUAL
887378 36 months B B5 Manager Hotel Operations Oasis 10+ years RENT Verified Jan-2015 Current n ... credit_card Credit card refinancing 331xx FL Dec-2001 f Jan-2016 Feb-2016 Jan-2016 INDIVIDUAL

887379 rows × 21 columns


In [19]:
df['num_spell_errors'] = 0

for s in ['emp_title', 'title']:
    df['num_spell_errors'] += df[s].apply(lambda x: spell_check.check(x)) == True
    
df[['emp_title', 'title', 'num_spell_errors']].head()


Out[19]:
emp_title title purpose num_spell_errors
0 NaN Computer credit_card 1
1 Ryder bike car 3
2 NaN real estate business small_business 0
3 AIR RESOURCES BOARD personel other 1
4 University Medical Group Personal other 2

In [24]:
df[['emp_title', 'title', 'num_spell_errors']].head(20)


Out[24]:
emp_title title purpose num_spell_errors
0 NaN Computer credit_card 1
1 Ryder bike car 3
2 NaN real estate business small_business 0
3 AIR RESOURCES BOARD personel other 1
4 University Medical Group Personal other 2
5 Veolia Transportaton My wedding loan I promise to pay back wedding 1
6 Southern Star Photography Loan debt_consolidation 1
7 MKC Accounting Car Downpayment car 1
8 NaN Expand Business & Buy Debt Portfolio small_business 0
9 Starbucks Building my credit history. other 1
10 Southwest Rural metro High intrest Consolidation debt_consolidation 0
11 UCLA Consolidation debt_consolidation 2
12 Va. Dept of Conservation/Recreation freedom debt_consolidation 1
13 Target citicard fund credit_card 1
14 SFMTA Other Loan other 1
15 Internal revenue Service Debt Consolidation Loan debt_consolidation 0
16 Chin's Restaurant Home home_improvement 1
17 Duracell Holiday major_purchase 2
18 Connection Inspection Medical medical 2
19 Network Interpreting Service lowerratemeanseasiertogetoutofdebt! debt_consolidation 0

In [28]:
df[['title', df['title'].apply(lambda x: spell_check.check(x))]].head()


------------------------------------------------------------------------
ValueError                             Traceback (most recent call last)
<ipython-input-28-353303856aed> in <module>()
----> 1 df[['title', df['title'].apply(lambda x: spell_check.check(x))]].head()

/usr/local/lib/python2.7/site-packages/pandas/core/frame.pyc in __getitem__(self, key)
   2049         if isinstance(key, (Series, np.ndarray, Index, list)):
   2050             # either boolean or fancy integer index
-> 2051             return self._getitem_array(key)
   2052         elif isinstance(key, DataFrame):
   2053             return self._getitem_frame(key)

/usr/local/lib/python2.7/site-packages/pandas/core/frame.pyc in _getitem_array(self, key)
   2076     def _getitem_array(self, key):
   2077         # also raises Exception if object array with NA values
-> 2078         if com.is_bool_indexer(key):
   2079             # warning here just in case -- previously __setitem__ was
   2080             # reindexing but __getitem__ was not; it seems more reasonable to

/usr/local/lib/python2.7/site-packages/pandas/core/common.pyc in is_bool_indexer(key)
    198     elif isinstance(key, list):
    199         try:
--> 200             arr = np.asarray(key)
    201             return arr.dtype == np.bool_ and len(arr) == len(key)
    202         except TypeError:  # pragma: no cover

/usr/local/lib/python2.7/site-packages/numpy/core/numeric.pyc in asarray(a, dtype, order)
    480 
    481     """
--> 482     return array(a, dtype, copy=False, order=order)
    483 
    484 def asanyarray(a, dtype=None, order=None):

ValueError: cannot set an array element with a sequence

In [26]:
df['purpose'].value_counts()


Out[26]:
debt_consolidation    524215
credit_card           206182
home_improvement       51829
other                  42894
major_purchase         17277
small_business         10377
car                     8863
medical                 8540
moving                  5414
vacation                4736
house                   3707
wedding                 2347
renewable_energy         575
educational              423
Name: purpose, dtype: int64

In [ ]:


In [ ]:


In [ ]:


In [4]:
catagorical_cols = [
    'application_type', 'initial_list_status',
    'purpose', 'pymnt_plan', 'verification_status',
    'emp_length', 'term'
]

continous_cols =  [
 'loan_amnt','funded_amnt','funded_amnt_inv','installment',
 'dti','revol_bal'
]

y_col = ['int_rate']

In [5]:
df_data = df[catagorical_cols + continous_cols]

In [6]:
# Converted columns to floating point
for feature_name in continous_cols:
    df_data[feature_name] = df_data[feature_name].astype(float)


/usr/local/lib/python3.5/site-packages/ipykernel/__main__.py:3: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  app.launch_new_instance()

In [7]:
data = pd.get_dummies(df_data)

In [8]:
data.tail(3)


Out[8]:
loan_amnt funded_amnt funded_amnt_inv installment dti revol_bal application_type_INDIVIDUAL application_type_JOINT initial_list_status_f initial_list_status_w ... emp_length_4 years emp_length_5 years emp_length_6 years emp_length_7 years emp_length_8 years emp_length_9 years emp_length_< 1 year emp_length_n/a term_ 36 months term_ 60 months
887376 13000.0 13000.0 13000.0 316.07 30.90 11031.0 1.0 0.0 0.0 1.0 ... 0.0 1.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 1.0
887377 12000.0 12000.0 12000.0 317.86 27.19 8254.0 1.0 0.0 0.0 1.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 1.0
887378 20000.0 20000.0 20000.0 664.20 10.83 33266.0 1.0 0.0 1.0 0.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 1.0 0.0

3 rows × 43 columns


In [9]:
x = data.values[:, :]
y = df[y_col].values[:,-1]

In [10]:
# def encode_categorical(array):
#     if not array.dtype == np.dtype('float64'):
#         return LabelEncoder().fit_transform(array) 
#     else:
#         return array
    
# # Categorical columns for use in one-hot encoder
# categorical = (df_data.dtypes.values != np.dtype('float64'))

# # Encode all labels
# data = df_data.apply(encode_categorical)

# # Get numpy array from data
# x = data.values[:, :-1]
# y = data.values[:, -1]

# # Apply one hot endcoing
# encoder = OneHotEncoder(categorical_features=categorical[:-1], sparse=False)  # Last value in mask is y
# x = encoder.fit_transform(x)

In [11]:
plt.hist(y, bins=10)  # plt.hist passes it's arguments to np.histogram
plt.axvline(np.mean(y), color='black', linestyle='-', lw=6, label='Mean Interest rate')
plt.axvline(np.mean(y) - np.std(y), color='black', linestyle='--', lw=2, label='Std')
plt.axvline(np.mean(y) + np.std(y), color='black', linestyle='--', lw=2)
plt.title("Histogram of Interest Rates, Mean of {:0.2f}%".format(np.mean(y)))
plt.legend()
plt.show()