In [2]:
import pandas as pd
import numpy as np
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.feature_selection import RFE
from sklearn.svm import SVR
from sklearn.svm import LinearSVC
from sklearn.svm import LinearSVR
import seaborn as sns
import matplotlib.pylab as pl
%matplotlib inline
loan_status -- Current status of the loan
loan_amnt -- The listed amount of the loan applied for by the borrower. If at some point in time, the credit department reduces the loan amount, then it will be reflected in this value.
int_rate -- interest rate of the loan
sub_grade -- LC assigned sub loan grade -- dummie (grade -- LC assigned loan grade
-- dummie)
purpose -- A category provided by the borrower for the loan request. -- dummie
annual_inc -- The self-reported annual income provided by the borrower during registration.
emp_length -- Employment length in years. Possible values are between 0 and 10 where 0 means less than one year and 10 means ten or more years. -- dummie
fico_range_low
fico_range_high
home_ownership -- The home ownership status provided by the borrower during registration or obtained from the credit report. Our values are: RENT, OWN, MORTGAGE, OTHER -- dummie
tot_cur_bal -- Total current balance of all accounts
num_actv_bc_tl -- number of active bank accounts (avg_cur_bal -- average current balance of all accounts )
mort_acc -- number of mortgage accounts
num_actv_rev_tl -- Number of currently active revolving trades
dti -- A ratio calculated using the borrower’s total monthly debt payments on the total debt obligations, excluding mortgage and the requested LC loan, divided by the borrower’s self-reported monthly income.
pub_rec_bankruptcies - Number of public record bankruptcies
In [3]:
df_app_2015 = pd.read_csv('LoanStats3d_securev1.csv.zip', compression='zip',header=1, skiprows=[-2,-1],low_memory=False)
In [4]:
df_app_2015.head(3)
Out[4]:
In [5]:
# Pre-select columns
df = df_app_2015.ix[:, ['loan_status','loan_amnt', 'int_rate', 'sub_grade',\
'purpose',\
'annual_inc', 'emp_length', 'home_ownership',\
'fico_range_low','fico_range_high',\
'num_actv_bc_tl', 'tot_cur_bal', 'mort_acc','num_actv_rev_tl',\
'pub_rec_bankruptcies','dti' ]]
In [6]:
## in Nehal and Kay's notebooks
In [7]:
df_app_2015.tail(3)
Out[7]:
In [8]:
df.head(3)
Out[8]:
In [9]:
df.loan_status.unique()
Out[9]:
In [10]:
df = df.dropna()
In [11]:
len(df)
Out[11]:
In [12]:
#df.loan_status.fillna('none', inplace=True) ## there is no nan
In [13]:
df.loan_status.unique()
Out[13]:
In [14]:
defaulters=['Default','Charged Off', 'Late (31-120 days)']
non_defaulters=['Fully Paid']
uncertain = ['Current','Late (16-30 days)','In Grace Period', 'none']
In [15]:
len(df[df.loan_status.isin(uncertain)].loan_status)
Out[15]:
In [16]:
df.info()
In [18]:
## select instances of defaulters and non_defulters
df2 = df.copy()
df2['Target']= 2 ## uncertain
df2.loc[df2.loan_status.isin(defaulters),'Target'] = 0 ## defaulters
df2.loc[df2.loan_status.isin(non_defaulters),'Target'] = 1 ## paid -- (and to whom to issue the loan)
print('Value in Target value for non defaulters')
print(df2.loc[df2.loan_status.isin(non_defaulters)].Target.unique())
print(len(df2[df2['Target'] == 1]))
print('Value in Target value for defaulters')
print(df2.loc[df2.loan_status.isin(defaulters)].Target.unique())
print(len(df2[df2['Target'] == 0]))
print('Value in Target value for uncertained-- unlabeled ones to predict')
print(df2.loc[df2.loan_status.isin(uncertain)].Target.unique())
print(len(df2[df2['Target'] == 2]))
In [1]:
42302/94968
Out[1]:
In [20]:
# function to create dummies
def create_dummies(column_name,df):
temp=pd.get_dummies(df[column_name],prefix=column_name)
df=pd.concat([df,temp],axis=1)
return df
In [21]:
dummy_list=['emp_length','home_ownership','purpose','sub_grade']
for col in dummy_list:
df2=create_dummies(col,df2)
for col in dummy_list:
df2=df2.drop(col,1)
In [22]:
temp=df2['int_rate'].astype(str).str.replace('%', '').replace(' ','').astype(float)
df2=df2.drop('int_rate',1)
df2=pd.concat([df2,temp],axis=1)
df2=df2.drop('loan_status',1)
In [23]:
for col in df2.columns:
print((df2[col].dtype))
In [24]:
df2.shape
Out[24]:
In [25]:
df2['loan_amnt'][sorted(np.random.randint(0, high=10, size=5))]
Out[25]:
In [26]:
# Reference:
# http://stackoverflow.com/questions/22354094/pythonic-way-of-detecting-outliers-in-one-dimensional-observation-data
def main(df, col, thres):
outliers_all = []
ind = sorted(np.random.randint(0, high=len(df), size=5000)) # randomly pick instances from the dataframe
#select data from our dataframe
x = df[col][ind]
num = len(ind)
outliers = plot(x, col, num, thres) # append all the outliers in the list
pl.show()
return outliers
def mad_based_outlier(points, thresh):
if len(points.shape) == 1:
points = points[:,None]
median = np.median(points, axis=0)
diff = np.sum((points - median)**2, axis=-1)
diff = np.sqrt(diff)
med_abs_deviation = np.median(diff)
modified_z_score = 0.6745 * diff / med_abs_deviation
return modified_z_score > thresh
def plot(x, col, num, thres):
fig, ax = pl.subplots(nrows=1, figsize=(10, 3))
sns.distplot(x, ax=ax, rug=True, hist=False)
outliers = np.asarray(x[mad_based_outlier(x, thres)])
ax.plot(outliers, np.zeros_like(outliers), 'ro', clip_on=False)
fig.suptitle('MAD-based Outlier Tests with selected {} values'.format(col, num, size=20))
return outliers
In [27]:
### Find outliers
##
boundries = []
outliers_loan = main(df2, 'loan_amnt', thres=2.2)
In [28]:
boundries.append(outliers_loan.min())
In [29]:
## annual income
outliers_inc = main(df2, 'annual_inc', 8)
In [30]:
boundries.append(outliers_inc.min())
In [31]:
## For total current balance of bank accounts
outliers_bal = main(df2, 'tot_cur_bal', 8)
In [32]:
boundries.append(outliers_bal.min())
In [33]:
columns = ['loan_amnt', 'annual_inc', 'tot_cur_bal']
for col, bound in zip(columns, boundries):
print ('Lower bound of detected Outliers for {}: {}'.format(col, bound))
# Use the outlier boundry to "regularize" the dataframe
df2_r = df2[df2[col] <= bound]
In [106]:
# df2_r.info()
In [34]:
df2_r.shape
Out[34]:
In [35]:
#### Fill NaN with "none"??? ####
#df_filled = df2.fillna(value='none')
#df_filled.head(3)
df2_r = df2_r.dropna()
print (len(df2_r))
In [109]:
# df2_r.to_csv('approved_loan_2015_clean.csv')