In [1]:
%matplotlib inline
%load_ext autoreload
%autoreload 2
import os
import sys
import sklearn
import sqlite3
import matplotlib
import numpy as np
import pandas as pd
import enchant as en
import seaborn as sns
import statsmodels.api as sm
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
src_dir = os.path.join(os.getcwd(), os.pardir, 'src')
sys.path.append(src_dir)
%aimport data
from data import make_dataset as md
plt.style.use('ggplot')
plt.rcParams['figure.figsize'] = (16.0, 6.0)
plt.rcParams['legend.markerscale'] = 3
matplotlib.rcParams['font.size'] = 16.0
In [8]:
spell_check = en.Dict('en_US')
In [3]:
DIR = os.getcwd() + "/../data/"
df = pd.read_csv(DIR + 'raw/lending-club-loan-data/loan.csv', low_memory=False)
df.head()
Out[3]:
In [5]:
t = md.clean_data(df)
t.head()
Out[5]:
In [6]:
t2 = md.impute_missing(t)
t2.head()
Out[6]:
In [9]:
s = 'my spej is bad'
string_list = str(s).split()
errors_list = [spell_check.check(x) for x in string_list]
errors_list
Out[9]:
In [14]:
float(errors_list.count(False)) / len(errors_list)
Out[14]:
In [16]:
t2.select_dtypes(include = ['object']).head()
Out[16]:
In [19]:
t3 = md.spelling_mistakes(t2)
t3.head()
Out[19]:
In [20]:
t3[['emp_title', 'emp_title_percent_misspelled', 'title', 'title_percent_misspelled']].head()
Out[20]:
In [22]:
t3[['emp_title_percent_misspelled', 'title_percent_misspelled']].describe()
Out[22]:
In [25]:
plt.hist(t3['emp_title_percent_misspelled'], label = 'Employee title')
plt.hist(t3['title_percent_misspelled'], label = Title')
plt.show()
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [25]:
plt.hist(df['annual_inc'].dropna(), bins = 50)
plt.show()
In [52]:
# Drop the people making over $400k a year
df2 = df[df['annual_inc'] <= 400000].copy()
df2.shape, df.shape
Out[52]:
In [58]:
plt.hist(df2['annual_inc'], bins = 50)
plt.axvline(df2['annual_inc'].mean(), label = "${:.2f}".format(df2['annual_inc'].mean()))
plt.title('Annual income (outlier millionaires removed)')
plt.xlabel('USD/year')
plt.legend()
plt.show()
In [63]:
# Since it's just 25 out of 800k, I'll drop
df2['delinq_2yrs'].value_counts(dropna = False)
df3 = df2[df2['delinq_2yrs'].isnull() == False]
df3.shape, df2.shape
Out[63]:
In [67]:
df3['inq_last_6mths'].isnull().value_counts()
Out[67]:
In [72]:
df3['revol_util'].isnull().value_counts()
Out[72]:
In [76]:
plt.hist(df3['revol_util'].dropna(), bins = 50)
plt.show()
In [83]:
float(df3[df3['revol_util'] > 100].shape[0]) / df3.shape[0]
# Let's drop people using > 100% their credit rate
df4 = df3[df3['revol_util'] <= 130].copy()
df4.shape, df3.shape
Out[83]:
In [84]:
df4['revol_util'].hist(bins = 50)
Out[84]:
In [90]:
df5 = df4[df4['collections_12_mths_ex_med'].isnull() == False].copy()
df5.shape, df4.shape
Out[90]:
In [105]:
# Imputing using the mean
df5['tot_coll_amt'].fillna(value = df5['tot_coll_amt'].mean(), inplace = True)
In [110]:
df5['tot_cur_bal'].hist(bins = 50)
Out[110]:
In [115]:
df5[df5['tot_cur_bal'].isnull() == True][cols_to_keep].tail()
Out[115]:
In [116]:
# I think it makes sense to set these to zero - these are people who's current balance is NaN
# because their loan is finished/paid off
df5['tot_cur_bal'].fillna(value = 0, inplace = True)
In [8]:
# Let's just drop this column
cols_to_keep.remove('total_rev_hi_lim')
len(cols_to_keep)
Out[8]:
In [10]:
print cols_to_keep
In [ ]:
for i in cols_to_keep:
if df5[i]
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [5]:
df['emp_title'].apply(lambda x: spell_check.check(x)).value_counts() / len(df['emp_title'])
Out[5]:
In [18]:
df[objects].head()
Out[18]:
In [19]:
df['num_spell_errors'] = 0
for s in ['emp_title', 'title']:
df['num_spell_errors'] += df[s].apply(lambda x: spell_check.check(x)) == True
df[['emp_title', 'title', 'num_spell_errors']].head()
Out[19]:
In [24]:
df[['emp_title', 'title', 'num_spell_errors']].head(20)
Out[24]:
In [28]:
df[['title', df['title'].apply(lambda x: spell_check.check(x))]].head()
In [26]:
df['purpose'].value_counts()
Out[26]:
In [ ]:
In [ ]:
In [ ]:
In [4]:
catagorical_cols = [
'application_type', 'initial_list_status',
'purpose', 'pymnt_plan', 'verification_status',
'emp_length', 'term'
]
continous_cols = [
'loan_amnt','funded_amnt','funded_amnt_inv','installment',
'dti','revol_bal'
]
y_col = ['int_rate']
In [5]:
df_data = df[catagorical_cols + continous_cols]
In [6]:
# Converted columns to floating point
for feature_name in continous_cols:
df_data[feature_name] = df_data[feature_name].astype(float)
In [7]:
data = pd.get_dummies(df_data)
In [8]:
data.tail(3)
Out[8]:
In [9]:
x = data.values[:, :]
y = df[y_col].values[:,-1]
In [10]:
# def encode_categorical(array):
# if not array.dtype == np.dtype('float64'):
# return LabelEncoder().fit_transform(array)
# else:
# return array
# # Categorical columns for use in one-hot encoder
# categorical = (df_data.dtypes.values != np.dtype('float64'))
# # Encode all labels
# data = df_data.apply(encode_categorical)
# # Get numpy array from data
# x = data.values[:, :-1]
# y = data.values[:, -1]
# # Apply one hot endcoing
# encoder = OneHotEncoder(categorical_features=categorical[:-1], sparse=False) # Last value in mask is y
# x = encoder.fit_transform(x)
In [11]:
plt.hist(y, bins=10) # plt.hist passes it's arguments to np.histogram
plt.axvline(np.mean(y), color='black', linestyle='-', lw=6, label='Mean Interest rate')
plt.axvline(np.mean(y) - np.std(y), color='black', linestyle='--', lw=2, label='Std')
plt.axvline(np.mean(y) + np.std(y), color='black', linestyle='--', lw=2)
plt.title("Histogram of Interest Rates, Mean of {:0.2f}%".format(np.mean(y)))
plt.legend()
plt.show()