In [48]:
## load libraries
import sys
from numpy import *
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import operator
%matplotlib inline
from sklearn.feature_extraction import DictVectorizer
from sklearn import preprocessing
from sklearn import neighbors, tree, naive_bayes
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.cross_validation import train_test_split
from sklearn.neighbors import KNeighborsClassifier
In [49]:
data = pd.read_csv("loan.csv", low_memory=False)
In [50]:
# 5% of the data without replacement
data = data.sample(frac=0.05, replace=False, random_state=123)
In [51]:
data.shape
Out[51]:
In [52]:
data.head(n=5)
Out[52]:
In [53]:
data.columns
Out[53]:
The loan_status column is the target!
In [54]:
pd.unique(data['loan_status'].values.ravel())
Out[54]:
In [55]:
print("Amount of Classes: ", len(pd.unique(data['loan_status'].values.ravel())))
In [56]:
len(pd.unique(data['zip_code'].values.ravel())) # want to make sure this was not too unique
Out[56]:
In [57]:
len(pd.unique(data['url'].values.ravel())) # drop url
Out[57]:
In [58]:
len(pd.unique(data['last_pymnt_d'].values.ravel()))
Out[58]:
In [59]:
len(pd.unique(data['next_pymnt_d'].values.ravel()))
Out[59]:
In [60]:
for col in data.select_dtypes(include=['object']).columns:
print ("Column {} has {} unique instances".format( col, len(data[col].unique())) )
In [62]:
len(pd.unique(data['member_id'].values.ravel())) == data.shape[0]
Out[62]:
In [63]:
data = data.drop('id', 1) #
data = data.drop('member_id', 1)#
data = data.drop('url', 1)#
data = data.drop('purpose', 1)
data = data.drop('title', 1)#
data = data.drop('zip_code', 1)#
data = data.drop('emp_title', 1)#
data = data.drop('earliest_cr_line', 1)#
data = data.drop('term', 1)
data = data.drop('sub_grade', 1) #
data = data.drop('last_pymnt_d', 1)#
data = data.drop('next_pymnt_d', 1)#
data = data.drop('last_credit_pull_d', 1)
data = data.drop('issue_d', 1) ##
data = data.drop('desc', 1)##
data = data.drop('addr_state', 1)##
In [65]:
data.shape
Out[65]:
In [66]:
# yay this is better
for col in data.select_dtypes(include=['object']).columns:
print ("Column {} has {} unique instances".format( col, len(data[col].unique())) )
In [70]:
data['loan_amnt'].plot(kind="hist", bins=10)
Out[70]:
In [71]:
data['grade'].value_counts().plot(kind='bar')
Out[71]:
In [72]:
data['emp_length'].value_counts().plot(kind='bar')
Out[72]:
In [16]:
data['loan_status'].value_counts().plot(kind='bar')
Out[16]:
In [76]:
data._get_numeric_data().columns
Out[76]:
In [77]:
"There are {} numeric columns in the data set".format(len(data._get_numeric_data().columns) )
Out[77]:
In [78]:
data.select_dtypes(include=['object']).columns
Out[78]:
In [79]:
"There are {} Character columns in the data set (minus the target)".format(len(data.select_dtypes(include=['object']).columns) -1)
Out[79]:
In [80]:
X = data.drop("loan_status", axis=1, inplace = False)
y = data.loan_status
In [81]:
y.head()
Out[81]:
In [83]:
def model_matrix(df , columns):
dummified_cols = pd.get_dummies(df[columns])
df = df.drop(columns, axis = 1, inplace=False)
df_new = df.join(dummified_cols)
return df_new
X = model_matrix(X, ['grade', 'emp_length', 'home_ownership', 'verification_status',
'pymnt_plan', 'initial_list_status', 'application_type', 'verification_status_joint'])
# 'issue_d' 'desc' 'addr_state'
In [84]:
X.head()
Out[84]:
In [85]:
X.shape
Out[85]:
In [86]:
# impute rows with NaN with a 0 for now
X2 = X.fillna(value = 0)
X2.head()
Out[86]:
In [87]:
from sklearn.preprocessing import MinMaxScaler
Scaler = MinMaxScaler()
X2[['loan_amnt', 'funded_amnt', 'funded_amnt_inv', 'int_rate',
'installment', 'annual_inc', 'dti', 'delinq_2yrs', 'inq_last_6mths',
'mths_since_last_delinq', 'mths_since_last_record', 'open_acc',
'pub_rec', 'revol_bal', 'revol_util', 'total_acc', 'out_prncp',
'out_prncp_inv', 'total_pymnt', 'total_pymnt_inv', 'total_rec_prncp',
'total_rec_int', 'total_rec_late_fee', 'recoveries',
'collection_recovery_fee', 'last_pymnt_amnt',
'collections_12_mths_ex_med', 'mths_since_last_major_derog',
'policy_code', 'annual_inc_joint', 'dti_joint', 'acc_now_delinq',
'tot_coll_amt', 'tot_cur_bal', 'open_acc_6m', 'open_il_6m',
'open_il_12m', 'open_il_24m', 'mths_since_rcnt_il', 'total_bal_il',
'il_util', 'open_rv_12m', 'open_rv_24m', 'max_bal_bc', 'all_util',
'total_rev_hi_lim', 'inq_fi', 'total_cu_tl', 'inq_last_12m']] = Scaler.fit_transform(X2[['loan_amnt', 'funded_amnt', 'funded_amnt_inv', 'int_rate',
'installment', 'annual_inc', 'dti', 'delinq_2yrs', 'inq_last_6mths',
'mths_since_last_delinq', 'mths_since_last_record', 'open_acc',
'pub_rec', 'revol_bal', 'revol_util', 'total_acc', 'out_prncp',
'out_prncp_inv', 'total_pymnt', 'total_pymnt_inv', 'total_rec_prncp',
'total_rec_int', 'total_rec_late_fee', 'recoveries',
'collection_recovery_fee', 'last_pymnt_amnt',
'collections_12_mths_ex_med', 'mths_since_last_major_derog',
'policy_code', 'annual_inc_joint', 'dti_joint', 'acc_now_delinq',
'tot_coll_amt', 'tot_cur_bal', 'open_acc_6m', 'open_il_6m',
'open_il_12m', 'open_il_24m', 'mths_since_rcnt_il', 'total_bal_il',
'il_util', 'open_rv_12m', 'open_rv_24m', 'max_bal_bc', 'all_util',
'total_rev_hi_lim', 'inq_fi', 'total_cu_tl', 'inq_last_12m']])
In [88]:
X2.head()
Out[88]:
In [89]:
x_train, x_test, y_train, y_test = train_test_split(X2, y, test_size=.3, random_state=123)
In [90]:
print(x_train.shape)
print(y_train.shape)
print(x_test.shape)
print(y_test.shape)
In [91]:
# start out with the number of classes for neighbors
data_knn = KNeighborsClassifier(n_neighbors = 10, metric='euclidean')
data_knn
Out[91]:
In [92]:
data_knn.fit(x_train, y_train)
Out[92]:
In [93]:
data_knn.predict(x_test)
Out[93]:
In [94]:
# R-square from training and test data
rsquared_train = data_knn.score(x_train, y_train)
rsquared_test = data_knn.score(x_test, y_test)
print ('Training data R-squared:')
print(rsquared_train)
print ('Test data R-squared:')
print(rsquared_test)
In [96]:
# confusion matrix
from sklearn.metrics import confusion_matrix
knn_confusion_matrix = confusion_matrix(y_true = y_test, y_pred = data_knn.predict(x_test))
print("The Confusion matrix:\n", knn_confusion_matrix)
In [99]:
# visualize the confusion matrix
# http://scikit-learn.org/stable/auto_examples/model_selection/plot_confusion_matrix.html
plt.matshow(knn_confusion_matrix, cmap = plt.cm.Blues)
plt.title("KNN Confusion Matrix\n")
#plt.xticks([0,1], ['No', 'Yes'])
#plt.yticks([0,1], ['No', 'Yes'])
plt.ylabel('True label')
plt.xlabel('Predicted label')
for y in range(knn_confusion_matrix.shape[0]):
for x in range(knn_confusion_matrix.shape[1]):
plt.text(x, y, '{}'.format(knn_confusion_matrix[y, x]),
horizontalalignment = 'center',
verticalalignment = 'center',)
plt.show()
In [98]:
#Generate the classification report
from sklearn.metrics import classification_report
knn_classify_report = classification_report(y_true = y_test,
y_pred = data_knn.predict(x_test))
print(knn_classify_report)
fin.
In [ ]: