In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
from sklearn import ensemble
from sklearn.model_selection import cross_val_score
In [2]:
# Access to data
y2015 = pd.read_csv(
'D:\\Users\\Borja.gonzalez\\Desktop\\Thinkful-DataScience-Borja\\LoanStats3d.csv',skipinitialspace=True,
header=1)
In [3]:
#Data cleaning
categorical = y2015.select_dtypes(include=['object'])
for i in categorical:
column = categorical[i]
print(i)
print(column.nunique())
In [4]:
# Drop other columns with many unique variables
y2015.drop(['url', 'emp_title', 'zip_code', 'earliest_cr_line', 'revol_util',
'sub_grade', 'addr_state', 'desc'], 1, inplace=True)
In [5]:
y2015.tail()
Out[5]:
In [6]:
# Remove two summary rows at the end that don't actually contain data.
y2015 = y2015[:-2]
In [7]:
# Convert ID and Interest Rate to numeric.
y2015['id'] = pd.to_numeric(y2015['id'], errors='coerce')
y2015['int_rate'] = pd.to_numeric(y2015['int_rate'].str.strip('%'), errors='coerce')
In [8]:
pd.get_dummies(y2015).head()
Out[8]:
In [9]:
rfc = ensemble.RandomForestClassifier()
X = y2015.drop('loan_status', 1)
Y = y2015['loan_status']
X = pd.get_dummies(X)
X = X.dropna(axis=1)
In [10]:
X.dropna().head()
Out[10]:
In [11]:
X.isnull().any()
Out[11]:
In [12]:
X1 = X.drop(X.columns[[26,88,95,98,131,139,197,198,199,200,201]],axis=1).head()
X1 = X.drop(X[['delinq_2yrs', 'inq_last_6mths','open_acc','pub_rec','revol_bal','total_acc','last_credit_pull_d_Apr-2016',
'last_credit_pull_d_Aug-2015',
'last_credit_pull_d_Aug-2016',
'last_credit_pull_d_Dec-2014',
'last_credit_pull_d_Dec-2015',
'last_credit_pull_d_Dec-2016',
'last_credit_pull_d_Feb-2015',
'last_credit_pull_d_Feb-2016',
'last_credit_pull_d_Jan-2015',
'last_credit_pull_d_Jan-2016',
'last_credit_pull_d_Jan-2017',
'last_credit_pull_d_Jul-2015',
'last_credit_pull_d_Jul-2016',
'last_credit_pull_d_Jun-2015',
'last_credit_pull_d_Jun-2016',
'last_credit_pull_d_Mar-2015',
'last_credit_pull_d_Mar-2016',
'last_credit_pull_d_May-2015',
'last_credit_pull_d_May-2016',
'last_credit_pull_d_Nov-2015',
'last_credit_pull_d_Nov-2016',
'last_credit_pull_d_Oct-2015',
'last_credit_pull_d_Oct-2016',
'last_credit_pull_d_Sep-2015',
'last_credit_pull_d_Sep-2016',
'application_type_INDIVIDUAL',
'application_type_JOINT',
'verification_status_joint_Not Verified',
'verification_status_joint_Source Verified',
'verification_status_joint_Verified',
'last_pymnt_d_Apr-2015',
'last_pymnt_d_Apr-2016',
'last_pymnt_d_Aug-2015',
'last_pymnt_d_Aug-2016',
'last_pymnt_d_Dec-2015',
'last_pymnt_d_Dec-2016',
'last_pymnt_d_Feb-2015',
'last_pymnt_d_Feb-2016',
'last_pymnt_d_Jan-2015',
'last_pymnt_d_Jan-2016',
'last_pymnt_d_Jan-2017',
'last_pymnt_d_Jul-2015',
'last_pymnt_d_Jul-2016',
'last_pymnt_d_Jun-2015',
'last_pymnt_d_Jun-2016',
'last_pymnt_d_Mar-2015',
'last_pymnt_d_Mar-2016',
'last_pymnt_d_May-2015',
'last_pymnt_d_May-2016',
'last_pymnt_d_Nov-2015',
'last_pymnt_d_Nov-2016',
'last_pymnt_d_Oct-2015',
'last_pymnt_d_Oct-2016',
'last_pymnt_d_Sep-2015',
'last_pymnt_d_Sep-2016',
'next_pymnt_d_Feb-2017',
'next_pymnt_d_Jan-2017',
'next_pymnt_d_Jul-2016',
'next_pymnt_d_Mar-2017',
'last_credit_pull_d_Apr-2015',
'purpose_wedding',
'title_Business',
'title_Car financing',
'title_Credit Card/Auto Repair',
'title_Credit card refinancing',
'title_Debt consolidation',
'title_DebtC',
'title_Green loan',
'title_Home buying',
'title_Home improvement',
'title_Learning and training',
'title_Major purchase',
'title_Medical expenses',
'title_Moving and relocation',
'title_New Baby and New House (CC Consolidate)',
'title_Other',
'title_Pay off Lowes Card',
'title_Paying off higher interest cards & auto',
'title_Prescription Drug and Medical Costs',
'title_SAVE',
'title_Simple Loan Until Contract Is Completed',
'title_Student Loan',
'title_Trying to come back to reality!',
'title_Vacation',
'title_considerate',
'title_new day',
'title_new kitchen for momma!',
'title_odymeds',
'initial_list_status_f',
'initial_list_status_w',
'home_ownership_RENT',
'verification_status_Not Verified',
'verification_status_Source Verified',
'verification_status_Verified',
'issue_d_Apr-2015',
'issue_d_Aug-2015',
'issue_d_Dec-2015',
'issue_d_Feb-2015',
'issue_d_Jan-2015',
'issue_d_Jul-2015',
'issue_d_Jun-2015',
'issue_d_Mar-2015',
'issue_d_May-2015',
'issue_d_Nov-2015',
'issue_d_Oct-2015',
'issue_d_Sep-2015',
'pymnt_plan_n',
'purpose_car',
'purpose_credit_card',
'purpose_debt_consolidation',
'purpose_educational',
'purpose_home_improvement',
'purpose_house',
'purpose_major_purchase',
'purpose_medical',
'purpose_moving',
'purpose_other',
'purpose_renewable_energy',
'purpose_small_business',
'purpose_vacation',
'term_ 36 months',
'term_ 60 months',
'grade_A',
'grade_B',
'grade_C',
'grade_D',
'grade_E',
'grade_F',
'grade_G',
'emp_length_1 year',
'emp_length_10+ years',
'emp_length_2 years',
'emp_length_3 years',
'emp_length_4 years',
'emp_length_5 years',
'emp_length_6 years',
'emp_length_7 years',
'emp_length_8 years',
'emp_length_9 years',
'emp_length_< 1 year',
'emp_length_n/a',
'home_ownership_ANY',
'home_ownership_MORTGAGE',
'home_ownership_OWN',
'num_accts_ever_120_pd',
'num_actv_bc_tl',
'num_actv_rev_tl',
'num_bc_sats',
'num_bc_tl',
'num_il_tl', 'num_op_rev_tl',
'num_rev_tl_bal_gt_0',
'num_sats',
'num_tl_30dpd',
'num_tl_90g_dpd_24m',
'num_tl_op_past_12m',
'pct_tl_nvr_dlq',
'pub_rec_bankruptcies',
'tax_liens',
'tot_hi_cred_lim', 'out_prncp'
]],axis=1)
In [13]:
# Make the correlation matrix.
corrmat = X1.corr()
print(corrmat)
In [14]:
cross_val_score(rfc, X1, Y, cv=10)
Out[14]:
In [15]:
import time
start_time = time.clock()
print (time.clock() - start_time, "seconds")
In [16]:
cross_val_score(rfc, X1, Y, cv=10).mean()
Out[16]:
In [17]:
start_time = time.clock()
print (time.clock() - start_time, "seconds")
In [18]:
# This is the model we'll be using.
from sklearn import tree
# A convenience for displaying visualizations.
from IPython.display import Image
# Packages for rendering our tree.
import pydotplus
import graphviz
# Initialize and train our tree.
decision_tree = tree.DecisionTreeClassifier(
criterion='entropy',
max_features=1,
max_depth=4,
random_state = 1337
)
decision_tree.fit(X1, Y)
# Render our tree.
dot_data = tree.export_graphviz(
decision_tree, out_file=None,
feature_names=customers.columns,
class_names=['Current', 'Fully Paid','Charged Off'],
filled=True
)
graph = pydotplus.graph_from_dot_data(dot_data)
Image(graph.create_png())
In [ ]:
import time
start_time = time.time()
main()
print("--- %s seconds ---" % (time.time() - start_time))
In [ ]:
In [ ]: