Company deals in all home loans. They have presence across all urban, semi urban and rural areas. Customer first apply for home loan after that company validates the customer eligibility for loan.
Company wants to automate the loan eligibility process (real time) based on customer detail provided while filling online application form. These details are Gender, Marital Status, Education, Number of Dependents, Income, Loan Amount, Credit History and others. To automate this process, they have given a problem to identify the customers segments, those are eligible for loan amount so that they can specifically target these customers.
| Variable | Description |
|---|---|
| Loan_ID | Unique Loan ID |
| Gender | Male/ Female |
| Married | Applicant married (Y/N) |
| Dependents | Number of dependents |
| Education | Applicant Education (Graduate/ Under Graduate) |
| Self_Employed | Self employed (Y/N) |
| ApplicantIncome | Applicant income |
| CoapplicantIncome | Coapplicant income |
| LoanAmount | Loan amount in thousands |
| Loan_Amount_Term | Term of loan in months |
| Credit_History | credit history meets guidelines |
| Property_Area | Urban/ Semi Urban/ Rural |
| Loan_Status | Loan approved (Y/N) |
In [ ]:
%matplotlib inline
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
In [ ]:
df = pd.read_csv("data/loan_prediction_train.csv")
In [ ]:
df.head()
In [ ]:
df.apply(lambda x: sum(x.isnull()), axis=0)
In [ ]:
df = df.dropna()
df.apply(lambda x: sum(x.isnull()), axis=0)
In [ ]:
df.describe()
In [ ]:
df['Property_Area'].value_counts()
In [ ]:
df['ApplicantIncome'].hist(bins=50)
In [ ]:
df.boxplot(column='ApplicantIncome')
In [ ]:
df.boxplot(column='ApplicantIncome', by='Education')
In [ ]:
df['LoanAmount'].hist(bins=50)
In [ ]:
df.boxplot(column='LoanAmount')
In [ ]:
df.head()
In [ ]:
df.dtypes
In [ ]:
df.head()
In [ ]:
from sklearn.preprocessing import LabelEncoder
var_mod = ['Gender', 'Married', 'Dependents', 'Education', 'Self_Employed', 'Property_Area', 'Loan_Status']
le = LabelEncoder()
for var in var_mod:
df[var] = le.fit_transform(df[var])
In [ ]:
df.head()
In [ ]:
df.corr()
In [ ]:
plt.scatter(df['Credit_History'], df['Loan_Status'], alpha=0.1)
In [ ]:
noise1 = np.random.normal(0, 0.1, len(df))
noise2 = np.random.normal(0, 0.1, len(df))
plt.scatter(df['Credit_History']+noise1, df['Loan_Status'] + noise2, alpha=0.1)
In [ ]:
from sklearn.linear_model import LogisticRegression
In [ ]:
logit_model = LogisticRegression()
logit_model
In [ ]:
predictors = df[['Credit_History']]
In [ ]:
logit_model.fit(df[['Credit_History']], df['Loan_Status'])
In [ ]:
from sklearn.cross_validation import KFold
In [ ]:
df.shape
In [ ]:
kf = KFold(len(df), n_folds=5)
error = []
for train, test in kf:
train_predictors = df[['Credit_History']].iloc[train,:]
train_target = df['Loan_Status'].iloc[train]
logit_model.fit(train_predictors, train_target)
error.append(logit_model.score(df[['Credit_History']].iloc[test,:], df['Loan_Status'].iloc[test]))
print("Cross-Validation Score ", np.mean(error))
In [ ]:
def fit_model(model, data, predictors, outcome, num_fold=5):
kf =KFold(data.shape[0], n_folds=num_fold)
error = []
for train, test in kf:
train_predictors = data[predictors].iloc[train,:]
train_target = data[outcome].iloc[train]
model.fit(train_predictors, train_target)
error.append(model.score(data[predictors].iloc[test,:], data[outcome].iloc[test]))
print("Cross-Validation Score :", np.mean(error))
model.fit(data[predictors], data[outcome])
accuracy = model.score(data[predictors], data[outcome])
print("Accuracy: ", accuracy)
return model
In [ ]:
logit_model = LogisticRegression()
logit_model = fit_model(logit_model, df, ['Credit_History'], 'Loan_Status')
In [ ]:
df.columns
In [ ]:
predictor_list = ['Gender', 'Married', 'Dependents', 'Education',
'Self_Employed', 'ApplicantIncome', 'CoapplicantIncome',
'LoanAmount','Loan_Amount_Term', 'Credit_History', 'Property_Area']
logit_model = fit_model(logit_model, df, predictor_list, 'Loan_Status')
In [ ]:
from sklearn.tree import DecisionTreeClassifier
decision_tree_model = DecisionTreeClassifier()
predictor_var = ['Credit_History', 'Gender', 'Married', 'Education']
outcome_var = 'Loan_Status'
decision_tree_model = fit_model(decision_tree_model, df, predictor_var, outcome_var)
In [ ]:
predictor_var = ['Credit_History', 'Loan_Amount_Term', 'LoanAmount']
decision_tree_model = fit_model(decision_tree_model, df, predictor_var, outcome_var)