In [ ]:

    
import numpy as np
import pandas as pd
import matplotlib
import matplotlib.pyplot as plt
%matplotlib inline



In [2]:

    
%%html
<style> table {float:left} </style>

Data Analysis - Problem Statement

About Company:

Dream Housing Finance company deals in all home loans. They have presence across all urban, semi urban and rural areas. Customer first apply for home loan after that company validates the customer eligibility for loan.

Company wants to automate the loan eligibility process (real time) based on customer detail provided while filling online application form. These details are Gender, Marital Status, Education, Number of Dependents, Income, Loan Amount, Credit History and others. To automate this process, they have given a problem to identify the customers segments, those are eligible for loan amount so that they can specifically target these customers. Here they have provided a partial data set.

Variable Description

Variable	Description
Loan_ID	Unique Loan ID
Gender	Male/ Female
Married	Applicant married (Y/N)
Dependents	Number of dependents
Education	Applicant Education (Graduate/ Under Graduate)
Self_Employed	Self employed (Y/N)
ApplicantIncome	Applicant income
CoapplicantIncome	Coapplicant income
LoanAmount	Loan amount in thousands
Loan_Amount_Term	Term of loan in months
Credit_History	credit history meets guidelines
Property_Area	Urban/ Semi Urban/ Rural
Loan_Status	Loan approved (Y/N)



In [ ]:

    
#Read the train_loan_predection.csv into pandas DataFrame
loan_df = pd.read_csv('./data/loan/train_loan_predection.csv')

Let's look at the data



In [ ]:

    
loan_df.head()



In [ ]:

    
len(loan_df)



In [ ]:

    
loan_df.info()

Set the Index, this is not mandatory



In [ ]:

    
len(loan_df.Loan_ID.unique())



In [ ]:

    
loan_df = loan_df.set_index('Loan_ID')



In [ ]:

    
loan_df.head(2)



In [ ]:

    
loan_df.loc['LP001008']

Check for Null values



In [ ]:

    
loan_df.isnull().sum()



In [ ]:

    
loan_df[loan_df.Gender.isnull()]



In [ ]:

    
loan_df[loan_df.Married.isnull()]

Let's drop all of the rows if one of the cell has null value



In [ ]:

    
len(loan_df)



In [ ]:

    
loan_df = loan_df.dropna(how='any')



In [ ]:

    
len(loan_df)



In [ ]:

    
loan_df.isnull().sum()

Verify column data types and convert if necessary



In [ ]:

    
loan_df.columns



In [ ]:

    
loan_df.dtypes



In [ ]:

    
loan_df.Dependents.value_counts()



In [ ]:

    
loan_df.Dependents.apply(lambda x:str(type(x))).value_counts()



In [ ]:

    
# Convert Dependents column to integers
loan_df['Dependents'] = loan_df.Dependents.apply(lambda x: int(x.replace('+','')))



In [ ]:

    
# Now Dependents colum has descrete values
loan_df.Dependents.value_counts()



In [ ]:

    
loan_df.dtypes



In [ ]:

    
# add Dummy column 'Count', will be used for aggregations
loan_df['Count'] = 1

Analyze numeric columns



In [ ]:

    
loan_df.ApplicantIncome.describe()



In [ ]:

    
columns = ['ApplicantIncome', 'CoapplicantIncome']
loan_df[columns].plot(kind='box',figsize=(20,6))



In [ ]:

    
columns = ['ApplicantIncome', 'CoapplicantIncome','LoanAmount']
loan_df[columns].plot(kind='hist',bins=20,grid=True,alpha=0.3)

Analyze categorical columns



In [ ]:

    
column_name = 'Gender'
pd.DataFrame(loan_df[column_name].value_counts()*100/len(loan_df)).plot(kind='bar')



In [ ]:

    
column_name = "Married"
#column_name = "Gender"
#column_name = "Dependents"
#column_name = "Loan_Amount_Term"
#column_name = "Self_Employed"
#column_name = "Education"
#column_name = 'Credit_History'

columns = [column_name,'Loan_Status','Count']

grp_df = loan_df[columns].groupby([column_name,'Loan_Status']).sum()
grp_df = grp_df.unstack()*100/len(loan_df)
grp_df.plot(kind='bar',stacked=True,figsize=(8,4))

Loan has been sancationed to very few people who has Credit_History as 0



In [ ]:

    
# Loan sanctioned to few of them even though their credit history is not 
loan_df[(loan_df.Credit_History==0) & (loan_df.Loan_Status=='Y')]

Addition of more fields



In [ ]:

    
loan_status_map = {'Y':1,'N':0}
loan_df['Loan_Status_int'] = loan_df.Loan_Status.map(loan_status_map)



In [ ]:

    
loan_df['LoanAmount_pct'] = loan_df.LoanAmount *100 / loan_df.ApplicantIncome



In [ ]:

    
pd.cut(loan_df['LoanAmount_pct'],[0,1,2,3,4,5,6,8,10,20,100]).value_counts().sort_index()



In [ ]:

    
loan_df.Loan_Amount_Term.value_counts()



In [ ]:

    
plt.scatter(loan_df.LoanAmount_pct,loan_df.ApplicantIncome)
plt.show()



In [ ]:

    
loan_df[loan_df.LoanAmount_pct > 10]



In [ ]:

    
loan_df.LoanAmount_pct.plot(kind='hist',bins=20)



In [ ]:

    
loan_df['TotalIncome'] = loan_df.ApplicantIncome + loan_df.CoapplicantIncome



In [ ]:

    
loan_df[loan_df.Loan_Status=='Y'].TotalIncome.plot(kind='hist',bins=20)



In [ ]:

    
loan_df[loan_df.Loan_Status=='N'].TotalIncome.plot(kind='hist',bins=20)



In [ ]:

    
loan_df[(loan_df.TotalIncome > 30000) & (loan_df.Loan_Status=='N')]



In [ ]:

    
temp_df = loan_df[(loan_df.Credit_History==1) & (loan_df.Loan_Status=='N')]
print(len(temp_df))
temp_df.head()



In [ ]:

    
loan_df[loan_df.Loan_Status=='N'].describe(include='all')



In [ ]:

    
loan_df[loan_df.Loan_Status=='Y'].describe(include='all')

Let's prepare the data for building the model



In [ ]:

    
loan_df.columns



In [ ]:

    
columns_retained = ['Gender', 'Married', 'Dependents', 'Education', 'Self_Employed',
       'ApplicantIncome', 'CoapplicantIncome', 'LoanAmount',
       'Loan_Amount_Term', 'Credit_History', 'Property_Area', 'Loan_Status']

columns_retained = ['Gender', 'Married', 'Dependents', 'Education', 'Self_Employed',
       'ApplicantIncome', 'CoapplicantIncome', 'LoanAmount',
       'Loan_Amount_Term', 'Credit_History', 'Property_Area', 'Loan_Status']



In [ ]:

    
columns = ['Gender','Married','Education','Self_Employed','Property_Area','Loan_Status']
df = loan_df[columns_retained]
df = pd.get_dummies(df,columns=columns,drop_first=True)



In [ ]:

    
df.head()



In [ ]:

    
import sklearn



In [ ]:

    
from sklearn.model_selection import train_test_split



In [ ]:

    
X_train,X_test = train_test_split(df,test_size=0.2, random_state=42)



In [ ]:

    
y_train = X_train['Loan_Status_Y']
y_test = X_test['Loan_Status_Y']
X_train = X_train.drop('Loan_Status_Y',axis=1)
X_test = X_test.drop('Loan_Status_Y',axis=1)



In [ ]:

    
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix,accuracy_score



In [ ]:

    
clf = RandomForestClassifier(n_estimators=500,n_jobs=-1,max_features=None)



In [ ]:

    
clf.fit(X_train,y_train)



In [ ]:

    
d = dict(zip(clf.feature_importances_,X_test.columns))



In [ ]:

    
for k in sorted(d.keys(),reverse=True):
    print(d[k])



In [ ]:

    
y_pred = clf.predict(X_test)



In [ ]:

    
confusion_matrix(y_pred=y_pred,y_true=y_test)



In [ ]:

    
accuracy_score(y_pred=y_pred,y_true=y_test)



In [ ]:

    
df_test[df_test.LoanAmount.isnull()]



In [ ]:

    
df_test = df_test.dropna(how='any')



In [ ]:

    
len(df_test)



In [ ]:

    
df_test['Dependents'] = df_test.Dependents.apply(lambda x: int(x.replace('+','')))



In [ ]:

    
columns_retained = ['Gender', 'Married', 'Dependents', 'Education', 'Self_Employed',
       'ApplicantIncome', 'CoapplicantIncome', 'LoanAmount',
       'Loan_Amount_Term', 'Credit_History', 'Property_Area']



In [ ]:

    
columns = ['Gender','Married','Education','Self_Employed','Property_Area']
df_test = df_test[columns_retained]
df_test = pd.get_dummies(df_test,columns=columns,drop_first=True)



In [ ]:

    
y_test_pred = clf.predict(df_test)



In [ ]:

    
df_test['Loan_Status_Y'] = y_test_pred



In [ ]:

    
df_test[df_test.Loan_Status_Y==0]



In [ ]:

    
x =loan_df.LoanAmount*100/loan_df.ApplicantIncome
b = loan_df[(loan_df.LoanAmount*100/loan_df.ApplicantIncome > 8) ]



In [ ]:

    
x.head()



In [ ]:

    
b[b.Loan_Status=='N']



In [ ]: