In [ ]:
import numpy as np
import pandas as pd
import matplotlib
import matplotlib.pyplot as plt
%matplotlib inline
In [2]:
%%html
<style> table {float:left} </style>
About Company:
Dream Housing Finance company deals in all home loans. They have presence across all urban, semi urban and rural areas. Customer first apply for home loan after that company validates the customer eligibility for loan.
Company wants to automate the loan eligibility process (real time) based on customer detail provided while filling online application form. These details are Gender, Marital Status, Education, Number of Dependents, Income, Loan Amount, Credit History and others. To automate this process, they have given a problem to identify the customers segments, those are eligible for loan amount so that they can specifically target these customers. Here they have provided a partial data set.
Variable Description
| Variable | Description |
|---|---|
| Loan_ID | Unique Loan ID |
| Gender | Male/ Female |
| Married | Applicant married (Y/N) |
| Dependents | Number of dependents |
| Education | Applicant Education (Graduate/ Under Graduate) |
| Self_Employed | Self employed (Y/N) |
| ApplicantIncome | Applicant income |
| CoapplicantIncome | Coapplicant income |
| LoanAmount | Loan amount in thousands |
| Loan_Amount_Term | Term of loan in months |
| Credit_History | credit history meets guidelines |
| Property_Area | Urban/ Semi Urban/ Rural |
| Loan_Status | Loan approved (Y/N) |
In [ ]:
#Read the train_loan_predection.csv into pandas DataFrame
loan_df = pd.read_csv('./data/loan/train_loan_predection.csv')
In [ ]:
loan_df.head()
In [ ]:
len(loan_df)
In [ ]:
loan_df.info()
In [ ]:
len(loan_df.Loan_ID.unique())
In [ ]:
loan_df = loan_df.set_index('Loan_ID')
In [ ]:
loan_df.head(2)
In [ ]:
loan_df.loc['LP001008']
In [ ]:
loan_df.isnull().sum()
In [ ]:
loan_df[loan_df.Gender.isnull()]
In [ ]:
loan_df[loan_df.Married.isnull()]
In [ ]:
len(loan_df)
In [ ]:
loan_df = loan_df.dropna(how='any')
In [ ]:
len(loan_df)
In [ ]:
loan_df.isnull().sum()
In [ ]:
loan_df.columns
In [ ]:
loan_df.dtypes
In [ ]:
loan_df.Dependents.value_counts()
In [ ]:
loan_df.Dependents.apply(lambda x:str(type(x))).value_counts()
In [ ]:
# Convert Dependents column to integers
loan_df['Dependents'] = loan_df.Dependents.apply(lambda x: int(x.replace('+','')))
In [ ]:
# Now Dependents colum has descrete values
loan_df.Dependents.value_counts()
In [ ]:
loan_df.dtypes
In [ ]:
# add Dummy column 'Count', will be used for aggregations
loan_df['Count'] = 1
In [ ]:
loan_df.ApplicantIncome.describe()
In [ ]:
columns = ['ApplicantIncome', 'CoapplicantIncome']
loan_df[columns].plot(kind='box',figsize=(20,6))
In [ ]:
columns = ['ApplicantIncome', 'CoapplicantIncome','LoanAmount']
loan_df[columns].plot(kind='hist',bins=20,grid=True,alpha=0.3)
In [ ]:
column_name = 'Gender'
pd.DataFrame(loan_df[column_name].value_counts()*100/len(loan_df)).plot(kind='bar')
In [ ]:
column_name = "Married"
#column_name = "Gender"
#column_name = "Dependents"
#column_name = "Loan_Amount_Term"
#column_name = "Self_Employed"
#column_name = "Education"
#column_name = 'Credit_History'
columns = [column_name,'Loan_Status','Count']
grp_df = loan_df[columns].groupby([column_name,'Loan_Status']).sum()
grp_df = grp_df.unstack()*100/len(loan_df)
grp_df.plot(kind='bar',stacked=True,figsize=(8,4))
Loan has been sancationed to very few people who has Credit_History as 0
In [ ]:
# Loan sanctioned to few of them even though their credit history is not
loan_df[(loan_df.Credit_History==0) & (loan_df.Loan_Status=='Y')]
In [ ]:
loan_status_map = {'Y':1,'N':0}
loan_df['Loan_Status_int'] = loan_df.Loan_Status.map(loan_status_map)
In [ ]:
loan_df['LoanAmount_pct'] = loan_df.LoanAmount *100 / loan_df.ApplicantIncome
In [ ]:
pd.cut(loan_df['LoanAmount_pct'],[0,1,2,3,4,5,6,8,10,20,100]).value_counts().sort_index()
In [ ]:
loan_df.Loan_Amount_Term.value_counts()
In [ ]:
plt.scatter(loan_df.LoanAmount_pct,loan_df.ApplicantIncome)
plt.show()
In [ ]:
loan_df[loan_df.LoanAmount_pct > 10]
In [ ]:
loan_df.LoanAmount_pct.plot(kind='hist',bins=20)
In [ ]:
loan_df['TotalIncome'] = loan_df.ApplicantIncome + loan_df.CoapplicantIncome
In [ ]:
loan_df[loan_df.Loan_Status=='Y'].TotalIncome.plot(kind='hist',bins=20)
In [ ]:
loan_df[loan_df.Loan_Status=='N'].TotalIncome.plot(kind='hist',bins=20)
In [ ]:
loan_df[(loan_df.TotalIncome > 30000) & (loan_df.Loan_Status=='N')]
In [ ]:
temp_df = loan_df[(loan_df.Credit_History==1) & (loan_df.Loan_Status=='N')]
print(len(temp_df))
temp_df.head()
In [ ]:
loan_df[loan_df.Loan_Status=='N'].describe(include='all')
In [ ]:
loan_df[loan_df.Loan_Status=='Y'].describe(include='all')
In [ ]:
loan_df.columns
In [ ]:
columns_retained = ['Gender', 'Married', 'Dependents', 'Education', 'Self_Employed',
'ApplicantIncome', 'CoapplicantIncome', 'LoanAmount',
'Loan_Amount_Term', 'Credit_History', 'Property_Area', 'Loan_Status']
columns_retained = ['Gender', 'Married', 'Dependents', 'Education', 'Self_Employed',
'ApplicantIncome', 'CoapplicantIncome', 'LoanAmount',
'Loan_Amount_Term', 'Credit_History', 'Property_Area', 'Loan_Status']
In [ ]:
columns = ['Gender','Married','Education','Self_Employed','Property_Area','Loan_Status']
df = loan_df[columns_retained]
df = pd.get_dummies(df,columns=columns,drop_first=True)
In [ ]:
df.head()
In [ ]:
import sklearn
In [ ]:
from sklearn.model_selection import train_test_split
In [ ]:
X_train,X_test = train_test_split(df,test_size=0.2, random_state=42)
In [ ]:
y_train = X_train['Loan_Status_Y']
y_test = X_test['Loan_Status_Y']
X_train = X_train.drop('Loan_Status_Y',axis=1)
X_test = X_test.drop('Loan_Status_Y',axis=1)
In [ ]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix,accuracy_score
In [ ]:
clf = RandomForestClassifier(n_estimators=500,n_jobs=-1,max_features=None)
In [ ]:
clf.fit(X_train,y_train)
In [ ]:
d = dict(zip(clf.feature_importances_,X_test.columns))
In [ ]:
for k in sorted(d.keys(),reverse=True):
print(d[k])
In [ ]:
y_pred = clf.predict(X_test)
In [ ]:
confusion_matrix(y_pred=y_pred,y_true=y_test)
In [ ]:
accuracy_score(y_pred=y_pred,y_true=y_test)
In [ ]:
df_test[df_test.LoanAmount.isnull()]
In [ ]:
df_test = df_test.dropna(how='any')
In [ ]:
len(df_test)
In [ ]:
df_test['Dependents'] = df_test.Dependents.apply(lambda x: int(x.replace('+','')))
In [ ]:
columns_retained = ['Gender', 'Married', 'Dependents', 'Education', 'Self_Employed',
'ApplicantIncome', 'CoapplicantIncome', 'LoanAmount',
'Loan_Amount_Term', 'Credit_History', 'Property_Area']
In [ ]:
columns = ['Gender','Married','Education','Self_Employed','Property_Area']
df_test = df_test[columns_retained]
df_test = pd.get_dummies(df_test,columns=columns,drop_first=True)
In [ ]:
y_test_pred = clf.predict(df_test)
In [ ]:
df_test['Loan_Status_Y'] = y_test_pred
In [ ]:
df_test[df_test.Loan_Status_Y==0]
In [ ]:
x =loan_df.LoanAmount*100/loan_df.ApplicantIncome
b = loan_df[(loan_df.LoanAmount*100/loan_df.ApplicantIncome > 8) ]
In [ ]:
x.head()
In [ ]:
b[b.Loan_Status=='N']
In [ ]: