In [1]:
import pandas as pd
import numpy as np

from sklearn.model_selection import GridSearchCV,cross_val_score
from sklearn.preprocessing import LabelEncoder,StandardScaler
from sklearn.metrics import accuracy_score

import xgboost as xgb
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier


c:\program files\python36\lib\site-packages\sklearn\cross_validation.py:44: DeprecationWarning: This module was deprecated in version 0.18 in favor of the model_selection module into which all the refactored classes and functions are moved. Also note that the interface of the new CV iterators are different from that of this module. This module will be removed in 0.20.
  "This module will be removed in 0.20.", DeprecationWarning)

In [2]:
#Load Data
X_train=pd.read_csv('Train.csv')
X_test=pd.read_csv('Test.csv')

In [3]:
#print missing values in all columns
X_train.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 614 entries, 0 to 613
Data columns (total 13 columns):
Loan_ID              614 non-null object
Gender               601 non-null object
Married              611 non-null object
Dependents           599 non-null float64
Education            614 non-null object
Self_Employed        582 non-null object
ApplicantIncome      614 non-null int64
CoapplicantIncome    614 non-null float64
LoanAmount           592 non-null float64
Loan_Amount_Term     600 non-null float64
Credit_History       564 non-null float64
Property_Area        614 non-null object
Loan_Status          614 non-null object
dtypes: float64(5), int64(1), object(7)
memory usage: 62.4+ KB

In [4]:
X_test.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 367 entries, 0 to 366
Data columns (total 12 columns):
Loan_ID              367 non-null object
Gender               356 non-null object
Married              367 non-null object
Dependents           357 non-null float64
Education            367 non-null object
Self_Employed        344 non-null object
ApplicantIncome      367 non-null int64
CoapplicantIncome    367 non-null int64
LoanAmount           362 non-null float64
Loan_Amount_Term     361 non-null float64
Credit_History       338 non-null float64
Property_Area        367 non-null object
dtypes: float64(4), int64(2), object(6)
memory usage: 34.5+ KB

In [5]:
#Target Variable #Loan_Status
#encode labels into binary
X_train.Loan_Status=X_train.Loan_Status.map({'Y':1,'N':0})

In [6]:
#Gender

In [7]:
#impute gender by mode to fill NaNs
X_train['Gender']=X_train['Gender'].fillna(X_train['Gender'].value_counts().index[0])
X_test['Gender']=X_test['Gender'].fillna(X_test['Gender'].value_counts().index[0])

#Recheck for NaNs as above

In [8]:
#Married

In [9]:
#impute Married by mode to fill NaNs
X_train['Married']=X_train['Married'].fillna(X_train['Married'].value_counts().index[0])
#X_test['Married']=X_test['Married'].fillna(X_test['Married'].value_counts().index[0])

#Recheck for NaNs as above

In [10]:
#Dependents

In [11]:
#impute Dependents by mode to fill NaNs
X_train['Dependents']=X_train['Dependents'].fillna(X_train['Dependents'].value_counts().index[0])
X_test['Dependents']=X_test['Dependents'].fillna(X_test['Dependents'].value_counts().index[0])

In [12]:
#impute Self_Employed by mode to fill NaNs
X_train['Self_Employed']=X_train['Self_Employed'].fillna(X_train['Self_Employed'].value_counts().index[0])
X_test['Self_Employed']=X_test['Self_Employed'].fillna(X_test['Self_Employed'].value_counts().index[0])

#recheck for NaNs as above

In [13]:
#Credit_History

In [14]:
#impute Credit_History by mode to fill NaNs
X_train['Credit_History']=X_train['Credit_History'].fillna(X_train['Credit_History'].value_counts().index[0])
X_test['Credit_History']=X_test['Credit_History'].fillna(X_test['Credit_History'].value_counts().index[0])

#recheck for NaNs as above

In [15]:
#Use LabelEncoder to map all catgeorical variables into numeric

#Note here categories for every feature of train and test set are same hence using the Label Encoder fitted on training set
#will result in same mapping for test set
#If a feature in training and test set has different categories, it needs to be dealt differently 
#(ex. by row binding train and test sets and then applying LabelEncoder )

var_mod = ['Gender','Married','Education','Self_Employed','Property_Area']
le = LabelEncoder()
for i in var_mod:
    X_train[i] = le.fit_transform(X_train[i])
    X_test[i] = le.transform(X_test[i])

In [16]:
#Some applicants have high ApplicantIncome while others have high CoapplicantIncome
#So it might be a good idea to combine both as Total_Income (Feature Engineering...)
X_train['Total_Income']=(X_train.ApplicantIncome + X_train.CoapplicantIncome)
X_test['Total_Income']=(X_test.ApplicantIncome + X_test.CoapplicantIncome)

In [17]:
#Dropping ApplicantIncome and CoapplicantIncome
X_train=X_train.drop(['ApplicantIncome','CoapplicantIncome'],axis=1)
X_test=X_test.drop(['ApplicantIncome','CoapplicantIncome'],axis=1)

In [18]:
#LoanAmount

In [19]:
#Check pivot tables for Train & Test Sets LoanAmount values grouped by Education and Self_Employed
impute_grps_train = X_train.pivot_table(values=["LoanAmount"], index=["Education","Self_Employed"], aggfunc=np.mean)
impute_grps_test = X_test.pivot_table(values=["LoanAmount"], index=["Education","Self_Employed"], aggfunc=np.mean)
impute_grps_train


Out[19]:
LoanAmount
Education Self_Employed
0 0 149.724566
1 182.241935
1 0 115.900000
1 134.647059

In [20]:
#iterate only through rows with missing LoanAmount
#Use means of these groups for LoanAmount imputation
for i,row in X_train.loc[X_train['LoanAmount'].isnull(),:].iterrows():
  ind = tuple([row['Education'],row['Self_Employed']])
  X_train.loc[i,'LoanAmount'] = impute_grps_train.loc[ind].values[0]

    
for i,row in X_test.loc[X_test['LoanAmount'].isnull(),:].iterrows():
  ind = tuple([row['Education'],row['Self_Employed']])
  X_test.loc[i,'LoanAmount'] = impute_grps_test.loc[ind].values[0]
    
#Recheck for NaNs as above

In [21]:
#Loan_Amount_Term

In [22]:
#impute by mode to fill NaNs
X_train['Loan_Amount_Term']=X_train['Loan_Amount_Term'].fillna(X_train['Loan_Amount_Term'].value_counts().index[0])
X_test['Loan_Amount_Term']=X_test['Loan_Amount_Term'].fillna(X_test['Loan_Amount_Term'].value_counts().index[0])

In [23]:
#Convert Loan_Amount_Term from float to int
X_train['Loan_Amount_Term'] = X_train['Loan_Amount_Term'].astype(int)
X_test['Loan_Amount_Term'] = X_test['Loan_Amount_Term'].astype(int)

In [24]:
#Feature Engineering
#Add an additional feature EMI where EMI= [P x R x (1+R)^N]/[(1+R)^N-1]
#Where P is Loan Amount (given), N is repayment period in months (given), Assuming R to be 9%

X_train['EMI']=(X_train.LoanAmount*0.09*(1.09**X_train.Loan_Amount_Term))/((1.09**X_train.Loan_Amount_Term)-1)
X_test['EMI']=(X_test.LoanAmount*0.09*(1.09**X_test.Loan_Amount_Term))/((1.09**X_test.Loan_Amount_Term)-1)

In [25]:
#Banks use EMI/Income ratio to evaluate loans for loan approval process, higher this ratio, reduced are chances to get a loan
X_train['EMI/Income']=X_train.EMI/X_train.Total_Income
X_test['EMI/Income']=X_test.EMI/X_test.Total_Income

In [26]:
#Now as EMI/Income is derived from EMI, LoanAmount, Loan_Amount_Term and Total_Income there is high correlation amongst these
#We will retain EMI/Income and drop features it is derived from, hence need not take log transformations to handle outliers
X_train.drop(labels=['Total_Income','LoanAmount','Loan_Amount_Term','EMI'], inplace=True, axis=1)
X_test.drop(labels=['Total_Income','LoanAmount','Loan_Amount_Term','EMI'], inplace=True, axis=1)

In [27]:
#Separate out labels
y_train= X_train['Loan_Status'].copy()
X_train.drop(labels=['Loan_Status'], inplace=True, axis=1)

In [28]:
#Drop Loan_ID
X_train.drop(labels=['Loan_ID','Gender'], inplace=True, axis=1)

id=X_test['Loan_ID'].copy()
X_test.drop(labels=['Loan_ID','Gender'], inplace=True, axis=1)

In [29]:
X_train.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 614 entries, 0 to 613
Data columns (total 7 columns):
Married           614 non-null int64
Dependents        614 non-null float64
Education         614 non-null int64
Self_Employed     614 non-null int64
Credit_History    614 non-null float64
Property_Area     614 non-null int64
EMI/Income        614 non-null float64
dtypes: float64(3), int64(4)
memory usage: 33.7 KB

In [30]:
X_test.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 367 entries, 0 to 366
Data columns (total 7 columns):
Married           367 non-null int64
Dependents        367 non-null float64
Education         367 non-null int64
Self_Employed     367 non-null int64
Credit_History    367 non-null float64
Property_Area     367 non-null int64
EMI/Income        367 non-null float64
dtypes: float64(3), int64(4)
memory usage: 20.1 KB

In [31]:
X_train=X_train.values
X_test=X_test.values

In [32]:
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)

In [33]:
#Logistic Regression 0.7778
logreg = LogisticRegression()
logreg.fit(X_train, y_train)

y_logreg=logreg.predict(X_train)
accuracy = accuracy_score(y_train,y_logreg)
print("Accuracy (Training Data) : %s" % "{0:.2%}".format(accuracy))

print("Cross Validation Score : %s" % "{0:.2%}".format((cross_val_score(logreg, X_train, y_train, cv=5).mean())))


Accuracy (Training Data) : 80.78%
Cross Validation Score : 80.79%

In [34]:
#Random Forest Classification 0.784
rf = RandomForestClassifier(max_features=2, min_samples_split=5, max_depth=3, n_estimators=100, criterion='gini', random_state=0)
rf.fit(X_train, y_train)

y_rf=rf.predict(X_train)

accuracy = accuracy_score(y_train,y_rf)
print("Accuracy (Training Data) : %s" % "{0:.2%}".format(accuracy))

print("Cross Validation Score : %s" % "{0:.2%}".format((cross_val_score(rf, X_train, y_train, cv=5).mean())))


Accuracy (Training Data) : 82.25%
Cross Validation Score : 81.12%

In [35]:
#XGBoost 0.8125
gbm = xgb.XGBClassifier(n_estimators=180)
gbm.fit(X_train, y_train)

y_xgb=gbm.predict(X_train)

accuracy = accuracy_score(y_train,y_xgb)
print("Accuracy (Training Data) : %s" % "{0:.2%}".format(accuracy))

print("Cross Validation Score : %s" % "{0:.2%}".format((cross_val_score(gbm, X_train, y_train, cv=5).mean())))


Accuracy (Training Data) : 85.67%
Cross Validation Score : 77.04%

In [36]:
# KNeighborsClassifier 0.7778
knn = KNeighborsClassifier(n_neighbors = 5)
knn.fit(X_train, y_train)
y_knn=knn.predict(X_train)

accuracy = accuracy_score(y_train,y_knn)
print("Accuracy (Training Data) : %s" % "{0:.2%}".format(accuracy))
print("Cross Validation Score : %s" % "{0:.2%}".format((cross_val_score(knn,X_train, y_train, cv=5).mean())))


Accuracy (Training Data) : 82.57%
Cross Validation Score : 77.55%

In [37]:
pred=gbm.predict(X_test)

submission = pd.DataFrame({"Loan_ID": id,"Loan_Status": pred})
submission.Loan_Status = submission.Loan_Status.map({1:'Y', 0:'N'}) 
submission.to_csv('submission.csv',sep=',',index=False)