In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import GridSearchCV,cross_val_score
from sklearn.preprocessing import LabelEncoder,StandardScaler
from sklearn.metrics import accuracy_score
import xgboost as xgb
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
In [2]:
#Load Data
X_train=pd.read_csv('Train.csv')
X_test=pd.read_csv('Test.csv')
In [3]:
#print missing values in all columns
X_train.info()
In [4]:
X_test.info()
In [5]:
#Target Variable #Loan_Status
#encode labels into binary
X_train.Loan_Status=X_train.Loan_Status.map({'Y':1,'N':0})
In [6]:
#Gender
In [7]:
#impute gender by mode to fill NaNs
X_train['Gender']=X_train['Gender'].fillna(X_train['Gender'].value_counts().index[0])
X_test['Gender']=X_test['Gender'].fillna(X_test['Gender'].value_counts().index[0])
#Recheck for NaNs as above
In [8]:
#Married
In [9]:
#impute Married by mode to fill NaNs
X_train['Married']=X_train['Married'].fillna(X_train['Married'].value_counts().index[0])
#X_test['Married']=X_test['Married'].fillna(X_test['Married'].value_counts().index[0])
#Recheck for NaNs as above
In [10]:
#Dependents
In [11]:
#impute Dependents by mode to fill NaNs
X_train['Dependents']=X_train['Dependents'].fillna(X_train['Dependents'].value_counts().index[0])
X_test['Dependents']=X_test['Dependents'].fillna(X_test['Dependents'].value_counts().index[0])
In [12]:
#impute Self_Employed by mode to fill NaNs
X_train['Self_Employed']=X_train['Self_Employed'].fillna(X_train['Self_Employed'].value_counts().index[0])
X_test['Self_Employed']=X_test['Self_Employed'].fillna(X_test['Self_Employed'].value_counts().index[0])
#recheck for NaNs as above
In [13]:
#Credit_History
In [14]:
#impute Credit_History by mode to fill NaNs
X_train['Credit_History']=X_train['Credit_History'].fillna(X_train['Credit_History'].value_counts().index[0])
X_test['Credit_History']=X_test['Credit_History'].fillna(X_test['Credit_History'].value_counts().index[0])
#recheck for NaNs as above
In [15]:
#Use LabelEncoder to map all catgeorical variables into numeric
#Note here categories for every feature of train and test set are same hence using the Label Encoder fitted on training set
#will result in same mapping for test set
#If a feature in training and test set has different categories, it needs to be dealt differently
#(ex. by row binding train and test sets and then applying LabelEncoder )
var_mod = ['Gender','Married','Education','Self_Employed','Property_Area']
le = LabelEncoder()
for i in var_mod:
X_train[i] = le.fit_transform(X_train[i])
X_test[i] = le.transform(X_test[i])
In [16]:
#Some applicants have high ApplicantIncome while others have high CoapplicantIncome
#So it might be a good idea to combine both as Total_Income (Feature Engineering...)
X_train['Total_Income']=(X_train.ApplicantIncome + X_train.CoapplicantIncome)
X_test['Total_Income']=(X_test.ApplicantIncome + X_test.CoapplicantIncome)
In [17]:
#Dropping ApplicantIncome and CoapplicantIncome
X_train=X_train.drop(['ApplicantIncome','CoapplicantIncome'],axis=1)
X_test=X_test.drop(['ApplicantIncome','CoapplicantIncome'],axis=1)
In [18]:
#LoanAmount
In [19]:
#Check pivot tables for Train & Test Sets LoanAmount values grouped by Education and Self_Employed
impute_grps_train = X_train.pivot_table(values=["LoanAmount"], index=["Education","Self_Employed"], aggfunc=np.mean)
impute_grps_test = X_test.pivot_table(values=["LoanAmount"], index=["Education","Self_Employed"], aggfunc=np.mean)
impute_grps_train
Out[19]:
In [20]:
#iterate only through rows with missing LoanAmount
#Use means of these groups for LoanAmount imputation
for i,row in X_train.loc[X_train['LoanAmount'].isnull(),:].iterrows():
ind = tuple([row['Education'],row['Self_Employed']])
X_train.loc[i,'LoanAmount'] = impute_grps_train.loc[ind].values[0]
for i,row in X_test.loc[X_test['LoanAmount'].isnull(),:].iterrows():
ind = tuple([row['Education'],row['Self_Employed']])
X_test.loc[i,'LoanAmount'] = impute_grps_test.loc[ind].values[0]
#Recheck for NaNs as above
In [21]:
#Loan_Amount_Term
In [22]:
#impute by mode to fill NaNs
X_train['Loan_Amount_Term']=X_train['Loan_Amount_Term'].fillna(X_train['Loan_Amount_Term'].value_counts().index[0])
X_test['Loan_Amount_Term']=X_test['Loan_Amount_Term'].fillna(X_test['Loan_Amount_Term'].value_counts().index[0])
In [23]:
#Convert Loan_Amount_Term from float to int
X_train['Loan_Amount_Term'] = X_train['Loan_Amount_Term'].astype(int)
X_test['Loan_Amount_Term'] = X_test['Loan_Amount_Term'].astype(int)
In [24]:
#Feature Engineering
#Add an additional feature EMI where EMI= [P x R x (1+R)^N]/[(1+R)^N-1]
#Where P is Loan Amount (given), N is repayment period in months (given), Assuming R to be 9%
X_train['EMI']=(X_train.LoanAmount*0.09*(1.09**X_train.Loan_Amount_Term))/((1.09**X_train.Loan_Amount_Term)-1)
X_test['EMI']=(X_test.LoanAmount*0.09*(1.09**X_test.Loan_Amount_Term))/((1.09**X_test.Loan_Amount_Term)-1)
In [25]:
#Banks use EMI/Income ratio to evaluate loans for loan approval process, higher this ratio, reduced are chances to get a loan
X_train['EMI/Income']=X_train.EMI/X_train.Total_Income
X_test['EMI/Income']=X_test.EMI/X_test.Total_Income
In [26]:
#Now as EMI/Income is derived from EMI, LoanAmount, Loan_Amount_Term and Total_Income there is high correlation amongst these
#We will retain EMI/Income and drop features it is derived from, hence need not take log transformations to handle outliers
X_train.drop(labels=['Total_Income','LoanAmount','Loan_Amount_Term','EMI'], inplace=True, axis=1)
X_test.drop(labels=['Total_Income','LoanAmount','Loan_Amount_Term','EMI'], inplace=True, axis=1)
In [27]:
#Separate out labels
y_train= X_train['Loan_Status'].copy()
X_train.drop(labels=['Loan_Status'], inplace=True, axis=1)
In [28]:
#Drop Loan_ID
X_train.drop(labels=['Loan_ID','Gender'], inplace=True, axis=1)
id=X_test['Loan_ID'].copy()
X_test.drop(labels=['Loan_ID','Gender'], inplace=True, axis=1)
In [29]:
X_train.info()
In [30]:
X_test.info()
In [31]:
X_train=X_train.values
X_test=X_test.values
In [32]:
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)
In [33]:
#Logistic Regression 0.7778
logreg = LogisticRegression()
logreg.fit(X_train, y_train)
y_logreg=logreg.predict(X_train)
accuracy = accuracy_score(y_train,y_logreg)
print("Accuracy (Training Data) : %s" % "{0:.2%}".format(accuracy))
print("Cross Validation Score : %s" % "{0:.2%}".format((cross_val_score(logreg, X_train, y_train, cv=5).mean())))
In [34]:
#Random Forest Classification 0.784
rf = RandomForestClassifier(max_features=2, min_samples_split=5, max_depth=3, n_estimators=100, criterion='gini', random_state=0)
rf.fit(X_train, y_train)
y_rf=rf.predict(X_train)
accuracy = accuracy_score(y_train,y_rf)
print("Accuracy (Training Data) : %s" % "{0:.2%}".format(accuracy))
print("Cross Validation Score : %s" % "{0:.2%}".format((cross_val_score(rf, X_train, y_train, cv=5).mean())))
In [35]:
#XGBoost 0.8125
gbm = xgb.XGBClassifier(n_estimators=180)
gbm.fit(X_train, y_train)
y_xgb=gbm.predict(X_train)
accuracy = accuracy_score(y_train,y_xgb)
print("Accuracy (Training Data) : %s" % "{0:.2%}".format(accuracy))
print("Cross Validation Score : %s" % "{0:.2%}".format((cross_val_score(gbm, X_train, y_train, cv=5).mean())))
In [36]:
# KNeighborsClassifier 0.7778
knn = KNeighborsClassifier(n_neighbors = 5)
knn.fit(X_train, y_train)
y_knn=knn.predict(X_train)
accuracy = accuracy_score(y_train,y_knn)
print("Accuracy (Training Data) : %s" % "{0:.2%}".format(accuracy))
print("Cross Validation Score : %s" % "{0:.2%}".format((cross_val_score(knn,X_train, y_train, cv=5).mean())))
In [37]:
pred=gbm.predict(X_test)
submission = pd.DataFrame({"Loan_ID": id,"Loan_Status": pred})
submission.Loan_Status = submission.Loan_Status.map({1:'Y', 0:'N'})
submission.to_csv('submission.csv',sep=',',index=False)