In [3]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
train = pd.read_csv("https://s3-ap-southeast-1.amazonaws.com/av-datahack-datacamp/train.csv")
test = pd.read_csv("https://s3-ap-southeast-1.amazonaws.com/av-datahack-datacamp/test.csv")
In [4]:
#Combining both train and test dataset
train['Type']='Train' #Create a flag for Train and Test Data set
test['Type']='Test'
fullData = pd.concat([train,test],axis=0)
#Look at the available missing values in the dataset
fullData.isnull().sum()
Out[4]:
In [5]:
#Identify categorical and continuous variables
ID_col = ['Loan_ID']
target_col = ["Loan_Status"]
cat_cols = ['Credit_History','Dependents','Gender','Married','Education','Property_Area','Self_Employed']
other_col=['Type'] #Test and Train Data set identifier
num_cols= list(set(list(fullData.columns))-set(cat_cols)-set(ID_col)-set(target_col)-set(other_col))
In [6]:
#Imputing Missing values with mean for continuous variable
fullData[num_cols] = fullData[num_cols].fillna(fullData[num_cols].mean(),inplace=True)
#Imputing Missing values with mode for categorical variables
cat_imput=pd.Series(fullData[cat_cols].mode().values[0])
cat_imput.index=cat_cols
fullData[cat_cols] = fullData[cat_cols].fillna(cat_imput,inplace=True)
In [7]:
#Create a new column as Total Income
fullData['TotalIncome']=fullData['ApplicantIncome']+fullData['CoapplicantIncome']
#Take a log of TotalIncome + 1, adding 1 to deal with zeros of TotalIncome it it exists
fullData['Log_TotalIncome']=np.log(fullData['TotalIncome'])
In [8]:
#create label encoders for categorical features
for var in cat_cols:
number = LabelEncoder()
fullData[var] = number.fit_transform(fullData[var].astype('str'))
train_modified=fullData[fullData['Type']=='Train']
test_modified=fullData[fullData['Type']=='Test']
train_modified["Loan_Status"] = number.fit_transform(train_modified["Loan_Status"].astype('str'))
In [9]:
from sklearn.linear_model import LogisticRegression
predictors=['Credit_History','Education','Gender']
x_train = train_modified[list(predictors)].values
y_train = train_modified["Loan_Status"].values
x_test=test_modified[list(predictors)].values
In [10]:
# Create logistic regression object
model = LogisticRegression()
# Train the model using the training sets
model.fit(x_train, y_train)
#Predict Output
predicted= model.predict(x_test)
#Reverse encoding for predicted outcome
predicted = number.inverse_transform(predicted)
#Store it to test dataset
test_modified['Loan_Status']=predicted
#Output file to make submission
test_modified.to_csv("Submission1.csv",columns=['Loan_ID','Loan_Status'])
In [11]:
predictors=['Credit_History','Education','Gender']
x_train = train_modified[list(predictors)].values
y_train = train_modified["Loan_Status"].values
x_test=test_modified[list(predictors)].values
In [12]:
from sklearn.tree import DecisionTreeClassifier
# Create Decision Tree object
model = DecisionTreeClassifier()
# Train the model using the training sets
model.fit(x_train, y_train)
#Predict Output
predicted= model.predict(x_test)
#Reverse encoding for predicted outcome
predicted = number.inverse_transform(predicted)
#Store it to test dataset
test_modified['Loan_Status']=predicted
#Output file to make submission
test_modified.to_csv("Submission2.csv",columns=['Loan_ID','Loan_Status'])
In [13]:
from sklearn.linear_model import LogisticRegression
predictors=['ApplicantIncome', 'CoapplicantIncome', 'Credit_History','Dependents', 'Education', 'Gender', 'LoanAmount',
'Loan_Amount_Term', 'Married', 'Property_Area', 'Self_Employed', 'TotalIncome','Log_TotalIncome']
x_train = train_modified[list(predictors)].values
y_train = train_modified["Loan_Status"].values
x_test=test_modified[list(predictors)].values
In [14]:
from sklearn.ensemble import RandomForestClassifier
# Create Decision Tree object
model = RandomForestClassifier()
# Train the model using the training sets
model.fit(x_train, y_train)
#Predict Output
predicted= model.predict(x_test)
#Reverse encoding for predicted outcome
predicted = number.inverse_transform(predicted)
#Store it to test dataset
test_modified['Loan_Status']=predicted
#Output file to make submission
test_modified.to_csv("Submission3.csv",columns=['Loan_ID','Loan_Status'])
In [15]:
#Create a series with feature importances:
featimp = pd.Series(model.feature_importances_, index=predictors).sort_values(ascending=False)
print featimp
In [16]:
number = LabelEncoder()
train['Gender'] = number.fit_transform(train['Gender'].astype('str'))
In [17]:
train.Gender
Out[17]:
In [ ]: