preprocessing of data set


In [3]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder

train = pd.read_csv("https://s3-ap-southeast-1.amazonaws.com/av-datahack-datacamp/train.csv")
test = pd.read_csv("https://s3-ap-southeast-1.amazonaws.com/av-datahack-datacamp/test.csv")

In [4]:
#Combining both train and test dataset

train['Type']='Train' #Create a flag for Train and Test Data set
test['Type']='Test'
fullData = pd.concat([train,test],axis=0)

#Look at the available missing values in the dataset
fullData.isnull().sum()


Out[4]:
ApplicantIncome        0
CoapplicantIncome      0
Credit_History        79
Dependents            25
Education              0
Gender                24
LoanAmount            27
Loan_Amount_Term      20
Loan_ID                0
Loan_Status          367
Married                3
Property_Area          0
Self_Employed         55
Type                   0
dtype: int64

In [5]:
#Identify categorical and continuous variables
ID_col = ['Loan_ID']
target_col = ["Loan_Status"]
cat_cols = ['Credit_History','Dependents','Gender','Married','Education','Property_Area','Self_Employed']

other_col=['Type'] #Test and Train Data set identifier
num_cols= list(set(list(fullData.columns))-set(cat_cols)-set(ID_col)-set(target_col)-set(other_col))

In [6]:
#Imputing Missing values with mean for continuous variable
fullData[num_cols] = fullData[num_cols].fillna(fullData[num_cols].mean(),inplace=True)


#Imputing Missing values with mode for categorical variables
cat_imput=pd.Series(fullData[cat_cols].mode().values[0])
cat_imput.index=cat_cols
fullData[cat_cols] = fullData[cat_cols].fillna(cat_imput,inplace=True)


C:\Users\abc\Anaconda2\lib\site-packages\pandas\core\generic.py:3178: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self._update_inplace(new_data)

In [7]:
#Create a new column as Total Income

fullData['TotalIncome']=fullData['ApplicantIncome']+fullData['CoapplicantIncome']

#Take a log of TotalIncome + 1, adding 1 to deal with zeros of TotalIncome it it exists
fullData['Log_TotalIncome']=np.log(fullData['TotalIncome'])

In [8]:
#create label encoders for categorical features
for var in cat_cols:
    number = LabelEncoder()
    fullData[var] = number.fit_transform(fullData[var].astype('str'))

train_modified=fullData[fullData['Type']=='Train']
test_modified=fullData[fullData['Type']=='Test']
train_modified["Loan_Status"] = number.fit_transform(train_modified["Loan_Status"].astype('str'))


C:\Users\abc\Anaconda2\lib\site-packages\ipykernel\__main__.py:8: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy

Building Logistic Regression


In [9]:
from sklearn.linear_model import LogisticRegression


predictors=['Credit_History','Education','Gender']

x_train = train_modified[list(predictors)].values
y_train = train_modified["Loan_Status"].values

x_test=test_modified[list(predictors)].values

In [10]:
# Create logistic regression object
model = LogisticRegression()

# Train the model using the training sets
model.fit(x_train, y_train)

#Predict Output
predicted= model.predict(x_test)

#Reverse encoding for predicted outcome
predicted = number.inverse_transform(predicted)

#Store it to test dataset
test_modified['Loan_Status']=predicted

#Output file to make submission
test_modified.to_csv("Submission1.csv",columns=['Loan_ID','Loan_Status'])


C:\Users\abc\Anaconda2\lib\site-packages\ipykernel\__main__.py:14: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy

Building Decision Tree Classifier


In [11]:
predictors=['Credit_History','Education','Gender']

x_train = train_modified[list(predictors)].values
y_train = train_modified["Loan_Status"].values

x_test=test_modified[list(predictors)].values

In [12]:
from sklearn.tree import DecisionTreeClassifier

# Create Decision Tree object
model = DecisionTreeClassifier()

# Train the model using the training sets
model.fit(x_train, y_train)

#Predict Output
predicted= model.predict(x_test)

#Reverse encoding for predicted outcome
predicted = number.inverse_transform(predicted)

#Store it to test dataset
test_modified['Loan_Status']=predicted

#Output file to make submission
test_modified.to_csv("Submission2.csv",columns=['Loan_ID','Loan_Status'])


C:\Users\abc\Anaconda2\lib\site-packages\ipykernel\__main__.py:16: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy

Building Random Forest Classifier


In [13]:
from sklearn.linear_model import LogisticRegression


predictors=['ApplicantIncome', 'CoapplicantIncome', 'Credit_History','Dependents', 'Education', 'Gender', 'LoanAmount',
            'Loan_Amount_Term', 'Married', 'Property_Area', 'Self_Employed', 'TotalIncome','Log_TotalIncome']

x_train = train_modified[list(predictors)].values
y_train = train_modified["Loan_Status"].values

x_test=test_modified[list(predictors)].values

In [14]:
from sklearn.ensemble import RandomForestClassifier

# Create Decision Tree object
model = RandomForestClassifier()

# Train the model using the training sets
model.fit(x_train, y_train)

#Predict Output
predicted= model.predict(x_test)

#Reverse encoding for predicted outcome
predicted = number.inverse_transform(predicted)

#Store it to test dataset
test_modified['Loan_Status']=predicted

#Output file to make submission
test_modified.to_csv("Submission3.csv",columns=['Loan_ID','Loan_Status'])


C:\Users\abc\Anaconda2\lib\site-packages\ipykernel\__main__.py:16: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy

In [15]:
#Create a series with feature importances:
featimp = pd.Series(model.feature_importances_, index=predictors).sort_values(ascending=False)
print featimp


Credit_History       0.232724
TotalIncome          0.146955
LoanAmount           0.128687
ApplicantIncome      0.114424
Log_TotalIncome      0.113866
CoapplicantIncome    0.082272
Dependents           0.038125
Property_Area        0.036118
Loan_Amount_Term     0.032650
Married              0.022713
Self_Employed        0.022481
Education            0.016459
Gender               0.012527
dtype: float64

In [16]:
number = LabelEncoder()
train['Gender'] = number.fit_transform(train['Gender'].astype('str'))

In [17]:
train.Gender


Out[17]:
0      1
1      1
2      1
3      1
4      1
5      1
6      1
7      1
8      1
9      1
10     1
11     1
12     1
13     1
14     1
15     1
16     1
17     0
18     1
19     1
20     1
21     1
22     1
23     2
24     1
25     1
26     1
27     1
28     1
29     0
      ..
584    1
585    1
586    1
587    0
588    2
589    1
590    1
591    1
592    2
593    1
594    1
595    1
596    1
597    1
598    1
599    1
600    0
601    1
602    1
603    1
604    0
605    1
606    1
607    1
608    1
609    0
610    1
611    1
612    1
613    0
Name: Gender, dtype: int64

In [ ]: