In [1]:
    
# Import library pandas
import pandas as pd
# Import training data as train
train = pd.read_csv("https://s3-ap-southeast-1.amazonaws.com/av-datahack-datacamp/train.csv")
# Import testing data as test
test = pd.read_csv("https://s3-ap-southeast-1.amazonaws.com/av-datahack-datacamp/test.csv")
    
In [2]:
    
# Print top 5 observation of training dataset
print (train.head(5))
    
    
In [3]:
    
# Store total number of observation in training dataset
train_length =len(train)
# Store total number of columns in testing data set
test_col = len(test.columns)
    
In [4]:
    
# Look at the summary of numerical variables for train data set
df= train.describe()
print (df)
    
    
In [5]:
    
# Print the unique values and their frequency of variable Property_Area
df1=train['Property_Area'].value_counts()
print (df1)
    
    
In [6]:
    
%matplotlib inline
# Plot histogram for variable LoanAmount
train['LoanAmount'].hist()
    
    Out[6]:
    
In [7]:
    
# Plot a box plot for variable LoanAmount by variable Gender of training data set
train.boxplot(column='LoanAmount', by = 'Gender')
    
    Out[7]:
    
In [8]:
    
# Loan approval rates in absolute numbers
loan_approval = train['Loan_Status'].value_counts()['Y']
print "%d number of loans were approved." %loan_approval
    
    
In [9]:
    
# Two-way comparison: Credit History and Loan Status
pd.crosstab(train ["Credit_History"], train ["Loan_Status"], margins=True)
    
    Out[9]:
In [10]:
    
#Function to output percentage row wise in a cross table
def percentageConvert(ser):
 return ser/float(ser[-1])
# Two-way comparison: Loan approval rate for customers having Credit_History (1)
df=pd.crosstab(train ["Credit_History"], train ["Loan_Status"], margins=True).apply(percentageConvert, axis=1)
loan_approval_with_Credit_1 = df['Y'][1]
print "%f percent of the applicants whose loans were approved have Credit_History equals to 1." %(loan_approval_with_Credit_1*100)
    
    
In [11]:
    
df['Y']
    
    Out[11]:
In [12]:
    
# Number of variables with missing values
variables_missing_value = train.isnull().sum()
variables_missing_value
    
    Out[12]:
In [13]:
    
# Impute missing value of Loan_Amount_Term with median
train['Loan_Amount_Term'].fillna(train['Loan_Amount_Term'].median(), inplace=True)
    
In [14]:
    
# Impute missing value of Self_Employed with more frequent category
train['Self_Employed'].fillna('No',inplace=True)
    
In [15]:
    
# Add both ApplicantIncome and CoapplicantIncome to TotalIncome
train['TotalIncome'] = train['ApplicantIncome'] + train['CoapplicantIncome']
# Looking at the distribtion of TotalIncome
train['LoanAmount'].hist(bins=20)
    
    Out[15]:
    
In [16]:
    
import numpy as np
# Perform log transformation of TotalIncome to make it closer to normal
train['LoanAmount_log'] = np.log(train['LoanAmount'])
# Looking at the distribtion of TotalIncome_log
train['LoanAmount_log'].hist(bins=20)
    
    Out[16]:
    
In [18]:
    
from sklearn import preprocessing
# Impute missing values for Gender
train['Gender'].fillna(train['Gender'].mode()[0],inplace=True)
# Impute missing values for Married
train['Married'].fillna(train['Married'].mode()[0],inplace=True)
# Impute missing values for Dependents
train['Dependents'].fillna(train['Dependents'].mode()[0],inplace=True)
# Impute missing values for Credit_History
train['Credit_History'].fillna(train['Credit_History'].mode()[0],inplace=True)
# Convert all non-numeric values to number
cat_col=['Gender','Married','Dependents','Education','Self_Employed','Credit_History','Property_Area']
for var in cat_col:
    le = preprocessing.LabelEncoder()
    train[var]=le.fit_transform(train[var].astype('str'))
    
In [30]:
    
train['Credit_History'].isnull().sum()
    
    Out[30]:
In [33]:
    
LoanAmount_have_missing_value = train['LoanAmount'].isnull().sum() > 0
    
In [34]:
    
LoanAmount_have_missing_value
    
    Out[34]:
In [11]:
    
train.isnull().sum()
    
    Out[11]:
In [45]:
    
train.describe()
    
    Out[45]:
In [3]:
    
# Impute missing value of LoanAmount with 168
train['LoanAmount'].fillna(168, inplace=True)
# Impute missing value of LoanAmount with median
#train['LoanAmount'].fillna(train['LoanAmount'].median(), inplace=True)
    
In [52]:
    
train['LoanAmount'].median()
    
    Out[52]:
In [6]:
    
train['Gender'].fillna('Male',inplace=True)
    
In [9]:
    
train['Credit_History'].value_counts()
    
    Out[9]:
In [10]:
    
train['Credit_History'].fillna(1,inplace=True)
    
In [ ]: