In [1]:
# Import library pandas
import pandas as pd
# Import training data as train
train = pd.read_csv("https://s3-ap-southeast-1.amazonaws.com/av-datahack-datacamp/train.csv")
# Import testing data as test
test = pd.read_csv("https://s3-ap-southeast-1.amazonaws.com/av-datahack-datacamp/test.csv")
In [2]:
# Print top 5 observation of training dataset
print (train.head(5))
In [3]:
# Store total number of observation in training dataset
train_length =len(train)
# Store total number of columns in testing data set
test_col = len(test.columns)
In [4]:
# Look at the summary of numerical variables for train data set
df= train.describe()
print (df)
In [5]:
# Print the unique values and their frequency of variable Property_Area
df1=train['Property_Area'].value_counts()
print (df1)
In [6]:
%matplotlib inline
# Plot histogram for variable LoanAmount
train['LoanAmount'].hist()
Out[6]:
In [7]:
# Plot a box plot for variable LoanAmount by variable Gender of training data set
train.boxplot(column='LoanAmount', by = 'Gender')
Out[7]:
In [8]:
# Loan approval rates in absolute numbers
loan_approval = train['Loan_Status'].value_counts()['Y']
print "%d number of loans were approved." %loan_approval
In [9]:
# Two-way comparison: Credit History and Loan Status
pd.crosstab(train ["Credit_History"], train ["Loan_Status"], margins=True)
Out[9]:
In [10]:
#Function to output percentage row wise in a cross table
def percentageConvert(ser):
return ser/float(ser[-1])
# Two-way comparison: Loan approval rate for customers having Credit_History (1)
df=pd.crosstab(train ["Credit_History"], train ["Loan_Status"], margins=True).apply(percentageConvert, axis=1)
loan_approval_with_Credit_1 = df['Y'][1]
print "%f percent of the applicants whose loans were approved have Credit_History equals to 1." %(loan_approval_with_Credit_1*100)
In [11]:
df['Y']
Out[11]:
In [12]:
# Number of variables with missing values
variables_missing_value = train.isnull().sum()
variables_missing_value
Out[12]:
In [13]:
# Impute missing value of Loan_Amount_Term with median
train['Loan_Amount_Term'].fillna(train['Loan_Amount_Term'].median(), inplace=True)
In [14]:
# Impute missing value of Self_Employed with more frequent category
train['Self_Employed'].fillna('No',inplace=True)
In [15]:
# Add both ApplicantIncome and CoapplicantIncome to TotalIncome
train['TotalIncome'] = train['ApplicantIncome'] + train['CoapplicantIncome']
# Looking at the distribtion of TotalIncome
train['LoanAmount'].hist(bins=20)
Out[15]:
In [16]:
import numpy as np
# Perform log transformation of TotalIncome to make it closer to normal
train['LoanAmount_log'] = np.log(train['LoanAmount'])
# Looking at the distribtion of TotalIncome_log
train['LoanAmount_log'].hist(bins=20)
Out[16]:
In [18]:
from sklearn import preprocessing
# Impute missing values for Gender
train['Gender'].fillna(train['Gender'].mode()[0],inplace=True)
# Impute missing values for Married
train['Married'].fillna(train['Married'].mode()[0],inplace=True)
# Impute missing values for Dependents
train['Dependents'].fillna(train['Dependents'].mode()[0],inplace=True)
# Impute missing values for Credit_History
train['Credit_History'].fillna(train['Credit_History'].mode()[0],inplace=True)
# Convert all non-numeric values to number
cat_col=['Gender','Married','Dependents','Education','Self_Employed','Credit_History','Property_Area']
for var in cat_col:
le = preprocessing.LabelEncoder()
train[var]=le.fit_transform(train[var].astype('str'))
In [30]:
train['Credit_History'].isnull().sum()
Out[30]:
In [33]:
LoanAmount_have_missing_value = train['LoanAmount'].isnull().sum() > 0
In [34]:
LoanAmount_have_missing_value
Out[34]:
In [11]:
train.isnull().sum()
Out[11]:
In [45]:
train.describe()
Out[45]:
In [3]:
# Impute missing value of LoanAmount with 168
train['LoanAmount'].fillna(168, inplace=True)
# Impute missing value of LoanAmount with median
#train['LoanAmount'].fillna(train['LoanAmount'].median(), inplace=True)
In [52]:
train['LoanAmount'].median()
Out[52]:
In [6]:
train['Gender'].fillna('Male',inplace=True)
In [9]:
train['Credit_History'].value_counts()
Out[9]:
In [10]:
train['Credit_History'].fillna(1,inplace=True)
In [ ]: