In [1]:
import pandas as pd
import numpy as np
%matplotlib inline
In [2]:
#Load data:
train = pd.read_csv('Train_nyOWmfK.csv')
test = pd.read_csv('Test_bCtAN1w.csv')
In [3]:
train.shape, test.shape
Out[3]:
In [4]:
train.dtypes
Out[4]:
In [5]:
#Combine into data:
train['source']= 'train'
test['source'] = 'test'
data=pd.concat([train, test],ignore_index=True)
data.shape
Out[5]:
In [6]:
data.apply(lambda x: sum(x.isnull()))
Out[6]:
In [7]:
var = ['Gender','Salary_Account','Mobile_Verified','Var1','Filled_Form','Device_Type','Var2','Source']
for v in var:
print '\nFrequency count for variable %s'%v
print data[v].value_counts()
In [8]:
len(data['City'].unique())
#drop city because too many unique
data.drop('City',axis=1,inplace=True)
In [9]:
data['DOB'].head()
Out[9]:
In [10]:
#Create age variable:
data['Age'] = data['DOB'].apply(lambda x: 115 - int(x[-2:]))
data['Age'].head()
Out[10]:
In [11]:
#drop DOB:
data.drop('DOB',axis=1,inplace=True)
In [12]:
data.boxplot(column=['EMI_Loan_Submitted'],return_type='axes')
Out[12]:
In [13]:
#Majority values missing so I'll create a new variable stating whether this is missing or note:
data['EMI_Loan_Submitted_Missing'] = data['EMI_Loan_Submitted'].apply(lambda x: 1 if pd.isnull(x) else 0)
data[['EMI_Loan_Submitted','EMI_Loan_Submitted_Missing']].head(10)
Out[13]:
In [14]:
#drop original vaiables:
data.drop('EMI_Loan_Submitted',axis=1,inplace=True)
In [15]:
len(data['Employer_Name'].value_counts())
Out[15]:
In [16]:
#I'll drop the variable because too many unique values. Another option could be to categorize them manually
data.drop('Employer_Name',axis=1,inplace=True)
In [17]:
data.boxplot(column='Existing_EMI',return_type='axes')
Out[17]:
In [18]:
data['Existing_EMI'].describe()
Out[18]:
In [19]:
#Impute by median (0) because just 111 missing:
data['Existing_EMI'].fillna(0, inplace=True)
In [20]:
#Majority values missing so I'll create a new variable stating whether this is missing or note:
data['Interest_Rate_Missing'] = data['Interest_Rate'].apply(lambda x: 1 if pd.isnull(x) else 0)
print data[['Interest_Rate','Interest_Rate_Missing']].head(10)
In [21]:
data.drop('Interest_Rate',axis=1,inplace=True)
In [22]:
#Drop this variable because doesn't appear to affect much intuitively
data.drop('Lead_Creation_Date',axis=1,inplace=True)
In [23]:
#Impute with median because only 111 missing:
data['Loan_Amount_Applied'].fillna(data['Loan_Amount_Applied'].median(),inplace=True)
data['Loan_Tenure_Applied'].fillna(data['Loan_Tenure_Applied'].median(),inplace=True)
In [24]:
#High proportion missing so create a new var whether present or not
data['Loan_Amount_Submitted_Missing'] = data['Loan_Amount_Submitted'].apply(lambda x: 1 if pd.isnull(x) else 0)
data['Loan_Tenure_Submitted_Missing'] = data['Loan_Tenure_Submitted'].apply(lambda x: 1 if pd.isnull(x) else 0)
In [25]:
#Remove old vars
data.drop(['Loan_Amount_Submitted','Loan_Tenure_Submitted'],axis=1,inplace=True)
In [26]:
data.drop('LoggedIn',axis=1,inplace=True)
In [27]:
#Salary account has mnay banks which have to be manually grouped
data.drop('Salary_Account',axis=1,inplace=True)
In [28]:
#High proportion missing so create a new var whether present or not
data['Processing_Fee_Missing'] = data['Processing_Fee'].apply(lambda x: 1 if pd.isnull(x) else 0)
#drop old
data.drop('Processing_Fee',axis=1,inplace=True)
In [29]:
data['Source'] = data['Source'].apply(lambda x: 'others' if x not in ['S122','S133'] else x)
data['Source'].value_counts()
Out[29]:
In [30]:
data.apply(lambda x: sum(x.isnull()))
Out[30]:
In [31]:
data.dtypes
Out[31]:
In [32]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
var_to_encode = ['Device_Type','Filled_Form','Gender','Var1','Var2','Mobile_Verified','Source']
for col in var_to_encode:
data[col] = le.fit_transform(data[col])
In [33]:
data = pd.get_dummies(data, columns=var_to_encode)
data.columns
Out[33]:
In [34]:
train = data.loc[data['source']=='train']
test = data.loc[data['source']=='test']
In [35]:
train.drop('source',axis=1,inplace=True)
test.drop(['source','Disbursed'],axis=1,inplace=True)
In [36]:
train.to_csv('train_modified.csv',index=False)
test.to_csv('test_modified.csv',index=False)