In [ ]:
#https://github.com/aarshayj/Analytics_Vidhya/blob/master/Articles/Parameter_Tuning_GBM_with_Example/data_preparation.ipynb
In [ ]:
#https://www.analyticsvidhya.com/blog/2016/02/complete-guide-parameter-tuning-gradient-boosting-gbm-python/
In [1]:
import pandas as pd
import numpy as np
%matplotlib inline
In [4]:
#train = pd.read_csv('Train_nyOWmfK.csv')#,encoding='utf-8')
#test = pd.read_csv('Test_bCtAN1w.csv')#, encoding='utf-8')
test = pd.read_csv('Test_bCtAN1w.csv')#, encoding='utf-8')
test.head(2)
Out[4]:
In [5]:
#train = pd.read_csv('Train_nyOWmfK.csv')#,encoding='utf-8')
train = pd.read_csv('Train_my.csv')#,encoding='utf-8')
In [7]:
train.shape, test.shape
Out[7]:
In [8]:
train['Disbursed'] = train.Disbursed.apply(lambda x: 0 if pd.isnull(x) else 1)
train.dtypes
Out[8]:
In [10]:
#train.Var5.value_counts()
train.Var5.value_counts(dropna=True)
Out[10]:
In [11]:
train.head(2)
Out[11]:
In [12]:
train['source'] = 'train'
test['source'] = 'test'
data = pd.concat([train, test], ignore_index = True)
data.shape
Out[12]:
In [7]:
##Check Missing
data.apply(lambda x: sum(x.isnull()))
Out[7]:
In [14]:
## Look at categories of all object variables:
var = ['Gender','Salary_Account','Mobile_Verified','Var1','Filled_Form','Device_Type','Var2','Source']
for v in var:
print("\n\nFrequency count for variable:",v)
print(data[v].value_counts(dropna=False))
Handle Individual Variables:
In [ ]:
##city variable:
len(data.City.unique()) ## gives 724
data.drop('City',axis = 1, inplace=True )
In [10]:
## Determine Age from DOB
data.DOB.head()
Out[10]:
In [11]:
# create age variable
data['Age'] = data['DOB'].apply(lambda x : 117 - int(x[-2:]))
data.Age.head()
Out[11]:
In [12]:
#drop DOB
data.drop('DOB',axis = 1 , inplace = True)
In [13]:
data.boxplot(column=['EMI_Loan_Submitted'], return_type = 'axes')
In [14]:
#Majority values missing so I'll create a new variable stating whether this is missing or note:
data['EMI_Loan_Submitted_Missing'] = data['EMI_Loan_Submitted'].apply(lambda x: 1 if pd.isnull(x) else 0)
data[['EMI_Loan_Submitted','EMI_Loan_Submitted_Missing']].head(10)
Out[14]:
In [15]:
#drop original vaiables:
data.drop('EMI_Loan_Submitted', axis = 1 , inplace = True)
In [16]:
len(data['Employer_Name'].value_counts())
Out[16]:
In [17]:
#I'll drop the variable because too many unique values. Another option could be to categorize them manually
data.drop('Employer_Name',axis=1,inplace=True)
In [15]:
data.boxplot(column='Existing_EMI',return_type='axes')
Out[15]:
In [18]:
data['Existing_EMI'].describe()
Out[18]:
In [19]:
data['Existing_EMI'].median()
Out[19]:
In [20]:
#Impute by median (0) because just 111 missing:
data['Existing_EMI'].fillna(0,inplace = True)
In [21]:
## Interest Rate:
#Majority values missing so I'll create a new variable stating whether this is missing or note:
data['Interest_Rate_Missing'] = data['Interest_Rate'].apply(lambda x: 1 if pd.isnull(x) else 0)
print(data[['Interest_Rate','Interest_Rate_Missing']].head(10))
In [22]:
data.drop('Interest_Rate', axis=1, inplace=True)
In [23]:
## Lead Creation Date:¶
#Drop this variable because doesn't appear to affect much intuitively
data.drop('Lead_Creation_Date',axis=1,inplace=True)
Loan Amount and Tenure applied:
In [24]:
#Impute with median because only 111 missing:
data['Loan_Amount_Applied'].fillna(data['Loan_Amount_Applied'].median(),inplace = True)
data['Loan_Tenure_Applied'].fillna(data['Loan_Tenure_Applied'].median(),inplace=True)
In [25]:
## Loan Amount and Tenure selected
#High proportion missing so create a new var whether present or not
data['Loan_Amount_Submitted_Missing'] = data['Loan_Amount_Submitted'].apply(lambda x: 1 if pd.isnull(x) else 0)
data['Loan_Tenure_Submitted_Missing'] = data['Loan_Tenure_Submitted'].apply(lambda x: 1 if pd.isnull(x) else 0)
In [26]:
#Remove old vars
data.drop(['Loan_Amount_Submitted','Loan_Tenure_Submitted'],axis=1,inplace=True)
In [27]:
## Remove logged-in
data.drop('LoggedIn',axis=1,inplace=True)
In [28]:
## Remove salary account
#Salary account has mnay banks which have to be manually grouped
data.drop('Salary_Account',axis=1,inplace=True)
In [29]:
## Processing_Fee
#High proportion missing so create a new var whether present or not
data['Processing_Fee_Missing'] = data['Processing_Fee'].apply(lambda x: 1 if pd.isnull(x) else 0)
#drop old
data.drop('Processing_Fee',axis=1,inplace=True)
In [17]:
## Source
data['Source'] = data['Source'].apply(lambda x: 'others' if x not in ['S122', 'S133'] else x)
data['Source'].value_counts(dropna=False)
Out[17]:
In [31]:
## Final Data:
data.apply(lambda x: sum(x.isnull()))
Out[31]:
In [33]:
data.Var1.value_counts()
Out[33]:
In [34]:
data['Var1'] = data['Var1'].apply(lambda x : 'HBXX' if pd.isnull(x) else x)
In [35]:
data.Var1.value_counts()
Out[35]:
In [36]:
## Final Data:
data.apply(lambda x: sum(x.isnull()))
Out[36]:
In [37]:
data.dtypes
Out[37]:
In [38]:
data['Var5'].value_counts()
Out[38]:
In [39]:
##Numerical Coding:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
var_to_encode = ['Device_Type','Filled_Form','Gender','Mobile_Verified','Source','Var1','Var2']
for col in var_to_encode:
data[col] = le.fit_transform(data[col])
In [40]:
data['Gender'].value_counts()
Out[40]:
In [ ]:
##data['Var5'] = data['Var5'].apply(lambda x : 0 if pd.isnull(x) else x)
#data['']
In [41]:
## One-Hot Coding
data = pd.get_dummies(data, columns = var_to_encode)
data.columns
Out[41]:
In [42]:
## Separate train & test:
train = data.loc[data['source']=='train']
test = data.loc[data['source']== 'test']
In [43]:
train.drop('source', axis = 1, inplace = True)
test.drop(['source','Disbursed'], axis = 1 , inplace = True)
In [44]:
train.to_csv('train_modified.csv',index=False)
test.to_csv('test_modified.csv', index=False)
In [ ]: