notebook.community

Edit and run



In [ ]:

    
#https://github.com/aarshayj/Analytics_Vidhya/blob/master/Articles/Parameter_Tuning_GBM_with_Example/data_preparation.ipynb



In [ ]:

    
#https://www.analyticsvidhya.com/blog/2016/02/complete-guide-parameter-tuning-gradient-boosting-gbm-python/



In [1]:

    
import pandas as pd
import numpy as np
%matplotlib inline



In [4]:

    
#train = pd.read_csv('Train_nyOWmfK.csv')#,encoding='utf-8')
#test = pd.read_csv('Test_bCtAN1w.csv')#, encoding='utf-8')
test = pd.read_csv('Test_bCtAN1w.csv')#, encoding='utf-8')

test.head(2)









    Out[4]:







  
    
      
      ID
      Gender
      City
      Monthly_Income
      DOB
      Lead_Creation_Date
      Loan_Amount_Applied
      Loan_Tenure_Applied
      Existing_EMI
      Employer_Name
      ...
      Loan_Amount_Submitted
      Loan_Tenure_Submitted
      Interest_Rate
      Processing_Fee
      EMI_Loan_Submitted
      Filled_Form
      Device_Type
      Var2
      Source
      Var4
    
  
  
    
      0
      ID000026A10
      Male
      Dehradun
      21500
      03-Apr-87
      05-May-15
      100000.0
      3.0
      0.0
      APTARA INC
      ...
      100000.0
      3.0
      20.0
      1000.0
      2649.39
      N
      Web-browser
      B
      S122
      3
    
    
      1
      ID000054C40
      Male
      Mumbai
      42000
      12-May-80
      01-May-15
      0.0
      0.0
      0.0
      ATUL LTD
      ...
      690000.0
      5.0
      24.0
      13800.0
      19849.90
      Y
      Mobile
      C
      S133
      5
    
  

2 rows × 24 columns



In [5]:

    
#train = pd.read_csv('Train_nyOWmfK.csv')#,encoding='utf-8')
train = pd.read_csv('Train_my.csv')#,encoding='utf-8')









    



C:\Users\prassha\AppData\Local\Continuum\Anaconda3\lib\site-packages\IPython\core\interactiveshell.py:2705: DtypeWarning: Columns (12,18) have mixed types. Specify dtype option on import or set low_memory=False.
  interactivity=interactivity, compiler=compiler, result=result)



In [7]:

    
train.shape, test.shape









    Out[7]:





((87020, 26), (37717, 24))



In [8]:

    
train['Disbursed'] = train.Disbursed.apply(lambda x: 0 if pd.isnull(x) else 1)
train.dtypes









    Out[8]:





ID                        object
Gender                    object
City                      object
Monthly_Income             int64
DOB                       object
Lead_Creation_Date        object
Loan_Amount_Applied      float64
Loan_Tenure_Applied      float64
Existing_EMI             float64
Employer_Name             object
Salary_Account            object
Mobile_Verified           object
Var5                      object
Var1                      object
Loan_Amount_Submitted    float64
Loan_Tenure_Submitted    float64
Interest_Rate            float64
Processing_Fee           float64
EMI_Loan_Submitted        object
Filled_Form               object
Device_Type               object
Var2                      object
Source                    object
Var4                       int64
LoggedIn                   int64
Disbursed                  int64
dtype: object



In [10]:

    
#train.Var5.value_counts()
train.Var5.value_counts(dropna=True)









    Out[10]:





0       17814
0       11272
1        7536
3        4966
1        4700
11       4003
2        2974
14       2479
15       2223
12       2208
13       1795
3        1793
2        1511
8        1459
10       1326
16       1324
4        1292
15       1286
11       1201
14       1183
9        1175
9        1106
10       1101
8        1056
7         981
17        936
13        827
12        781
16        773
17        755
6         685
5         639
4         523
7         508
5         336
6         298
18        194
HBXX        1
Name: Var5, dtype: int64



In [11]:

    
train.head(2)









    Out[11]:







  
    
      
      ID
      Gender
      City
      Monthly_Income
      DOB
      Lead_Creation_Date
      Loan_Amount_Applied
      Loan_Tenure_Applied
      Existing_EMI
      Employer_Name
      ...
      Interest_Rate
      Processing_Fee
      EMI_Loan_Submitted
      Filled_Form
      Device_Type
      Var2
      Source
      Var4
      LoggedIn
      Disbursed
    
  
  
    
      0
      ID000002C20
      Female
      Delhi
      20000
      23-May-78
      15-May-15
      300000.0
      5.0
      0.0
      CYBOSOL
      ...
      NaN
      NaN
      NaN
      N
      Web-browser
      G
      S122
      1
      0
      1
    
    
      1
      ID000004E40
      Male
      Mumbai
      35000
      7-Oct-85
      4-May-15
      200000.0
      2.0
      0.0
      TATA CONSULTANCY SERVICES LTD (TCS)
      ...
      13.25
      NaN
      6762.9
      N
      Web-browser
      G
      S122
      3
      0
      1
    
  

2 rows × 26 columns



In [12]:

    
train['source'] = 'train'
test['source'] = 'test'
data =  pd.concat([train, test], ignore_index = True)
data.shape









    Out[12]:





(124737, 27)



In [7]:

    
##Check Missing
data.apply(lambda x: sum(x.isnull()))









    Out[7]:





City                      1401
DOB                          0
Device_Type                  0
Disbursed                37718
EMI_Loan_Submitted       84900
Employer_Name              113
Existing_EMI               111
Filled_Form                  0
Gender                       0
ID                           0
Interest_Rate            84901
Lead_Creation_Date           0
Loan_Amount_Applied        111
Loan_Amount_Submitted    49535
Loan_Tenure_Applied        111
Loan_Tenure_Submitted    49535
LoggedIn                 37717
Mobile_Verified              0
Monthly_Income               0
Processing_Fee           85346
Salary_Account           16801
Source                       0
Var1                         1
Var2                         0
Var4                         0
Var5                         0
source                       0
dtype: int64



In [14]:

    
##  Look at categories of all object variables:
var = ['Gender','Salary_Account','Mobile_Verified','Var1','Filled_Form','Device_Type','Var2','Source']
for v in var:
    print("\n\nFrequency count for variable:",v)
    print(data[v].value_counts(dropna=False))









    




Frequency count for variable: Gender
Male      71398
Female    53339
Name: Gender, dtype: int64


Frequency count for variable: Salary_Account
HDFC Bank                                          25180
ICICI Bank                                         19547
State Bank of India                                17110
NaN                                                16801
Axis Bank                                          12590
Citibank                                            3398
Kotak Bank                                          2955
IDBI Bank                                           2213
Punjab National Bank                                1747
Bank of India                                       1713
Bank of Baroda                                      1675
Standard Chartered Bank                             1434
Canara Bank                                         1384
Union Bank of India                                 1330
Yes Bank                                            1120
ING Vysya                                            996
Corporation bank                                     948
Indian Overseas Bank                                 901
State Bank of Hyderabad                              854
Indian Bank                                          773
Oriental Bank of Commerce                            761
IndusInd Bank                                        711
Andhra Bank                                          706
Central Bank of India                                648
Syndicate Bank                                       614
Bank of Maharasthra                                  576
HSBC                                                 474
State Bank of Bikaner & Jaipur                       448
Karur Vysya Bank                                     435
State Bank of Mysore                                 385
Federal Bank                                         377
Vijaya Bank                                          354
Allahabad Bank                                       345
UCO Bank                                             344
State Bank of Travancore                             333
Karnataka Bank                                       279
United Bank of India                                 276
Dena Bank                                            268
Saraswat Bank                                        265
State Bank of Patiala                                263
South Indian Bank                                    223
Deutsche Bank                                        176
Abhyuday Co-op Bank Ltd                              161
The Ratnakar Bank Ltd                                113
Tamil Nadu Mercantile Bank                           103
Punjab & Sind bank                                    84
J&K Bank                                              78
Lakshmi Vilas bank                                    69
Dhanalakshmi Bank Ltd                                 66
State Bank of Indore                                  32
Catholic Syrian Bank                                  27
India Bulls                                           21
B N P Paribas                                         15
Firstrand Bank Limited                                11
GIC Housing Finance Ltd                               10
Bank of Rajasthan                                      8
Kerala Gramin Bank                                     4
Industrial And Commercial Bank Of China Limited        3
Ahmedabad Mercantile Cooperative Bank                  1
N                                                      1
Name: Salary_Account, dtype: int64


Frequency count for variable: Mobile_Verified
Y    80928
N    43808
0        1
Name: Mobile_Verified, dtype: int64


Frequency count for variable: Var1
HBXX    84900
HBXC    12952
HBXB     6502
HAXA     4214
HBXA     3042
HAXB     2879
HBXD     2818
HAXC     2171
HBXH     1387
HCXF      990
HAYT      710
HAVC      570
HAXM      386
HCXD      348
HCYS      318
HVYS      252
HAZD      161
HCXG      114
HAXF       22
NaN         1
Name: Var1, dtype: int64


Frequency count for variable: Filled_Form
N         96739
Y         27997
Mobile        1
Name: Filled_Form, dtype: int64


Frequency count for variable: Device_Type
Web-browser    92105
Mobile         32631
G                  1
Name: Device_Type, dtype: int64


Frequency count for variable: Var2
B       53481
G       47337
C       20366
E        1855
D         918
F         770
A           9
S122        1
Name: Var2, dtype: int64


Frequency count for variable: Source
S122    55248
S133    42900
S159     7999
S143     6140
S127     2804
S137     2450
S134     1900
S161     1109
S151     1018
S157      929
S153      705
S144      447
S156      432
S158      294
S123      112
S141       83
S162       60
S124       43
S150       19
S160       11
S138        5
S136        5
S155        5
S129        4
S139        4
S135        2
S125        1
1           1
S142        1
S154        1
S131        1
S126        1
S132        1
S130        1
S140        1
Name: Source, dtype: int64

Handle Individual Variables:



In [ ]:

    
##city variable:
len(data.City.unique())    ## gives 724
data.drop('City',axis = 1, inplace=True )



In [10]:

    
## Determine Age from DOB
data.DOB.head()









    Out[10]:





0    23-May-78
1     7-Oct-85
2    10-Oct-81
3    30-Nov-87
4    17-Feb-84
Name: DOB, dtype: object



In [11]:

    
# create age variable
data['Age']  = data['DOB'].apply(lambda x : 117 - int(x[-2:]))
data.Age.head()









    Out[11]:





0    39
1    32
2    36
3    30
4    33
Name: Age, dtype: int64



In [12]:

    
#drop DOB
data.drop('DOB',axis = 1 , inplace = True)



In [13]:

    
data.boxplot(column=['EMI_Loan_Submitted'], return_type = 'axes')









    



---------------------------------------------------------------------------
KeyError                                  Traceback (most recent call last)
<ipython-input-13-bbb94b53b177> in <module>()
----> 1 data.boxplot(column=['EMI_Loan_Submitted'], return_type = 'axes')

C:\Users\prassha\AppData\Local\Continuum\Anaconda3\lib\site-packages\pandas\core\frame.py in boxplot(self, column, by, ax, fontsize, rot, grid, figsize, layout, return_type, **kwds)
   5943     ax = boxplot(self, column=column, by=by, ax=ax, fontsize=fontsize,
   5944                  grid=grid, rot=rot, figsize=figsize, layout=layout,
-> 5945                  return_type=return_type, **kwds)
   5946     plt.draw_if_interactive()
   5947     return ax

C:\Users\prassha\AppData\Local\Continuum\Anaconda3\lib\site-packages\pandas\plotting\_core.py in boxplot(data, column, by, ax, fontsize, rot, grid, figsize, layout, return_type, **kwds)
   2009             columns = data.columns
   2010         else:
-> 2011             data = data[columns]
   2012 
   2013         result = plot_group(columns, data.values.T, ax)

C:\Users\prassha\AppData\Local\Continuum\Anaconda3\lib\site-packages\pandas\core\frame.py in __getitem__(self, key)
   2054         if isinstance(key, (Series, np.ndarray, Index, list)):
   2055             # either boolean or fancy integer index
-> 2056             return self._getitem_array(key)
   2057         elif isinstance(key, DataFrame):
   2058             return self._getitem_frame(key)

C:\Users\prassha\AppData\Local\Continuum\Anaconda3\lib\site-packages\pandas\core\frame.py in _getitem_array(self, key)
   2098             return self.take(indexer, axis=0, convert=False)
   2099         else:
-> 2100             indexer = self.loc._convert_to_indexer(key, axis=1)
   2101             return self.take(indexer, axis=1, convert=True)
   2102 

C:\Users\prassha\AppData\Local\Continuum\Anaconda3\lib\site-packages\pandas\core\indexing.py in _convert_to_indexer(self, obj, axis, is_setter)
   1229                 mask = check == -1
   1230                 if mask.any():
-> 1231                     raise KeyError('%s not in index' % objarr[mask])
   1232 
   1233                 return _values_from_object(indexer)

KeyError: "['EMI_Loan_Submitted'] not in index"



In [14]:

    
#Majority values missing so I'll create a new variable stating whether this is missing or note:
data['EMI_Loan_Submitted_Missing'] = data['EMI_Loan_Submitted'].apply(lambda x: 1 if pd.isnull(x) else 0)
data[['EMI_Loan_Submitted','EMI_Loan_Submitted_Missing']].head(10)









    Out[14]:







  
    
      
      EMI_Loan_Submitted
      EMI_Loan_Submitted_Missing
    
  
  
    
      0
      NaN
      1
    
    
      1
      6762.9
      0
    
    
      2
      NaN
      1
    
    
      3
      NaN
      1
    
    
      4
      NaN
      1
    
    
      5
      6978.92
      0
    
    
      6
      NaN
      1
    
    
      7
      NaN
      1
    
    
      8
      30824.7
      0
    
    
      9
      10883.4
      0



In [15]:

    
#drop original vaiables:
data.drop('EMI_Loan_Submitted', axis = 1 , inplace = True)



In [16]:

    
len(data['Employer_Name'].value_counts())









    Out[16]:





57193



In [17]:

    
#I'll drop the variable because too many unique values. Another option could be to categorize them manually
data.drop('Employer_Name',axis=1,inplace=True)



In [15]:

    
data.boxplot(column='Existing_EMI',return_type='axes')









    Out[15]:





<matplotlib.axes._subplots.AxesSubplot at 0x1f44e55a080>



In [18]:

    
data['Existing_EMI'].describe()









    Out[18]:





count    1.246260e+05
mean     3.636342e+03
std      3.369124e+04
min      0.000000e+00
25%      0.000000e+00
50%      0.000000e+00
75%      3.500000e+03
max      1.000000e+07
Name: Existing_EMI, dtype: float64



In [19]:

    
data['Existing_EMI'].median()









    Out[19]:





0.0



In [20]:

    
#Impute by median (0) because just 111 missing:
data['Existing_EMI'].fillna(0,inplace = True)



In [21]:

    
## Interest Rate:
#Majority values missing so I'll create a new variable stating whether this is missing or note:
data['Interest_Rate_Missing']  = data['Interest_Rate'].apply(lambda x: 1 if pd.isnull(x) else 0)
print(data[['Interest_Rate','Interest_Rate_Missing']].head(10))









    



   Interest_Rate  Interest_Rate_Missing
0            NaN                      1
1          13.25                      0
2            NaN                      1
3            NaN                      1
4            NaN                      1
5          13.99                      0
6            NaN                      1
7            NaN                      1
8          14.85                      0
9          18.25                      0



In [22]:

    
data.drop('Interest_Rate', axis=1, inplace=True)



In [23]:

    
## Lead Creation Date:¶
#Drop this variable because doesn't appear to affect much intuitively
data.drop('Lead_Creation_Date',axis=1,inplace=True)

Loan Amount and Tenure applied:



In [24]:

    
#Impute with median because only 111 missing:
data['Loan_Amount_Applied'].fillna(data['Loan_Amount_Applied'].median(),inplace = True)
data['Loan_Tenure_Applied'].fillna(data['Loan_Tenure_Applied'].median(),inplace=True)



In [25]:

    
## Loan Amount and Tenure selected
#High proportion missing so create a new var whether present or not
data['Loan_Amount_Submitted_Missing'] = data['Loan_Amount_Submitted'].apply(lambda x: 1 if pd.isnull(x) else 0)
data['Loan_Tenure_Submitted_Missing'] = data['Loan_Tenure_Submitted'].apply(lambda x: 1 if pd.isnull(x) else 0)



In [26]:

    
#Remove old vars
data.drop(['Loan_Amount_Submitted','Loan_Tenure_Submitted'],axis=1,inplace=True)



In [27]:

    
## Remove logged-in
data.drop('LoggedIn',axis=1,inplace=True)



In [28]:

    
## Remove salary account
#Salary account has mnay banks which have to be manually grouped
data.drop('Salary_Account',axis=1,inplace=True)



In [29]:

    
## Processing_Fee
#High proportion missing so create a new var whether present or not
data['Processing_Fee_Missing'] = data['Processing_Fee'].apply(lambda x: 1 if pd.isnull(x) else 0)
#drop old
data.drop('Processing_Fee',axis=1,inplace=True)



In [17]:

    
## Source
data['Source']  = data['Source'].apply(lambda x: 'others' if x not in ['S122', 'S133'] else x)
data['Source'].value_counts(dropna=False)









    Out[17]:





S122      55248
S133      42900
others    26589
Name: Source, dtype: int64



In [31]:

    
## Final Data:
data.apply(lambda x: sum(x.isnull()))









    Out[31]:





City                              1401
Device_Type                          0
Disbursed                        37718
Existing_EMI                         0
Filled_Form                          0
Gender                               0
ID                                   0
Loan_Amount_Applied                  0
Loan_Tenure_Applied                  0
Mobile_Verified                      0
Monthly_Income                       0
Source                               0
Var1                                 1
Var2                                 0
Var4                                 0
Var5                                 0
source                               0
Age                                  0
EMI_Loan_Submitted_Missing           0
Interest_Rate_Missing                0
Loan_Amount_Submitted_Missing        0
Loan_Tenure_Submitted_Missing        0
Processing_Fee_Missing               0
dtype: int64



In [33]:

    
data.Var1.value_counts()









    Out[33]:





HBXX    84900
HBXC    12952
HBXB     6502
HAXA     4214
HBXA     3042
HAXB     2879
HBXD     2818
HAXC     2171
HBXH     1387
HCXF      990
HAYT      710
HAVC      570
HAXM      386
HCXD      348
HCYS      318
HVYS      252
HAZD      161
HCXG      114
HAXF       22
Name: Var1, dtype: int64



In [34]:

    
data['Var1'] = data['Var1'].apply(lambda x : 'HBXX' if pd.isnull(x) else x)



In [35]:

    
data.Var1.value_counts()









    Out[35]:





HBXX    84901
HBXC    12952
HBXB     6502
HAXA     4214
HBXA     3042
HAXB     2879
HBXD     2818
HAXC     2171
HBXH     1387
HCXF      990
HAYT      710
HAVC      570
HAXM      386
HCXD      348
HCYS      318
HVYS      252
HAZD      161
HCXG      114
HAXF       22
Name: Var1, dtype: int64



In [36]:

    
## Final Data:
data.apply(lambda x: sum(x.isnull()))









    Out[36]:





City                              1401
Device_Type                          0
Disbursed                        37718
Existing_EMI                         0
Filled_Form                          0
Gender                               0
ID                                   0
Loan_Amount_Applied                  0
Loan_Tenure_Applied                  0
Mobile_Verified                      0
Monthly_Income                       0
Source                               0
Var1                                 0
Var2                                 0
Var4                                 0
Var5                                 0
source                               0
Age                                  0
EMI_Loan_Submitted_Missing           0
Interest_Rate_Missing                0
Loan_Amount_Submitted_Missing        0
Loan_Tenure_Submitted_Missing        0
Processing_Fee_Missing               0
dtype: int64



In [37]:

    
data.dtypes









    Out[37]:





City                              object
Device_Type                        int64
Disbursed                        float64
Existing_EMI                     float64
Filled_Form                        int64
Gender                             int64
ID                                object
Loan_Amount_Applied              float64
Loan_Tenure_Applied              float64
Mobile_Verified                    int64
Monthly_Income                     int64
Source                             int64
Var1                              object
Var2                              object
Var4                               int64
Var5                              object
source                            object
Age                                int64
EMI_Loan_Submitted_Missing         int64
Interest_Rate_Missing              int64
Loan_Amount_Submitted_Missing      int64
Loan_Tenure_Submitted_Missing      int64
Processing_Fee_Missing             int64
dtype: object



In [38]:

    
data['Var5'].value_counts()









    Out[38]:





0       30311
1       12862
0       11272
3        7800
11       6226
1        4700
14       4044
15       3737
2        3558
12       3509
2        2974
13       2922
8        2573
16       2323
10       2166
4        2116
9        2115
3        1793
17       1621
7        1607
10       1326
15       1286
11       1201
14       1183
9        1175
6        1108
5        1092
8        1056
13        827
12        781
16        773
17        755
4         523
7         508
5         336
6         298
18        279
HBXX        1
Name: Var5, dtype: int64



In [39]:

    
##Numerical Coding:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
var_to_encode = ['Device_Type','Filled_Form','Gender','Mobile_Verified','Source','Var1','Var2']
for col in var_to_encode:
    data[col] = le.fit_transform(data[col])



In [40]:

    
data['Gender'].value_counts()









    Out[40]:





1    71398
0    53339
Name: Gender, dtype: int64



In [ ]:

    
##data['Var5'] = data['Var5'].apply(lambda x : 0 if pd.isnull(x) else x)
#data['']



In [41]:

    
## One-Hot Coding
data = pd.get_dummies(data, columns = var_to_encode)
data.columns









    Out[41]:





Index(['City', 'Disbursed', 'Existing_EMI', 'ID', 'Loan_Amount_Applied',
       'Loan_Tenure_Applied', 'Monthly_Income', 'Var4', 'Var5', 'source',
       'Age', 'EMI_Loan_Submitted_Missing', 'Interest_Rate_Missing',
       'Loan_Amount_Submitted_Missing', 'Loan_Tenure_Submitted_Missing',
       'Processing_Fee_Missing', 'Device_Type_0', 'Device_Type_1',
       'Device_Type_2', 'Filled_Form_0', 'Filled_Form_1', 'Filled_Form_2',
       'Gender_0', 'Gender_1', 'Mobile_Verified_0', 'Mobile_Verified_1',
       'Mobile_Verified_2', 'Source_0', 'Source_1', 'Source_2', 'Var1_0',
       'Var1_1', 'Var1_2', 'Var1_3', 'Var1_4', 'Var1_5', 'Var1_6', 'Var1_7',
       'Var1_8', 'Var1_9', 'Var1_10', 'Var1_11', 'Var1_12', 'Var1_13',
       'Var1_14', 'Var1_15', 'Var1_16', 'Var1_17', 'Var1_18', 'Var2_0',
       'Var2_1', 'Var2_2', 'Var2_3', 'Var2_4', 'Var2_5', 'Var2_6', 'Var2_7'],
      dtype='object')



In [42]:

    
## Separate train & test:
train = data.loc[data['source']=='train']
test = data.loc[data['source']== 'test']



In [43]:

    
train.drop('source', axis = 1, inplace = True)
test.drop(['source','Disbursed'], axis = 1 , inplace = True)









    



C:\Users\prassha\AppData\Local\Continuum\Anaconda3\lib\site-packages\ipykernel_launcher.py:1: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.
C:\Users\prassha\AppData\Local\Continuum\Anaconda3\lib\site-packages\ipykernel_launcher.py:2: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy



In [44]:

    
train.to_csv('train_modified.csv',index=False)
test.to_csv('test_modified.csv', index=False)



In [ ]:

	ID	Gender	City	Monthly_Income	DOB	Lead_Creation_Date	Loan_Amount_Applied	Loan_Tenure_Applied	Existing_EMI	Employer_Name	...	Loan_Amount_Submitted	Loan_Tenure_Submitted	Interest_Rate	Processing_Fee	EMI_Loan_Submitted	Filled_Form	Device_Type	Var2	Source	Var4
0	ID000026A10	Male	Dehradun	21500	03-Apr-87	05-May-15	100000.0	3.0	0.0	APTARA INC	...	100000.0	3.0	20.0	1000.0	2649.39	N	Web-browser	B	S122	3
1	ID000054C40	Male	Mumbai	42000	12-May-80	01-May-15	0.0	0.0	0.0	ATUL LTD	...	690000.0	5.0	24.0	13800.0	19849.90	Y	Mobile	C	S133	5

	ID	Gender	City	Monthly_Income	DOB	Lead_Creation_Date	Loan_Amount_Applied	Loan_Tenure_Applied	Existing_EMI	Employer_Name	...	Interest_Rate	Processing_Fee	EMI_Loan_Submitted	Filled_Form	Device_Type	Var2	Source	Var4	LoggedIn	Disbursed
0	ID000002C20	Female	Delhi	20000	23-May-78	15-May-15	300000.0	5.0	0.0	CYBOSOL	...	NaN	NaN	NaN	N	Web-browser	G	S122	1	0	1
1	ID000004E40	Male	Mumbai	35000	7-Oct-85	4-May-15	200000.0	2.0	0.0	TATA CONSULTANCY SERVICES LTD (TCS)	...	13.25	NaN	6762.9	N	Web-browser	G	S122	3	0	1

	EMI_Loan_Submitted	EMI_Loan_Submitted_Missing
0	NaN	1
1	6762.9	0
2	NaN	1
3	NaN	1
4	NaN	1
5	6978.92	0
6	NaN	1
7	NaN	1
8	30824.7	0
9	10883.4	0