In [78]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline  
import os
from sklearn import ensemble
from sklearn import cross_validation
from sklearn.metrics import roc_auc_score as auc
import time
from sklearn.feature_selection import SelectPercentile
from sklearn.feature_selection import f_classif,chi2
from sklearn.preprocessing import Binarizer, scale

In [7]:
ROOT_PATH = os.path.join("/media", "shabou", "7D04-9B6D", "WS", "data", "datasets", "santander")

In [8]:
train_df = pd.read_csv(os.path.join(ROOT_PATH,'train.csv'))

In [9]:
train_df.head(2)


Out[9]:
ID var3 var15 imp_ent_var16_ult1 imp_op_var39_comer_ult1 imp_op_var39_comer_ult3 imp_op_var40_comer_ult1 imp_op_var40_comer_ult3 imp_op_var40_efect_ult1 imp_op_var40_efect_ult3 ... saldo_medio_var33_hace2 saldo_medio_var33_hace3 saldo_medio_var33_ult1 saldo_medio_var33_ult3 saldo_medio_var44_hace2 saldo_medio_var44_hace3 saldo_medio_var44_ult1 saldo_medio_var44_ult3 var38 TARGET
0 1 2 23 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 39205.17 0
1 3 2 34 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 49278.03 0

2 rows × 371 columns


In [10]:
train_df.shape


Out[10]:
(76020, 371)

In [69]:
train_df_describe = train_df.describe()
pd.set_option('display.max_rows', train_df_describe.shape[1])
pd.options.display.float_format = '{:10.5f}'.format
print train_df_describe.loc[['min', 'max', 'mean']].T
#problem in min var3


                                        min              max           mean
ID                                  1.00000     151838.00000    75964.05072
var3                          -999999.00000        238.00000    -1523.19928
var15                               5.00000        105.00000       33.21287
imp_ent_var16_ult1                  0.00000     210000.00000       86.20827
imp_op_var39_comer_ult1             0.00000      12888.03000       72.36307
imp_op_var39_comer_ult3             0.00000      21024.81000      119.52963
imp_op_var40_comer_ult1             0.00000       8237.82000        3.55913
imp_op_var40_comer_ult3             0.00000      11073.57000        6.47270
imp_op_var40_efect_ult1             0.00000       6600.00000        0.41295
imp_op_var40_efect_ult3             0.00000       6600.00000        0.56735
imp_op_var40_ult1                   0.00000       8237.82000        3.16072
imp_op_var41_comer_ult1             0.00000      12888.03000       68.80394
imp_op_var41_comer_ult3             0.00000      16566.81000      113.05693
imp_op_var41_efect_ult1             0.00000      45990.00000       68.20514
imp_op_var41_efect_ult3             0.00000     131100.00000      113.22506
imp_op_var41_ult1                   0.00000      47598.09000      137.24276
imp_op_var39_efect_ult1             0.00000      45990.00000       68.61809
imp_op_var39_efect_ult3             0.00000     131100.00000      113.79241
imp_op_var39_ult1                   0.00000      47598.09000      140.40348
imp_sal_var16_ult1                  0.00000     105000.00000        5.47768
ind_var1_0                          0.00000          1.00000        0.01146
ind_var1                            0.00000          1.00000        0.00376
ind_var2_0                          0.00000          0.00000        0.00000
ind_var2                            0.00000          0.00000        0.00000
ind_var5_0                          0.00000          1.00000        0.95802
ind_var5                            0.00000          1.00000        0.66376
ind_var6_0                          0.00000          1.00000        0.00011
ind_var6                            0.00000          1.00000        0.00003
ind_var8_0                          0.00000          1.00000        0.03283
ind_var8                            0.00000          1.00000        0.02860
ind_var12_0                         0.00000          1.00000        0.06752
ind_var12                           0.00000          1.00000        0.04546
ind_var13_0                         0.00000          1.00000        0.05225
ind_var13_corto_0                   0.00000          1.00000        0.04294
ind_var13_corto                     0.00000          1.00000        0.04148
ind_var13_largo_0                   0.00000          1.00000        0.01017
ind_var13_largo                     0.00000          1.00000        0.01000
ind_var13_medio_0                   0.00000          1.00000        0.00003
ind_var13_medio                     0.00000          1.00000        0.00003
ind_var13                           0.00000          1.00000        0.05086
ind_var14_0                         0.00000          1.00000        0.02365
ind_var14                           0.00000          1.00000        0.00530
ind_var17_0                         0.00000          1.00000        0.00180
ind_var17                           0.00000          1.00000        0.00145
ind_var18_0                         0.00000          1.00000        0.00003
ind_var18                           0.00000          1.00000        0.00003
ind_var19                           0.00000          1.00000        0.00420
ind_var20_0                         0.00000          1.00000        0.00363
ind_var20                           0.00000          1.00000        0.00270
ind_var24_0                         0.00000          1.00000        0.04237
ind_var24                           0.00000          1.00000        0.03788
ind_var25_cte                       0.00000          1.00000        0.02643
ind_var26_0                         0.00000          1.00000        0.02464
ind_var26_cte                       0.00000          1.00000        0.02756
ind_var26                           0.00000          1.00000        0.02464
ind_var25_0                         0.00000          1.00000        0.02364
ind_var25                           0.00000          1.00000        0.02364
ind_var27_0                         0.00000          0.00000        0.00000
ind_var28_0                         0.00000          0.00000        0.00000
ind_var28                           0.00000          0.00000        0.00000
ind_var27                           0.00000          0.00000        0.00000
ind_var29_0                         0.00000          1.00000        0.00011
ind_var29                           0.00000          1.00000        0.00003
ind_var30_0                         0.00000          1.00000        0.99549
ind_var30                           0.00000          1.00000        0.73283
ind_var31_0                         0.00000          1.00000        0.00428
ind_var31                           0.00000          1.00000        0.00367
ind_var32_cte                       0.00000          1.00000        0.00121
ind_var32_0                         0.00000          1.00000        0.00108
ind_var32                           0.00000          1.00000        0.00108
ind_var33_0                         0.00000          1.00000        0.00075
ind_var33                           0.00000          1.00000        0.00063
ind_var34_0                         0.00000          1.00000        0.00003
ind_var34                           0.00000          1.00000        0.00003
ind_var37_cte                       0.00000          1.00000        0.07230
ind_var37_0                         0.00000          1.00000        0.06526
ind_var37                           0.00000          1.00000        0.06526
ind_var39_0                         0.00000          1.00000        0.88076
ind_var40_0                         0.00000          1.00000        0.01142
ind_var40                           0.00000          1.00000        0.00372
ind_var41_0                         0.00000          1.00000        0.87928
ind_var41                           0.00000          0.00000        0.00000
ind_var39                           0.00000          1.00000        0.00372
ind_var44_0                         0.00000          1.00000        0.00188
ind_var44                           0.00000          1.00000        0.00170
ind_var46_0                         0.00000          0.00000        0.00000
ind_var46                           0.00000          0.00000        0.00000
num_var1_0                          0.00000          6.00000        0.03445
num_var1                            0.00000          6.00000        0.01133
num_var4                            0.00000          7.00000        1.07944
num_var5_0                          0.00000         15.00000        2.89404
num_var5                            0.00000         15.00000        1.99917
num_var6_0                          0.00000          3.00000        0.00032
num_var6                            0.00000          3.00000        0.00008
num_var8_0                          0.00000          6.00000        0.09854
num_var8                            0.00000          3.00000        0.08579
num_var12_0                         0.00000        111.00000        0.21125
num_var12                           0.00000         15.00000        0.13800
num_var13_0                         0.00000         18.00000        0.16772
num_var13_corto_0                   0.00000          6.00000        0.13031
num_var13_corto                     0.00000          6.00000        0.12451
num_var13_largo_0                   0.00000         18.00000        0.03733
num_var13_largo                     0.00000         18.00000        0.03524
num_var13_medio_0                   0.00000          3.00000        0.00008
num_var13_medio                     0.00000          3.00000        0.00008
num_var13                           0.00000         18.00000        0.15983
num_var14_0                         0.00000        111.00000        0.07269
num_var14                           0.00000         12.00000        0.01614
num_var17_0                         0.00000         36.00000        0.01188
num_var17                           0.00000         27.00000        0.00888
num_var18_0                         0.00000          3.00000        0.00008
num_var18                           0.00000          3.00000        0.00008
num_var20_0                         0.00000          3.00000        0.01089
num_var20                           0.00000          3.00000        0.00809
num_var24_0                         0.00000          9.00000        0.12766
num_var24                           0.00000          6.00000        0.11377
num_var26_0                         0.00000         33.00000        0.08938
num_var26                           0.00000         33.00000        0.08938
num_var25_0                         0.00000         33.00000        0.08516
num_var25                           0.00000         33.00000        0.08516
num_op_var40_hace2                  0.00000        117.00000        0.02024
num_op_var40_hace3                  0.00000         48.00000        0.00103
num_op_var40_ult1                   0.00000        234.00000        0.05702
num_op_var40_ult3                   0.00000        351.00000        0.07830
num_op_var41_hace2                  0.00000        249.00000        1.60114
num_op_var41_hace3                  0.00000         81.00000        0.09392
num_op_var41_ult1                   0.00000        468.00000        2.85884
num_op_var41_ult3                   0.00000        468.00000        4.55391
num_op_var39_hace2                  0.00000        249.00000        1.62139
num_op_var39_hace3                  0.00000         81.00000        0.09495
num_op_var39_ult1                   0.00000        468.00000        2.91586
num_op_var39_ult3                   0.00000        468.00000        4.63220
num_var27_0                         0.00000          0.00000        0.00000
num_var28_0                         0.00000          0.00000        0.00000
num_var28                           0.00000          0.00000        0.00000
num_var27                           0.00000          0.00000        0.00000
num_var29_0                         0.00000          3.00000        0.00032
num_var29                           0.00000          3.00000        0.00008
num_var30_0                         0.00000        114.00000        3.37186
num_var30                           0.00000         33.00000        2.38287
num_var31_0                         0.00000         36.00000        0.02013
num_var31                           0.00000         27.00000        0.01606
num_var32_0                         0.00000         12.00000        0.00422
num_var32                           0.00000         12.00000        0.00422
num_var33_0                         0.00000         12.00000        0.00257
num_var33                           0.00000          6.00000        0.00209
num_var34_0                         0.00000          3.00000        0.00008
num_var34                           0.00000          3.00000        0.00008
num_var35                           0.00000         36.00000        3.29937
num_var37_med_ult2                  0.00000        105.00000        0.26354
num_var37_0                         0.00000        114.00000        0.41878
num_var37                           0.00000        114.00000        0.41878
num_var39_0                         0.00000         33.00000        2.72494
num_var40_0                         0.00000          6.00000        0.03429
num_var40                           0.00000          3.00000        0.01117
num_var41_0                         0.00000         33.00000        2.69925
num_var41                           0.00000          0.00000        0.00000
num_var39                           0.00000          3.00000        0.01117
num_var42_0                         0.00000        114.00000        3.20414
num_var42                           0.00000         18.00000        2.21800
num_var44_0                         0.00000          6.00000        0.00568
num_var44                           0.00000          3.00000        0.00509
num_var46_0                         0.00000          0.00000        0.00000
num_var46                           0.00000          0.00000        0.00000
saldo_var1                         -0.90000    3000000.00000       48.44911
saldo_var5                      -2895.72000     619329.15000     1028.46824
saldo_var6                          0.00000      19531.80000        0.41448
saldo_var8                      -4942.26000     240045.00000      141.22678
saldo_var12                         0.00000    3008077.32000     6021.61590
saldo_var13_corto                   0.00000     450000.00000     4993.75297
saldo_var13_largo                   0.00000    1500000.00000     1493.68227
saldo_var13_medio                   0.00000      30000.00000        0.51302
saldo_var13                         0.00000    1500000.00000     6487.94826
saldo_var14                         0.00000     450000.00000       69.09620
saldo_var17                         0.00000    6119500.14000      183.40590
saldo_var18                         0.00000    3000000.00000       43.40963
saldo_var20                         0.00000     455858.16000       27.39946
saldo_var24                         0.00000    3008077.32000     5925.12024
saldo_var26                         0.00000      69756.72000       76.08163
saldo_var25                         0.00000      69756.72000       72.73569
saldo_var28                         0.00000          0.00000        0.00000
saldo_var27                         0.00000          0.00000        0.00000
saldo_var29                         0.00000      19531.80000        0.41448
saldo_var30                     -4942.26000    3458077.32000    13679.67366
saldo_var31                         0.00000    6119500.14000      292.29097
saldo_var32                         0.00000      12210.78000        3.34594
saldo_var33                         0.00000     142078.80000       12.53233
saldo_var34                         0.00000      36000.00000        0.67088
saldo_var37                         0.00000      60000.00000       36.90719
saldo_var40                        -0.90000       8192.61000        4.36860
saldo_var41                         0.00000          0.00000        0.00000
saldo_var42                     -4942.26000    3008077.32000     7191.72540
saldo_var44                         0.00000     740006.61000       96.35274
saldo_var46                         0.00000          0.00000        0.00000
var36                               0.00000         99.00000       40.44908
delta_imp_amort_var18_1y3           0.00000 9999999999.00000   263088.66085
delta_imp_amort_var34_1y3           0.00000 9999999999.00000   263088.66085
delta_imp_aport_var13_1y3          -1.00000 9999999999.00000 48671402.23587
delta_imp_aport_var17_1y3          -1.00000 9999999999.00000  5130228.88643
delta_imp_aport_var33_1y3          -1.00000 9999999999.00000   131544.33014
delta_imp_compra_var44_1y3         -1.00000 9999999999.00000  9208103.12972
delta_imp_reemb_var13_1y3           0.00000 9999999999.00000  4998684.55620
delta_imp_reemb_var17_1y3          -1.00000 9999999999.00000  2630886.60851
delta_imp_reemb_var33_1y3           0.00000 9999999999.00000   131544.33043
delta_imp_trasp_var17_in_1y3       -1.00000 9999999999.00000   526177.32168
delta_imp_trasp_var17_out_1y3       0.00000 9999999999.00000   526177.32170
delta_imp_trasp_var33_in_1y3       -1.00000 9999999999.00000   657721.65205
delta_imp_trasp_var33_out_1y3       0.00000 9999999999.00000   131544.33043
delta_imp_venta_var44_1y3          -1.00000 9999999999.00000  5524861.87795
delta_num_aport_var13_1y3          -1.00000 9999999999.00000 48671402.23569
delta_num_aport_var17_1y3          -1.00000 9999999999.00000  5130228.88643
delta_num_aport_var33_1y3          -1.00000 9999999999.00000   131544.33020
delta_num_compra_var44_1y3         -1.00000 9999999999.00000  9208103.12977
delta_num_reemb_var13_1y3           0.00000 9999999999.00000  4998684.55620
delta_num_reemb_var17_1y3          -1.00000 9999999999.00000  2630886.60851
delta_num_reemb_var33_1y3           0.00000 9999999999.00000   131544.33043
delta_num_trasp_var17_in_1y3       -1.00000 9999999999.00000   526177.32168
delta_num_trasp_var17_out_1y3       0.00000 9999999999.00000   526177.32170
delta_num_trasp_var33_in_1y3       -1.00000 9999999999.00000   657721.65205
delta_num_trasp_var33_out_1y3       0.00000 9999999999.00000   131544.33043
delta_num_venta_var44_1y3          -1.00000 9999999999.00000  5524861.87799
imp_amort_var18_hace3               0.00000          0.00000        0.00000
imp_amort_var18_ult1                0.00000      15691.80000        0.23119
imp_amort_var34_hace3               0.00000          0.00000        0.00000
imp_amort_var34_ult1                0.00000       1096.02000        0.01805
imp_aport_var13_hace3               0.00000     840000.00000     2823.94909
imp_aport_var13_ult1                0.00000     450000.00000      619.58501
imp_aport_var17_hace3               0.00000    6083691.87000       98.78877
imp_aport_var17_ult1                0.00000     432457.32000       31.10532
imp_aport_var33_hace3               0.00000      36000.00000        2.98579
imp_aport_var33_ult1                0.00000       1260.00000        0.04815
imp_var7_emit_ult1                  0.00000     145384.92000        2.72145
imp_var7_recib_ult1                 0.00000    1039260.00000      127.69821
imp_compra_var44_hace3              0.00000     210001.35000       13.96458
imp_compra_var44_ult1               0.00000    3410058.66000      116.78253
imp_reemb_var13_hace3               0.00000          0.00000        0.00000
imp_reemb_var13_ult1                0.00000     450000.00000       46.18024
imp_reemb_var17_hace3               0.00000      12027.15000        0.15821
imp_reemb_var17_ult1                0.00000     182132.97000       12.56940
imp_reemb_var33_hace3               0.00000          0.00000        0.00000
imp_reemb_var33_ult1                0.00000       1200.00000        0.01579
imp_var43_emit_ult1                 0.00000    1155003.00000      854.12075
imp_trans_var37_ult1                0.00000    2310003.00000     1932.95443
imp_trasp_var17_in_hace3            0.00000      96781.44000        1.87481
imp_trasp_var17_in_ult1             0.00000     133730.58000        2.51268
imp_trasp_var17_out_hace3           0.00000          0.00000        0.00000
imp_trasp_var17_out_ult1            0.00000      69622.29000        1.91362
imp_trasp_var33_in_hace3            0.00000      49581.27000        2.78977
imp_trasp_var33_in_ult1             0.00000      13207.32000        0.31470
imp_trasp_var33_out_hace3           0.00000          0.00000        0.00000
imp_trasp_var33_out_ult1            0.00000       3000.00000        0.03946
imp_venta_var44_hace3               0.00000     209834.40000        3.78714
imp_venta_var44_ult1                0.00000    2754476.46000       81.43383
ind_var7_emit_ult1                  0.00000          1.00000        0.00004
ind_var7_recib_ult1                 0.00000          1.00000        0.00270
ind_var10_ult1                      0.00000          1.00000        0.08087
ind_var10cte_ult1                   0.00000          1.00000        0.09216
ind_var9_cte_ult1                   0.00000          1.00000        0.09687
ind_var9_ult1                       0.00000          1.00000        0.08591
ind_var43_emit_ult1                 0.00000          1.00000        0.06659
ind_var43_recib_ult1                0.00000          1.00000        0.12931
var21                               0.00000      30000.00000       32.54933
num_var2_0_ult1                     0.00000          0.00000        0.00000
num_var2_ult1                       0.00000          0.00000        0.00000
num_aport_var13_hace3               0.00000         24.00000        0.07589
num_aport_var13_ult1                0.00000         30.00000        0.01796
num_aport_var17_hace3               0.00000         12.00000        0.00154
num_aport_var17_ult1                0.00000         21.00000        0.00339
num_aport_var33_hace3               0.00000         12.00000        0.00107
num_aport_var33_ult1                0.00000          6.00000        0.00032
num_var7_emit_ult1                  0.00000          3.00000        0.00012
num_var7_recib_ult1                 0.00000         24.00000        0.01030
num_compra_var44_hace3              0.00000          9.00000        0.00185
num_compra_var44_ult1               0.00000         39.00000        0.00754
num_ent_var16_ult1                  0.00000         60.00000        0.18796
num_var22_hace2                     0.00000        123.00000        1.29870
num_var22_hace3                     0.00000        108.00000        1.18489
num_var22_ult1                      0.00000         96.00000        0.56066
num_var22_ult3                      0.00000        234.00000        3.04424
num_med_var22_ult3                  0.00000         78.00000        0.63587
num_med_var45_ult3                  0.00000        267.00000        4.02466
num_meses_var5_ult3                 0.00000          3.00000        1.97998
num_meses_var8_ult3                 0.00000          3.00000        0.05360
num_meses_var12_ult3                0.00000          3.00000        0.10205
num_meses_var13_corto_ult3          0.00000          3.00000        0.09892
num_meses_var13_largo_ult3          0.00000          3.00000        0.01740
num_meses_var13_medio_ult3          0.00000          2.00000        0.00005
num_meses_var17_ult3                0.00000          3.00000        0.00296
num_meses_var29_ult3                0.00000          2.00000        0.00011
num_meses_var33_ult3                0.00000          3.00000        0.00151
num_meses_var39_vig_ult3            0.00000          3.00000        1.59279
num_meses_var44_ult3                0.00000          3.00000        0.00358
num_op_var39_comer_ult1             0.00000        438.00000        2.19479
num_op_var39_comer_ult3             0.00000        600.00000        3.60706
num_op_var40_comer_ult1             0.00000        210.00000        0.07498
num_op_var40_comer_ult3             0.00000        582.00000        0.14483
num_op_var40_efect_ult1             0.00000         24.00000        0.00249
num_op_var40_efect_ult3             0.00000         24.00000        0.00367
num_op_var41_comer_ult1             0.00000        438.00000        2.11981
num_op_var41_comer_ult3             0.00000        438.00000        3.46223
num_op_var41_efect_ult1             0.00000         90.00000        0.71942
num_op_var41_efect_ult3             0.00000        156.00000        1.21215
num_op_var39_efect_ult1             0.00000         90.00000        0.72190
num_op_var39_efect_ult3             0.00000        156.00000        1.21582
num_reemb_var13_hace3               0.00000          0.00000        0.00000
num_reemb_var13_ult1                0.00000          3.00000        0.00150
num_reemb_var17_hace3               0.00000          3.00000        0.00004
num_reemb_var17_ult1                0.00000         21.00000        0.00118
num_reemb_var33_hace3               0.00000          0.00000        0.00000
num_reemb_var33_ult1                0.00000          3.00000        0.00004
num_sal_var16_ult1                  0.00000         15.00000        0.00493
num_var43_emit_ult1                 0.00000        180.00000        0.39282
num_var43_recib_ult1                0.00000        264.00000        0.81500
num_trasp_var11_ult1                0.00000         93.00000        0.12068
num_trasp_var17_in_hace3            0.00000          6.00000        0.00012
num_trasp_var17_in_ult1             0.00000          3.00000        0.00016
num_trasp_var17_out_hace3           0.00000          0.00000        0.00000
num_trasp_var17_out_ult1            0.00000          3.00000        0.00016
num_trasp_var33_in_hace3            0.00000          3.00000        0.00024
num_trasp_var33_in_ult1             0.00000          6.00000        0.00024
num_trasp_var33_out_hace3           0.00000          0.00000        0.00000
num_trasp_var33_out_ult1            0.00000          3.00000        0.00004
num_venta_var44_hace3               0.00000          6.00000        0.00016
num_venta_var44_ult1                0.00000         39.00000        0.00442
num_var45_hace2                     0.00000        342.00000        5.39321
num_var45_hace3                     0.00000        339.00000        3.89440
num_var45_ult1                      0.00000        510.00000        4.36350
num_var45_ult3                      0.00000        801.00000       13.65110
saldo_var2_ult1                     0.00000          0.00000        0.00000
saldo_medio_var5_hace2           -128.37000     812137.26000     1579.13531
saldo_medio_var5_hace3             -8.04000    1542339.36000      891.36586
saldo_medio_var5_ult1            -922.38000     601428.60000     1077.25676
saldo_medio_var5_ult3            -476.07000     544365.57000     1048.85645
saldo_medio_var8_hace2           -287.67000     231351.99000       68.27545
saldo_medio_var8_hace3              0.00000      77586.21000        9.50529
saldo_medio_var8_ult1           -3401.34000     228031.80000      124.62096
saldo_medio_var8_ult3           -1844.52000     177582.00000      110.02658
saldo_medio_var12_hace2             0.00000    3000538.14000     3997.02333
saldo_medio_var12_hace3             0.00000     668335.32000      613.53444
saldo_medio_var12_ult1              0.00000    3004185.60000     5703.00817
saldo_medio_var12_ult3              0.00000    2272859.43000     4401.00244
saldo_medio_var13_corto_hace2       0.00000     450000.00000     3639.41994
saldo_medio_var13_corto_hace3       0.00000     304838.70000      556.18418
saldo_medio_var13_corto_ult1        0.00000     450000.00000     4852.26181
saldo_medio_var13_corto_ult3        0.00000     450000.00000     3857.84854
saldo_medio_var13_largo_hace2       0.00000     840000.00000      771.22745
saldo_medio_var13_largo_hace3       0.00000     534000.00000      162.17044
saldo_medio_var13_largo_ult1        0.00000    1500000.00000      956.95021
saldo_medio_var13_largo_ult3        0.00000    1034482.74000      750.95627
saldo_medio_var13_medio_hace2       0.00000       7741.95000        0.17532
saldo_medio_var13_medio_hace3       0.00000          0.00000        0.00000
saldo_medio_var13_medio_ult1        0.00000      30000.00000        0.51302
saldo_medio_var13_medio_ult3        0.00000      18870.99000        0.34417
saldo_medio_var17_hace2            -0.03000    4210084.23000       91.17181
saldo_medio_var17_hace3             0.00000    2368558.95000       36.46318
saldo_medio_var17_ult1              0.00000    3998687.46000      131.03157
saldo_medio_var17_ult3              0.00000    3525776.88000      109.21694
saldo_medio_var29_hace2             0.00000      10430.01000        0.21307
saldo_medio_var29_hace3             0.00000        145.20000        0.00191
saldo_medio_var29_ult1              0.00000      13793.67000        0.25391
saldo_medio_var29_ult3              0.00000       7331.34000        0.18663
saldo_medio_var33_hace2             0.00000      50003.88000        7.93582
saldo_medio_var33_hace3             0.00000      20385.72000        1.36515
saldo_medio_var33_ult1              0.00000     138831.63000       12.21558
saldo_medio_var33_ult3              0.00000      91778.73000        8.78407
saldo_medio_var44_hace2             0.00000     438329.22000       31.50532
saldo_medio_var44_hace3             0.00000      24650.01000        1.85857
saldo_medio_var44_ult1              0.00000     681462.90000       76.02617
saldo_medio_var44_ult3              0.00000     397884.30000       56.61435
var38                            5163.75000   22034738.76000   117235.80943
TARGET                              0.00000          1.00000        0.03957

In [70]:
#remove outliers rows
train_df_clean1 = train_df.replace(9999999999, np.nan).replace(-999999, np.nan).dropna()
print train_df.shape[0], train_df_clean1.shape[0], train_df.shape[0]-train_df_clean1.shape[0]


76020 75369 651

In [72]:
#non numerical columns
train_df_clean1.select_dtypes(include=['object']).shape[1]


Out[72]:
0

In [73]:
#remove constant columns
cols_to_remove = []
for col in train_df_clean1.columns:
    if train_df_clean1[col].std() == 0:
        cols_to_remove.append(col)
train_df_clean2 = train_df_clean1.drop(cols_to_remove, axis=1)
cols_to_remove, len(cols_to_remove),  train_df_clean2.shape[1], train_df_clean1.shape[1]


Out[73]:
(['ind_var2_0',
  'ind_var2',
  'ind_var18_0',
  'ind_var18',
  'ind_var27_0',
  'ind_var28_0',
  'ind_var28',
  'ind_var27',
  'ind_var34_0',
  'ind_var34',
  'ind_var41',
  'ind_var46_0',
  'ind_var46',
  'num_var18_0',
  'num_var18',
  'num_var27_0',
  'num_var28_0',
  'num_var28',
  'num_var27',
  'num_var34_0',
  'num_var34',
  'num_var41',
  'num_var46_0',
  'num_var46',
  'saldo_var18',
  'saldo_var28',
  'saldo_var27',
  'saldo_var34',
  'saldo_var41',
  'saldo_var46',
  'delta_imp_amort_var18_1y3',
  'delta_imp_amort_var34_1y3',
  'delta_imp_reemb_var13_1y3',
  'delta_imp_reemb_var33_1y3',
  'delta_imp_trasp_var17_out_1y3',
  'delta_imp_trasp_var33_out_1y3',
  'delta_num_reemb_var13_1y3',
  'delta_num_reemb_var33_1y3',
  'delta_num_trasp_var17_out_1y3',
  'delta_num_trasp_var33_out_1y3',
  'imp_amort_var18_hace3',
  'imp_amort_var18_ult1',
  'imp_amort_var34_hace3',
  'imp_amort_var34_ult1',
  'imp_reemb_var13_hace3',
  'imp_reemb_var13_ult1',
  'imp_reemb_var17_ult1',
  'imp_reemb_var33_hace3',
  'imp_reemb_var33_ult1',
  'imp_trasp_var17_in_ult1',
  'imp_trasp_var17_out_hace3',
  'imp_trasp_var17_out_ult1',
  'imp_trasp_var33_in_ult1',
  'imp_trasp_var33_out_hace3',
  'imp_trasp_var33_out_ult1',
  'num_var2_0_ult1',
  'num_var2_ult1',
  'num_reemb_var13_hace3',
  'num_reemb_var13_ult1',
  'num_reemb_var17_ult1',
  'num_reemb_var33_hace3',
  'num_reemb_var33_ult1',
  'num_trasp_var17_in_ult1',
  'num_trasp_var17_out_hace3',
  'num_trasp_var17_out_ult1',
  'num_trasp_var33_in_ult1',
  'num_trasp_var33_out_hace3',
  'num_trasp_var33_out_ult1',
  'saldo_var2_ult1',
  'saldo_medio_var13_medio_hace3'],
 70,
 301,
 371)

In [74]:
#remove duplicate columns
#train_df_clean2 = train_df_clean1.T.drop_duplicates().T #does not work, a lot of rows --> add sub-set recursive
cols_to_remove = []
columns = train_df_clean2.columns
for i in xrange(len(columns)-1):
    v = train_df_clean2[columns[i]].values
    for j in xrange(i+1,len(columns)):
        if np.array_equal(v,train_df_clean2[columns[j]].values):
            cols_to_remove.append(columns[j])
train_df_clean3 = train_df_clean2.drop(cols_to_remove, axis=1)
len(cols_to_remove), train_df_clean3.shape[1], train_df_clean2.shape[1]


Out[74]:
(28, 275, 301)

In [75]:
for col in train_df_clean3.columns:
    dict = train_df_clean3[col].value_counts()
    if dict.shape[0]<10:
        print dict
#TODO: check if removing the rows with value frequency==1 improves the accuracy


0    74528
1      841
Name: ind_var1_0, dtype: int64
0    75092
1      277
Name: ind_var1, dtype: int64
1    72209
0     3160
Name: ind_var5_0, dtype: int64
1    50085
0    25284
Name: ind_var5, dtype: int64
0    75362
1        7
Name: ind_var6_0, dtype: int64
0    75367
1        2
Name: ind_var6, dtype: int64
0    72901
1     2468
Name: ind_var8_0, dtype: int64
0    73223
1     2146
Name: ind_var8, dtype: int64
0    70396
1     4973
Name: ind_var12_0, dtype: int64
0    71990
1     3379
Name: ind_var12, dtype: int64
0    71808
1     3561
Name: ind_var13_0, dtype: int64
0    72425
1     2944
Name: ind_var13_corto_0, dtype: int64
0    72507
1     2862
Name: ind_var13_corto, dtype: int64
0    74701
1      668
Name: ind_var13_largo_0, dtype: int64
0    74711
1      658
Name: ind_var13_largo, dtype: int64
0    75367
1        2
Name: ind_var13_medio_0, dtype: int64
0    71886
1     3483
Name: ind_var13, dtype: int64
0    73677
1     1692
Name: ind_var14_0, dtype: int64
0    75000
1      369
Name: ind_var14, dtype: int64
0    75285
1       84
Name: ind_var17_0, dtype: int64
0    75301
1       68
Name: ind_var17, dtype: int64
0    75067
1      302
Name: ind_var19, dtype: int64
0    75093
1      276
Name: ind_var20_0, dtype: int64
0    75164
1      205
Name: ind_var20, dtype: int64
0    72204
1     3165
Name: ind_var24_0, dtype: int64
0    72532
1     2837
Name: ind_var24, dtype: int64
0    73385
1     1984
Name: ind_var25_cte, dtype: int64
0    73520
1     1849
Name: ind_var26_0, dtype: int64
0    73299
1     2070
Name: ind_var26_cte, dtype: int64
0    73596
1     1773
Name: ind_var25_0, dtype: int64
1    75030
0      339
Name: ind_var30_0, dtype: int64
1    55163
0    20206
Name: ind_var30, dtype: int64
0    75180
1      189
Name: ind_var31_0, dtype: int64
0    75205
1      164
Name: ind_var31, dtype: int64
0    75277
1       92
Name: ind_var32_cte, dtype: int64
0    75287
1       82
Name: ind_var32_0, dtype: int64
0    75321
1       48
Name: ind_var33_0, dtype: int64
0    75328
1       41
Name: ind_var33, dtype: int64
0    69921
1     5448
Name: ind_var37_cte, dtype: int64
0    70451
1     4918
Name: ind_var37_0, dtype: int64
1    66364
0     9005
Name: ind_var39_0, dtype: int64
1    66254
0     9115
Name: ind_var41_0, dtype: int64
0    75306
1       63
Name: ind_var44_0, dtype: int64
0    75309
1       60
Name: ind_var44, dtype: int64
0    74528
3      840
6        1
Name: num_var1_0, dtype: int64
0    75092
3      277
Name: num_var1, dtype: int64
1    37974
0    19432
2    12438
3     4311
4      995
5      185
6       32
7        2
Name: num_var4, dtype: int64
3     71722
0      3160
6       478
9         7
15        2
Name: num_var5_0, dtype: int64
3     49895
0     25284
6       186
9         3
15        1
Name: num_var5, dtype: int64
0    75362
3        7
Name: num_var6_0, dtype: int64
0    75367
3        2
Name: num_var6, dtype: int64
0    72901
3     2467
6        1
Name: num_var8_0, dtype: int64
0    73223
3     2146
Name: num_var8, dtype: int64
0      70396
3       4797
6        171
9          3
111        1
15         1
Name: num_var12_0, dtype: int64
0     71990
3      3342
6        36
15        1
Name: num_var12, dtype: int64
0     71808
3      3373
6       161
9        18
12        6
15        2
18        1
Name: num_var13_0, dtype: int64
0    72425
3     2913
6       31
Name: num_var13_corto_0, dtype: int64
0    72507
3     2860
6        2
Name: num_var13_corto, dtype: int64
0     74701
3       555
6        92
9        15
12        3
15        2
18        1
Name: num_var13_largo_0, dtype: int64
0     74711
3       568
6        78
9         7
15        2
12        2
18        1
Name: num_var13_largo, dtype: int64
0    75367
3        2
Name: num_var13_medio_0, dtype: int64
0     71886
3      3356
6       112
9         9
12        3
15        2
18        1
Name: num_var13, dtype: int64
0      73677
3       1685
6          5
111        1
12         1
Name: num_var14_0, dtype: int64
0     75000
3       365
6         3
12        1
Name: num_var14, dtype: int64
0     75285
3        43
6        19
9         9
12        6
15        3
18        2
36        1
27        1
Name: num_var17_0, dtype: int64
0     75301
3        34
6        18
9         8
12        5
27        1
18        1
15        1
Name: num_var17, dtype: int64
0    75093
3      276
Name: num_var20_0, dtype: int64
0    75164
3      205
Name: num_var20, dtype: int64
0    72204
3     3153
6       11
9        1
Name: num_var24_0, dtype: int64
0    72532
3     2835
6        2
Name: num_var24, dtype: int64
0     73520
3      1550
6       240
9        42
12       12
15        2
33        1
27        1
21        1
Name: num_var26_0, dtype: int64
0     73596
3      1501
6       216
9        40
12       11
15        2
33        1
27        1
21        1
Name: num_var25_0, dtype: int64
0     75362
3         3
6         2
48        1
9         1
Name: num_op_var40_hace3, dtype: int64
3     51198
0     20206
6      3658
9       276
12       23
15        5
33        1
21        1
18        1
Name: num_var30, dtype: int64
0     75180
3       141
6        23
9        12
15        5
12        4
18        2
36        1
27        1
Name: num_var31_0, dtype: int64
0     75205
3       125
6        20
9        11
12        4
15        2
27        1
18        1
Name: num_var31, dtype: int64
0     75287
3        60
6        20
12        1
9         1
Name: num_var32_0, dtype: int64
0    75321
3       43
6        5
Name: num_var33_0, dtype: int64
0    75328
3       37
6        4
Name: num_var33, dtype: int64
3     64464
0      9005
6      1777
9       109
12        9
15        2
33        1
21        1
18        1
Name: num_var39_0, dtype: int64
3     64829
0      9115
6      1340
9        74
12        6
15        2
33        1
21        1
18        1
Name: num_var41_0, dtype: int64
3      69762
6       4965
0        431
9        199
12         8
15         2
114        1
18         1
Name: num_var42_0, dtype: int64
3     51698
0     21680
6      1958
9        29
12        2
18        1
15        1
Name: num_var42, dtype: int64
0    75306
3       62
6        1
Name: num_var44_0, dtype: int64
0    75309
3       60
Name: num_var44, dtype: int64
   0.00000     75367
11976.60000        1
19531.80000        1
Name: saldo_var6, dtype: int64
0        75367
30000        1
9000         1
Name: saldo_var13_medio, dtype: int64
99    29995
3     22155
1     14115
2      8694
0       410
Name: var36, dtype: int64
 0.00000    75350
-1.00000       17
 1.00005        1
 1.50000        1
Name: delta_imp_aport_var17_1y3, dtype: int64
 0.00000    75346
-1.00000       17
-0.66667        1
-0.70000        1
-0.95000        1
-0.94048        1
-0.91600        1
-0.50000        1
Name: delta_imp_aport_var33_1y3, dtype: int64
 0.00000    75347
-1.00000       16
 0.02562        1
-0.47574        1
-0.63073        1
 2.65055        1
-0.93403        1
 0.33361        1
Name: delta_imp_compra_var44_1y3, dtype: int64
 0.00000    75368
-1.00000        1
Name: delta_imp_reemb_var17_1y3, dtype: int64
 0.00000    75367
-1.00000        2
Name: delta_imp_trasp_var17_in_1y3, dtype: int64
 0.00000    75363
-1.00000        6
Name: delta_imp_trasp_var33_in_1y3, dtype: int64
 0.00000    75366
-1.00000        1
 5.41710        1
-0.50577        1
Name: delta_imp_venta_var44_1y3, dtype: int64
 0.00000    73707
-1.00000     1657
 1.00000        3
-0.33333        1
-0.50000        1
Name: delta_num_aport_var13_1y3, dtype: int64
 0.00000    75351
-1.00000       17
 1.00000        1
Name: delta_num_aport_var17_1y3, dtype: int64
 0.00000    75351
-1.00000       17
-0.50000        1
Name: delta_num_aport_var33_1y3, dtype: int64
 0.00000    75350
-1.00000       16
-0.66667        1
 2.50000        1
 0.50000        1
Name: delta_num_compra_var44_1y3, dtype: int64
 0.00000    75366
-1.00000        1
 8.00000        1
-0.50000        1
Name: delta_num_venta_var44_1y3, dtype: int64
   0.00000     75366
7500.00000         1
33000.90000        1
3000.00000         1
Name: imp_aport_var17_ult1, dtype: int64
0       75363
450         2
750         1
1260        1
300         1
150         1
Name: imp_aport_var33_ult1, dtype: int64
   0.00000      75366
145384.92000        1
43500.00000         1
18000.00000         1
Name: imp_var7_emit_ult1, dtype: int64
   0.00000     75363
78019.20000        1
6743.25000         1
8972.10000         1
  99.66000         1
21550.02000        1
28728.00000        1
Name: imp_compra_var44_ult1, dtype: int64
   0.00000     75368
12027.15000        1
Name: imp_reemb_var17_hace3, dtype: int64
   0.00000     75367
96781.44000        1
45741.48000        1
Name: imp_trasp_var17_in_hace3, dtype: int64
   0.00000     75363
33079.83000        1
33744.48000        1
25501.80000        1
44251.08000        1
49581.27000        1
25920.03000        1
Name: imp_trasp_var33_in_hace3, dtype: int64
   0.00000      75366
78040.59000         1
209834.40000        1
  23.16000          1
Name: imp_venta_var44_hace3, dtype: int64
   0.00000      75367
103705.77000        1
 148.62000          1
Name: imp_venta_var44_ult1, dtype: int64
0    75366
1        3
Name: ind_var7_emit_ult1, dtype: int64
0    75180
1      189
Name: ind_var7_recib_ult1, dtype: int64
0    69273
1     6096
Name: ind_var10_ult1, dtype: int64
0    68430
1     6939
Name: ind_var10cte_ult1, dtype: int64
0    68086
1     7283
Name: ind_var9_cte_ult1, dtype: int64
0    68905
1     6464
Name: ind_var9_ult1, dtype: int64
0    70458
1     4911
Name: ind_var43_emit_ult1, dtype: int64
0    65858
1     9511
Name: ind_var43_recib_ult1, dtype: int64
0     73683
3      1545
6        97
9        25
12       14
24        2
18        2
15        1
Name: num_aport_var13_hace3, dtype: int64
0    75340
3       25
6        4
Name: num_aport_var13_ult1, dtype: int64
0     75349
3        11
6         6
12        3
Name: num_aport_var17_hace3, dtype: int64
0    75366
3        2
6        1
Name: num_aport_var17_ult1, dtype: int64
0     75346
3        21
12        1
6         1
Name: num_aport_var33_hace3, dtype: int64
0    75363
3        5
6        1
Name: num_aport_var33_ult1, dtype: int64
0    75366
3        3
Name: num_var7_emit_ult1, dtype: int64
0     75180
3       154
6        27
9         5
12        3
Name: num_var7_recib_ult1, dtype: int64
0    75347
3       15
6        5
9        2
Name: num_compra_var44_hace3, dtype: int64
0     75363
3         4
21        1
9         1
Name: num_compra_var44_ult1, dtype: int64
3    42546
0    20463
2     9147
1     3213
Name: num_meses_var5_ult3, dtype: int64
0    73182
2      900
1      818
3      469
Name: num_meses_var8_ult3, dtype: int64
0    71967
2     1686
3     1273
1      443
Name: num_meses_var12_ult3, dtype: int64
0    72429
2     1557
3     1314
1       69
Name: num_meses_var13_corto_ult3, dtype: int64
0    74887
3      294
2      176
1       12
Name: num_meses_var13_largo_ult3, dtype: int64
0    75367
2        2
Name: num_meses_var13_medio_ult3, dtype: int64
0    75296
2       51
3       15
1        7
Name: num_meses_var17_ult3, dtype: int64
0    75365
2        2
1        2
Name: num_meses_var29_ult3, dtype: int64
0    75328
3       24
2       16
1        1
Name: num_meses_var33_ult3, dtype: int64
2    54120
1    10837
0    10067
3      345
Name: num_meses_var39_vig_ult3, dtype: int64
0    75306
2       40
3       21
1        2
Name: num_meses_var44_ult3, dtype: int64
0     75331
3        26
6         8
24        1
18        1
12        1
9         1
Name: num_op_var40_efect_ult1, dtype: int64
0     75324
3        25
6        12
24        2
9         2
21        1
18        1
15        1
12        1
Name: num_op_var40_efect_ult3, dtype: int64
0    75368
3        1
Name: num_reemb_var17_hace3, dtype: int64
0     75275
3        71
6        20
15        1
12        1
9         1
Name: num_sal_var16_ult1, dtype: int64
0    75367
6        1
3        1
Name: num_trasp_var17_in_hace3, dtype: int64
0    75363
3        6
Name: num_trasp_var33_in_hace3, dtype: int64
0    75366
3        2
6        1
Name: num_venta_var44_hace3, dtype: int64
0     75367
27        1
3         1
Name: num_venta_var44_ult1, dtype: int64
   0.00000    75367
5586.21000        1
7741.95000        1
Name: saldo_medio_var13_medio_hace2, dtype: int64
   0.00000     75367
7293.12000         1
18870.99000        1
Name: saldo_medio_var13_medio_ult3, dtype: int64
   0.00000     75366
10430.01000        1
 868.98000         1
  59.97000         1
Name: saldo_medio_var29_hace2, dtype: int64
  0.00000    75368
145.20000        1
Name: saldo_medio_var29_hace3, dtype: int64
   0.00000     75367
13793.67000        1
3365.46000         1
Name: saldo_medio_var29_ult1, dtype: int64
   0.00000    75367
7331.34000        1
3365.46000        1
Name: saldo_medio_var29_ult3, dtype: int64
0    72373
1     2996
Name: TARGET, dtype: int64

In [76]:
train_labels = train_df_clean3['TARGET']
train_features = train_df_clean3.drop(['ID','TARGET'], axis=1)
train_features.shape, train_labels.shape


Out[76]:
((75369, 273), (75369,))

In [77]:
#single features prediciton power
model = ensemble.GradientBoostingClassifier(n_estimators=10, 
                                            max_features=1, 
                                            max_depth=3, 
                                            min_samples_leaf=100,
                                            learning_rate=0.3, 
                                            subsample=0.65, 
                                            loss='deviance',
                                            verbose=0,
                                            random_state=1)

X_train, X_valid, y_train, y_valid = cross_validation.train_test_split(train_features, 
                                                                       train_labels, 
                                                                       test_size=0.5, 
                                                                       random_state=1)
        
time_start = time.time()
single_var_AUC_list = []
single_var_AUC_dict = {}
for i, feature in enumerate(X_train.columns):    
    train = X_train[feature].values.reshape(-1,1)
    valid = X_valid[feature].values.reshape(-1,1)
    model.fit(train, y_train)
    
    train_AUC = auc(y_train, model.predict_proba(train)[:,1])
    valid_AUC = auc(y_valid, model.predict_proba(valid)[:,1])
        
    single_var_AUC_list.append(valid_AUC)
    single_var_AUC_dict[feature] = valid_AUC
    print "feature %d/%d: AUC=%f" %(i, len(X_train.columns),valid_AUC)
        
valid_AUC = np.array(single_var_AUC_list)
tim_spent = (time.time()-time_start)/60
print("(min,mean,max) AUC = (%.3f,%.3f,%.3f). took %.2f minutes" %(valid_AUC.min(),valid_AUC.mean(),valid_AUC.max(), tim_spent))

# show the scatter plot of the individual feature performance 
plt.figure()
plt.hist(valid_AUC, 50, normed=1, facecolor='blue', alpha=0.75)
plt.xlabel('AUC'); 
plt.ylabel('frequency'); 
plt.title('single feature AUC histogram'); 
plt.show()

sorted_vars_df = pd.DataFrame(index=range(len(single_var_AUC_dict.keys())), columns=['feature','AUC'])
for k,key in enumerate(single_var_AUC_dict):
    sorted_vars_df.ix[k,'feature'] = key
    sorted_vars_df.ix[k,'AUC'] = single_var_AUC_dict[key]
sorted_vars_df = sorted_vars_df.sort_values(by='AUC', axis=0, ascending=False).reset_index(drop=True)

sorted_vars_df.head(10)


feature 0/273: AUC=0.501831
feature 1/273: AUC=0.715859
feature 2/273: AUC=0.497597
feature 3/273: AUC=0.513979
feature 4/273: AUC=0.507052
feature 5/273: AUC=0.500526
feature 6/273: AUC=0.501089
feature 7/273: AUC=0.500000
feature 8/273: AUC=0.500000
feature 9/273: AUC=0.500000
feature 10/273: AUC=0.510227
feature 11/273: AUC=0.504226
feature 12/273: AUC=0.513789
feature 13/273: AUC=0.513591
feature 14/273: AUC=0.529833
feature 15/273: AUC=0.510620
feature 16/273: AUC=0.513173
feature 17/273: AUC=0.529167
feature 18/273: AUC=0.500000
feature 19/273: AUC=0.500562
feature 20/273: AUC=0.500000
feature 21/273: AUC=0.518035
feature 22/273: AUC=0.670083
feature 23/273: AUC=0.500000
feature 24/273: AUC=0.500000
feature 25/273: AUC=0.522827
feature 26/273: AUC=0.514460
feature 27/273: AUC=0.525320
feature 28/273: AUC=0.518286
feature 29/273: AUC=0.520845
feature 30/273: AUC=0.516542
feature 31/273: AUC=0.516327
feature 32/273: AUC=0.504690
feature 33/273: AUC=0.504607
feature 34/273: AUC=0.500000
feature 35/273: AUC=0.520644
feature 36/273: AUC=0.508648
feature 37/273: AUC=0.501640
feature 38/273: AUC=0.500000
feature 39/273: AUC=0.500000
feature 40/273: AUC=0.502072
feature 41/273: AUC=0.501795
feature 42/273: AUC=0.500000
feature 43/273: AUC=0.515672
feature 44/273: AUC=0.515472
feature 45/273: AUC=0.512714
feature 46/273: AUC=0.510368
feature 47/273: AUC=0.513077
feature 48/273: AUC=0.510273
feature 49/273: AUC=0.502107
feature 50/273: AUC=0.673763
feature 51/273: AUC=0.500000
feature 52/273: AUC=0.500000
feature 53/273: AUC=0.500000
feature 54/273: AUC=0.500000
feature 55/273: AUC=0.500000
feature 56/273: AUC=0.500000
feature 57/273: AUC=0.499124
feature 58/273: AUC=0.499120
feature 59/273: AUC=0.532554
feature 60/273: AUC=0.532063
feature 61/273: AUC=0.500000
feature 62/273: AUC=0.500000
feature 63/273: AUC=0.500562
feature 64/273: AUC=0.500000
feature 65/273: AUC=0.696806
feature 66/273: AUC=0.516297
feature 67/273: AUC=0.670083
feature 68/273: AUC=0.500000
feature 69/273: AUC=0.500000
feature 70/273: AUC=0.522827
feature 71/273: AUC=0.514460
feature 72/273: AUC=0.525320
feature 73/273: AUC=0.518286
feature 74/273: AUC=0.520845
feature 75/273: AUC=0.516542
feature 76/273: AUC=0.516327
feature 77/273: AUC=0.504690
feature 78/273: AUC=0.504607
feature 79/273: AUC=0.500000
feature 80/273: AUC=0.520644
feature 81/273: AUC=0.508648
feature 82/273: AUC=0.501640
feature 83/273: AUC=0.500000
feature 84/273: AUC=0.500000
feature 85/273: AUC=0.501795
feature 86/273: AUC=0.500000
feature 87/273: AUC=0.515672
feature 88/273: AUC=0.515472
feature 89/273: AUC=0.510369
feature 90/273: AUC=0.510274
feature 91/273: AUC=0.500000
feature 92/273: AUC=0.500000
feature 93/273: AUC=0.500000
feature 94/273: AUC=0.500000
feature 95/273: AUC=0.511658
feature 96/273: AUC=0.501902
feature 97/273: AUC=0.513153
feature 98/273: AUC=0.509917
feature 99/273: AUC=0.511676
feature 100/273: AUC=0.501874
feature 101/273: AUC=0.514139
feature 102/273: AUC=0.511153
feature 103/273: AUC=0.534945
feature 104/273: AUC=0.679165
feature 105/273: AUC=0.500000
feature 106/273: AUC=0.500000
feature 107/273: AUC=0.500000
feature 108/273: AUC=0.500000
feature 109/273: AUC=0.500000
feature 110/273: AUC=0.697177
feature 111/273: AUC=0.502484
feature 112/273: AUC=0.499041
feature 113/273: AUC=0.529272
feature 114/273: AUC=0.529983
feature 115/273: AUC=0.524541
feature 116/273: AUC=0.670048
feature 117/273: AUC=0.500000
feature 118/273: AUC=0.500000
feature 119/273: AUC=0.500000
feature 120/273: AUC=0.697327
feature 121/273: AUC=0.500000
feature 122/273: AUC=0.515448
feature 123/273: AUC=0.518397
feature 124/273: AUC=0.516320
feature 125/273: AUC=0.504609
feature 126/273: AUC=0.500000
feature 127/273: AUC=0.520607
feature 128/273: AUC=0.501642
feature 129/273: AUC=0.500000
feature 130/273: AUC=0.500000
feature 131/273: AUC=0.515431
feature 132/273: AUC=0.504406
feature 133/273: AUC=0.504198
feature 134/273: AUC=0.719565
feature 135/273: AUC=0.500000
feature 136/273: AUC=0.500000
feature 137/273: AUC=0.500000
feature 138/273: AUC=0.494535
feature 139/273: AUC=0.705306
feature 140/273: AUC=0.500000
feature 141/273: AUC=0.646810
feature 142/273: AUC=0.511181
feature 143/273: AUC=0.500000
feature 144/273: AUC=0.500000
feature 145/273: AUC=0.500000
feature 146/273: AUC=0.500000
feature 147/273: AUC=0.500000
feature 148/273: AUC=0.500000
feature 149/273: AUC=0.500000
feature 150/273: AUC=0.511084
feature 151/273: AUC=0.500000
feature 152/273: AUC=0.500000
feature 153/273: AUC=0.500000
feature 154/273: AUC=0.500000
feature 155/273: AUC=0.511327
feature 156/273: AUC=0.500000
feature 157/273: AUC=0.500000
feature 158/273: AUC=0.500000
feature 159/273: AUC=0.500000
feature 160/273: AUC=0.500000
feature 161/273: AUC=0.500000
feature 162/273: AUC=0.500000
feature 163/273: AUC=0.500000
feature 164/273: AUC=0.500000
feature 165/273: AUC=0.500000
feature 166/273: AUC=0.503177
feature 167/273: AUC=0.510052
feature 168/273: AUC=0.500000
feature 169/273: AUC=0.500000
feature 170/273: AUC=0.500000
feature 171/273: AUC=0.500000
feature 172/273: AUC=0.500000
feature 173/273: AUC=0.500000
feature 174/273: AUC=0.507704
feature 175/273: AUC=0.507564
feature 176/273: AUC=0.506607
feature 177/273: AUC=0.506513
feature 178/273: AUC=0.506698
feature 179/273: AUC=0.520004
feature 180/273: AUC=0.503319
feature 181/273: AUC=0.511333
feature 182/273: AUC=0.500000
feature 183/273: AUC=0.500000
feature 184/273: AUC=0.500000
feature 185/273: AUC=0.500000
feature 186/273: AUC=0.500000
feature 187/273: AUC=0.500000
feature 188/273: AUC=0.500000
feature 189/273: AUC=0.500000
feature 190/273: AUC=0.500000
feature 191/273: AUC=0.500357
feature 192/273: AUC=0.505611
feature 193/273: AUC=0.508789
feature 194/273: AUC=0.506811
feature 195/273: AUC=0.517106
feature 196/273: AUC=0.514046
feature 197/273: AUC=0.538084
feature 198/273: AUC=0.693430
feature 199/273: AUC=0.514844
feature 200/273: AUC=0.517777
feature 201/273: AUC=0.516575
feature 202/273: AUC=0.503618
feature 203/273: AUC=0.500000
feature 204/273: AUC=0.500000
feature 205/273: AUC=0.500000
feature 206/273: AUC=0.500000
feature 207/273: AUC=0.555588
feature 208/273: AUC=0.500000
feature 209/273: AUC=0.509288
feature 210/273: AUC=0.505691
feature 211/273: AUC=0.500525
feature 212/273: AUC=0.500236
feature 213/273: AUC=0.500000
feature 214/273: AUC=0.500000
feature 215/273: AUC=0.512061
feature 216/273: AUC=0.508815
feature 217/273: AUC=0.506851
feature 218/273: AUC=0.503388
feature 219/273: AUC=0.507524
feature 220/273: AUC=0.503287
feature 221/273: AUC=0.500000
feature 222/273: AUC=0.500000
feature 223/273: AUC=0.501326
feature 224/273: AUC=0.520019
feature 225/273: AUC=0.504493
feature 226/273: AUC=0.500000
feature 227/273: AUC=0.500000
feature 228/273: AUC=0.500000
feature 229/273: AUC=0.500000
feature 230/273: AUC=0.550952
feature 231/273: AUC=0.510978
feature 232/273: AUC=0.544896
feature 233/273: AUC=0.531257
feature 234/273: AUC=0.700980
feature 235/273: AUC=0.672831
feature 236/273: AUC=0.697150
feature 237/273: AUC=0.694594
feature 238/273: AUC=0.510465
feature 239/273: AUC=0.502153
feature 240/273: AUC=0.515063
feature 241/273: AUC=0.516814
feature 242/273: AUC=0.515697
feature 243/273: AUC=0.506930
feature 244/273: AUC=0.517425
feature 245/273: AUC=0.517581
feature 246/273: AUC=0.516332
feature 247/273: AUC=0.508875
feature 248/273: AUC=0.516334
feature 249/273: AUC=0.516330
feature 250/273: AUC=0.503563
feature 251/273: AUC=0.500000
feature 252/273: AUC=0.503591
feature 253/273: AUC=0.503591
feature 254/273: AUC=0.500000
feature 255/273: AUC=0.500000
feature 256/273: AUC=0.500000
feature 257/273: AUC=0.500000
feature 258/273: AUC=0.500000
feature 259/273: AUC=0.500000
feature 260/273: AUC=0.500000
feature 261/273: AUC=0.500000
feature 262/273: AUC=0.500000
feature 263/273: AUC=0.500000
feature 264/273: AUC=0.500000
feature 265/273: AUC=0.500000
feature 266/273: AUC=0.500000
feature 267/273: AUC=0.500000
feature 268/273: AUC=0.500000
feature 269/273: AUC=0.500000
feature 270/273: AUC=0.500000
feature 271/273: AUC=0.500000
feature 272/273: AUC=0.591160
(min,mean,max) AUC = (0.495,0.518,0.720). took 0.53 minutes
Out[77]:
feature AUC
0 saldo_var30 0.71957
1 var15 0.71586
2 saldo_var42 0.70531
3 saldo_medio_var5_hace2 0.70098
4 saldo_var5 0.69733
5 num_var35 0.69718
6 saldo_medio_var5_ult1 0.69715
7 num_var4 0.69681
8 saldo_medio_var5_ult3 0.69459
9 num_meses_var5_ult3 0.69343

In [79]:
#features binarization
train_features_bin = Binarizer().fit_transform(scale(train_features))

In [82]:
#features selection chi2
selectChi2 = SelectPercentile(chi2, percentile=75).fit(train_features_bin, train_labels)
chi2_selected = selectChi2.get_support()
chi2_selected_features = [ f for i,f in enumerate(train_features.columns) if chi2_selected[i]]
print len(chi2_selected_features)


204

In [83]:
#features selection F_classif
selectF_classif = SelectPercentile(f_classif, percentile=75).fit(train_features_bin, train_labels)
f_classif_selected = selectF_classif.get_support()
f_classif_selected_features = [ f for i,f in enumerate(train_features.columns) if f_classif_selected[i]]
print len(f_classif_selected_features)


204

In [85]:
#intesection of selected features
selected = chi2_selected & f_classif_selected
selected_features = [ f for f,s in zip(train_features.columns, selected) if s]
print len(selected_features)


199

In [86]:
X_train, X_valid, y_train, y_valid= cross_validation.train_test_split(train_features[selected_features], 
                                                                     train_labels, 
                                                                     random_state=1301, 
                                                                     stratify=train_labels, 
                                                                     test_size=0.3)

In [ ]: