In [78]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import os
from sklearn import ensemble
from sklearn import cross_validation
from sklearn.metrics import roc_auc_score as auc
import time
from sklearn.feature_selection import SelectPercentile
from sklearn.feature_selection import f_classif,chi2
from sklearn.preprocessing import Binarizer, scale
In [7]:
ROOT_PATH = os.path.join("/media", "shabou", "7D04-9B6D", "WS", "data", "datasets", "santander")
In [8]:
train_df = pd.read_csv(os.path.join(ROOT_PATH,'train.csv'))
In [9]:
train_df.head(2)
Out[9]:
ID
var3
var15
imp_ent_var16_ult1
imp_op_var39_comer_ult1
imp_op_var39_comer_ult3
imp_op_var40_comer_ult1
imp_op_var40_comer_ult3
imp_op_var40_efect_ult1
imp_op_var40_efect_ult3
...
saldo_medio_var33_hace2
saldo_medio_var33_hace3
saldo_medio_var33_ult1
saldo_medio_var33_ult3
saldo_medio_var44_hace2
saldo_medio_var44_hace3
saldo_medio_var44_ult1
saldo_medio_var44_ult3
var38
TARGET
0
1
2
23
0.0
0.0
0.0
0.0
0.0
0.0
0.0
...
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
39205.17
0
1
3
2
34
0.0
0.0
0.0
0.0
0.0
0.0
0.0
...
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
49278.03
0
2 rows × 371 columns
In [10]:
train_df.shape
Out[10]:
(76020, 371)
In [69]:
train_df_describe = train_df.describe()
pd.set_option('display.max_rows', train_df_describe.shape[1])
pd.options.display.float_format = '{:10.5f}'.format
print train_df_describe.loc[['min', 'max', 'mean']].T
#problem in min var3
min max mean
ID 1.00000 151838.00000 75964.05072
var3 -999999.00000 238.00000 -1523.19928
var15 5.00000 105.00000 33.21287
imp_ent_var16_ult1 0.00000 210000.00000 86.20827
imp_op_var39_comer_ult1 0.00000 12888.03000 72.36307
imp_op_var39_comer_ult3 0.00000 21024.81000 119.52963
imp_op_var40_comer_ult1 0.00000 8237.82000 3.55913
imp_op_var40_comer_ult3 0.00000 11073.57000 6.47270
imp_op_var40_efect_ult1 0.00000 6600.00000 0.41295
imp_op_var40_efect_ult3 0.00000 6600.00000 0.56735
imp_op_var40_ult1 0.00000 8237.82000 3.16072
imp_op_var41_comer_ult1 0.00000 12888.03000 68.80394
imp_op_var41_comer_ult3 0.00000 16566.81000 113.05693
imp_op_var41_efect_ult1 0.00000 45990.00000 68.20514
imp_op_var41_efect_ult3 0.00000 131100.00000 113.22506
imp_op_var41_ult1 0.00000 47598.09000 137.24276
imp_op_var39_efect_ult1 0.00000 45990.00000 68.61809
imp_op_var39_efect_ult3 0.00000 131100.00000 113.79241
imp_op_var39_ult1 0.00000 47598.09000 140.40348
imp_sal_var16_ult1 0.00000 105000.00000 5.47768
ind_var1_0 0.00000 1.00000 0.01146
ind_var1 0.00000 1.00000 0.00376
ind_var2_0 0.00000 0.00000 0.00000
ind_var2 0.00000 0.00000 0.00000
ind_var5_0 0.00000 1.00000 0.95802
ind_var5 0.00000 1.00000 0.66376
ind_var6_0 0.00000 1.00000 0.00011
ind_var6 0.00000 1.00000 0.00003
ind_var8_0 0.00000 1.00000 0.03283
ind_var8 0.00000 1.00000 0.02860
ind_var12_0 0.00000 1.00000 0.06752
ind_var12 0.00000 1.00000 0.04546
ind_var13_0 0.00000 1.00000 0.05225
ind_var13_corto_0 0.00000 1.00000 0.04294
ind_var13_corto 0.00000 1.00000 0.04148
ind_var13_largo_0 0.00000 1.00000 0.01017
ind_var13_largo 0.00000 1.00000 0.01000
ind_var13_medio_0 0.00000 1.00000 0.00003
ind_var13_medio 0.00000 1.00000 0.00003
ind_var13 0.00000 1.00000 0.05086
ind_var14_0 0.00000 1.00000 0.02365
ind_var14 0.00000 1.00000 0.00530
ind_var17_0 0.00000 1.00000 0.00180
ind_var17 0.00000 1.00000 0.00145
ind_var18_0 0.00000 1.00000 0.00003
ind_var18 0.00000 1.00000 0.00003
ind_var19 0.00000 1.00000 0.00420
ind_var20_0 0.00000 1.00000 0.00363
ind_var20 0.00000 1.00000 0.00270
ind_var24_0 0.00000 1.00000 0.04237
ind_var24 0.00000 1.00000 0.03788
ind_var25_cte 0.00000 1.00000 0.02643
ind_var26_0 0.00000 1.00000 0.02464
ind_var26_cte 0.00000 1.00000 0.02756
ind_var26 0.00000 1.00000 0.02464
ind_var25_0 0.00000 1.00000 0.02364
ind_var25 0.00000 1.00000 0.02364
ind_var27_0 0.00000 0.00000 0.00000
ind_var28_0 0.00000 0.00000 0.00000
ind_var28 0.00000 0.00000 0.00000
ind_var27 0.00000 0.00000 0.00000
ind_var29_0 0.00000 1.00000 0.00011
ind_var29 0.00000 1.00000 0.00003
ind_var30_0 0.00000 1.00000 0.99549
ind_var30 0.00000 1.00000 0.73283
ind_var31_0 0.00000 1.00000 0.00428
ind_var31 0.00000 1.00000 0.00367
ind_var32_cte 0.00000 1.00000 0.00121
ind_var32_0 0.00000 1.00000 0.00108
ind_var32 0.00000 1.00000 0.00108
ind_var33_0 0.00000 1.00000 0.00075
ind_var33 0.00000 1.00000 0.00063
ind_var34_0 0.00000 1.00000 0.00003
ind_var34 0.00000 1.00000 0.00003
ind_var37_cte 0.00000 1.00000 0.07230
ind_var37_0 0.00000 1.00000 0.06526
ind_var37 0.00000 1.00000 0.06526
ind_var39_0 0.00000 1.00000 0.88076
ind_var40_0 0.00000 1.00000 0.01142
ind_var40 0.00000 1.00000 0.00372
ind_var41_0 0.00000 1.00000 0.87928
ind_var41 0.00000 0.00000 0.00000
ind_var39 0.00000 1.00000 0.00372
ind_var44_0 0.00000 1.00000 0.00188
ind_var44 0.00000 1.00000 0.00170
ind_var46_0 0.00000 0.00000 0.00000
ind_var46 0.00000 0.00000 0.00000
num_var1_0 0.00000 6.00000 0.03445
num_var1 0.00000 6.00000 0.01133
num_var4 0.00000 7.00000 1.07944
num_var5_0 0.00000 15.00000 2.89404
num_var5 0.00000 15.00000 1.99917
num_var6_0 0.00000 3.00000 0.00032
num_var6 0.00000 3.00000 0.00008
num_var8_0 0.00000 6.00000 0.09854
num_var8 0.00000 3.00000 0.08579
num_var12_0 0.00000 111.00000 0.21125
num_var12 0.00000 15.00000 0.13800
num_var13_0 0.00000 18.00000 0.16772
num_var13_corto_0 0.00000 6.00000 0.13031
num_var13_corto 0.00000 6.00000 0.12451
num_var13_largo_0 0.00000 18.00000 0.03733
num_var13_largo 0.00000 18.00000 0.03524
num_var13_medio_0 0.00000 3.00000 0.00008
num_var13_medio 0.00000 3.00000 0.00008
num_var13 0.00000 18.00000 0.15983
num_var14_0 0.00000 111.00000 0.07269
num_var14 0.00000 12.00000 0.01614
num_var17_0 0.00000 36.00000 0.01188
num_var17 0.00000 27.00000 0.00888
num_var18_0 0.00000 3.00000 0.00008
num_var18 0.00000 3.00000 0.00008
num_var20_0 0.00000 3.00000 0.01089
num_var20 0.00000 3.00000 0.00809
num_var24_0 0.00000 9.00000 0.12766
num_var24 0.00000 6.00000 0.11377
num_var26_0 0.00000 33.00000 0.08938
num_var26 0.00000 33.00000 0.08938
num_var25_0 0.00000 33.00000 0.08516
num_var25 0.00000 33.00000 0.08516
num_op_var40_hace2 0.00000 117.00000 0.02024
num_op_var40_hace3 0.00000 48.00000 0.00103
num_op_var40_ult1 0.00000 234.00000 0.05702
num_op_var40_ult3 0.00000 351.00000 0.07830
num_op_var41_hace2 0.00000 249.00000 1.60114
num_op_var41_hace3 0.00000 81.00000 0.09392
num_op_var41_ult1 0.00000 468.00000 2.85884
num_op_var41_ult3 0.00000 468.00000 4.55391
num_op_var39_hace2 0.00000 249.00000 1.62139
num_op_var39_hace3 0.00000 81.00000 0.09495
num_op_var39_ult1 0.00000 468.00000 2.91586
num_op_var39_ult3 0.00000 468.00000 4.63220
num_var27_0 0.00000 0.00000 0.00000
num_var28_0 0.00000 0.00000 0.00000
num_var28 0.00000 0.00000 0.00000
num_var27 0.00000 0.00000 0.00000
num_var29_0 0.00000 3.00000 0.00032
num_var29 0.00000 3.00000 0.00008
num_var30_0 0.00000 114.00000 3.37186
num_var30 0.00000 33.00000 2.38287
num_var31_0 0.00000 36.00000 0.02013
num_var31 0.00000 27.00000 0.01606
num_var32_0 0.00000 12.00000 0.00422
num_var32 0.00000 12.00000 0.00422
num_var33_0 0.00000 12.00000 0.00257
num_var33 0.00000 6.00000 0.00209
num_var34_0 0.00000 3.00000 0.00008
num_var34 0.00000 3.00000 0.00008
num_var35 0.00000 36.00000 3.29937
num_var37_med_ult2 0.00000 105.00000 0.26354
num_var37_0 0.00000 114.00000 0.41878
num_var37 0.00000 114.00000 0.41878
num_var39_0 0.00000 33.00000 2.72494
num_var40_0 0.00000 6.00000 0.03429
num_var40 0.00000 3.00000 0.01117
num_var41_0 0.00000 33.00000 2.69925
num_var41 0.00000 0.00000 0.00000
num_var39 0.00000 3.00000 0.01117
num_var42_0 0.00000 114.00000 3.20414
num_var42 0.00000 18.00000 2.21800
num_var44_0 0.00000 6.00000 0.00568
num_var44 0.00000 3.00000 0.00509
num_var46_0 0.00000 0.00000 0.00000
num_var46 0.00000 0.00000 0.00000
saldo_var1 -0.90000 3000000.00000 48.44911
saldo_var5 -2895.72000 619329.15000 1028.46824
saldo_var6 0.00000 19531.80000 0.41448
saldo_var8 -4942.26000 240045.00000 141.22678
saldo_var12 0.00000 3008077.32000 6021.61590
saldo_var13_corto 0.00000 450000.00000 4993.75297
saldo_var13_largo 0.00000 1500000.00000 1493.68227
saldo_var13_medio 0.00000 30000.00000 0.51302
saldo_var13 0.00000 1500000.00000 6487.94826
saldo_var14 0.00000 450000.00000 69.09620
saldo_var17 0.00000 6119500.14000 183.40590
saldo_var18 0.00000 3000000.00000 43.40963
saldo_var20 0.00000 455858.16000 27.39946
saldo_var24 0.00000 3008077.32000 5925.12024
saldo_var26 0.00000 69756.72000 76.08163
saldo_var25 0.00000 69756.72000 72.73569
saldo_var28 0.00000 0.00000 0.00000
saldo_var27 0.00000 0.00000 0.00000
saldo_var29 0.00000 19531.80000 0.41448
saldo_var30 -4942.26000 3458077.32000 13679.67366
saldo_var31 0.00000 6119500.14000 292.29097
saldo_var32 0.00000 12210.78000 3.34594
saldo_var33 0.00000 142078.80000 12.53233
saldo_var34 0.00000 36000.00000 0.67088
saldo_var37 0.00000 60000.00000 36.90719
saldo_var40 -0.90000 8192.61000 4.36860
saldo_var41 0.00000 0.00000 0.00000
saldo_var42 -4942.26000 3008077.32000 7191.72540
saldo_var44 0.00000 740006.61000 96.35274
saldo_var46 0.00000 0.00000 0.00000
var36 0.00000 99.00000 40.44908
delta_imp_amort_var18_1y3 0.00000 9999999999.00000 263088.66085
delta_imp_amort_var34_1y3 0.00000 9999999999.00000 263088.66085
delta_imp_aport_var13_1y3 -1.00000 9999999999.00000 48671402.23587
delta_imp_aport_var17_1y3 -1.00000 9999999999.00000 5130228.88643
delta_imp_aport_var33_1y3 -1.00000 9999999999.00000 131544.33014
delta_imp_compra_var44_1y3 -1.00000 9999999999.00000 9208103.12972
delta_imp_reemb_var13_1y3 0.00000 9999999999.00000 4998684.55620
delta_imp_reemb_var17_1y3 -1.00000 9999999999.00000 2630886.60851
delta_imp_reemb_var33_1y3 0.00000 9999999999.00000 131544.33043
delta_imp_trasp_var17_in_1y3 -1.00000 9999999999.00000 526177.32168
delta_imp_trasp_var17_out_1y3 0.00000 9999999999.00000 526177.32170
delta_imp_trasp_var33_in_1y3 -1.00000 9999999999.00000 657721.65205
delta_imp_trasp_var33_out_1y3 0.00000 9999999999.00000 131544.33043
delta_imp_venta_var44_1y3 -1.00000 9999999999.00000 5524861.87795
delta_num_aport_var13_1y3 -1.00000 9999999999.00000 48671402.23569
delta_num_aport_var17_1y3 -1.00000 9999999999.00000 5130228.88643
delta_num_aport_var33_1y3 -1.00000 9999999999.00000 131544.33020
delta_num_compra_var44_1y3 -1.00000 9999999999.00000 9208103.12977
delta_num_reemb_var13_1y3 0.00000 9999999999.00000 4998684.55620
delta_num_reemb_var17_1y3 -1.00000 9999999999.00000 2630886.60851
delta_num_reemb_var33_1y3 0.00000 9999999999.00000 131544.33043
delta_num_trasp_var17_in_1y3 -1.00000 9999999999.00000 526177.32168
delta_num_trasp_var17_out_1y3 0.00000 9999999999.00000 526177.32170
delta_num_trasp_var33_in_1y3 -1.00000 9999999999.00000 657721.65205
delta_num_trasp_var33_out_1y3 0.00000 9999999999.00000 131544.33043
delta_num_venta_var44_1y3 -1.00000 9999999999.00000 5524861.87799
imp_amort_var18_hace3 0.00000 0.00000 0.00000
imp_amort_var18_ult1 0.00000 15691.80000 0.23119
imp_amort_var34_hace3 0.00000 0.00000 0.00000
imp_amort_var34_ult1 0.00000 1096.02000 0.01805
imp_aport_var13_hace3 0.00000 840000.00000 2823.94909
imp_aport_var13_ult1 0.00000 450000.00000 619.58501
imp_aport_var17_hace3 0.00000 6083691.87000 98.78877
imp_aport_var17_ult1 0.00000 432457.32000 31.10532
imp_aport_var33_hace3 0.00000 36000.00000 2.98579
imp_aport_var33_ult1 0.00000 1260.00000 0.04815
imp_var7_emit_ult1 0.00000 145384.92000 2.72145
imp_var7_recib_ult1 0.00000 1039260.00000 127.69821
imp_compra_var44_hace3 0.00000 210001.35000 13.96458
imp_compra_var44_ult1 0.00000 3410058.66000 116.78253
imp_reemb_var13_hace3 0.00000 0.00000 0.00000
imp_reemb_var13_ult1 0.00000 450000.00000 46.18024
imp_reemb_var17_hace3 0.00000 12027.15000 0.15821
imp_reemb_var17_ult1 0.00000 182132.97000 12.56940
imp_reemb_var33_hace3 0.00000 0.00000 0.00000
imp_reemb_var33_ult1 0.00000 1200.00000 0.01579
imp_var43_emit_ult1 0.00000 1155003.00000 854.12075
imp_trans_var37_ult1 0.00000 2310003.00000 1932.95443
imp_trasp_var17_in_hace3 0.00000 96781.44000 1.87481
imp_trasp_var17_in_ult1 0.00000 133730.58000 2.51268
imp_trasp_var17_out_hace3 0.00000 0.00000 0.00000
imp_trasp_var17_out_ult1 0.00000 69622.29000 1.91362
imp_trasp_var33_in_hace3 0.00000 49581.27000 2.78977
imp_trasp_var33_in_ult1 0.00000 13207.32000 0.31470
imp_trasp_var33_out_hace3 0.00000 0.00000 0.00000
imp_trasp_var33_out_ult1 0.00000 3000.00000 0.03946
imp_venta_var44_hace3 0.00000 209834.40000 3.78714
imp_venta_var44_ult1 0.00000 2754476.46000 81.43383
ind_var7_emit_ult1 0.00000 1.00000 0.00004
ind_var7_recib_ult1 0.00000 1.00000 0.00270
ind_var10_ult1 0.00000 1.00000 0.08087
ind_var10cte_ult1 0.00000 1.00000 0.09216
ind_var9_cte_ult1 0.00000 1.00000 0.09687
ind_var9_ult1 0.00000 1.00000 0.08591
ind_var43_emit_ult1 0.00000 1.00000 0.06659
ind_var43_recib_ult1 0.00000 1.00000 0.12931
var21 0.00000 30000.00000 32.54933
num_var2_0_ult1 0.00000 0.00000 0.00000
num_var2_ult1 0.00000 0.00000 0.00000
num_aport_var13_hace3 0.00000 24.00000 0.07589
num_aport_var13_ult1 0.00000 30.00000 0.01796
num_aport_var17_hace3 0.00000 12.00000 0.00154
num_aport_var17_ult1 0.00000 21.00000 0.00339
num_aport_var33_hace3 0.00000 12.00000 0.00107
num_aport_var33_ult1 0.00000 6.00000 0.00032
num_var7_emit_ult1 0.00000 3.00000 0.00012
num_var7_recib_ult1 0.00000 24.00000 0.01030
num_compra_var44_hace3 0.00000 9.00000 0.00185
num_compra_var44_ult1 0.00000 39.00000 0.00754
num_ent_var16_ult1 0.00000 60.00000 0.18796
num_var22_hace2 0.00000 123.00000 1.29870
num_var22_hace3 0.00000 108.00000 1.18489
num_var22_ult1 0.00000 96.00000 0.56066
num_var22_ult3 0.00000 234.00000 3.04424
num_med_var22_ult3 0.00000 78.00000 0.63587
num_med_var45_ult3 0.00000 267.00000 4.02466
num_meses_var5_ult3 0.00000 3.00000 1.97998
num_meses_var8_ult3 0.00000 3.00000 0.05360
num_meses_var12_ult3 0.00000 3.00000 0.10205
num_meses_var13_corto_ult3 0.00000 3.00000 0.09892
num_meses_var13_largo_ult3 0.00000 3.00000 0.01740
num_meses_var13_medio_ult3 0.00000 2.00000 0.00005
num_meses_var17_ult3 0.00000 3.00000 0.00296
num_meses_var29_ult3 0.00000 2.00000 0.00011
num_meses_var33_ult3 0.00000 3.00000 0.00151
num_meses_var39_vig_ult3 0.00000 3.00000 1.59279
num_meses_var44_ult3 0.00000 3.00000 0.00358
num_op_var39_comer_ult1 0.00000 438.00000 2.19479
num_op_var39_comer_ult3 0.00000 600.00000 3.60706
num_op_var40_comer_ult1 0.00000 210.00000 0.07498
num_op_var40_comer_ult3 0.00000 582.00000 0.14483
num_op_var40_efect_ult1 0.00000 24.00000 0.00249
num_op_var40_efect_ult3 0.00000 24.00000 0.00367
num_op_var41_comer_ult1 0.00000 438.00000 2.11981
num_op_var41_comer_ult3 0.00000 438.00000 3.46223
num_op_var41_efect_ult1 0.00000 90.00000 0.71942
num_op_var41_efect_ult3 0.00000 156.00000 1.21215
num_op_var39_efect_ult1 0.00000 90.00000 0.72190
num_op_var39_efect_ult3 0.00000 156.00000 1.21582
num_reemb_var13_hace3 0.00000 0.00000 0.00000
num_reemb_var13_ult1 0.00000 3.00000 0.00150
num_reemb_var17_hace3 0.00000 3.00000 0.00004
num_reemb_var17_ult1 0.00000 21.00000 0.00118
num_reemb_var33_hace3 0.00000 0.00000 0.00000
num_reemb_var33_ult1 0.00000 3.00000 0.00004
num_sal_var16_ult1 0.00000 15.00000 0.00493
num_var43_emit_ult1 0.00000 180.00000 0.39282
num_var43_recib_ult1 0.00000 264.00000 0.81500
num_trasp_var11_ult1 0.00000 93.00000 0.12068
num_trasp_var17_in_hace3 0.00000 6.00000 0.00012
num_trasp_var17_in_ult1 0.00000 3.00000 0.00016
num_trasp_var17_out_hace3 0.00000 0.00000 0.00000
num_trasp_var17_out_ult1 0.00000 3.00000 0.00016
num_trasp_var33_in_hace3 0.00000 3.00000 0.00024
num_trasp_var33_in_ult1 0.00000 6.00000 0.00024
num_trasp_var33_out_hace3 0.00000 0.00000 0.00000
num_trasp_var33_out_ult1 0.00000 3.00000 0.00004
num_venta_var44_hace3 0.00000 6.00000 0.00016
num_venta_var44_ult1 0.00000 39.00000 0.00442
num_var45_hace2 0.00000 342.00000 5.39321
num_var45_hace3 0.00000 339.00000 3.89440
num_var45_ult1 0.00000 510.00000 4.36350
num_var45_ult3 0.00000 801.00000 13.65110
saldo_var2_ult1 0.00000 0.00000 0.00000
saldo_medio_var5_hace2 -128.37000 812137.26000 1579.13531
saldo_medio_var5_hace3 -8.04000 1542339.36000 891.36586
saldo_medio_var5_ult1 -922.38000 601428.60000 1077.25676
saldo_medio_var5_ult3 -476.07000 544365.57000 1048.85645
saldo_medio_var8_hace2 -287.67000 231351.99000 68.27545
saldo_medio_var8_hace3 0.00000 77586.21000 9.50529
saldo_medio_var8_ult1 -3401.34000 228031.80000 124.62096
saldo_medio_var8_ult3 -1844.52000 177582.00000 110.02658
saldo_medio_var12_hace2 0.00000 3000538.14000 3997.02333
saldo_medio_var12_hace3 0.00000 668335.32000 613.53444
saldo_medio_var12_ult1 0.00000 3004185.60000 5703.00817
saldo_medio_var12_ult3 0.00000 2272859.43000 4401.00244
saldo_medio_var13_corto_hace2 0.00000 450000.00000 3639.41994
saldo_medio_var13_corto_hace3 0.00000 304838.70000 556.18418
saldo_medio_var13_corto_ult1 0.00000 450000.00000 4852.26181
saldo_medio_var13_corto_ult3 0.00000 450000.00000 3857.84854
saldo_medio_var13_largo_hace2 0.00000 840000.00000 771.22745
saldo_medio_var13_largo_hace3 0.00000 534000.00000 162.17044
saldo_medio_var13_largo_ult1 0.00000 1500000.00000 956.95021
saldo_medio_var13_largo_ult3 0.00000 1034482.74000 750.95627
saldo_medio_var13_medio_hace2 0.00000 7741.95000 0.17532
saldo_medio_var13_medio_hace3 0.00000 0.00000 0.00000
saldo_medio_var13_medio_ult1 0.00000 30000.00000 0.51302
saldo_medio_var13_medio_ult3 0.00000 18870.99000 0.34417
saldo_medio_var17_hace2 -0.03000 4210084.23000 91.17181
saldo_medio_var17_hace3 0.00000 2368558.95000 36.46318
saldo_medio_var17_ult1 0.00000 3998687.46000 131.03157
saldo_medio_var17_ult3 0.00000 3525776.88000 109.21694
saldo_medio_var29_hace2 0.00000 10430.01000 0.21307
saldo_medio_var29_hace3 0.00000 145.20000 0.00191
saldo_medio_var29_ult1 0.00000 13793.67000 0.25391
saldo_medio_var29_ult3 0.00000 7331.34000 0.18663
saldo_medio_var33_hace2 0.00000 50003.88000 7.93582
saldo_medio_var33_hace3 0.00000 20385.72000 1.36515
saldo_medio_var33_ult1 0.00000 138831.63000 12.21558
saldo_medio_var33_ult3 0.00000 91778.73000 8.78407
saldo_medio_var44_hace2 0.00000 438329.22000 31.50532
saldo_medio_var44_hace3 0.00000 24650.01000 1.85857
saldo_medio_var44_ult1 0.00000 681462.90000 76.02617
saldo_medio_var44_ult3 0.00000 397884.30000 56.61435
var38 5163.75000 22034738.76000 117235.80943
TARGET 0.00000 1.00000 0.03957
In [70]:
#remove outliers rows
train_df_clean1 = train_df.replace(9999999999, np.nan).replace(-999999, np.nan).dropna()
print train_df.shape[0], train_df_clean1.shape[0], train_df.shape[0]-train_df_clean1.shape[0]
76020 75369 651
In [72]:
#non numerical columns
train_df_clean1.select_dtypes(include=['object']).shape[1]
Out[72]:
0
In [73]:
#remove constant columns
cols_to_remove = []
for col in train_df_clean1.columns:
if train_df_clean1[col].std() == 0:
cols_to_remove.append(col)
train_df_clean2 = train_df_clean1.drop(cols_to_remove, axis=1)
cols_to_remove, len(cols_to_remove), train_df_clean2.shape[1], train_df_clean1.shape[1]
Out[73]:
(['ind_var2_0',
'ind_var2',
'ind_var18_0',
'ind_var18',
'ind_var27_0',
'ind_var28_0',
'ind_var28',
'ind_var27',
'ind_var34_0',
'ind_var34',
'ind_var41',
'ind_var46_0',
'ind_var46',
'num_var18_0',
'num_var18',
'num_var27_0',
'num_var28_0',
'num_var28',
'num_var27',
'num_var34_0',
'num_var34',
'num_var41',
'num_var46_0',
'num_var46',
'saldo_var18',
'saldo_var28',
'saldo_var27',
'saldo_var34',
'saldo_var41',
'saldo_var46',
'delta_imp_amort_var18_1y3',
'delta_imp_amort_var34_1y3',
'delta_imp_reemb_var13_1y3',
'delta_imp_reemb_var33_1y3',
'delta_imp_trasp_var17_out_1y3',
'delta_imp_trasp_var33_out_1y3',
'delta_num_reemb_var13_1y3',
'delta_num_reemb_var33_1y3',
'delta_num_trasp_var17_out_1y3',
'delta_num_trasp_var33_out_1y3',
'imp_amort_var18_hace3',
'imp_amort_var18_ult1',
'imp_amort_var34_hace3',
'imp_amort_var34_ult1',
'imp_reemb_var13_hace3',
'imp_reemb_var13_ult1',
'imp_reemb_var17_ult1',
'imp_reemb_var33_hace3',
'imp_reemb_var33_ult1',
'imp_trasp_var17_in_ult1',
'imp_trasp_var17_out_hace3',
'imp_trasp_var17_out_ult1',
'imp_trasp_var33_in_ult1',
'imp_trasp_var33_out_hace3',
'imp_trasp_var33_out_ult1',
'num_var2_0_ult1',
'num_var2_ult1',
'num_reemb_var13_hace3',
'num_reemb_var13_ult1',
'num_reemb_var17_ult1',
'num_reemb_var33_hace3',
'num_reemb_var33_ult1',
'num_trasp_var17_in_ult1',
'num_trasp_var17_out_hace3',
'num_trasp_var17_out_ult1',
'num_trasp_var33_in_ult1',
'num_trasp_var33_out_hace3',
'num_trasp_var33_out_ult1',
'saldo_var2_ult1',
'saldo_medio_var13_medio_hace3'],
70,
301,
371)
In [74]:
#remove duplicate columns
#train_df_clean2 = train_df_clean1.T.drop_duplicates().T #does not work, a lot of rows --> add sub-set recursive
cols_to_remove = []
columns = train_df_clean2.columns
for i in xrange(len(columns)-1):
v = train_df_clean2[columns[i]].values
for j in xrange(i+1,len(columns)):
if np.array_equal(v,train_df_clean2[columns[j]].values):
cols_to_remove.append(columns[j])
train_df_clean3 = train_df_clean2.drop(cols_to_remove, axis=1)
len(cols_to_remove), train_df_clean3.shape[1], train_df_clean2.shape[1]
Out[74]:
(28, 275, 301)
In [75]:
for col in train_df_clean3.columns:
dict = train_df_clean3[col].value_counts()
if dict.shape[0]<10:
print dict
#TODO: check if removing the rows with value frequency==1 improves the accuracy
0 74528
1 841
Name: ind_var1_0, dtype: int64
0 75092
1 277
Name: ind_var1, dtype: int64
1 72209
0 3160
Name: ind_var5_0, dtype: int64
1 50085
0 25284
Name: ind_var5, dtype: int64
0 75362
1 7
Name: ind_var6_0, dtype: int64
0 75367
1 2
Name: ind_var6, dtype: int64
0 72901
1 2468
Name: ind_var8_0, dtype: int64
0 73223
1 2146
Name: ind_var8, dtype: int64
0 70396
1 4973
Name: ind_var12_0, dtype: int64
0 71990
1 3379
Name: ind_var12, dtype: int64
0 71808
1 3561
Name: ind_var13_0, dtype: int64
0 72425
1 2944
Name: ind_var13_corto_0, dtype: int64
0 72507
1 2862
Name: ind_var13_corto, dtype: int64
0 74701
1 668
Name: ind_var13_largo_0, dtype: int64
0 74711
1 658
Name: ind_var13_largo, dtype: int64
0 75367
1 2
Name: ind_var13_medio_0, dtype: int64
0 71886
1 3483
Name: ind_var13, dtype: int64
0 73677
1 1692
Name: ind_var14_0, dtype: int64
0 75000
1 369
Name: ind_var14, dtype: int64
0 75285
1 84
Name: ind_var17_0, dtype: int64
0 75301
1 68
Name: ind_var17, dtype: int64
0 75067
1 302
Name: ind_var19, dtype: int64
0 75093
1 276
Name: ind_var20_0, dtype: int64
0 75164
1 205
Name: ind_var20, dtype: int64
0 72204
1 3165
Name: ind_var24_0, dtype: int64
0 72532
1 2837
Name: ind_var24, dtype: int64
0 73385
1 1984
Name: ind_var25_cte, dtype: int64
0 73520
1 1849
Name: ind_var26_0, dtype: int64
0 73299
1 2070
Name: ind_var26_cte, dtype: int64
0 73596
1 1773
Name: ind_var25_0, dtype: int64
1 75030
0 339
Name: ind_var30_0, dtype: int64
1 55163
0 20206
Name: ind_var30, dtype: int64
0 75180
1 189
Name: ind_var31_0, dtype: int64
0 75205
1 164
Name: ind_var31, dtype: int64
0 75277
1 92
Name: ind_var32_cte, dtype: int64
0 75287
1 82
Name: ind_var32_0, dtype: int64
0 75321
1 48
Name: ind_var33_0, dtype: int64
0 75328
1 41
Name: ind_var33, dtype: int64
0 69921
1 5448
Name: ind_var37_cte, dtype: int64
0 70451
1 4918
Name: ind_var37_0, dtype: int64
1 66364
0 9005
Name: ind_var39_0, dtype: int64
1 66254
0 9115
Name: ind_var41_0, dtype: int64
0 75306
1 63
Name: ind_var44_0, dtype: int64
0 75309
1 60
Name: ind_var44, dtype: int64
0 74528
3 840
6 1
Name: num_var1_0, dtype: int64
0 75092
3 277
Name: num_var1, dtype: int64
1 37974
0 19432
2 12438
3 4311
4 995
5 185
6 32
7 2
Name: num_var4, dtype: int64
3 71722
0 3160
6 478
9 7
15 2
Name: num_var5_0, dtype: int64
3 49895
0 25284
6 186
9 3
15 1
Name: num_var5, dtype: int64
0 75362
3 7
Name: num_var6_0, dtype: int64
0 75367
3 2
Name: num_var6, dtype: int64
0 72901
3 2467
6 1
Name: num_var8_0, dtype: int64
0 73223
3 2146
Name: num_var8, dtype: int64
0 70396
3 4797
6 171
9 3
111 1
15 1
Name: num_var12_0, dtype: int64
0 71990
3 3342
6 36
15 1
Name: num_var12, dtype: int64
0 71808
3 3373
6 161
9 18
12 6
15 2
18 1
Name: num_var13_0, dtype: int64
0 72425
3 2913
6 31
Name: num_var13_corto_0, dtype: int64
0 72507
3 2860
6 2
Name: num_var13_corto, dtype: int64
0 74701
3 555
6 92
9 15
12 3
15 2
18 1
Name: num_var13_largo_0, dtype: int64
0 74711
3 568
6 78
9 7
15 2
12 2
18 1
Name: num_var13_largo, dtype: int64
0 75367
3 2
Name: num_var13_medio_0, dtype: int64
0 71886
3 3356
6 112
9 9
12 3
15 2
18 1
Name: num_var13, dtype: int64
0 73677
3 1685
6 5
111 1
12 1
Name: num_var14_0, dtype: int64
0 75000
3 365
6 3
12 1
Name: num_var14, dtype: int64
0 75285
3 43
6 19
9 9
12 6
15 3
18 2
36 1
27 1
Name: num_var17_0, dtype: int64
0 75301
3 34
6 18
9 8
12 5
27 1
18 1
15 1
Name: num_var17, dtype: int64
0 75093
3 276
Name: num_var20_0, dtype: int64
0 75164
3 205
Name: num_var20, dtype: int64
0 72204
3 3153
6 11
9 1
Name: num_var24_0, dtype: int64
0 72532
3 2835
6 2
Name: num_var24, dtype: int64
0 73520
3 1550
6 240
9 42
12 12
15 2
33 1
27 1
21 1
Name: num_var26_0, dtype: int64
0 73596
3 1501
6 216
9 40
12 11
15 2
33 1
27 1
21 1
Name: num_var25_0, dtype: int64
0 75362
3 3
6 2
48 1
9 1
Name: num_op_var40_hace3, dtype: int64
3 51198
0 20206
6 3658
9 276
12 23
15 5
33 1
21 1
18 1
Name: num_var30, dtype: int64
0 75180
3 141
6 23
9 12
15 5
12 4
18 2
36 1
27 1
Name: num_var31_0, dtype: int64
0 75205
3 125
6 20
9 11
12 4
15 2
27 1
18 1
Name: num_var31, dtype: int64
0 75287
3 60
6 20
12 1
9 1
Name: num_var32_0, dtype: int64
0 75321
3 43
6 5
Name: num_var33_0, dtype: int64
0 75328
3 37
6 4
Name: num_var33, dtype: int64
3 64464
0 9005
6 1777
9 109
12 9
15 2
33 1
21 1
18 1
Name: num_var39_0, dtype: int64
3 64829
0 9115
6 1340
9 74
12 6
15 2
33 1
21 1
18 1
Name: num_var41_0, dtype: int64
3 69762
6 4965
0 431
9 199
12 8
15 2
114 1
18 1
Name: num_var42_0, dtype: int64
3 51698
0 21680
6 1958
9 29
12 2
18 1
15 1
Name: num_var42, dtype: int64
0 75306
3 62
6 1
Name: num_var44_0, dtype: int64
0 75309
3 60
Name: num_var44, dtype: int64
0.00000 75367
11976.60000 1
19531.80000 1
Name: saldo_var6, dtype: int64
0 75367
30000 1
9000 1
Name: saldo_var13_medio, dtype: int64
99 29995
3 22155
1 14115
2 8694
0 410
Name: var36, dtype: int64
0.00000 75350
-1.00000 17
1.00005 1
1.50000 1
Name: delta_imp_aport_var17_1y3, dtype: int64
0.00000 75346
-1.00000 17
-0.66667 1
-0.70000 1
-0.95000 1
-0.94048 1
-0.91600 1
-0.50000 1
Name: delta_imp_aport_var33_1y3, dtype: int64
0.00000 75347
-1.00000 16
0.02562 1
-0.47574 1
-0.63073 1
2.65055 1
-0.93403 1
0.33361 1
Name: delta_imp_compra_var44_1y3, dtype: int64
0.00000 75368
-1.00000 1
Name: delta_imp_reemb_var17_1y3, dtype: int64
0.00000 75367
-1.00000 2
Name: delta_imp_trasp_var17_in_1y3, dtype: int64
0.00000 75363
-1.00000 6
Name: delta_imp_trasp_var33_in_1y3, dtype: int64
0.00000 75366
-1.00000 1
5.41710 1
-0.50577 1
Name: delta_imp_venta_var44_1y3, dtype: int64
0.00000 73707
-1.00000 1657
1.00000 3
-0.33333 1
-0.50000 1
Name: delta_num_aport_var13_1y3, dtype: int64
0.00000 75351
-1.00000 17
1.00000 1
Name: delta_num_aport_var17_1y3, dtype: int64
0.00000 75351
-1.00000 17
-0.50000 1
Name: delta_num_aport_var33_1y3, dtype: int64
0.00000 75350
-1.00000 16
-0.66667 1
2.50000 1
0.50000 1
Name: delta_num_compra_var44_1y3, dtype: int64
0.00000 75366
-1.00000 1
8.00000 1
-0.50000 1
Name: delta_num_venta_var44_1y3, dtype: int64
0.00000 75366
7500.00000 1
33000.90000 1
3000.00000 1
Name: imp_aport_var17_ult1, dtype: int64
0 75363
450 2
750 1
1260 1
300 1
150 1
Name: imp_aport_var33_ult1, dtype: int64
0.00000 75366
145384.92000 1
43500.00000 1
18000.00000 1
Name: imp_var7_emit_ult1, dtype: int64
0.00000 75363
78019.20000 1
6743.25000 1
8972.10000 1
99.66000 1
21550.02000 1
28728.00000 1
Name: imp_compra_var44_ult1, dtype: int64
0.00000 75368
12027.15000 1
Name: imp_reemb_var17_hace3, dtype: int64
0.00000 75367
96781.44000 1
45741.48000 1
Name: imp_trasp_var17_in_hace3, dtype: int64
0.00000 75363
33079.83000 1
33744.48000 1
25501.80000 1
44251.08000 1
49581.27000 1
25920.03000 1
Name: imp_trasp_var33_in_hace3, dtype: int64
0.00000 75366
78040.59000 1
209834.40000 1
23.16000 1
Name: imp_venta_var44_hace3, dtype: int64
0.00000 75367
103705.77000 1
148.62000 1
Name: imp_venta_var44_ult1, dtype: int64
0 75366
1 3
Name: ind_var7_emit_ult1, dtype: int64
0 75180
1 189
Name: ind_var7_recib_ult1, dtype: int64
0 69273
1 6096
Name: ind_var10_ult1, dtype: int64
0 68430
1 6939
Name: ind_var10cte_ult1, dtype: int64
0 68086
1 7283
Name: ind_var9_cte_ult1, dtype: int64
0 68905
1 6464
Name: ind_var9_ult1, dtype: int64
0 70458
1 4911
Name: ind_var43_emit_ult1, dtype: int64
0 65858
1 9511
Name: ind_var43_recib_ult1, dtype: int64
0 73683
3 1545
6 97
9 25
12 14
24 2
18 2
15 1
Name: num_aport_var13_hace3, dtype: int64
0 75340
3 25
6 4
Name: num_aport_var13_ult1, dtype: int64
0 75349
3 11
6 6
12 3
Name: num_aport_var17_hace3, dtype: int64
0 75366
3 2
6 1
Name: num_aport_var17_ult1, dtype: int64
0 75346
3 21
12 1
6 1
Name: num_aport_var33_hace3, dtype: int64
0 75363
3 5
6 1
Name: num_aport_var33_ult1, dtype: int64
0 75366
3 3
Name: num_var7_emit_ult1, dtype: int64
0 75180
3 154
6 27
9 5
12 3
Name: num_var7_recib_ult1, dtype: int64
0 75347
3 15
6 5
9 2
Name: num_compra_var44_hace3, dtype: int64
0 75363
3 4
21 1
9 1
Name: num_compra_var44_ult1, dtype: int64
3 42546
0 20463
2 9147
1 3213
Name: num_meses_var5_ult3, dtype: int64
0 73182
2 900
1 818
3 469
Name: num_meses_var8_ult3, dtype: int64
0 71967
2 1686
3 1273
1 443
Name: num_meses_var12_ult3, dtype: int64
0 72429
2 1557
3 1314
1 69
Name: num_meses_var13_corto_ult3, dtype: int64
0 74887
3 294
2 176
1 12
Name: num_meses_var13_largo_ult3, dtype: int64
0 75367
2 2
Name: num_meses_var13_medio_ult3, dtype: int64
0 75296
2 51
3 15
1 7
Name: num_meses_var17_ult3, dtype: int64
0 75365
2 2
1 2
Name: num_meses_var29_ult3, dtype: int64
0 75328
3 24
2 16
1 1
Name: num_meses_var33_ult3, dtype: int64
2 54120
1 10837
0 10067
3 345
Name: num_meses_var39_vig_ult3, dtype: int64
0 75306
2 40
3 21
1 2
Name: num_meses_var44_ult3, dtype: int64
0 75331
3 26
6 8
24 1
18 1
12 1
9 1
Name: num_op_var40_efect_ult1, dtype: int64
0 75324
3 25
6 12
24 2
9 2
21 1
18 1
15 1
12 1
Name: num_op_var40_efect_ult3, dtype: int64
0 75368
3 1
Name: num_reemb_var17_hace3, dtype: int64
0 75275
3 71
6 20
15 1
12 1
9 1
Name: num_sal_var16_ult1, dtype: int64
0 75367
6 1
3 1
Name: num_trasp_var17_in_hace3, dtype: int64
0 75363
3 6
Name: num_trasp_var33_in_hace3, dtype: int64
0 75366
3 2
6 1
Name: num_venta_var44_hace3, dtype: int64
0 75367
27 1
3 1
Name: num_venta_var44_ult1, dtype: int64
0.00000 75367
5586.21000 1
7741.95000 1
Name: saldo_medio_var13_medio_hace2, dtype: int64
0.00000 75367
7293.12000 1
18870.99000 1
Name: saldo_medio_var13_medio_ult3, dtype: int64
0.00000 75366
10430.01000 1
868.98000 1
59.97000 1
Name: saldo_medio_var29_hace2, dtype: int64
0.00000 75368
145.20000 1
Name: saldo_medio_var29_hace3, dtype: int64
0.00000 75367
13793.67000 1
3365.46000 1
Name: saldo_medio_var29_ult1, dtype: int64
0.00000 75367
7331.34000 1
3365.46000 1
Name: saldo_medio_var29_ult3, dtype: int64
0 72373
1 2996
Name: TARGET, dtype: int64
In [76]:
train_labels = train_df_clean3['TARGET']
train_features = train_df_clean3.drop(['ID','TARGET'], axis=1)
train_features.shape, train_labels.shape
Out[76]:
((75369, 273), (75369,))
In [77]:
#single features prediciton power
model = ensemble.GradientBoostingClassifier(n_estimators=10,
max_features=1,
max_depth=3,
min_samples_leaf=100,
learning_rate=0.3,
subsample=0.65,
loss='deviance',
verbose=0,
random_state=1)
X_train, X_valid, y_train, y_valid = cross_validation.train_test_split(train_features,
train_labels,
test_size=0.5,
random_state=1)
time_start = time.time()
single_var_AUC_list = []
single_var_AUC_dict = {}
for i, feature in enumerate(X_train.columns):
train = X_train[feature].values.reshape(-1,1)
valid = X_valid[feature].values.reshape(-1,1)
model.fit(train, y_train)
train_AUC = auc(y_train, model.predict_proba(train)[:,1])
valid_AUC = auc(y_valid, model.predict_proba(valid)[:,1])
single_var_AUC_list.append(valid_AUC)
single_var_AUC_dict[feature] = valid_AUC
print "feature %d/%d: AUC=%f" %(i, len(X_train.columns),valid_AUC)
valid_AUC = np.array(single_var_AUC_list)
tim_spent = (time.time()-time_start)/60
print("(min,mean,max) AUC = (%.3f,%.3f,%.3f). took %.2f minutes" %(valid_AUC.min(),valid_AUC.mean(),valid_AUC.max(), tim_spent))
# show the scatter plot of the individual feature performance
plt.figure()
plt.hist(valid_AUC, 50, normed=1, facecolor='blue', alpha=0.75)
plt.xlabel('AUC');
plt.ylabel('frequency');
plt.title('single feature AUC histogram');
plt.show()
sorted_vars_df = pd.DataFrame(index=range(len(single_var_AUC_dict.keys())), columns=['feature','AUC'])
for k,key in enumerate(single_var_AUC_dict):
sorted_vars_df.ix[k,'feature'] = key
sorted_vars_df.ix[k,'AUC'] = single_var_AUC_dict[key]
sorted_vars_df = sorted_vars_df.sort_values(by='AUC', axis=0, ascending=False).reset_index(drop=True)
sorted_vars_df.head(10)
feature 0/273: AUC=0.501831
feature 1/273: AUC=0.715859
feature 2/273: AUC=0.497597
feature 3/273: AUC=0.513979
feature 4/273: AUC=0.507052
feature 5/273: AUC=0.500526
feature 6/273: AUC=0.501089
feature 7/273: AUC=0.500000
feature 8/273: AUC=0.500000
feature 9/273: AUC=0.500000
feature 10/273: AUC=0.510227
feature 11/273: AUC=0.504226
feature 12/273: AUC=0.513789
feature 13/273: AUC=0.513591
feature 14/273: AUC=0.529833
feature 15/273: AUC=0.510620
feature 16/273: AUC=0.513173
feature 17/273: AUC=0.529167
feature 18/273: AUC=0.500000
feature 19/273: AUC=0.500562
feature 20/273: AUC=0.500000
feature 21/273: AUC=0.518035
feature 22/273: AUC=0.670083
feature 23/273: AUC=0.500000
feature 24/273: AUC=0.500000
feature 25/273: AUC=0.522827
feature 26/273: AUC=0.514460
feature 27/273: AUC=0.525320
feature 28/273: AUC=0.518286
feature 29/273: AUC=0.520845
feature 30/273: AUC=0.516542
feature 31/273: AUC=0.516327
feature 32/273: AUC=0.504690
feature 33/273: AUC=0.504607
feature 34/273: AUC=0.500000
feature 35/273: AUC=0.520644
feature 36/273: AUC=0.508648
feature 37/273: AUC=0.501640
feature 38/273: AUC=0.500000
feature 39/273: AUC=0.500000
feature 40/273: AUC=0.502072
feature 41/273: AUC=0.501795
feature 42/273: AUC=0.500000
feature 43/273: AUC=0.515672
feature 44/273: AUC=0.515472
feature 45/273: AUC=0.512714
feature 46/273: AUC=0.510368
feature 47/273: AUC=0.513077
feature 48/273: AUC=0.510273
feature 49/273: AUC=0.502107
feature 50/273: AUC=0.673763
feature 51/273: AUC=0.500000
feature 52/273: AUC=0.500000
feature 53/273: AUC=0.500000
feature 54/273: AUC=0.500000
feature 55/273: AUC=0.500000
feature 56/273: AUC=0.500000
feature 57/273: AUC=0.499124
feature 58/273: AUC=0.499120
feature 59/273: AUC=0.532554
feature 60/273: AUC=0.532063
feature 61/273: AUC=0.500000
feature 62/273: AUC=0.500000
feature 63/273: AUC=0.500562
feature 64/273: AUC=0.500000
feature 65/273: AUC=0.696806
feature 66/273: AUC=0.516297
feature 67/273: AUC=0.670083
feature 68/273: AUC=0.500000
feature 69/273: AUC=0.500000
feature 70/273: AUC=0.522827
feature 71/273: AUC=0.514460
feature 72/273: AUC=0.525320
feature 73/273: AUC=0.518286
feature 74/273: AUC=0.520845
feature 75/273: AUC=0.516542
feature 76/273: AUC=0.516327
feature 77/273: AUC=0.504690
feature 78/273: AUC=0.504607
feature 79/273: AUC=0.500000
feature 80/273: AUC=0.520644
feature 81/273: AUC=0.508648
feature 82/273: AUC=0.501640
feature 83/273: AUC=0.500000
feature 84/273: AUC=0.500000
feature 85/273: AUC=0.501795
feature 86/273: AUC=0.500000
feature 87/273: AUC=0.515672
feature 88/273: AUC=0.515472
feature 89/273: AUC=0.510369
feature 90/273: AUC=0.510274
feature 91/273: AUC=0.500000
feature 92/273: AUC=0.500000
feature 93/273: AUC=0.500000
feature 94/273: AUC=0.500000
feature 95/273: AUC=0.511658
feature 96/273: AUC=0.501902
feature 97/273: AUC=0.513153
feature 98/273: AUC=0.509917
feature 99/273: AUC=0.511676
feature 100/273: AUC=0.501874
feature 101/273: AUC=0.514139
feature 102/273: AUC=0.511153
feature 103/273: AUC=0.534945
feature 104/273: AUC=0.679165
feature 105/273: AUC=0.500000
feature 106/273: AUC=0.500000
feature 107/273: AUC=0.500000
feature 108/273: AUC=0.500000
feature 109/273: AUC=0.500000
feature 110/273: AUC=0.697177
feature 111/273: AUC=0.502484
feature 112/273: AUC=0.499041
feature 113/273: AUC=0.529272
feature 114/273: AUC=0.529983
feature 115/273: AUC=0.524541
feature 116/273: AUC=0.670048
feature 117/273: AUC=0.500000
feature 118/273: AUC=0.500000
feature 119/273: AUC=0.500000
feature 120/273: AUC=0.697327
feature 121/273: AUC=0.500000
feature 122/273: AUC=0.515448
feature 123/273: AUC=0.518397
feature 124/273: AUC=0.516320
feature 125/273: AUC=0.504609
feature 126/273: AUC=0.500000
feature 127/273: AUC=0.520607
feature 128/273: AUC=0.501642
feature 129/273: AUC=0.500000
feature 130/273: AUC=0.500000
feature 131/273: AUC=0.515431
feature 132/273: AUC=0.504406
feature 133/273: AUC=0.504198
feature 134/273: AUC=0.719565
feature 135/273: AUC=0.500000
feature 136/273: AUC=0.500000
feature 137/273: AUC=0.500000
feature 138/273: AUC=0.494535
feature 139/273: AUC=0.705306
feature 140/273: AUC=0.500000
feature 141/273: AUC=0.646810
feature 142/273: AUC=0.511181
feature 143/273: AUC=0.500000
feature 144/273: AUC=0.500000
feature 145/273: AUC=0.500000
feature 146/273: AUC=0.500000
feature 147/273: AUC=0.500000
feature 148/273: AUC=0.500000
feature 149/273: AUC=0.500000
feature 150/273: AUC=0.511084
feature 151/273: AUC=0.500000
feature 152/273: AUC=0.500000
feature 153/273: AUC=0.500000
feature 154/273: AUC=0.500000
feature 155/273: AUC=0.511327
feature 156/273: AUC=0.500000
feature 157/273: AUC=0.500000
feature 158/273: AUC=0.500000
feature 159/273: AUC=0.500000
feature 160/273: AUC=0.500000
feature 161/273: AUC=0.500000
feature 162/273: AUC=0.500000
feature 163/273: AUC=0.500000
feature 164/273: AUC=0.500000
feature 165/273: AUC=0.500000
feature 166/273: AUC=0.503177
feature 167/273: AUC=0.510052
feature 168/273: AUC=0.500000
feature 169/273: AUC=0.500000
feature 170/273: AUC=0.500000
feature 171/273: AUC=0.500000
feature 172/273: AUC=0.500000
feature 173/273: AUC=0.500000
feature 174/273: AUC=0.507704
feature 175/273: AUC=0.507564
feature 176/273: AUC=0.506607
feature 177/273: AUC=0.506513
feature 178/273: AUC=0.506698
feature 179/273: AUC=0.520004
feature 180/273: AUC=0.503319
feature 181/273: AUC=0.511333
feature 182/273: AUC=0.500000
feature 183/273: AUC=0.500000
feature 184/273: AUC=0.500000
feature 185/273: AUC=0.500000
feature 186/273: AUC=0.500000
feature 187/273: AUC=0.500000
feature 188/273: AUC=0.500000
feature 189/273: AUC=0.500000
feature 190/273: AUC=0.500000
feature 191/273: AUC=0.500357
feature 192/273: AUC=0.505611
feature 193/273: AUC=0.508789
feature 194/273: AUC=0.506811
feature 195/273: AUC=0.517106
feature 196/273: AUC=0.514046
feature 197/273: AUC=0.538084
feature 198/273: AUC=0.693430
feature 199/273: AUC=0.514844
feature 200/273: AUC=0.517777
feature 201/273: AUC=0.516575
feature 202/273: AUC=0.503618
feature 203/273: AUC=0.500000
feature 204/273: AUC=0.500000
feature 205/273: AUC=0.500000
feature 206/273: AUC=0.500000
feature 207/273: AUC=0.555588
feature 208/273: AUC=0.500000
feature 209/273: AUC=0.509288
feature 210/273: AUC=0.505691
feature 211/273: AUC=0.500525
feature 212/273: AUC=0.500236
feature 213/273: AUC=0.500000
feature 214/273: AUC=0.500000
feature 215/273: AUC=0.512061
feature 216/273: AUC=0.508815
feature 217/273: AUC=0.506851
feature 218/273: AUC=0.503388
feature 219/273: AUC=0.507524
feature 220/273: AUC=0.503287
feature 221/273: AUC=0.500000
feature 222/273: AUC=0.500000
feature 223/273: AUC=0.501326
feature 224/273: AUC=0.520019
feature 225/273: AUC=0.504493
feature 226/273: AUC=0.500000
feature 227/273: AUC=0.500000
feature 228/273: AUC=0.500000
feature 229/273: AUC=0.500000
feature 230/273: AUC=0.550952
feature 231/273: AUC=0.510978
feature 232/273: AUC=0.544896
feature 233/273: AUC=0.531257
feature 234/273: AUC=0.700980
feature 235/273: AUC=0.672831
feature 236/273: AUC=0.697150
feature 237/273: AUC=0.694594
feature 238/273: AUC=0.510465
feature 239/273: AUC=0.502153
feature 240/273: AUC=0.515063
feature 241/273: AUC=0.516814
feature 242/273: AUC=0.515697
feature 243/273: AUC=0.506930
feature 244/273: AUC=0.517425
feature 245/273: AUC=0.517581
feature 246/273: AUC=0.516332
feature 247/273: AUC=0.508875
feature 248/273: AUC=0.516334
feature 249/273: AUC=0.516330
feature 250/273: AUC=0.503563
feature 251/273: AUC=0.500000
feature 252/273: AUC=0.503591
feature 253/273: AUC=0.503591
feature 254/273: AUC=0.500000
feature 255/273: AUC=0.500000
feature 256/273: AUC=0.500000
feature 257/273: AUC=0.500000
feature 258/273: AUC=0.500000
feature 259/273: AUC=0.500000
feature 260/273: AUC=0.500000
feature 261/273: AUC=0.500000
feature 262/273: AUC=0.500000
feature 263/273: AUC=0.500000
feature 264/273: AUC=0.500000
feature 265/273: AUC=0.500000
feature 266/273: AUC=0.500000
feature 267/273: AUC=0.500000
feature 268/273: AUC=0.500000
feature 269/273: AUC=0.500000
feature 270/273: AUC=0.500000
feature 271/273: AUC=0.500000
feature 272/273: AUC=0.591160
(min,mean,max) AUC = (0.495,0.518,0.720). took 0.53 minutes
Out[77]:
feature
AUC
0
saldo_var30
0.71957
1
var15
0.71586
2
saldo_var42
0.70531
3
saldo_medio_var5_hace2
0.70098
4
saldo_var5
0.69733
5
num_var35
0.69718
6
saldo_medio_var5_ult1
0.69715
7
num_var4
0.69681
8
saldo_medio_var5_ult3
0.69459
9
num_meses_var5_ult3
0.69343
In [79]:
#features binarization
train_features_bin = Binarizer().fit_transform(scale(train_features))
In [82]:
#features selection chi2
selectChi2 = SelectPercentile(chi2, percentile=75).fit(train_features_bin, train_labels)
chi2_selected = selectChi2.get_support()
chi2_selected_features = [ f for i,f in enumerate(train_features.columns) if chi2_selected[i]]
print len(chi2_selected_features)
204
In [83]:
#features selection F_classif
selectF_classif = SelectPercentile(f_classif, percentile=75).fit(train_features_bin, train_labels)
f_classif_selected = selectF_classif.get_support()
f_classif_selected_features = [ f for i,f in enumerate(train_features.columns) if f_classif_selected[i]]
print len(f_classif_selected_features)
204
In [85]:
#intesection of selected features
selected = chi2_selected & f_classif_selected
selected_features = [ f for f,s in zip(train_features.columns, selected) if s]
print len(selected_features)
199
In [86]:
X_train, X_valid, y_train, y_valid= cross_validation.train_test_split(train_features[selected_features],
train_labels,
random_state=1301,
stratify=train_labels,
test_size=0.3)
In [ ]:
Content source: aymen82/kaggler-competitions-scripts
Similar notebooks: