In [1]:

    
from IPython.display import Image
Image(filename='images/phd053104s.png')









    Out[1]:

Paseando por pandas



In [2]:

    
from IPython.display import IFrame
IFrame('http://pandas.pydata.org/', width=900, height=350)









    Out[2]:

Importamos las bibliotecas a usar ...



In [7]:

    
import pandas as pd
import numpy as np



In [8]:

    
more data/train.csv



In [9]:

    
df = pd.read_csv('data/train.csv')
# Reemplazamos nuestros valores de 'y' con valores enteros.
types = np.sort(np.unique(df['target']))
new_values = dict(zip(types, range(types.shape[0])))
df['target'] = df['target'].map(new_values).astype(np.int32)

**También mirar: read_excel, read_clipboard, read_fwf, read_html, read_json, read_sql**

Ahora demos un vistazo de cómo esta compuesto nuestro dataset ...



In [10]:

    
df.head(5)









    Out[10]:






  
    
      
      id
      feat_1
      feat_2
      feat_3
      feat_4
      feat_5
      feat_6
      feat_7
      feat_8
      feat_9
      ...
      feat_85
      feat_86
      feat_87
      feat_88
      feat_89
      feat_90
      feat_91
      feat_92
      feat_93
      target
    
  
  
    
      0
      1
      1
      0
      0
      0
      0
      0
      0
      0
      0
      ...
      1
      0
      0
      0
      0
      0
      0
      0
      0
      0
    
    
      1
      2
      0
      0
      0
      0
      0
      0
      0
      1
      0
      ...
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
    
    
      2
      3
      0
      0
      0
      0
      0
      0
      0
      1
      0
      ...
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
    
    
      3
      4
      1
      0
      0
      1
      6
      1
      5
      0
      0
      ...
      0
      1
      2
      0
      0
      0
      0
      0
      0
      0
    
    
      4
      5
      0
      0
      0
      0
      0
      0
      0
      0
      0
      ...
      1
      0
      0
      0
      0
      1
      0
      0
      0
      0
    
  

5 rows × 95 columns

Para tener una idea básica del comportamiento de nuestros datos podemos pedir información estadística básica a pandas con la sig. función



In [11]:

    
df.describe()









    Out[11]:






  
    
      
      id
      feat_1
      feat_2
      feat_3
      feat_4
      feat_5
      feat_6
      feat_7
      feat_8
      feat_9
      ...
      feat_85
      feat_86
      feat_87
      feat_88
      feat_89
      feat_90
      feat_91
      feat_92
      feat_93
      target
    
  
  
    
      count
      61878.000000
      61878.00000
      61878.000000
      61878.000000
      61878.000000
      61878.000000
      61878.000000
      61878.000000
      61878.000000
      61878.000000
      ...
      61878.000000
      61878.000000
      61878.000000
      61878.000000
      61878.000000
      61878.000000
      61878.000000
      61878.000000
      61878.000000
      61878.000000
    
    
      mean
      30939.500000
      0.38668
      0.263066
      0.901467
      0.779081
      0.071043
      0.025696
      0.193704
      0.662433
      1.011296
      ...
      0.532306
      1.128576
      0.393549
      0.874915
      0.457772
      0.812421
      0.264941
      0.380119
      0.126135
      3.842335
    
    
      std
      17862.784315
      1.52533
      1.252073
      2.934818
      2.788005
      0.438902
      0.215333
      1.030102
      2.255770
      3.474822
      ...
      1.900438
      2.681554
      1.575455
      2.115466
      1.527385
      4.597804
      2.045646
      0.982385
      1.201720
      2.510794
    
    
      min
      1.000000
      0.00000
      0.000000
      0.000000
      0.000000
      0.000000
      0.000000
      0.000000
      0.000000
      0.000000
      ...
      0.000000
      0.000000
      0.000000
      0.000000
      0.000000
      0.000000
      0.000000
      0.000000
      0.000000
      0.000000
    
    
      25%
      15470.250000
      0.00000
      0.000000
      0.000000
      0.000000
      0.000000
      0.000000
      0.000000
      0.000000
      0.000000
      ...
      0.000000
      0.000000
      0.000000
      0.000000
      0.000000
      0.000000
      0.000000
      0.000000
      0.000000
      1.000000
    
    
      50%
      30939.500000
      0.00000
      0.000000
      0.000000
      0.000000
      0.000000
      0.000000
      0.000000
      0.000000
      0.000000
      ...
      0.000000
      0.000000
      0.000000
      0.000000
      0.000000
      0.000000
      0.000000
      0.000000
      0.000000
      4.000000
    
    
      75%
      46408.750000
      0.00000
      0.000000
      0.000000
      0.000000
      0.000000
      0.000000
      0.000000
      1.000000
      0.000000
      ...
      0.000000
      1.000000
      0.000000
      1.000000
      0.000000
      0.000000
      0.000000
      0.000000
      0.000000
      6.000000
    
    
      max
      61878.000000
      61.00000
      51.000000
      64.000000
      70.000000
      19.000000
      10.000000
      38.000000
      76.000000
      43.000000
      ...
      55.000000
      65.000000
      67.000000
      30.000000
      61.000000
      130.000000
      52.000000
      19.000000
      87.000000
      8.000000
    
  

8 rows × 95 columns



In [12]:

    
df.corr()









    Out[12]:






  
    
      
      id
      feat_1
      feat_2
      feat_3
      feat_4
      feat_5
      feat_6
      feat_7
      feat_8
      feat_9
      ...
      feat_85
      feat_86
      feat_87
      feat_88
      feat_89
      feat_90
      feat_91
      feat_92
      feat_93
      target
    
  
  
    
      id
      1.000000
      0.070691
      0.185463
      0.138980
      1.460200e-01
      0.030707
      0.033410
      0.136501
      0.197418
      -0.156082
      ...
      -0.084507
      -0.096484
      0.097087
      -0.215878
      0.111519
      0.188895
      0.139078
      0.131737
      0.047944
      0.977144
    
    
      feat_1
      0.070691
      1.000000
      0.031332
      -0.027807
      -2.752941e-02
      0.042973
      0.043603
      0.298952
      0.056321
      -0.032285
      ...
      -0.008739
      0.107947
      0.089374
      0.020830
      0.096851
      0.010310
      0.037264
      0.054777
      0.081783
      0.072984
    
    
      feat_2
      0.185463
      0.031332
      1.000000
      0.082573
      1.349870e-01
      0.020926
      0.041343
      0.222386
      0.019815
      -0.025630
      ...
      -0.006764
      -0.039090
      0.047451
      -0.047035
      0.105527
      0.515022
      0.026383
      -0.008219
      0.054593
      0.191739
    
    
      feat_3
      0.138980
      -0.027807
      0.082573
      1.000000
      5.835232e-01
      0.010880
      0.004288
      0.001294
      -0.053462
      -0.063551
      ...
      -0.048626
      -0.096093
      -0.009838
      -0.082336
      0.174781
      -0.015068
      -0.012417
      0.066921
      0.006814
      0.148729
    
    
      feat_4
      0.146020
      -0.027529
      0.134987
      0.583523
      1.000000e+00
      0.017290
      0.014059
      0.014490
      -0.046184
      -0.046250
      ...
      -0.033153
      -0.071029
      0.005055
      -0.067484
      0.183715
      0.009454
      -0.010312
      0.087631
      0.015746
      0.153458
    
    
      feat_5
      0.030707
      0.042973
      0.020926
      0.010880
      1.729026e-02
      1.000000
      0.145355
      0.075047
      0.035861
      -0.024708
      ...
      0.034062
      0.013879
      0.013999
      -0.019201
      0.119951
      0.004842
      0.012012
      0.065331
      0.002038
      0.036034
    
    
      feat_6
      0.033410
      0.043603
      0.041343
      0.004288
      1.405895e-02
      0.145355
      1.000000
      0.088014
      0.012867
      -0.009373
      ...
      0.004290
      0.010455
      0.015256
      -0.015437
      0.035042
      0.054034
      0.012465
      0.015479
      0.008521
      0.037116
    
    
      feat_7
      0.136501
      0.298952
      0.222386
      0.001294
      1.448981e-02
      0.075047
      0.088014
      1.000000
      0.038121
      -0.027146
      ...
      0.037874
      -0.009169
      0.089574
      -0.033646
      0.063511
      0.129578
      0.068506
      -0.032261
      0.034912
      0.142172
    
    
      feat_8
      0.197418
      0.056321
      0.019815
      -0.053462
      -4.618407e-02
      0.035861
      0.012867
      0.038121
      1.000000
      -0.039281
      ...
      -0.003416
      -0.029395
      0.059929
      -0.050931
      0.007974
      0.026807
      0.095990
      0.013608
      0.005131
      0.209725
    
    
      feat_9
      -0.156082
      -0.032285
      -0.025630
      -0.063551
      -4.624977e-02
      -0.024708
      -0.009373
      -0.027146
      -0.039281
      1.000000
      ...
      -0.031462
      -0.019144
      -0.016925
      0.001160
      -0.019147
      -0.020698
      -0.014742
      -0.069707
      -0.006038
      -0.175350
    
    
      feat_10
      0.096127
      0.097776
      0.051925
      0.036944
      5.951396e-02
      0.091324
      0.041940
      0.194258
      -0.000023
      -0.024323
      ...
      0.086758
      0.159447
      0.077421
      0.054635
      0.061498
      0.049908
      0.024025
      -0.006869
      0.041316
      0.095383
    
    
      feat_11
      0.179164
      -0.042928
      0.118534
      0.596243
      3.894092e-01
      0.004882
      0.014504
      0.012418
      -0.065923
      -0.075820
      ...
      -0.074293
      -0.123339
      -0.032969
      -0.114491
      0.137374
      0.045074
      -0.029511
      0.013179
      0.003326
      0.189551
    
    
      feat_12
      0.079170
      0.056934
      0.090153
      0.050037
      5.743356e-02
      0.036668
      0.028588
      0.056230
      0.091424
      -0.021885
      ...
      0.019283
      -0.007214
      0.016089
      -0.024324
      0.082220
      0.062721
      0.063965
      0.063922
      0.012722
      0.082567
    
    
      feat_13
      0.184629
      0.139254
      0.157467
      0.013870
      2.897317e-02
      0.059081
      0.036293
      0.199142
      0.095365
      -0.040164
      ...
      0.002594
      0.004850
      0.093870
      -0.036259
      0.062990
      0.107722
      0.044338
      0.071953
      0.038989
      0.194200
    
    
      feat_14
      -0.346308
      0.063517
      -0.070057
      -0.111105
      -9.921490e-02
      -0.037607
      -0.027350
      -0.044671
      -0.061799
      -0.110188
      ...
      -0.021455
      0.145787
      -0.020229
      0.323089
      -0.038881
      -0.060240
      -0.038444
      -0.040133
      -0.018127
      -0.365092
    
    
      feat_15
      -0.245562
      -0.045738
      -0.048798
      -0.065285
      -5.122155e-02
      -0.007000
      -0.018328
      -0.035721
      -0.056960
      0.009858
      ...
      0.246847
      -0.002529
      -0.023191
      0.010840
      0.029547
      -0.046616
      -0.034402
      -0.018206
      -0.020369
      -0.259047
    
    
      feat_16
      -0.004622
      0.027086
      0.108046
      0.221426
      2.110780e-01
      0.062877
      0.021934
      0.043957
      -0.004659
      -0.082664
      ...
      0.110850
      0.003610
      0.077770
      -0.007257
      0.248364
      0.016863
      0.048494
      0.210499
      0.031467
      -0.004812
    
    
      feat_17
      0.122884
      0.053004
      0.074902
      -0.023093
      -7.553867e-03
      0.062197
      0.015488
      0.127245
      0.173912
      -0.028709
      ...
      0.015559
      0.049102
      0.214221
      -0.034139
      0.035390
      0.045218
      0.088508
      -0.006538
      0.056695
      0.132741
    
    
      feat_18
      0.189280
      0.084856
      0.242716
      0.115655
      2.148952e-01
      0.052186
      0.048710
      0.098972
      0.087777
      -0.043642
      ...
      -0.001555
      -0.029295
      0.126886
      -0.035981
      0.247462
      0.094336
      0.037275
      0.126640
      0.058100
      0.200805
    
    
      feat_19
      0.127893
      0.002302
      0.176655
      -0.012228
      -3.519107e-07
      -0.008556
      0.038493
      0.058071
      0.019387
      -0.000167
      ...
      -0.008292
      -0.014560
      0.000412
      -0.018485
      0.011116
      0.450925
      0.004085
      -0.027662
      0.014243
      0.131549
    
    
      feat_20
      0.213630
      0.070511
      0.449160
      -0.011069
      4.465657e-02
      0.046200
      0.057813
      0.364972
      0.062595
      -0.023397
      ...
      0.084570
      0.016850
      0.220475
      0.004081
      0.111231
      0.370282
      0.079181
      -0.018715
      0.110054
      0.219909
    
    
      feat_21
      0.012886
      -0.027026
      0.014113
      0.354925
      2.329227e-01
      0.003288
      0.008046
      -0.022908
      -0.041095
      -0.028409
      ...
      -0.006180
      -0.045562
      -0.016862
      -0.030401
      0.105392
      -0.033193
      -0.019779
      0.058008
      -0.007677
      0.013486
    
    
      feat_22
      0.190144
      0.063283
      0.215106
      0.251082
      2.477378e-01
      0.075161
      0.038939
      0.162620
      0.029032
      -0.062348
      ...
      0.044396
      -0.018347
      0.219974
      -0.045439
      0.244779
      0.098595
      0.104921
      0.200593
      0.113276
      0.200545
    
    
      feat_23
      0.102980
      0.048686
      0.162065
      -0.002427
      3.062225e-02
      0.017281
      0.043651
      0.186462
      0.012774
      0.006940
      ...
      0.056994
      0.121170
      0.111837
      -0.014039
      0.059743
      0.141869
      0.010438
      -0.031837
      0.084945
      0.104259
    
    
      feat_24
      0.215890
      0.067255
      0.253684
      -0.031596
      3.727726e-03
      0.075222
      0.082124
      0.244813
      0.161848
      0.073618
      ...
      -0.018990
      0.015444
      0.123298
      -0.043479
      0.023581
      0.357270
      0.090833
      -0.024375
      0.089200
      0.226936
    
    
      feat_25
      -0.276763
      0.187237
      -0.096366
      -0.157459
      -1.342306e-01
      -0.003610
      -0.023319
      -0.048820
      -0.036939
      -0.025279
      ...
      0.021119
      0.263924
      -0.011294
      0.207974
      -0.012866
      -0.088187
      -0.045759
      0.030135
      -0.015708
      -0.294079
    
    
      feat_26
      0.133917
      -0.022813
      0.064856
      0.268112
      3.657567e-01
      0.025116
      0.004680
      -0.008782
      -0.041599
      -0.066414
      ...
      -0.048889
      -0.072464
      0.015937
      -0.078470
      0.094521
      -0.021565
      -0.018447
      0.199974
      0.016709
      0.141617
    
    
      feat_27
      0.097531
      -0.038826
      0.037841
      0.508370
      3.086287e-01
      0.002098
      0.001943
      -0.015429
      -0.050272
      -0.042531
      ...
      -0.046053
      -0.082510
      -0.028097
      -0.070194
      0.099536
      -0.025263
      -0.018778
      0.023790
      0.000318
      0.103288
    
    
      feat_28
      0.128876
      -0.030257
      0.072494
      0.551398
      4.864171e-01
      0.047688
      0.017132
      0.000998
      -0.036668
      -0.055545
      ...
      -0.039784
      -0.080806
      -0.002941
      -0.074442
      0.169794
      -0.021330
      -0.015242
      0.122653
      0.005275
      0.136553
    
    
      feat_29
      0.117175
      0.069266
      0.025689
      -0.004141
      1.427066e-02
      0.065957
      0.002389
      0.046231
      0.104985
      -0.021328
      ...
      0.013104
      -0.011960
      0.038800
      -0.032585
      0.055398
      -0.000185
      0.040526
      0.084445
      0.008301
      0.122533
    
    
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
    
    
      feat_65
      0.064755
      0.110041
      0.078801
      0.065492
      6.228472e-02
      0.228349
      0.066867
      0.202346
      0.025544
      -0.038163
      ...
      0.066922
      0.030362
      0.050138
      -0.011600
      0.125884
      0.029076
      0.001188
      0.044286
      0.015500
      0.066557
    
    
      feat_66
      0.174734
      0.053010
      0.175620
      0.088017
      1.296545e-01
      0.048364
      0.033285
      0.122660
      0.115175
      -0.001778
      ...
      0.031291
      0.088686
      0.206406
      -0.000679
      0.205289
      0.094925
      0.098063
      0.123694
      0.067957
      0.183488
    
    
      feat_67
      0.186622
      0.154301
      0.068667
      -0.110081
      -8.045694e-02
      0.061964
      0.038289
      0.148598
      0.320949
      0.176921
      ...
      -0.020048
      0.081445
      0.295803
      -0.058706
      0.005220
      0.089262
      0.112052
      -0.011247
      0.129018
      0.199869
    
    
      feat_68
      0.182707
      0.014674
      -0.012802
      -0.030992
      -2.009191e-02
      0.107405
      0.021619
      0.040309
      0.075384
      -0.012192
      ...
      -0.024620
      -0.038904
      0.001672
      -0.059843
      0.125150
      -0.023839
      0.022515
      0.095970
      -0.004602
      0.190042
    
    
      feat_69
      0.210949
      0.007544
      0.307406
      -0.032748
      -1.446082e-02
      -0.003294
      0.074836
      0.131430
      0.046258
      -0.029335
      ...
      -0.006954
      -0.025538
      0.027690
      -0.022918
      0.011806
      0.549489
      0.041206
      -0.037961
      0.032052
      0.217790
    
    
      feat_70
      0.043618
      0.165442
      0.112968
      -0.018774
      2.079779e-02
      0.118510
      0.052401
      0.237907
      0.023089
      -0.056205
      ...
      0.361941
      0.225792
      0.212133
      0.140850
      0.163631
      0.074178
      0.030560
      0.007310
      0.093488
      0.033185
    
    
      feat_71
      0.129365
      0.013712
      -0.002336
      -0.053020
      -4.241268e-02
      0.056428
      0.011901
      0.115813
      0.081664
      0.043286
      ...
      0.013894
      -0.015410
      0.060004
      -0.048676
      0.076348
      -0.019694
      0.050622
      0.000368
      0.002001
      0.135428
    
    
      feat_72
      -0.199876
      -0.029983
      -0.023267
      -0.045339
      -2.979578e-02
      0.005177
      -0.011090
      -0.014921
      -0.029868
      -0.058147
      ...
      0.294384
      0.008897
      0.013536
      0.004066
      0.057040
      -0.030673
      -0.008936
      0.005300
      -0.008233
      -0.208646
    
    
      feat_73
      0.075773
      0.140815
      0.039192
      -0.013972
      -1.128547e-02
      0.001609
      0.025023
      0.022819
      0.028999
      0.022679
      ...
      -0.010675
      -0.000841
      -0.004759
      -0.026363
      -0.006704
      0.070001
      0.007193
      -0.024017
      -0.000163
      0.078975
    
    
      feat_74
      0.062870
      0.051365
      0.070724
      0.041559
      4.909735e-02
      0.017265
      0.043160
      0.053059
      -0.000431
      0.007594
      ...
      -0.000453
      -0.015945
      0.003992
      -0.025207
      0.042104
      0.055372
      0.016941
      0.004497
      0.021967
      0.065925
    
    
      feat_75
      0.199319
      0.011596
      0.093689
      -0.044724
      -3.145389e-02
      0.015279
      0.006951
      0.039865
      0.031466
      -0.027313
      ...
      -0.026329
      -0.031401
      0.001201
      -0.058630
      -0.014925
      0.160418
      -0.002625
      -0.037710
      0.006208
      0.206344
    
    
      feat_76
      0.189897
      0.153808
      0.259360
      -0.028670
      -1.379188e-02
      0.035570
      0.073867
      0.375114
      0.081682
      -0.027424
      ...
      0.000682
      0.010324
      0.063411
      -0.050417
      0.023242
      0.291884
      0.175163
      -0.050887
      0.029426
      0.199383
    
    
      feat_77
      0.053024
      0.123752
      0.014911
      -0.001584
      1.531773e-02
      0.030462
      0.006501
      0.005769
      0.027486
      -0.020185
      ...
      0.005602
      0.020294
      0.019275
      -0.007396
      0.021591
      -0.004988
      0.026376
      0.076551
      0.001715
      0.058228
    
    
      feat_78
      0.104017
      0.279202
      0.094256
      -0.021979
      -1.449856e-02
      0.070709
      0.061250
      0.567084
      0.079623
      -0.015922
      ...
      0.004071
      -0.018797
      0.063539
      -0.030010
      0.014639
      0.043339
      0.068450
      -0.028596
      0.016047
      0.109507
    
    
      feat_79
      0.162963
      0.228912
      0.033668
      -0.020566
      -1.083473e-02
      0.055115
      0.009942
      0.066753
      0.083714
      -0.036116
      ...
      0.004663
      0.095254
      0.099579
      -0.018615
      0.073207
      0.031099
      0.021616
      0.162033
      0.029082
      0.170379
    
    
      feat_80
      0.141716
      -0.013303
      0.155768
      0.442036
      4.057725e-01
      0.026223
      0.017648
      0.028860
      -0.038382
      -0.046721
      ...
      -0.035876
      -0.081888
      -0.004588
      -0.076250
      0.350787
      0.012623
      -0.017815
      0.063401
      0.012651
      0.149499
    
    
      feat_81
      0.078358
      0.032427
      0.052101
      0.013089
      2.828377e-02
      0.129333
      0.044136
      0.144308
      0.035102
      -0.005847
      ...
      0.054972
      0.013808
      0.084096
      -0.017469
      0.166234
      0.009379
      0.017243
      0.018565
      0.019378
      0.082662
    
    
      feat_82
      0.113915
      -0.026085
      0.119109
      0.438458
      4.365413e-01
      0.057400
      0.014907
      0.022059
      -0.034409
      -0.039806
      ...
      -0.034368
      -0.065189
      -0.012153
      -0.059553
      0.266249
      -0.001795
      -0.014641
      0.049661
      0.005497
      0.120733
    
    
      feat_83
      0.161417
      0.059165
      0.371691
      -0.019914
      -1.051874e-03
      0.008006
      0.035145
      0.282069
      0.033479
      -0.032875
      ...
      -0.009157
      -0.029711
      0.072006
      -0.052930
      0.035181
      0.243942
      0.095801
      -0.018325
      0.054188
      0.165828
    
    
      feat_84
      -0.008192
      0.049634
      0.009845
      0.011159
      5.684499e-03
      0.467329
      0.177777
      0.062634
      0.005064
      -0.013569
      ...
      -0.010210
      -0.003459
      0.013631
      -0.017903
      0.103643
      -0.006013
      -0.003444
      0.048431
      0.003723
      -0.003308
    
    
      feat_85
      -0.084507
      -0.008739
      -0.006764
      -0.048626
      -3.315343e-02
      0.034062
      0.004290
      0.037874
      -0.003416
      -0.031462
      ...
      1.000000
      0.109643
      0.049250
      0.027886
      0.053582
      -0.003931
      -0.023091
      -0.043484
      0.023390
      -0.102359
    
    
      feat_86
      -0.096484
      0.107947
      -0.039090
      -0.096093
      -7.102916e-02
      0.013879
      0.010455
      -0.009169
      -0.029395
      -0.019144
      ...
      0.109643
      1.000000
      0.073685
      0.426972
      -0.011822
      -0.019803
      -0.024005
      -0.049393
      0.029035
      -0.115179
    
    
      feat_87
      0.097087
      0.089374
      0.047451
      -0.009838
      5.054728e-03
      0.013999
      0.015256
      0.089574
      0.059929
      -0.016925
      ...
      0.049250
      0.073685
      1.000000
      0.023053
      0.066008
      0.014696
      0.028850
      0.001424
      0.499990
      0.101345
    
    
      feat_88
      -0.215878
      0.020830
      -0.047035
      -0.082336
      -6.748367e-02
      -0.019201
      -0.015437
      -0.033646
      -0.050931
      0.001160
      ...
      0.027886
      0.426972
      0.023053
      1.000000
      -0.022552
      -0.031679
      -0.033653
      -0.070120
      -0.008631
      -0.240481
    
    
      feat_89
      0.111519
      0.096851
      0.105527
      0.174781
      1.837145e-01
      0.119951
      0.035042
      0.063511
      0.007974
      -0.019147
      ...
      0.053582
      -0.011822
      0.066008
      -0.022552
      1.000000
      0.027764
      0.015917
      0.129622
      0.030650
      0.113492
    
    
      feat_90
      0.188895
      0.010310
      0.515022
      -0.015068
      9.454061e-03
      0.004842
      0.054034
      0.129578
      0.026807
      -0.020698
      ...
      -0.003931
      -0.019803
      0.014696
      -0.031679
      0.027764
      1.000000
      0.014812
      -0.035311
      0.039864
      0.195439
    
    
      feat_91
      0.139078
      0.037264
      0.026383
      -0.012417
      -1.031241e-02
      0.012012
      0.012465
      0.068506
      0.095990
      -0.014742
      ...
      -0.023091
      -0.024005
      0.028850
      -0.033653
      0.015917
      0.014812
      1.000000
      0.104226
      -0.000045
      0.146567
    
    
      feat_92
      0.131737
      0.054777
      -0.008219
      0.066921
      8.763105e-02
      0.065331
      0.015479
      -0.032261
      0.013608
      -0.069707
      ...
      -0.043484
      -0.049393
      0.001424
      -0.070120
      0.129622
      -0.035311
      0.104226
      1.000000
      -0.003653
      0.145157
    
    
      feat_93
      0.047944
      0.081783
      0.054593
      0.006814
      1.574563e-02
      0.002038
      0.008521
      0.034912
      0.005131
      -0.006038
      ...
      0.023390
      0.029035
      0.499990
      -0.008631
      0.030650
      0.039864
      -0.000045
      -0.003653
      1.000000
      0.049821
    
    
      target
      0.977144
      0.072984
      0.191739
      0.148729
      1.534582e-01
      0.036034
      0.037116
      0.142172
      0.209725
      -0.175350
      ...
      -0.102359
      -0.115179
      0.101345
      -0.240481
      0.113492
      0.195439
      0.146567
      0.145157
      0.049821
      1.000000
    
  

95 rows × 95 columns



In [13]:

    
df['feat_11'].corr(df['feat_90'])









    Out[13]:





0.045073721280135606

Podemos seleccionar solo algunas de las columnas de nuestro DataFrame...



In [14]:

    
df[['feat_1', 'target']]









    Out[14]:






  
    
      
      feat_1
      target
    
  
  
    
      0
      1
      0
    
    
      1
      0
      0
    
    
      2
      0
      0
    
    
      3
      1
      0
    
    
      4
      0
      0
    
    
      5
      2
      0
    
    
      6
      2
      0
    
    
      7
      0
      0
    
    
      8
      0
      0
    
    
      9
      0
      0
    
    
      10
      0
      0
    
    
      11
      0
      0
    
    
      12
      1
      0
    
    
      13
      0
      0
    
    
      14
      0
      0
    
    
      15
      0
      0
    
    
      16
      0
      0
    
    
      17
      0
      0
    
    
      18
      0
      0
    
    
      19
      0
      0
    
    
      20
      0
      0
    
    
      21
      0
      0
    
    
      22
      0
      0
    
    
      23
      0
      0
    
    
      24
      0
      0
    
    
      25
      0
      0
    
    
      26
      2
      0
    
    
      27
      0
      0
    
    
      28
      0
      0
    
    
      29
      2
      0
    
    
      ...
      ...
      ...
    
    
      61848
      0
      8
    
    
      61849
      5
      8
    
    
      61850
      0
      8
    
    
      61851
      3
      8
    
    
      61852
      0
      8
    
    
      61853
      0
      8
    
    
      61854
      0
      8
    
    
      61855
      0
      8
    
    
      61856
      0
      8
    
    
      61857
      0
      8
    
    
      61858
      4
      8
    
    
      61859
      0
      8
    
    
      61860
      0
      8
    
    
      61861
      0
      8
    
    
      61862
      2
      8
    
    
      61863
      0
      8
    
    
      61864
      0
      8
    
    
      61865
      0
      8
    
    
      61866
      0
      8
    
    
      61867
      0
      8
    
    
      61868
      0
      8
    
    
      61869
      0
      8
    
    
      61870
      1
      8
    
    
      61871
      0
      8
    
    
      61872
      0
      8
    
    
      61873
      1
      8
    
    
      61874
      4
      8
    
    
      61875
      0
      8
    
    
      61876
      1
      8
    
    
      61877
      0
      8
    
  

61878 rows × 2 columns



In [15]:

    
df.loc[2:5, ['id', 'target']]

Qué hacer cuando tenemos datos incompletos??



In [16]:

    
df.isnull().any() #df.isnull().isnull() full matrix









    Out[16]:





id         False
feat_1     False
feat_2     False
feat_3     False
feat_4     False
feat_5     False
feat_6     False
feat_7     False
feat_8     False
feat_9     False
feat_10    False
feat_11    False
feat_12    False
feat_13    False
feat_14    False
feat_15    False
feat_16    False
feat_17    False
feat_18    False
feat_19    False
feat_20    False
feat_21    False
feat_22    False
feat_23    False
feat_24    False
feat_25    False
feat_26    False
feat_27    False
feat_28    False
feat_29    False
           ...  
feat_65    False
feat_66    False
feat_67    False
feat_68    False
feat_69    False
feat_70    False
feat_71    False
feat_72    False
feat_73    False
feat_74    False
feat_75    False
feat_76    False
feat_77    False
feat_78    False
feat_79    False
feat_80    False
feat_81    False
feat_82    False
feat_83    False
feat_84    False
feat_85    False
feat_86    False
feat_87    False
feat_88    False
feat_89    False
feat_90    False
feat_91    False
feat_92    False
feat_93    False
target     False
dtype: bool

Y duplicados ??



In [17]:

    
unique_df = df.drop_duplicates()
unique_df.shape, df.shape









    Out[17]:





((61878, 95), (61878, 95))

Este dataset no cuenta con NaN values ... pero para futuros machetes ..



In [14]:

    
df_with_interpolated_values = df.interpolate(method='linear', axis=0)
# Se pueden utilizar diferentes métodos de interpolacion de datos y sobre cualquiera de los dos ejes
df_with_fill_values = df.fillna(df.mean())

**También mirar:
- fillna se tienen varias opciones como bfill y ffill. - dropna saca de la tabla todos los valores incompletos.
- interpolate cuenta con varios métodos: spline, pchip, polynomial, etc.**



In [15]:

    
df_with_fill_values.info(), df.info()









    



<class 'pandas.core.frame.DataFrame'>
Int64Index: 61878 entries, 0 to 61877
Data columns (total 95 columns):
id         61878 non-null int64
feat_1     61878 non-null int64
feat_2     61878 non-null int64
feat_3     61878 non-null int64
feat_4     61878 non-null int64
feat_5     61878 non-null int64
feat_6     61878 non-null int64
feat_7     61878 non-null int64
feat_8     61878 non-null int64
feat_9     61878 non-null int64
feat_10    61878 non-null int64
feat_11    61878 non-null int64
feat_12    61878 non-null int64
feat_13    61878 non-null int64
feat_14    61878 non-null int64
feat_15    61878 non-null int64
feat_16    61878 non-null int64
feat_17    61878 non-null int64
feat_18    61878 non-null int64
feat_19    61878 non-null int64
feat_20    61878 non-null int64
feat_21    61878 non-null int64
feat_22    61878 non-null int64
feat_23    61878 non-null int64
feat_24    61878 non-null int64
feat_25    61878 non-null int64
feat_26    61878 non-null int64
feat_27    61878 non-null int64
feat_28    61878 non-null int64
feat_29    61878 non-null int64
feat_30    61878 non-null int64
feat_31    61878 non-null int64
feat_32    61878 non-null int64
feat_33    61878 non-null int64
feat_34    61878 non-null int64
feat_35    61878 non-null int64
feat_36    61878 non-null int64
feat_37    61878 non-null int64
feat_38    61878 non-null int64
feat_39    61878 non-null int64
feat_40    61878 non-null int64
feat_41    61878 non-null int64
feat_42    61878 non-null int64
feat_43    61878 non-null int64
feat_44    61878 non-null int64
feat_45    61878 non-null int64
feat_46    61878 non-null int64
feat_47    61878 non-null int64
feat_48    61878 non-null int64
feat_49    61878 non-null int64
feat_50    61878 non-null int64
feat_51    61878 non-null int64
feat_52    61878 non-null int64
feat_53    61878 non-null int64
feat_54    61878 non-null int64
feat_55    61878 non-null int64
feat_56    61878 non-null int64
feat_57    61878 non-null int64
feat_58    61878 non-null int64
feat_59    61878 non-null int64
feat_60    61878 non-null int64
feat_61    61878 non-null int64
feat_62    61878 non-null int64
feat_63    61878 non-null int64
feat_64    61878 non-null int64
feat_65    61878 non-null int64
feat_66    61878 non-null int64
feat_67    61878 non-null int64
feat_68    61878 non-null int64
feat_69    61878 non-null int64
feat_70    61878 non-null int64
feat_71    61878 non-null int64
feat_72    61878 non-null int64
feat_73    61878 non-null int64
feat_74    61878 non-null int64
feat_75    61878 non-null int64
feat_76    61878 non-null int64
feat_77    61878 non-null int64
feat_78    61878 non-null int64
feat_79    61878 non-null int64
feat_80    61878 non-null int64
feat_81    61878 non-null int64
feat_82    61878 non-null int64
feat_83    61878 non-null int64
feat_84    61878 non-null int64
feat_85    61878 non-null int64
feat_86    61878 non-null int64
feat_87    61878 non-null int64
feat_88    61878 non-null int64
feat_89    61878 non-null int64
feat_90    61878 non-null int64
feat_91    61878 non-null int64
feat_92    61878 non-null int64
feat_93    61878 non-null int64
target     61878 non-null int32
dtypes: int32(1), int64(94)
memory usage: 45.1 MB
<class 'pandas.core.frame.DataFrame'>
Int64Index: 61878 entries, 0 to 61877
Data columns (total 95 columns):
id         61878 non-null int64
feat_1     61878 non-null int64
feat_2     61878 non-null int64
feat_3     61878 non-null int64
feat_4     61878 non-null int64
feat_5     61878 non-null int64
feat_6     61878 non-null int64
feat_7     61878 non-null int64
feat_8     61878 non-null int64
feat_9     61878 non-null int64
feat_10    61878 non-null int64
feat_11    61878 non-null int64
feat_12    61878 non-null int64
feat_13    61878 non-null int64
feat_14    61878 non-null int64
feat_15    61878 non-null int64
feat_16    61878 non-null int64
feat_17    61878 non-null int64
feat_18    61878 non-null int64
feat_19    61878 non-null int64
feat_20    61878 non-null int64
feat_21    61878 non-null int64
feat_22    61878 non-null int64
feat_23    61878 non-null int64
feat_24    61878 non-null int64
feat_25    61878 non-null int64
feat_26    61878 non-null int64
feat_27    61878 non-null int64
feat_28    61878 non-null int64
feat_29    61878 non-null int64
feat_30    61878 non-null int64
feat_31    61878 non-null int64
feat_32    61878 non-null int64
feat_33    61878 non-null int64
feat_34    61878 non-null int64
feat_35    61878 non-null int64
feat_36    61878 non-null int64
feat_37    61878 non-null int64
feat_38    61878 non-null int64
feat_39    61878 non-null int64
feat_40    61878 non-null int64
feat_41    61878 non-null int64
feat_42    61878 non-null int64
feat_43    61878 non-null int64
feat_44    61878 non-null int64
feat_45    61878 non-null int64
feat_46    61878 non-null int64
feat_47    61878 non-null int64
feat_48    61878 non-null int64
feat_49    61878 non-null int64
feat_50    61878 non-null int64
feat_51    61878 non-null int64
feat_52    61878 non-null int64
feat_53    61878 non-null int64
feat_54    61878 non-null int64
feat_55    61878 non-null int64
feat_56    61878 non-null int64
feat_57    61878 non-null int64
feat_58    61878 non-null int64
feat_59    61878 non-null int64
feat_60    61878 non-null int64
feat_61    61878 non-null int64
feat_62    61878 non-null int64
feat_63    61878 non-null int64
feat_64    61878 non-null int64
feat_65    61878 non-null int64
feat_66    61878 non-null int64
feat_67    61878 non-null int64
feat_68    61878 non-null int64
feat_69    61878 non-null int64
feat_70    61878 non-null int64
feat_71    61878 non-null int64
feat_72    61878 non-null int64
feat_73    61878 non-null int64
feat_74    61878 non-null int64
feat_75    61878 non-null int64
feat_76    61878 non-null int64
feat_77    61878 non-null int64
feat_78    61878 non-null int64
feat_79    61878 non-null int64
feat_80    61878 non-null int64
feat_81    61878 non-null int64
feat_82    61878 non-null int64
feat_83    61878 non-null int64
feat_84    61878 non-null int64
feat_85    61878 non-null int64
feat_86    61878 non-null int64
feat_87    61878 non-null int64
feat_88    61878 non-null int64
feat_89    61878 non-null int64
feat_90    61878 non-null int64
feat_91    61878 non-null int64
feat_92    61878 non-null int64
feat_93    61878 non-null int64
target     61878 non-null int32
dtypes: int32(1), int64(94)
memory usage: 45.1 MB






    Out[15]:





(None, None)

**Si los datos con los que se trabajan vienen de diferentes fuentes mirar las funciones:
- join, concat, merge, combine, etc. para unificar datos de distintos origenes con diferentes patrones ..**

Ahora queremos separar los datos que usaremos como entrada para crear nuestro clasificador...



In [20]:

    
y = df['target'].values.astype(np.int32)
xs = df[df.columns[1:-1]].values.astype(np.float32)

Y por las dudas guardemos nuestro archivo con la nueva codificación para la variable target ..



In [18]:

    
df.to_csv('/tmp/this_is_my_out.txt', sep='\t', header=None, index=False)



In [18]:

    
df.loc[:5, :].to_clipboard()

**También mirar:
- to_excel
- to_json
- to_html
- to_pickle ... Y sus parámetros**

Paseando por scikit-learn



In [3]:

    
from IPython.display import IFrame
IFrame('http://scikit-learn.org/stable/', width=900, height=350)









    Out[3]:



In [20]:

    
from IPython.display import Image
Image(url='http://1.bp.blogspot.com/-ME24ePzpzIM/UQLWTwurfXI/AAAAAAAAANw/W3EETIroA80/s1600/drop_shadows_background.png',
      width=1000, height=1000)









    Out[20]:

Nuestros datos cuentan con varias dimensiones por lo cual no es tan fácil visualizarlo ... por lo que si queremos ver cómo se comportan podemos usar PCA para reducir su dimensionalidad.



In [21]:

    
# Project the data to a 2D space for visualization
from sklearn.decomposition import RandomizedPCA # using randomized Singular Value Decomposition 
Xp = RandomizedPCA(n_components=2, random_state=1).fit_transform(xs)
Xp









    Out[21]:





array([[ 0.68992602, -1.4586081 ],
       [-2.64613954, -1.89425161],
       [-1.8834618 , -3.11100498],
       ..., 
       [ 4.22165023, -7.45878803],
       [-0.3476059 , -2.22537112],
       [ 1.50033683, -0.69861494]])



In [22]:

    
% matplotlib inline
import matplotlib.pyplot as plt

# get the product class 
product_class = np.unique(y)

colors = plt.get_cmap("hsv")

plt.figure(figsize=(10, 4))
for i, p in enumerate(product_class):
    mask = (y == p)
    plt.scatter(Xp[mask, 0], Xp[mask, 1], 
                c=colors(1. * i / 11), label=p, alpha=0.2)
    
plt.legend(loc="best")









    Out[22]:





<matplotlib.legend.Legend at 0x7f08de09c250>

Antes de usar cualquier algoritmo de predicción vamos a querer dividir nuestro dataset, entre los valores que usaremos para entrenar al clasificador y los que usaremos para evaluar que tan bien clasifica ...



In [23]:

    
from sklearn.cross_validation import train_test_split
from sklearn.metrics import classification_report, confusion_matrix
from sklearn import metrics
from sklearn import preprocessing

#X = preprocessing.scale(xs, axis=1)
X_train, X_test, y_train, y_test = train_test_split(xs, y,random_state=1)

print """X_train shape : {}, y_train shape : {}
X_test shape : {}, y_test shape : {}""".format(X_train.shape, y_train.shape, X_test.shape, y_test.shape)









    



X_train shape : (46408, 93), y_train shape : (46408,)
X_test shape : (15470, 93), y_test shape : (15470,)

Vamos a implementar una pequeña función que nos deje ver de forma grafica que tan bien trabaja nuestro clasificador



In [24]:

    
def plot_matrix(clf, X_test, y_test):
    plt.clf()
    plt.imshow(confusion_matrix(clf.predict(X_test), y_test),
               interpolation='nearest', cmap=plt.cm.binary)
    plt.colorbar()
    plt.xlabel("true label")
    plt.ylabel("predicted label")
    plt.show()

Ahora creamos nuestros clasificadores ...



In [17]:

    
from sklearn.svm import SVC

sv = SVC(kernel='rbf', cache_size=1000)
sv.fit(X_train, y_train)

print classification_report(sv.predict(X_test), y_test)
print sv.score(X_test, y_test)
plot_matrix(sv, X_test, y_test)









    



             precision    recall  f1-score   support

          0       0.44      0.74      0.56       315
          1       0.88      0.70      0.78      5136
          2       0.44      0.63      0.52      1411
          3       0.32      0.82      0.46       249
          4       0.96      0.98      0.97       710
          5       0.91      0.94      0.93      3327
          6       0.54      0.72      0.62       535
          7       0.94      0.74      0.83      2576
          8       0.82      0.85      0.83      1211

avg / total       0.82      0.78      0.79     15470

0.781124757595



In [25]:

    
from sklearn.ensemble import ExtraTreesClassifier

clf = ExtraTreesClassifier(n_estimators=200,
                           max_features=0.2, 
                           n_jobs=2,
                           max_depth=None,
                           min_samples_split=1,
                           random_state=1).fit(X_train, y_train)
print classification_report(clf.predict(X_test), y_test)
print clf.score(X_test, y_test)
plot_matrix(clf, X_test, y_test)









    



             precision    recall  f1-score   support

          0       0.42      0.84      0.56       262
          1       0.88      0.72      0.79      5037
          2       0.50      0.65      0.57      1568
          3       0.44      0.83      0.58       346
          4       0.96      0.98      0.97       712
          5       0.95      0.93      0.94      3515
          6       0.60      0.81      0.69       526
          7       0.94      0.87      0.91      2190
          8       0.88      0.84      0.86      1314

avg / total       0.84      0.81      0.82     15470

0.811700064641

Podemos observar cuales son las variables que mas peso tienen sobre la decision de ubicar un elemento en X clase?



In [26]:

    
importances = clf.feature_importances_

text = map(lambda i: df.columns[1:-1][i], range(93))
plt.figure(figsize=(20, 6))
print importances[::-1].shape
plt.bar(range(93),height=importances,  width=1.)
plt.xticks(np.arange(0.5, 93, 1.), text, rotation=90)
plt.xlim((0, 93))
plt.show()
# Top 10
indices = np.argsort(importances)[::-1]
for i in range(10):
    print importances[indices[i]], df.columns[1:-1][indices[i]]









    



(93,)






    












    



0.0490019250225 feat_11
0.0447674033442 feat_34
0.0423471301928 feat_60
0.0338275409266 feat_14
0.0264123595345 feat_40
0.0259991016933 feat_25
0.0246679553862 feat_15
0.0226700475102 feat_42
0.0225282480303 feat_26
0.020135275408 feat_75

Cómo buscar los mejores parámetros para nuestro modelo?



In [27]:

    
from sklearn.grid_search import GridSearchCV
from sklearn.ensemble import ExtraTreesClassifier

parameter_grid = {
    'n_estimators': [100, 200],
    'max_features': [0.2, 0.5],
    #'max_depth': [5., None]
}

grid_search = GridSearchCV(ExtraTreesClassifier(n_jobs=4), parameter_grid,
                            cv=5, verbose=3)
grid_search.fit(X_train, y_train)









    



Fitting 5 folds for each of 4 candidates, totalling 20 fits
[CV] max_features=0.2, n_estimators=100 ..............................
[CV] ..... max_features=0.2, n_estimators=100, score=0.806785 -  18.4s
[CV] max_features=0.2, n_estimators=100 ..............................
[CV] ..... max_features=0.2, n_estimators=100, score=0.813227 -  18.5s
[CV] max_features=0.2, n_estimators=100 ..............................
[CV] ..... max_features=0.2, n_estimators=100, score=0.802715 -  18.1s
[CV] max_features=0.2, n_estimators=100 ..............................
[CV] ..... max_features=0.2, n_estimators=100, score=0.810560 -  17.7s
[CV] max_features=0.2, n_estimators=100 ..............................
[CV] ..... max_features=0.2, n_estimators=100, score=0.812567 -  18.2s
[CV] max_features=0.2, n_estimators=200 ..............................
[CV] ..... max_features=0.2, n_estimators=200, score=0.807539 -  35.8s
[CV] max_features=0.2, n_estimators=200 ..............................
[CV] ..... max_features=0.2, n_estimators=200, score=0.817428 -  36.3s
[CV] max_features=0.2, n_estimators=200 ..............................
[CV] ..... max_features=0.2, n_estimators=200, score=0.805732 -  36.3s
[CV] max_features=0.2, n_estimators=200 ..............................
[CV] ..... max_features=0.2, n_estimators=200, score=0.813147 -  36.0s
[CV] max_features=0.2, n_estimators=200 ..............................
[CV] ..... max_features=0.2, n_estimators=200, score=0.811813 -  36.0s
[CV] max_features=0.5, n_estimators=100 ..............................
[CV] ..... max_features=0.5, n_estimators=100, score=0.811524 -  39.1s
[CV] max_features=0.5, n_estimators=100 ..............................
[CV] ..... max_features=0.5, n_estimators=100, score=0.818397 -  39.8s
[CV] max_features=0.5, n_estimators=100 ..............................
[CV] ..... max_features=0.5, n_estimators=100, score=0.805301 -  39.5s
[CV] max_features=0.5, n_estimators=100 ..............................
[CV] ..... max_features=0.5, n_estimators=100, score=0.813470 -  39.6s
[CV] max_features=0.5, n_estimators=100 ..............................
[CV] ..... max_features=0.5, n_estimators=100, score=0.814615 -  39.6s
[CV] max_features=0.5, n_estimators=200 ..............................
[CV] ..... max_features=0.5, n_estimators=200, score=0.811632 - 1.3min
[CV] max_features=0.5, n_estimators=200 ..............................
[CV] ..... max_features=0.5, n_estimators=200, score=0.818074 - 1.4min
[CV] max_features=0.5, n_estimators=200 ..............................
[CV] ..... max_features=0.5, n_estimators=200, score=0.806379 - 1.3min
[CV] max_features=0.5, n_estimators=200 ..............................
[CV] ..... max_features=0.5, n_estimators=200, score=0.816487 - 1.4min
[CV] max_features=0.5, n_estimators=200 ..............................
[CV] ..... max_features=0.5, n_estimators=200, score=0.816986 - 1.3min





    



[Parallel(n_jobs=1)]: Done   1 jobs       | elapsed:   18.4s
[Parallel(n_jobs=1)]: Done  20 out of  20 | elapsed: 14.5min finished






    









    Out[27]:





GridSearchCV(cv=5, error_score='raise',
       estimator=ExtraTreesClassifier(bootstrap=False, class_weight=None, criterion='gini',
           max_depth=None, max_features='auto', max_leaf_nodes=None,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=4,
           oob_score=False, random_state=None, verbose=0, warm_start=False),
       fit_params={}, iid=True, loss_func=None, n_jobs=1,
       param_grid={'n_estimators': [100, 200], 'max_features': [0.2, 0.5]},
       pre_dispatch='2*n_jobs', refit=True, score_func=None, scoring=None,
       verbose=3)



In [77]:

    
grid_search.grid_scores_









    Out[77]:





[mean: 0.80796, std: 0.00395, params: {'max_features': 0.2, 'n_estimators': 100},
 mean: 0.81074, std: 0.00412, params: {'max_features': 0.2, 'n_estimators': 200},
 mean: 0.81339, std: 0.00300, params: {'max_features': 0.5, 'n_estimators': 100},
 mean: 0.81413, std: 0.00419, params: {'max_features': 0.5, 'n_estimators': 200}]

Cómo saber si nuestro clasificador es realmente bueno ?? deberíamos tener alguna especie de baseline. Pensemos en cómo le iría al peor clasificador del mundo con nuestros datos ...



In [27]:

    
from sklearn.dummy import DummyClassifier
clf = DummyClassifier(strategy='most_frequent',random_state=0).fit(X_train, y_train)
print clf.score(X_test, y_test)
plot_matrix(clf, X_test, y_test)









    



0.2650937298

**También mirar: Pipelines en scikit-learn.**

Una miradita por lasagne ...



In [8]:

    
from sklearn import preprocessing



In [9]:

    
df = pd.read_csv('data/train.csv')
X = df.values

np.random.shuffle(X)

X, labels = X[:, 1:-1].astype(np.float32), X[:, -1]
encoder = preprocessing.LabelEncoder()
y = encoder.fit_transform(labels).astype(np.int32)
scaler = preprocessing.StandardScaler()
X = scaler.fit_transform(X)



In [10]:

    
# add to kfkd.py
from lasagne import layers 
from lasagne.nonlinearities import softmax
from lasagne.updates import nesterov_momentum
from nolearn.lasagne import NeuralNet
import theano

net1 = NeuralNet(
    layers= [
            ('input', layers.InputLayer),
            ('hidden1', layers.DenseLayer), 
            ('dropout0', layers.DropoutLayer),
            ('hidden2', layers.DenseLayer),
            ('output', layers.DenseLayer),
            ],
    # layer parameters:
    input_shape = (None, 93),
    hidden1_num_units = 500,
    dropout0_p = 0.3,
    hidden2_num_units = 250,
    output_nonlinearity= softmax,  
    output_num_units= encoder.classes_.shape[0],

    # optimization method:
    update=nesterov_momentum,
    update_learning_rate=theano.shared(np.float32(0.03)),
    update_momentum=theano.shared(np.float32(0.9)),

    #on_epoch_finished=[
    #    AdjustVariable('update_learning_rate', start=0.03, stop=0.0001),
    #    AdjustVariable('update_momentum', start=0.9, stop=0.999),
        #EarlyStopping(patience=300)
    #    ],
    regression=False,  
    max_epochs=100,
    verbose=1,
    )

net1.fit(X, y)









    



Using gpu device 0: GeForce GT 635M
/home/celia/Downloads/src/lasagne/lasagne/init.py:86: UserWarning: The uniform initializer no longer uses Glorot et al.'s approach to determine the bounds, but defaults to the range (-0.01, 0.01) instead. Please use the new GlorotUniform initializer to get the old behavior. GlorotUniform is now the default for all layers.
  warnings.warn("The uniform initializer no longer uses Glorot et al.'s "






    



  input             	(None, 93)          	produces      93 outputs
  hidden1           	(None, 500)         	produces     500 outputs
  dropout0          	(None, 500)         	produces     500 outputs
  hidden2           	(None, 250)         	produces     250 outputs
  output            	(None, 9)           	produces       9 outputs
  epoch    train loss    valid loss    train/val    valid acc  dur
-------  ------------  ------------  -----------  -----------  -----
      1       0.72416       0.58657      1.23457      0.78109  1.37s
      2       0.58723       0.55946      1.04964      0.78482  1.27s
      3       0.55576       0.54064      1.02797      0.79241  1.27s
      4       0.53638       0.53297      1.00639      0.79409  1.27s
      5       0.51871       0.52702      0.98422      0.79441  1.27s
      6       0.50350       0.52198      0.96459      0.79828  1.27s
      7       0.49222       0.51321      0.95910      0.80213  1.28s
      8       0.48407       0.50851      0.95195      0.80418  1.27s
      9       0.47342       0.50727      0.93326      0.80180  1.27s
     10       0.46529       0.51027      0.91184      0.80318  1.45s
     11       0.45798       0.50284      0.91078      0.80465  1.60s
     12       0.44821       0.50661      0.88471      0.80671  1.61s
     13       0.44031       0.50404      0.87356      0.80471  1.62s
     14       0.43518       0.50391      0.86361      0.80887  1.60s
     15       0.42760       0.50075      0.85391      0.80780  1.60s
     16       0.42405       0.50587      0.83826      0.80774  1.60s
     17       0.41875       0.50503      0.82917      0.80670  1.67s
     18       0.41142       0.50574      0.81350      0.80998  1.62s
     19       0.40730       0.50283      0.81000      0.80806  1.61s
     20       0.39819       0.50154      0.79392      0.80820  1.62s
     21       0.39277       0.50259      0.78150      0.80906  1.61s
     22       0.39038       0.49828      0.78345      0.81167  1.62s
     23       0.38516       0.50340      0.76513      0.81145  1.66s
     24       0.38043       0.50618      0.75157      0.81154  1.69s
     25       0.37515       0.51446      0.72922      0.80763  1.61s
     26       0.37170       0.50897      0.73031      0.80611  1.65s
     27       0.37230       0.50921      0.73113      0.80513  1.62s
     28       0.36380       0.51565      0.70551      0.80933  1.63s
     29       0.35971       0.51057      0.70453      0.80823  1.62s
     30       0.35359       0.51519      0.68633      0.80885  1.63s
     31       0.35116       0.51461      0.68238      0.81038  1.64s
     32       0.34524       0.51990      0.66405      0.80478  1.69s
     33       0.34138       0.51768      0.65944      0.81000  1.65s
     34       0.34024       0.52704      0.64558      0.81041  1.62s
     35       0.33518       0.52718      0.63580      0.81124  1.65s
     36       0.33490       0.52074      0.64313      0.80719  1.61s
     37       0.33246       0.52708      0.63077      0.80817  1.65s
     38       0.32704       0.53166      0.61513      0.80703  1.61s
     39       0.32303       0.53418      0.60472      0.80632  1.60s
     40       0.32297       0.53073      0.60853      0.80773  1.65s
     41       0.31560       0.53619      0.58860      0.80933  1.62s
     42       0.31336       0.54226      0.57788      0.80781  1.62s
     43       0.31462       0.54176      0.58073      0.81026  1.61s
     44       0.31093       0.54612      0.56935      0.80459  1.62s
     45       0.30729       0.54595      0.56285      0.81155  1.62s
     46       0.30461       0.54560      0.55830      0.80934  1.62s
     47       0.29643       0.56000      0.52935      0.80812  1.62s
     48       0.29935       0.54724      0.54702      0.81031  1.62s
     49       0.29544       0.55324      0.53402      0.80976  1.62s
     50       0.29027       0.56169      0.51678      0.80701  1.61s
     51       0.28878       0.56202      0.51381      0.80612  1.62s
     52       0.28596       0.57100      0.50079      0.81045  1.62s
     53       0.28800       0.56849      0.50659      0.81069  1.62s
     54       0.28165       0.58102      0.48476      0.80430  1.62s
     55       0.28904       0.57535      0.50237      0.80350  1.62s
     56       0.28109       0.56791      0.49495      0.80806  1.62s
     57       0.27560       0.57797      0.47684      0.80757  1.62s
     58       0.27667       0.57735      0.47921      0.80892  1.61s
     59       0.27468       0.58504      0.46951      0.80918  1.61s
     60       0.27415       0.58381      0.46959      0.80798  1.62s
     61       0.26945       0.57916      0.46523      0.81207  1.62s
     62       0.26667       0.58842      0.45320      0.81020  1.64s
     63       0.26286       0.59291      0.44334      0.80967  1.66s
     64       0.26570       0.59542      0.44624      0.80886  1.64s
     65       0.26102       0.61152      0.42683      0.80596  1.62s
     66       0.26331       0.59913      0.43949      0.80883  1.63s
     67       0.26110       0.61007      0.42799      0.80914  1.61s
     68       0.25593       0.60861      0.42052      0.80246  1.67s
     69       0.25494       0.62655      0.40690      0.80991  1.62s
     70       0.25240       0.60906      0.41442      0.80512  1.62s
     71       0.24819       0.61691      0.40231      0.81028  1.64s
     72       0.24716       0.62290      0.39679      0.80806  1.62s
     73       0.24432       0.64377      0.37951      0.80510  1.62s
     74       0.24092       0.62105      0.38793      0.81135  1.62s
     75       0.24499       0.62283      0.39335      0.81271  1.62s
     76       0.24568       0.63266      0.38833      0.80285  1.61s
     77       0.24323       0.65336      0.37228      0.80975  1.61s
     78       0.23773       0.65139      0.36496      0.80677  1.62s
     79       0.23940       0.64610      0.37052      0.80908  1.61s
     80       0.23646       0.65991      0.35832      0.80636  1.61s
     81       0.23870       0.65253      0.36580      0.80838  1.62s
     82       0.23721       0.65243      0.36358      0.80857  1.67s
     83       0.23944       0.65476      0.36570      0.80669  1.61s
     84       0.23271       0.65678      0.35431      0.80878  1.65s
     85       0.22861       0.65566      0.34867      0.80749  1.90s
     86       0.22473       0.66446      0.33822      0.80920  1.75s
     87       0.22383       0.66661      0.33578      0.80793  1.74s
     88       0.22553       0.67273      0.33524      0.80340  1.61s
     89       0.22319       0.68162      0.32744      0.80724  1.62s
     90       0.22194       0.68806      0.32256      0.80456  1.62s
     91       0.22270       0.67390      0.33046      0.80725  1.61s
     92       0.22090       0.68044      0.32464      0.81015  1.61s
     93       0.21299       0.68284      0.31192      0.80318  1.62s
     94       0.21626       0.68805      0.31431      0.80957  1.61s
     95       0.21637       0.69689      0.31048      0.80723  1.60s
     96       0.21908       0.68275      0.32088      0.80604  1.61s
     97       0.21675       0.69357      0.31250      0.80770  1.63s
     98       0.21397       0.68933      0.31041      0.80780  1.61s
     99       0.21456       0.68251      0.31436      0.80937  1.61s
    100       0.20991       0.69972      0.29999      0.80891  1.61s






    Out[10]:





NeuralNet(X_tensor_type=<function matrix at 0x7fd7f4ebd9b0>,
     batch_iterator_test=<nolearn.lasagne.base.BatchIterator object at 0x7fd7e3e1fd50>,
     batch_iterator_train=<nolearn.lasagne.base.BatchIterator object at 0x7fd7e81c2b10>,
     custom_score=None, dropout0_p=0.3, eval_size=0.2,
     hidden1_num_units=500, hidden2_num_units=250, input_shape=(None, 93),
     layers=[('input', <class 'lasagne.layers.input.InputLayer'>), ('hidden1', <class 'lasagne.layers.dense.DenseLayer'>), ('dropout0', <class 'lasagne.layers.noise.DropoutLayer'>), ('hidden2', <class 'lasagne.layers.dense.DenseLayer'>), ('output', <class 'lasagne.layers.dense.DenseLayer'>)],
     loss=None, max_epochs=100, more_params={},
     objective=<class 'lasagne.objectives.Objective'>,
     objective_loss_function=<function categorical_crossentropy at 0x7fd7f4b22410>,
     on_epoch_finished=[<nolearn.lasagne.util.PrintLog instance at 0x7fd7e3e32440>],
     on_training_finished=[],
     output_nonlinearity=<function softmax at 0x7fd7e3e18b90>,
     output_num_units=9, regression=False,
     update=<function nesterov_momentum at 0x7fd7e3995320>,
     update_learning_rate=<TensorType(float32, scalar)>,
     update_momentum=<TensorType(float32, scalar)>,
     use_label_encoder=False, verbose=1,
     y_tensor_type=TensorType(int32, vector))

Último viaje del dia .. por matplotlib



In [11]:

    
from IPython.display import Image
Image(filename='images/convincing.png')









    Out[11]:



In [29]:

    
from IPython.display import IFrame
IFrame('http://matplotlib.org/', width=900, height=350)









    Out[29]:



In [30]:

    
% matplotlib inline
import matplotlib.pyplot as plt



In [31]:

    
print plt.style.available
plt.style.use(plt.style.available[1])









    



[u'dark_background', u'bmh', u'grayscale', u'ggplot', u'fivethirtyeight']



In [32]:

    
df = pd.read_csv('data/train.csv')



In [33]:

    
plt.plot(df['feat_34'].cumsum(), 'b-') # con la letra indicamos color y con el siguiente la forma del marcador.
plt.ylabel('Este es el label para y') # colocando labels sobre los ejes.
plt.xlabel('Este label es para x')
#plt.xscale('log')









    Out[33]:





<matplotlib.text.Text at 0x7f08da946890>



In [34]:

    
plt.plot(df['feat_34'].diff(), 'y-') # con la letra indicamos color y con el siguiente la forma del marcador.
plt.ylabel('Este es el label para y') # colocando labels sobre los ejes.
plt.xlabel('Este label es para x')
#plt.axis([0, 65000, -60, 60])









    Out[34]:





<matplotlib.text.Text at 0x7f08ddf40d90>



In [35]:

    
from mpl_toolkits.mplot3d import Axes3D
for i, group in df.groupby('target'):
    threedee = plt.figure().gca(projection='3d')
    threedee.set_title("scatter plot of class number {}".format(i))
    X = group['feat_11']
    Y = group['feat_14']
    Z = group['feat_34']
    threedee.scatter(X, Y, Z)
    threedee.set_xlabel('feat_11')
    threedee.set_ylabel('feat_14')
    threedee.set_zlabel('feat_34')
plt.show()



In [33]:

    
from mpl_toolkits.mplot3d import Axes3D
from scipy.interpolate import griddata

for i, group in df.groupby('target'):
    threedee = plt.figure().gca(projection='3d')
    threedee.set_title("scatter plot of class number {}".format(i))
    X = group['feat_11']
    Y = group['feat_14']
    Z = group['feat_34']
    xi = np.linspace(X.min(),X.max(),100)
    yi = np.linspace(Y.min(),Y.max(),100)
    zi = griddata((X, Y), Z, (xi[None,:], yi[:,None]), method='cubic')
    
    xig, yig = np.meshgrid(xi, yi)
    surf = threedee.plot_surface(xig, yig, zi, linewidth=0)

    threedee.set_xlabel('feat_11')
    threedee.set_ylabel('feat_14')
    threedee.set_zlabel('feat_34')
plt.show()



In [34]:

    
for i, group in df.groupby('target'):
    threedee = plt.figure().gca(projection='3d')
    threedee.set_title("scatter plot of class number {}".format(i))
    X = group['feat_11']
    Y = group['feat_14']
    Z = group['feat_34']
    xi = np.linspace(X.min(),X.max(),100)
    yi = np.linspace(Y.min(),Y.max(),100)
    zi = griddata((X, Y), Z, (xi[None,:], yi[:,None]), method='cubic')
    
    xig, yig = np.meshgrid(xi, yi)
    contour = plt.contour(xi,yi,zi,15,linewidths=0.5,color='k')

    threedee.set_xlabel('feat_11')
    threedee.set_ylabel('feat_14')
    threedee.set_zlabel('feat_34')
plt.show()

También podemos graficar nuestros datos desde pandas con matplotlib ..



In [36]:

    
interest_feat = ['feat_11', 'feat_34', 'feat_14', 'feat_60', 'target']



In [22]:

    
from pandas.tools.plotting import andrews_curves

andrews_curves(df[interest_feat[:-1]], 'target')









    Out[22]:





<matplotlib.axes._subplots.AxesSubplot at 0x4f88210>



In [26]:

    
from pandas.tools.plotting import parallel_coordinates

parallel_coordinates(df[interest_feat], 'target', alpha=0.5)









    Out[26]:





<matplotlib.axes._subplots.AxesSubplot at 0x10e99cc90>



In [36]:

    
df_plot = df[interest_feat[:-1]].cumsum()
df_plot.plot(kind='area', alpha=0.5)









    Out[36]:





<matplotlib.axes._subplots.AxesSubplot at 0x7fd7f182ca90>



In [37]:

    
df[interest_feat[:-1]].diff().hist(alpha=0.5, bins=10)









    Out[37]:





array([[<matplotlib.axes._subplots.AxesSubplot object at 0x7fd7f2041750>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x7fd7f168fc10>],
       [<matplotlib.axes._subplots.AxesSubplot object at 0x7fd7f21c9950>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x7fd7f1cc3fd0>]], dtype=object)

Podemos ver cuantos elementos de cada clase cuenta la muestra que tenemos



In [38]:

    
df.groupby(['target']).target.count().plot(kind='bar')









    Out[38]:





<matplotlib.axes._subplots.AxesSubplot at 0x7fd7efae9950>



In [39]:

    
df.groupby(['target']).target.count().plot(kind='pie')









    Out[39]:





<matplotlib.axes._subplots.AxesSubplot at 0x7fd7f15d4ed0>



In [40]:

    
df.boxplot(column='feat_34', by='target', grid=False)
for i in range(9):
    y = df.feat_11[df.target==i].dropna()
    # Add some random "jitter" to the x-axis
    x = np.random.normal(i, 0.04, size=len(y))
    plt.plot(x, y, 'r.', alpha=0.02)

Si deseamos visualizar varias variables al mismo tiempo se puede utlizar la función scatter_matrix



In [37]:

    
pd.scatter_matrix(df.loc[:,interest_feat[:-1]], figsize=(12,8), diagonal='kde')









    Out[37]:





array([[<matplotlib.axes._subplots.AxesSubplot object at 0x7f08daaf5a90>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x7f08da4b5cd0>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x7f08da3471d0>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x7f08da2aa6d0>],
       [<matplotlib.axes._subplots.AxesSubplot object at 0x7f08da22ca10>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x7f08da2dfc50>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x7f08da113110>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x7f08da088f10>],
       [<matplotlib.axes._subplots.AxesSubplot object at 0x7f08dacc8b10>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x7f08dade2090>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x7f08daba8f10>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x7f08da975350>],
       [<matplotlib.axes._subplots.AxesSubplot object at 0x7f08da3f82d0>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x7f08dab29550>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x7f08daaed550>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x7f08dac019d0>]], dtype=object)

	id	feat_1	feat_4	feat_5	feat_6	feat_7	feat_8	...	feat_85	feat_86	feat_87	feat_90
0	1	1	0	0	0	0	0	...	1	0	0	0
1	2	0	0	0	0	0	1	...	0	0	0	0
2	3	0	0	0	0	0	1	...	0	0	0	0
3	4	1	1	6	1	5	0	...	0	1	2	0
4	5	0	0	0	0	0	0	...	1	0	0	1

	id	feat_1	feat_2	feat_3	feat_4	feat_5	feat_6	feat_7	feat_8	feat_9	...	feat_85	feat_86	feat_87	feat_88	feat_89	feat_90	feat_91	feat_92	feat_93	target
count	61878.000000	61878.00000	61878.000000	61878.000000	61878.000000	61878.000000	61878.000000	61878.000000	61878.000000	61878.000000	...	61878.000000	61878.000000	61878.000000	61878.000000	61878.000000	61878.000000	61878.000000	61878.000000	61878.000000	61878.000000
mean	30939.500000	0.38668	0.263066	0.901467	0.779081	0.071043	0.025696	0.193704	0.662433	1.011296	...	0.532306	1.128576	0.393549	0.874915	0.457772	0.812421	0.264941	0.380119	0.126135	3.842335
std	17862.784315	1.52533	1.252073	2.934818	2.788005	0.438902	0.215333	1.030102	2.255770	3.474822	...	1.900438	2.681554	1.575455	2.115466	1.527385	4.597804	2.045646	0.982385	1.201720	2.510794
min	1.000000	0.00000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	...	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000
25%	15470.250000	0.00000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	...	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	1.000000
50%	30939.500000	0.00000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	...	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	4.000000
75%	46408.750000	0.00000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	1.000000	0.000000	...	0.000000	1.000000	0.000000	1.000000	0.000000	0.000000	0.000000	0.000000	0.000000	6.000000
max	61878.000000	61.00000	51.000000	64.000000	70.000000	19.000000	10.000000	38.000000	76.000000	43.000000	...	55.000000	65.000000	67.000000	30.000000	61.000000	130.000000	52.000000	19.000000	87.000000	8.000000

	id	feat_1	feat_2	feat_3	feat_4	feat_5	feat_6	feat_7	feat_8	feat_9	...	feat_85	feat_86	feat_87	feat_88	feat_89	feat_90	feat_91	feat_92	feat_93	target
id	1.000000	0.070691	0.185463	0.138980	1.460200e-01	0.030707	0.033410	0.136501	0.197418	-0.156082	...	-0.084507	-0.096484	0.097087	-0.215878	0.111519	0.188895	0.139078	0.131737	0.047944	0.977144
feat_1	0.070691	1.000000	0.031332	-0.027807	-2.752941e-02	0.042973	0.043603	0.298952	0.056321	-0.032285	...	-0.008739	0.107947	0.089374	0.020830	0.096851	0.010310	0.037264	0.054777	0.081783	0.072984
feat_2	0.185463	0.031332	1.000000	0.082573	1.349870e-01	0.020926	0.041343	0.222386	0.019815	-0.025630	...	-0.006764	-0.039090	0.047451	-0.047035	0.105527	0.515022	0.026383	-0.008219	0.054593	0.191739
feat_3	0.138980	-0.027807	0.082573	1.000000	5.835232e-01	0.010880	0.004288	0.001294	-0.053462	-0.063551	...	-0.048626	-0.096093	-0.009838	-0.082336	0.174781	-0.015068	-0.012417	0.066921	0.006814	0.148729
feat_4	0.146020	-0.027529	0.134987	0.583523	1.000000e+00	0.017290	0.014059	0.014490	-0.046184	-0.046250	...	-0.033153	-0.071029	0.005055	-0.067484	0.183715	0.009454	-0.010312	0.087631	0.015746	0.153458
feat_5	0.030707	0.042973	0.020926	0.010880	1.729026e-02	1.000000	0.145355	0.075047	0.035861	-0.024708	...	0.034062	0.013879	0.013999	-0.019201	0.119951	0.004842	0.012012	0.065331	0.002038	0.036034
feat_6	0.033410	0.043603	0.041343	0.004288	1.405895e-02	0.145355	1.000000	0.088014	0.012867	-0.009373	...	0.004290	0.010455	0.015256	-0.015437	0.035042	0.054034	0.012465	0.015479	0.008521	0.037116
feat_7	0.136501	0.298952	0.222386	0.001294	1.448981e-02	0.075047	0.088014	1.000000	0.038121	-0.027146	...	0.037874	-0.009169	0.089574	-0.033646	0.063511	0.129578	0.068506	-0.032261	0.034912	0.142172
feat_8	0.197418	0.056321	0.019815	-0.053462	-4.618407e-02	0.035861	0.012867	0.038121	1.000000	-0.039281	...	-0.003416	-0.029395	0.059929	-0.050931	0.007974	0.026807	0.095990	0.013608	0.005131	0.209725
feat_9	-0.156082	-0.032285	-0.025630	-0.063551	-4.624977e-02	-0.024708	-0.009373	-0.027146	-0.039281	1.000000	...	-0.031462	-0.019144	-0.016925	0.001160	-0.019147	-0.020698	-0.014742	-0.069707	-0.006038	-0.175350
feat_10	0.096127	0.097776	0.051925	0.036944	5.951396e-02	0.091324	0.041940	0.194258	-0.000023	-0.024323	...	0.086758	0.159447	0.077421	0.054635	0.061498	0.049908	0.024025	-0.006869	0.041316	0.095383
feat_11	0.179164	-0.042928	0.118534	0.596243	3.894092e-01	0.004882	0.014504	0.012418	-0.065923	-0.075820	...	-0.074293	-0.123339	-0.032969	-0.114491	0.137374	0.045074	-0.029511	0.013179	0.003326	0.189551
feat_12	0.079170	0.056934	0.090153	0.050037	5.743356e-02	0.036668	0.028588	0.056230	0.091424	-0.021885	...	0.019283	-0.007214	0.016089	-0.024324	0.082220	0.062721	0.063965	0.063922	0.012722	0.082567
feat_13	0.184629	0.139254	0.157467	0.013870	2.897317e-02	0.059081	0.036293	0.199142	0.095365	-0.040164	...	0.002594	0.004850	0.093870	-0.036259	0.062990	0.107722	0.044338	0.071953	0.038989	0.194200
feat_14	-0.346308	0.063517	-0.070057	-0.111105	-9.921490e-02	-0.037607	-0.027350	-0.044671	-0.061799	-0.110188	...	-0.021455	0.145787	-0.020229	0.323089	-0.038881	-0.060240	-0.038444	-0.040133	-0.018127	-0.365092
feat_15	-0.245562	-0.045738	-0.048798	-0.065285	-5.122155e-02	-0.007000	-0.018328	-0.035721	-0.056960	0.009858	...	0.246847	-0.002529	-0.023191	0.010840	0.029547	-0.046616	-0.034402	-0.018206	-0.020369	-0.259047
feat_16	-0.004622	0.027086	0.108046	0.221426	2.110780e-01	0.062877	0.021934	0.043957	-0.004659	-0.082664	...	0.110850	0.003610	0.077770	-0.007257	0.248364	0.016863	0.048494	0.210499	0.031467	-0.004812
feat_17	0.122884	0.053004	0.074902	-0.023093	-7.553867e-03	0.062197	0.015488	0.127245	0.173912	-0.028709	...	0.015559	0.049102	0.214221	-0.034139	0.035390	0.045218	0.088508	-0.006538	0.056695	0.132741
feat_18	0.189280	0.084856	0.242716	0.115655	2.148952e-01	0.052186	0.048710	0.098972	0.087777	-0.043642	...	-0.001555	-0.029295	0.126886	-0.035981	0.247462	0.094336	0.037275	0.126640	0.058100	0.200805
feat_19	0.127893	0.002302	0.176655	-0.012228	-3.519107e-07	-0.008556	0.038493	0.058071	0.019387	-0.000167	...	-0.008292	-0.014560	0.000412	-0.018485	0.011116	0.450925	0.004085	-0.027662	0.014243	0.131549
feat_20	0.213630	0.070511	0.449160	-0.011069	4.465657e-02	0.046200	0.057813	0.364972	0.062595	-0.023397	...	0.084570	0.016850	0.220475	0.004081	0.111231	0.370282	0.079181	-0.018715	0.110054	0.219909
feat_21	0.012886	-0.027026	0.014113	0.354925	2.329227e-01	0.003288	0.008046	-0.022908	-0.041095	-0.028409	...	-0.006180	-0.045562	-0.016862	-0.030401	0.105392	-0.033193	-0.019779	0.058008	-0.007677	0.013486
feat_22	0.190144	0.063283	0.215106	0.251082	2.477378e-01	0.075161	0.038939	0.162620	0.029032	-0.062348	...	0.044396	-0.018347	0.219974	-0.045439	0.244779	0.098595	0.104921	0.200593	0.113276	0.200545
feat_23	0.102980	0.048686	0.162065	-0.002427	3.062225e-02	0.017281	0.043651	0.186462	0.012774	0.006940	...	0.056994	0.121170	0.111837	-0.014039	0.059743	0.141869	0.010438	-0.031837	0.084945	0.104259
feat_24	0.215890	0.067255	0.253684	-0.031596	3.727726e-03	0.075222	0.082124	0.244813	0.161848	0.073618	...	-0.018990	0.015444	0.123298	-0.043479	0.023581	0.357270	0.090833	-0.024375	0.089200	0.226936
feat_25	-0.276763	0.187237	-0.096366	-0.157459	-1.342306e-01	-0.003610	-0.023319	-0.048820	-0.036939	-0.025279	...	0.021119	0.263924	-0.011294	0.207974	-0.012866	-0.088187	-0.045759	0.030135	-0.015708	-0.294079
feat_26	0.133917	-0.022813	0.064856	0.268112	3.657567e-01	0.025116	0.004680	-0.008782	-0.041599	-0.066414	...	-0.048889	-0.072464	0.015937	-0.078470	0.094521	-0.021565	-0.018447	0.199974	0.016709	0.141617
feat_27	0.097531	-0.038826	0.037841	0.508370	3.086287e-01	0.002098	0.001943	-0.015429	-0.050272	-0.042531	...	-0.046053	-0.082510	-0.028097	-0.070194	0.099536	-0.025263	-0.018778	0.023790	0.000318	0.103288
feat_28	0.128876	-0.030257	0.072494	0.551398	4.864171e-01	0.047688	0.017132	0.000998	-0.036668	-0.055545	...	-0.039784	-0.080806	-0.002941	-0.074442	0.169794	-0.021330	-0.015242	0.122653	0.005275	0.136553
feat_29	0.117175	0.069266	0.025689	-0.004141	1.427066e-02	0.065957	0.002389	0.046231	0.104985	-0.021328	...	0.013104	-0.011960	0.038800	-0.032585	0.055398	-0.000185	0.040526	0.084445	0.008301	0.122533
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
feat_65	0.064755	0.110041	0.078801	0.065492	6.228472e-02	0.228349	0.066867	0.202346	0.025544	-0.038163	...	0.066922	0.030362	0.050138	-0.011600	0.125884	0.029076	0.001188	0.044286	0.015500	0.066557
feat_66	0.174734	0.053010	0.175620	0.088017	1.296545e-01	0.048364	0.033285	0.122660	0.115175	-0.001778	...	0.031291	0.088686	0.206406	-0.000679	0.205289	0.094925	0.098063	0.123694	0.067957	0.183488
feat_67	0.186622	0.154301	0.068667	-0.110081	-8.045694e-02	0.061964	0.038289	0.148598	0.320949	0.176921	...	-0.020048	0.081445	0.295803	-0.058706	0.005220	0.089262	0.112052	-0.011247	0.129018	0.199869
feat_68	0.182707	0.014674	-0.012802	-0.030992	-2.009191e-02	0.107405	0.021619	0.040309	0.075384	-0.012192	...	-0.024620	-0.038904	0.001672	-0.059843	0.125150	-0.023839	0.022515	0.095970	-0.004602	0.190042
feat_69	0.210949	0.007544	0.307406	-0.032748	-1.446082e-02	-0.003294	0.074836	0.131430	0.046258	-0.029335	...	-0.006954	-0.025538	0.027690	-0.022918	0.011806	0.549489	0.041206	-0.037961	0.032052	0.217790
feat_70	0.043618	0.165442	0.112968	-0.018774	2.079779e-02	0.118510	0.052401	0.237907	0.023089	-0.056205	...	0.361941	0.225792	0.212133	0.140850	0.163631	0.074178	0.030560	0.007310	0.093488	0.033185
feat_71	0.129365	0.013712	-0.002336	-0.053020	-4.241268e-02	0.056428	0.011901	0.115813	0.081664	0.043286	...	0.013894	-0.015410	0.060004	-0.048676	0.076348	-0.019694	0.050622	0.000368	0.002001	0.135428
feat_72	-0.199876	-0.029983	-0.023267	-0.045339	-2.979578e-02	0.005177	-0.011090	-0.014921	-0.029868	-0.058147	...	0.294384	0.008897	0.013536	0.004066	0.057040	-0.030673	-0.008936	0.005300	-0.008233	-0.208646
feat_73	0.075773	0.140815	0.039192	-0.013972	-1.128547e-02	0.001609	0.025023	0.022819	0.028999	0.022679	...	-0.010675	-0.000841	-0.004759	-0.026363	-0.006704	0.070001	0.007193	-0.024017	-0.000163	0.078975
feat_74	0.062870	0.051365	0.070724	0.041559	4.909735e-02	0.017265	0.043160	0.053059	-0.000431	0.007594	...	-0.000453	-0.015945	0.003992	-0.025207	0.042104	0.055372	0.016941	0.004497	0.021967	0.065925
feat_75	0.199319	0.011596	0.093689	-0.044724	-3.145389e-02	0.015279	0.006951	0.039865	0.031466	-0.027313	...	-0.026329	-0.031401	0.001201	-0.058630	-0.014925	0.160418	-0.002625	-0.037710	0.006208	0.206344
feat_76	0.189897	0.153808	0.259360	-0.028670	-1.379188e-02	0.035570	0.073867	0.375114	0.081682	-0.027424	...	0.000682	0.010324	0.063411	-0.050417	0.023242	0.291884	0.175163	-0.050887	0.029426	0.199383
feat_77	0.053024	0.123752	0.014911	-0.001584	1.531773e-02	0.030462	0.006501	0.005769	0.027486	-0.020185	...	0.005602	0.020294	0.019275	-0.007396	0.021591	-0.004988	0.026376	0.076551	0.001715	0.058228
feat_78	0.104017	0.279202	0.094256	-0.021979	-1.449856e-02	0.070709	0.061250	0.567084	0.079623	-0.015922	...	0.004071	-0.018797	0.063539	-0.030010	0.014639	0.043339	0.068450	-0.028596	0.016047	0.109507
feat_79	0.162963	0.228912	0.033668	-0.020566	-1.083473e-02	0.055115	0.009942	0.066753	0.083714	-0.036116	...	0.004663	0.095254	0.099579	-0.018615	0.073207	0.031099	0.021616	0.162033	0.029082	0.170379
feat_80	0.141716	-0.013303	0.155768	0.442036	4.057725e-01	0.026223	0.017648	0.028860	-0.038382	-0.046721	...	-0.035876	-0.081888	-0.004588	-0.076250	0.350787	0.012623	-0.017815	0.063401	0.012651	0.149499
feat_81	0.078358	0.032427	0.052101	0.013089	2.828377e-02	0.129333	0.044136	0.144308	0.035102	-0.005847	...	0.054972	0.013808	0.084096	-0.017469	0.166234	0.009379	0.017243	0.018565	0.019378	0.082662
feat_82	0.113915	-0.026085	0.119109	0.438458	4.365413e-01	0.057400	0.014907	0.022059	-0.034409	-0.039806	...	-0.034368	-0.065189	-0.012153	-0.059553	0.266249	-0.001795	-0.014641	0.049661	0.005497	0.120733
feat_83	0.161417	0.059165	0.371691	-0.019914	-1.051874e-03	0.008006	0.035145	0.282069	0.033479	-0.032875	...	-0.009157	-0.029711	0.072006	-0.052930	0.035181	0.243942	0.095801	-0.018325	0.054188	0.165828
feat_84	-0.008192	0.049634	0.009845	0.011159	5.684499e-03	0.467329	0.177777	0.062634	0.005064	-0.013569	...	-0.010210	-0.003459	0.013631	-0.017903	0.103643	-0.006013	-0.003444	0.048431	0.003723	-0.003308
feat_85	-0.084507	-0.008739	-0.006764	-0.048626	-3.315343e-02	0.034062	0.004290	0.037874	-0.003416	-0.031462	...	1.000000	0.109643	0.049250	0.027886	0.053582	-0.003931	-0.023091	-0.043484	0.023390	-0.102359
feat_86	-0.096484	0.107947	-0.039090	-0.096093	-7.102916e-02	0.013879	0.010455	-0.009169	-0.029395	-0.019144	...	0.109643	1.000000	0.073685	0.426972	-0.011822	-0.019803	-0.024005	-0.049393	0.029035	-0.115179
feat_87	0.097087	0.089374	0.047451	-0.009838	5.054728e-03	0.013999	0.015256	0.089574	0.059929	-0.016925	...	0.049250	0.073685	1.000000	0.023053	0.066008	0.014696	0.028850	0.001424	0.499990	0.101345
feat_88	-0.215878	0.020830	-0.047035	-0.082336	-6.748367e-02	-0.019201	-0.015437	-0.033646	-0.050931	0.001160	...	0.027886	0.426972	0.023053	1.000000	-0.022552	-0.031679	-0.033653	-0.070120	-0.008631	-0.240481
feat_89	0.111519	0.096851	0.105527	0.174781	1.837145e-01	0.119951	0.035042	0.063511	0.007974	-0.019147	...	0.053582	-0.011822	0.066008	-0.022552	1.000000	0.027764	0.015917	0.129622	0.030650	0.113492
feat_90	0.188895	0.010310	0.515022	-0.015068	9.454061e-03	0.004842	0.054034	0.129578	0.026807	-0.020698	...	-0.003931	-0.019803	0.014696	-0.031679	0.027764	1.000000	0.014812	-0.035311	0.039864	0.195439
feat_91	0.139078	0.037264	0.026383	-0.012417	-1.031241e-02	0.012012	0.012465	0.068506	0.095990	-0.014742	...	-0.023091	-0.024005	0.028850	-0.033653	0.015917	0.014812	1.000000	0.104226	-0.000045	0.146567
feat_92	0.131737	0.054777	-0.008219	0.066921	8.763105e-02	0.065331	0.015479	-0.032261	0.013608	-0.069707	...	-0.043484	-0.049393	0.001424	-0.070120	0.129622	-0.035311	0.104226	1.000000	-0.003653	0.145157
feat_93	0.047944	0.081783	0.054593	0.006814	1.574563e-02	0.002038	0.008521	0.034912	0.005131	-0.006038	...	0.023390	0.029035	0.499990	-0.008631	0.030650	0.039864	-0.000045	-0.003653	1.000000	0.049821
target	0.977144	0.072984	0.191739	0.148729	1.534582e-01	0.036034	0.037116	0.142172	0.209725	-0.175350	...	-0.102359	-0.115179	0.101345	-0.240481	0.113492	0.195439	0.146567	0.145157	0.049821	1.000000

	feat_1	target
0	1	0
1	0	0
2	0	0
3	1	0
4	0	0
5	2	0
6	2	0
7	0	0
8	0	0
9	0	0
10	0	0
11	0	0
12	1	0
13	0	0
14	0	0
15	0	0
16	0	0
17	0	0
18	0	0
19	0	0
20	0	0
21	0	0
22	0	0
23	0	0
24	0	0
25	0	0
26	2	0
27	0	0
28	0	0
29	2	0
...	...	...
61848	0	8
61849	5	8
61850	0	8
61851	3	8
61852	0	8
61853	0	8
61854	0	8
61855	0	8
61856	0	8
61857	0	8
61858	4	8
61859	0	8
61860	0	8
61861	0	8
61862	2	8
61863	0	8
61864	0	8
61865	0	8
61866	0	8
61867	0	8
61868	0	8
61869	0	8
61870	1	8
61871	0	8
61872	0	8
61873	1	8
61874	4	8
61875	0	8
61876	1	8
61877	0	8

	id	feat_1	feat_4	feat_5	feat_6	feat_7	feat_8	...	feat_85	feat_86	feat_87	feat_90
0	1	1	0	0	0	0	0	...	1	0	0	0
1	2	0	0	0	0	0	1	...	0	0	0	0
2	3	0	0	0	0	0	1	...	0	0	0	0
3	4	1	1	6	1	5	0	...	0	1	2	0
4	5	0	0	0	0	0	0	...	1	0	0	1

	id	feat_1	feat_4	feat_5	feat_6	feat_7	feat_8	...	feat_85	feat_86	feat_87	feat_90
0	1	1	0	0	0	0	0	...	1	0	0	0
1	2	0	0	0	0	0	1	...	0	0	0	0
2	3	0	0	0	0	0	1	...	0	0	0	0
3	4	1	1	6	1	5	0	...	0	1	2	0
4	5	0	0	0	0	0	0	...	1	0	0	1