notebook.community

Edit and run



In [1]:

    
from __future__ import division
import random
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import os
import gc
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
from scipy import sparse as ssp
from sklearn.utils import resample,shuffle
from sklearn.metrics import log_loss, roc_auc_score
from sklearn.cross_validation import train_test_split
import xgboost as xgb
import lightgbm as lgb
import config
from sklearn.model_selection import KFold, StratifiedKFold

from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
from sklearn.feature_selection import SelectPercentile, f_classif
from sklearn import preprocessing









    



/usr/local/lib/python2.7/dist-packages/sklearn/cross_validation.py:44: DeprecationWarning: This module was deprecated in version 0.18 in favor of the model_selection module into which all the refactored classes and functions are moved. Also note that the interface of the new CV iterators are different from that of this module. This module will be removed in 0.20.
  "This module will be removed in 0.20.", DeprecationWarning)



In [71]:

    
RAW_PATH=config.RAW_PATH
FEAT_PATH =config.FEAT_PATH

train = pd.read_csv(RAW_PATH+'train.csv')
train.drop(train.index[[config.ab_dup_test]], inplace=True)
train.reset_index(drop=True, inplace=True)
train_y = train['is_duplicate'].values



In [4]:

    
feat_df = ['feat_ab.csv','feature_base_lemmer.csv','feature_vect_lemmer.csv','feat_158_stpf.csv']
# feat_df = ['feat_ab.csv','feat_158_stpf.csv']

df = pd.read_csv(FEAT_PATH+'magic_feature.csv')
del df['question1'], df['question2'], df['id']
print 'feat_mag {}'.format(df.shape)

def remove_col(train):
    list1=['question1','question2','id','is_duplicate']
    for i in list1:
        if i in list(train.columns):
            del train[i]
    return train

for f in feat_df:
    df1 = pd.read_csv(FEAT_PATH+f)
    df1 = remove_col(df1)
    df = pd.concat([df, df1],axis=1)
    del df1
    gc.collect()
    print f, df.shape

# feat_ab = pd.read_csv(FEAT_PATH+'feat_ab.csv')
# del feat_ab['question1'], feat_ab['question2']
# print 'feat_ab {}'.format(feat_ab.shape)

# feature_base_close_porter = pd.read_csv(FEAT_PATH+'feature_base_close_porter.csv')
# del feature_base_close_porter['question1'], feature_base_close_porter['question2'], feature_base_close_porter['is_duplicate']
# print 'feature_base_close_porter {}'.format(feature_base_close_porter.shape)

# df = pd.concat([feat_mag, feat_ab, feature_base_close_porter], axis=1)
# print 'df {}'.format(df.shape)

# del feat_mag, feat_ab, feature_base_close_porter
gc.collect()









    



feat_mag (2750086, 26)
feat_ab.csv (2750086, 55)
feature_base_lemmer.csv (2750086, 267)
feature_vect_lemmer.csv (2750086, 388)
feat_158_stpf.csv (2750086, 404)






    Out[4]:





0



In [5]:

    
del_feat = ['q1_hash','q2_hash','q_hash_pos','q_hash_pos_1','q1_change','q2_change']
del_feat.extend(['q_change_pair','q1_q2_change_max'])
del_feat.extend(['euclidean_distance', 'jaccard_distance','RMSE_distance'])
# del_feat.extend(['freq_diff', 'q1_q2_intersect_ratio'])
del_feat.extend(list(tr_corr[abs(tr_corr['is_duplicate'])<0.01].index))

print df.shape
for i in list(df.columns):
    if i in del_feat:
        del df[i]
# df = df[use_feat]
print df.shape









    



---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-5-4fb5182de654> in <module>()
      4 # del_feat.extend(['freq_diff', 'q1_q2_intersect_ratio'])
      5 
----> 6 del_feat.extend(list(tr_corr[abs(tr_corr['is_duplicate'])<0.01].index))
      7 
      8 print df.shape

NameError: name 'tr_corr' is not defined



In [5]:

    
tr_corr = df[df['is_duplicate']!=-1].corr()
# tr_corr.sort_values(by='is_duplicate',ascending=0)
abs(tr_corr).sort_values(by='is_duplicate',ascending=0)
# tr_corr[abs(tr_corr['is_duplicate'])<0.1]
# tr_corr[tr_corr.index.str.contains('pos')].sort_values(by='is_duplicate',ascending=0)









    Out[5]:






  
    
      
      is_duplicate
      q1_freq
      q2_freq
      freq_diff
      q1_q2_intersect
      q1_q2_intersect_ratio
      q1_q2_wm_ratio
      q1_pr
      q2_pr
      z_place_match
      ...
      wc_diff_unique
      wc_ratio_unique
      wc_diff_unq_stop
      wc_ratio_unique_stop
      same_start
      char_diff
      char_diff_unq_stop
      total_unique_words
      total_unq_words_stop
      char_ratio
    
  
  
    
      is_duplicate
      1.000000
      0.296621
      0.198609
      0.337501
      0.412979
      0.609256
      0.641026
      0.018429
      0.016661
      0.063175
      ...
      0.207048
      0.065786
      0.212901
      0.066541
      0.197812
      0.211750
      0.221244
      0.288730
      0.301078
      0.044247
    
    
      q1_q2_wm_ratio
      0.641026
      0.487466
      0.355246
      0.444353
      0.684574
      0.976114
      1.000000
      0.083618
      0.034415
      0.033779
      ...
      0.151437
      0.044734
      0.152581
      0.045589
      0.120138
      0.154682
      0.157205
      0.188936
      0.202187
      0.029748
    
    
      q1_q2_intersect_ratio
      0.609256
      0.490514
      0.358951
      0.428979
      0.722593
      1.000000
      0.976114
      0.065419
      0.024576
      0.033451
      ...
      0.147422
      0.043581
      0.148945
      0.045619
      0.114459
      0.150171
      0.152772
      0.178367
      0.191013
      0.029460
    
    
      word_match
      0.456173
      0.108091
      0.068038
      0.131844
      0.164976
      0.292098
      0.289983
      0.000107
      0.010960
      0.074797
      ...
      0.416946
      0.130007
      0.423744
      0.140251
      0.365136
      0.401727
      0.421170
      0.504688
      0.514084
      0.097808
    
    
      tfidf_wm
      0.426661
      0.045062
      0.013690
      0.119178
      0.097573
      0.241715
      0.240230
      0.018761
      0.029882
      0.057995
      ...
      0.389450
      0.119110
      0.390666
      0.124922
      0.310045
      0.377316
      0.394885
      0.481146
      0.492038
      0.090539
    
    
      tfidf_wm_stops
      0.424446
      0.046444
      0.014802
      0.119944
      0.099261
      0.241695
      0.240182
      0.018047
      0.029574
      0.057249
      ...
      0.380831
      0.116539
      0.384502
      0.123480
      0.293581
      0.370424
      0.389616
      0.475493
      0.490034
      0.088959
    
    
      dicedistence_n1
      0.420369
      0.085192
      0.042429
      0.084832
      0.120920
      0.237792
      0.245715
      0.006209
      0.018697
      0.072514
      ...
      0.449278
      0.141261
      0.460238
      0.148549
      0.370355
      0.441570
      0.463231
      0.561132
      0.564348
      0.110338
    
    
      q1_q2_intersect
      0.412979
      0.789134
      0.591368
      0.298225
      1.000000
      0.722593
      0.684574
      0.185699
      0.095871
      0.002180
      ...
      0.086745
      0.025830
      0.086129
      0.026942
      0.069998
      0.089146
      0.088645
      0.088004
      0.094159
      0.016798
    
    
      jaccard_n1
      0.403398
      0.053339
      0.023333
      0.093114
      0.093163
      0.207972
      0.212186
      0.012013
      0.020005
      0.062210
      ...
      0.431276
      0.129653
      0.444258
      0.137975
      0.357477
      0.425794
      0.449788
      0.544192
      0.546700
      0.100936
    
    
      compression_dist
      0.401608
      0.070227
      0.044802
      0.097175
      0.109371
      0.225148
      0.231770
      0.004483
      0.003890
      0.044859
      ...
      0.591821
      0.173829
      0.629824
      0.190559
      0.377760
      0.614931
      0.658859
      0.672904
      0.682621
      0.149767
    
    
      fuzz_token_set_ratio
      0.396250
      0.073047
      0.035621
      0.065156
      0.101685
      0.215703
      0.224983
      0.003706
      0.015883
      0.072311
      ...
      0.378203
      0.119159
      0.376923
      0.112717
      0.520594
      0.357568
      0.373778
      0.529653
      0.502431
      0.090681
    
    
      intersect_close_ratio_n1
      0.386142
      0.076081
      0.033939
      0.070294
      0.106269
      0.218804
      0.225561
      0.008511
      0.021284
      0.079721
      ...
      0.403637
      0.131768
      0.413781
      0.138550
      0.344281
      0.384914
      0.413873
      0.485864
      0.487991
      0.101267
    
    
      cosine_distance
      0.383353
      0.094392
      0.052775
      0.050551
      0.118282
      0.226984
      0.236324
      0.023955
      0.009529
      0.099250
      ...
      0.347785
      0.123468
      0.353087
      0.129088
      0.331542
      0.332364
      0.357008
      0.390766
      0.392690
      0.110689
    
    
      braycurtis_distance
      0.373533
      0.074025
      0.041368
      0.058423
      0.100352
      0.208712
      0.214944
      0.011251
      0.009287
      0.092192
      ...
      0.352645
      0.126017
      0.360732
      0.133364
      0.328306
      0.338199
      0.365095
      0.394012
      0.395781
      0.100361
    
    
      fuzz_token_sort_ratio
      0.372549
      0.053984
      0.024276
      0.082596
      0.088935
      0.198681
      0.202809
      0.012577
      0.020599
      0.069679
      ...
      0.573175
      0.193992
      0.544256
      0.187017
      0.509895
      0.561328
      0.548579
      0.530075
      0.497493
      0.163729
    
    
      fuzz_qratio
      0.370515
      0.064127
      0.042052
      0.099188
      0.106037
      0.217565
      0.219972
      0.014650
      0.006547
      0.050904
      ...
      0.574604
      0.187528
      0.544941
      0.180481
      0.545101
      0.558042
      0.546280
      0.585915
      0.555160
      0.155578
    
    
      char_distribution_ratio_std
      0.369171
      0.040780
      0.015734
      0.084054
      0.074781
      0.176830
      0.183112
      0.015559
      0.020534
      0.055955
      ...
      0.393185
      0.109461
      0.407003
      0.115226
      0.318902
      0.404992
      0.431820
      0.478934
      0.483277
      0.092460
    
    
      minkowski_distance
      0.367988
      0.058778
      0.030754
      0.069073
      0.087539
      0.197664
      0.201169
      0.001259
      0.011970
      0.088154
      ...
      0.363508
      0.117265
      0.372212
      0.125123
      0.333252
      0.350491
      0.377956
      0.423810
      0.423766
      0.088071
    
    
      cityblock_distance
      0.367678
      0.058715
      0.030465
      0.068994
      0.087468
      0.197396
      0.200907
      0.001256
      0.012152
      0.088119
      ...
      0.363716
      0.117373
      0.372440
      0.125044
      0.333383
      0.350662
      0.378038
      0.423797
      0.423710
      0.088191
    
    
      char_distribution_kl
      0.367569
      0.076128
      0.042658
      0.071124
      0.104468
      0.205766
      0.216190
      0.005751
      0.010056
      0.064252
      ...
      0.427578
      0.129746
      0.440548
      0.134910
      0.329878
      0.443522
      0.466270
      0.500613
      0.505342
      0.115182
    
    
      edit_dist
      0.363314
      0.052797
      0.031065
      0.107013
      0.098689
      0.208114
      0.208664
      0.017945
      0.011778
      0.043613
      ...
      0.455591
      0.136686
      0.463785
      0.147319
      0.394476
      0.455909
      0.475519
      0.523411
      0.520470
      0.114905
    
    
      fuzz_partial_token_sort_ratio
      0.360776
      0.052253
      0.026430
      0.071715
      0.083777
      0.187334
      0.193005
      0.009621
      0.012978
      0.066607
      ...
      0.428750
      0.124707
      0.422835
      0.124150
      0.470208
      0.416147
      0.425894
      0.532344
      0.505971
      0.097755
    
    
      fuzz_partial_ratio
      0.358234
      0.063455
      0.041912
      0.082073
      0.099348
      0.203688
      0.209226
      0.010340
      0.002210
      0.046488
      ...
      0.370820
      0.104515
      0.372347
      0.105105
      0.488419
      0.351008
      0.370044
      0.552829
      0.530615
      0.067812
    
    
      wmd
      0.357621
      0.080754
      0.051812
      0.095494
      0.123375
      0.233257
      0.230458
      0.000490
      0.005023
      0.039572
      ...
      0.411257
      0.129093
      0.433550
      0.142258
      0.387992
      0.393620
      0.421469
      0.450407
      0.450937
      0.117364
    
    
      char_distribution_cosine
      0.350929
      0.066776
      0.030565
      0.066950
      0.094656
      0.190514
      0.199702
      0.002218
      0.020482
      0.072473
      ...
      0.361426
      0.115177
      0.369945
      0.120829
      0.307911
      0.366026
      0.388538
      0.412910
      0.415369
      0.100264
    
    
      norm_wmd
      0.350185
      0.071848
      0.041669
      0.091403
      0.113215
      0.223563
      0.220018
      0.004530
      0.011780
      0.049612
      ...
      0.429874
      0.134419
      0.452854
      0.149156
      0.400584
      0.408873
      0.435077
      0.481996
      0.476311
      0.124535
    
    
      jaccard
      0.341490
      0.057484
      0.039217
      0.093256
      0.097949
      0.202614
      0.197817
      0.015144
      0.005916
      0.061770
      ...
      0.429713
      0.130990
      0.409725
      0.127537
      0.566882
      0.393118
      0.395894
      0.482611
      0.444249
      0.094577
    
    
      freq_diff
      0.337501
      0.169080
      0.100777
      1.000000
      0.298225
      0.428979
      0.444353
      0.063712
      0.056260
      0.003862
      ...
      0.067860
      0.018200
      0.067786
      0.021691
      0.040075
      0.073070
      0.071185
      0.076522
      0.078187
      0.018074
    
    
      edit_dist_agg_n3_mean_mean
      0.326339
      0.024466
      0.008742
      0.076193
      0.062855
      0.161905
      0.169505
      0.018515
      0.013620
      0.010555
      ...
      0.369987
      0.100574
      0.389232
      0.103124
      0.290440
      0.373141
      0.398077
      0.596105
      0.623957
      0.084170
    
    
      edit_dist_agg_n3_min_min
      0.326339
      0.024466
      0.008742
      0.076193
      0.062855
      0.161905
      0.169505
      0.018515
      0.013620
      0.010555
      ...
      0.369987
      0.100574
      0.389232
      0.103124
      0.290440
      0.373141
      0.398077
      0.596105
      0.623957
      0.084170
    
    
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
    
    
      lsa_pn10_1
      0.012576
      0.001198
      0.011997
      0.005710
      0.022041
      0.000318
      0.003768
      0.012058
      0.000979
      0.006294
      ...
      0.011730
      0.005014
      0.014417
      0.006008
      0.018911
      0.009201
      0.007050
      0.021093
      0.027982
      0.006002
    
    
      pos_of_question1_n1_in_question2_median
      0.012323
      0.040680
      0.019722
      0.052882
      0.023614
      0.013920
      0.014394
      0.033490
      0.009789
      0.083415
      ...
      0.121917
      0.175631
      0.149237
      0.202135
      0.036631
      0.162574
      0.162411
      0.378543
      0.407656
      0.143045
    
    
      pos_of_question1_n1_in_question2_mean
      0.011189
      0.042929
      0.020687
      0.054457
      0.025476
      0.014044
      0.014350
      0.034695
      0.010248
      0.085515
      ...
      0.129234
      0.182698
      0.157260
      0.209693
      0.037232
      0.171015
      0.170485
      0.393125
      0.422616
      0.148715
    
    
      diff_len
      0.011137
      0.018938
      0.025338
      0.013381
      0.002910
      0.006019
      0.006060
      0.039278
      0.028082
      0.002180
      ...
      0.156516
      0.776571
      0.144614
      0.746947
      0.012580
      0.150253
      0.109326
      0.129764
      0.127882
      0.651576
    
    
      svd_cooc_on2_tn2_9
      0.009420
      0.013534
      0.005391
      0.001095
      0.001769
      0.001943
      0.001611
      0.014768
      0.002735
      0.003942
      ...
      0.012894
      0.003522
      0.011192
      0.003479
      0.019686
      0.010378
      0.009962
      0.005077
      0.004264
      0.002928
    
    
      lsa_cn10_q1_1
      0.008334
      0.107563
      0.065040
      0.013471
      0.138602
      0.037324
      0.032981
      0.033257
      0.000336
      0.007604
      ...
      0.013855
      0.001805
      0.014250
      0.003909
      0.008747
      0.011963
      0.009390
      0.016959
      0.021816
      0.004651
    
    
      lsa_cn10_q2_1
      0.008334
      0.107563
      0.065040
      0.013471
      0.138602
      0.037324
      0.032981
      0.033257
      0.000336
      0.007604
      ...
      0.013855
      0.001805
      0.014250
      0.003909
      0.008747
      0.011963
      0.009390
      0.016959
      0.021816
      0.004651
    
    
      lsa_wn3_q2_1
      0.008334
      0.107563
      0.065040
      0.013471
      0.138602
      0.037324
      0.032981
      0.033257
      0.000336
      0.007604
      ...
      0.013855
      0.001805
      0.014250
      0.003909
      0.008747
      0.011963
      0.009390
      0.016959
      0.021816
      0.004651
    
    
      nmf_c_question2_n5_1
      0.008195
      0.083173
      0.083755
      0.014720
      0.076449
      0.035411
      0.033630
      0.035111
      0.040087
      0.102338
      ...
      0.228009
      0.292831
      0.232863
      0.313614
      0.061320
      0.244838
      0.243706
      0.485460
      0.493997
      0.256925
    
    
      svd_cooc_on2_tn1_5
      0.007119
      0.037085
      0.026726
      0.006325
      0.027375
      0.009104
      0.010497
      0.004027
      0.006271
      0.001474
      ...
      0.002217
      0.000463
      0.001060
      0.003516
      0.001949
      0.001334
      0.001964
      0.001781
      0.000243
      0.000294
    
    
      nmf_p_n2_0
      0.006894
      0.079055
      0.099540
      0.000397
      0.074983
      0.019041
      0.017734
      0.054480
      0.083563
      0.061907
      ...
      0.048702
      0.011127
      0.040406
      0.007840
      0.017834
      0.048133
      0.046750
      0.061933
      0.051369
      0.008193
    
    
      lsa_pn10_2
      0.006817
      0.185262
      0.134210
      0.020450
      0.220081
      0.045107
      0.038408
      0.028231
      0.006104
      0.035418
      ...
      0.000562
      0.002055
      0.001679
      0.005174
      0.009103
      0.004656
      0.006761
      0.029028
      0.020454
      0.002444
    
    
      svd_cooc_on1_tn2_8
      0.006333
      0.005427
      0.016213
      0.007188
      0.004666
      0.001477
      0.005593
      0.013858
      0.017694
      0.000675
      ...
      0.002742
      0.001815
      0.006556
      0.000166
      0.000590
      0.004466
      0.006302
      0.012136
      0.019301
      0.001542
    
    
      lsa_cn10_q1_5
      0.006272
      0.145682
      0.294280
      0.008552
      0.118232
      0.032678
      0.033686
      0.090832
      0.252095
      0.107857
      ...
      0.005736
      0.007977
      0.007649
      0.010489
      0.001152
      0.000503
      0.002738
      0.017380
      0.009489
      0.008456
    
    
      lsa_cn10_q2_5
      0.006272
      0.145682
      0.294280
      0.008552
      0.118232
      0.032678
      0.033686
      0.090832
      0.252095
      0.107857
      ...
      0.005736
      0.007977
      0.007649
      0.010489
      0.001152
      0.000503
      0.002738
      0.017380
      0.009489
      0.008456
    
    
      lsa_wn3_q2_5
      0.006272
      0.145682
      0.294280
      0.008552
      0.118232
      0.032678
      0.033686
      0.090832
      0.252095
      0.107857
      ...
      0.005736
      0.007977
      0.007649
      0.010489
      0.001152
      0.000503
      0.002738
      0.017380
      0.009489
      0.008456
    
    
      pos_of_question2_n1_in_question1_mean
      0.006033
      0.046832
      0.002510
      0.063798
      0.018479
      0.002969
      0.002360
      0.055148
      0.027873
      0.077163
      ...
      0.220370
      0.211369
      0.243474
      0.236622
      0.049420
      0.254441
      0.231778
      0.456183
      0.483100
      0.181531
    
    
      edit_dist_agg_n2_max_std
      0.005927
      0.002377
      0.025246
      0.012332
      0.011621
      0.008176
      0.015845
      0.015403
      0.024549
      0.020470
      ...
      0.015716
      0.197529
      0.005400
      0.246259
      0.014387
      0.015493
      0.008581
      0.087150
      0.095353
      0.163037
    
    
      pos_of_question2_n1_in_question1_median
      0.005047
      0.043768
      0.001196
      0.062415
      0.016045
      0.002672
      0.002142
      0.053597
      0.027518
      0.075142
      ...
      0.211907
      0.202691
      0.234442
      0.227316
      0.048960
      0.245076
      0.223403
      0.440685
      0.467177
      0.174194
    
    
      lsa_wn3_q1_5
      0.004689
      0.186252
      0.152928
      0.007093
      0.108101
      0.023675
      0.028104
      0.144274
      0.120457
      0.126869
      ...
      0.007369
      0.005389
      0.004293
      0.005723
      0.008033
      0.009288
      0.010776
      0.010182
      0.006587
      0.002025
    
    
      cooc_tfidf_question1_n1_count_std
      0.004534
      0.041032
      0.064001
      0.073066
      0.047108
      0.015999
      0.015170
      0.000161
      0.052752
      0.024478
      ...
      0.019993
      0.028305
      0.018761
      0.050279
      0.021049
      0.010642
      0.006420
      0.031825
      0.039982
      0.018581
    
    
      z_q2_place_num
      0.004522
      0.005548
      0.008613
      0.005287
      0.002585
      0.005915
      0.003055
      0.008097
      0.011376
      0.738607
      ...
      0.032781
      0.043078
      0.037139
      0.043092
      0.016649
      0.042655
      0.045656
      0.084232
      0.098806
      0.041025
    
    
      nmf_p_n2_1
      0.003621
      0.026742
      0.012484
      0.011828
      0.052396
      0.014314
      0.011199
      0.036264
      0.066747
      0.012954
      ...
      0.062976
      0.018539
      0.052242
      0.015479
      0.019251
      0.061720
      0.056959
      0.080989
      0.067650
      0.014542
    
    
      nmf_w_question1_n3_0
      0.003426
      0.144577
      0.096858
      0.016395
      0.113097
      0.013826
      0.013711
      0.076627
      0.025876
      0.038849
      ...
      0.039278
      0.007077
      0.041755
      0.007526
      0.039450
      0.034794
      0.035216
      0.033604
      0.032700
      0.010026
    
    
      nmf_c_question1_n5_1
      0.002647
      0.073505
      0.064601
      0.026593
      0.072965
      0.036413
      0.034726
      0.018382
      0.018876
      0.109749
      ...
      0.175084
      0.214403
      0.187532
      0.249333
      0.057093
      0.207034
      0.219829
      0.443276
      0.453683
      0.202050
    
    
      z_q1_place_num
      0.002183
      0.009255
      0.008271
      0.003545
      0.005260
      0.006741
      0.004060
      0.011865
      0.007571
      0.749328
      ...
      0.023213
      0.031389
      0.027874
      0.037104
      0.013278
      0.033444
      0.037178
      0.075146
      0.089350
      0.025675
    
    
      edit_dist_agg_n1_max_std
      0.001549
      0.029105
      0.001893
      0.015360
      0.017830
      0.007274
      0.003805
      0.029608
      0.022281
      0.015480
      ...
      0.070055
      0.184489
      0.067459
      0.221408
      0.052199
      0.062229
      0.054836
      0.199959
      0.212620
      0.156063
    
    
      cooc_tfidf_question2_n1_count_std
      0.001358
      0.027159
      0.069295
      0.068700
      0.043920
      0.014229
      0.013165
      0.023579
      0.066267
      0.025357
      ...
      0.032098
      0.049792
      0.029457
      0.031464
      0.015366
      0.004882
      0.006462
      0.024904
      0.032381
      0.049008
    
    
      z_q2_has_place
      0.000900
      0.001412
      0.002652
      0.001977
      0.004445
      0.010539
      0.006853
      0.007198
      0.010221
      0.762324
      ...
      0.019203
      0.030487
      0.022519
      0.029478
      0.011847
      0.027166
      0.031130
      0.070461
      0.084916
      0.028749
    
    
      z_q1_has_place
      0.000180
      0.004121
      0.003831
      0.001484
      0.000121
      0.009932
      0.006590
      0.010791
      0.006724
      0.764193
      ...
      0.013340
      0.027691
      0.017229
      0.033519
      0.010078
      0.021595
      0.026155
      0.063369
      0.077690
      0.022233
    
  

392 rows × 392 columns



In [7]:

    
####################################################
########### remove high related features ###########
####################################################

ab_corr = abs(tr_corr).sort_values(by='is_duplicate',ascending=0)
corr_list = list(ab_corr.index)
for i in list(ab_corr.index):
    if i in corr_list:
        for j in corr_list:
            if ab_corr.ix[i][j]>0.9 and ab_corr.ix[i][j]<1:
                corr_list.remove(j)
                print (i,j,ab_corr.ix[i][j])
    else: pass
    
# df = df[corr_list]
# print df.shape









    



('q1_q2_wm_ratio', 'q1_q2_intersect_ratio', 0.98339464560072498)
('word_match', 'tfidf_wm', 0.93322210026153851)
('tfidf_wm_stops', 'word_match', 0.92995158765309249)
('cosine_distance', 'braycurtis_distance', 0.97916062541974491)
('cosine_distance', 'RMSE_distance', 0.94831239762790553)
('cosine_distance', 'minkowski_distance', 0.94797559168818923)
('fuzz_token_sort_ratio', 'fuzz_qratio', 0.92026621176353296)
('fuzz_token_sort_ratio', 'fuzz_partial_token_sort_ratio', 0.91308422436086456)
('euclidean_distance', 'cosine_distance', 0.94831843891180545)
('euclidean_distance', 'cityblock_distance', 0.99935660679125382)
('euclidean_distance', 'canberra_distance', 0.9563974628318419)
('wmd', 'norm_wmd', 0.97716355153362222)
('total_unq_words_stop', 'total_unique_words', 0.95390582381940558)
('char_diff_unq_stop', 'wc_diff_unq_stop', 0.90138626802904287)
('char_diff', 'char_diff_unq_stop', 0.91877313594035004)
('char_diff', 'wc_diff_unique', 0.90408376051139294)
('wc_diff', 'char_diff', 0.9231118250276108)
('len_q1', 'len_word_q1', 0.95802024672788144)
('len_q2', 'len_word_q2', 0.9675617534614811)
('wc_ratio', 'wc_ratio_unique', 0.9855730058919987)



In [5]:

    
#######################################
######### feature selection ###########
#######################################


train = df[df['is_duplicate']!=-1].copy()
train =train.replace([np.inf, -np.inf], np.nan).dropna()

# X_indices = np.arange(train.shape[-1])
# selector = SelectPercentile(f_classif, percentile=10)
# selector.fit(train, train['is_duplicate'])
# scores = -np.log10(selector.pvalues_)
# scores /= scores.max()
# plt.bar(X_indices - .45, scores, width=.2,
#         label=r'Univariate score ($-Log(p_{value})$)', color='darkorange')

full_feat = list(train.columns)
full_feat.remove('is_duplicate')
min_max_scaler = preprocessing.MinMaxScaler()
train[full_feat] = min_max_scaler.fit_transform(train[full_feat])

selector = SelectKBest(chi2, k=350)
selector.fit(train[full_feat], train['is_duplicate'])
idxs_selected = selector.get_support(indices=True)
columns_selected = train[full_feat].columns[idxs_selected]
print columns_selected
del train
gc.collect()

# df = df[list(columns_selected)+['is_duplicate']]









    



Index([u'q1_freq', u'q2_freq', u'freq_diff', u'q1_q2_intersect',
       u'q1_q2_intersect_ratio', u'q1_q2_wm_ratio', u'q1_pr', u'q2_pr',
       u'z_place_match', u'z_place_match_num',
       ...
       u'wc_diff_unique', u'wc_ratio_unique', u'wc_diff_unq_stop',
       u'wc_ratio_unique_stop', u'same_start', u'char_diff',
       u'char_diff_unq_stop', u'total_unique_words', u'total_unq_words_stop',
       u'char_ratio'],
      dtype='object', length=350)



In [6]:

    
######### 0.8 oversample ###########
test = df[df['is_duplicate']==-1].copy()
del test['is_duplicate']
train = df[df['is_duplicate']!=-1].copy()
del train['is_duplicate']
print train.shape, test.shape

X_train, X_test, y_train, y_test = train_test_split(train, train_y, test_size=0.1, random_state=4242)

#UPDownSampling
pos_train = X_train[y_train == 1]
neg_train = X_train[y_train == 0]
X_train = pd.concat((neg_train, pos_train.iloc[:int(0.8*len(pos_train))], neg_train))
y_train = np.array([0] * neg_train.shape[0] + [1] * pos_train.iloc[:int(0.8*len(pos_train))].shape[0] + [0] * neg_train.shape[0])
print(np.mean(y_train))
del pos_train, neg_train

pos_test = X_test[y_test == 1]
neg_test = X_test[y_test == 0]
X_test = pd.concat((neg_test, pos_test.iloc[:int(0.8 * len(pos_test))], neg_test))
y_test = np.array([0] * neg_test.shape[0] + [1] * pos_test.iloc[:int(0.8 * len(pos_test))].shape[0] + [0] * neg_test.shape[0])
print(np.mean(y_test))
del pos_test, neg_test









    



(404290, 46) (2345796, 46)
0.189752932122
0.189234677675



In [22]:

    
def oversample(X_ot,y,p=0.173):
    raw_num = X_ot.shape[0]
    print "RAW shape: {} | Mean rate: {}".format(X_ot.shape[0], y.mean())
    pos_ot = X_ot[y==1]
    neg_ot = X_ot[y==0]
    #p = 0.165
    scale = ((pos_ot.shape[0]*1.0 / (pos_ot.shape[0] + neg_ot.shape[0])) / p) - 1
    while scale > 1:
        neg_ot = np.vstack([neg_ot, neg_ot])
        scale -=1
    neg_ot = np.vstack([neg_ot, neg_ot[:int(scale * neg_ot.shape[0])]])
    ot = np.vstack([pos_ot, neg_ot])
    y=np.zeros(ot.shape[0])
    y[:pos_ot.shape[0]]=1.0
    print "Oversample: {} | Mean rate: {}".format(ot.shape[0],y.mean())
    return ot,y


test = df[df['is_duplicate']==-1].copy()
del test['is_duplicate']
train = df[df['is_duplicate']!=-1].copy()
del train['is_duplicate']
del df
print train.shape, test.shape

############### drop absolute duplicate rows #################
train.drop(train.index[[config.ab_dup_test]], inplace=True)
train.reset_index(drop=True, inplace=True)

X_train, X_test, y_train, y_test = train_test_split(train, train_y, test_size=0.1, random_state=1048)

X_train,y_train = oversample(X_train,y_train,p=0.1742)
X_test,y_test = oversample(X_test,y_test,p=0.1742)
X_train,y_train = shuffle(X_train,y_train,random_state=421)  
print X_train.shape, y_train.shape
gc.collect()



In [10]:

    
def xgb_train(n=2500,params=False):
    if not params:
        params = {}
        params['base_score'] = 0.2
        params['objective'] = 'binary:logistic'
        params['eval_metric'] = 'logloss'
        params['eta'] = 0.02
        params['max_depth'] = 7
        params['subsample'] = 0.6
        params['nthread'] = 16
        # params['colsample_bytree'] = 0.6 
        # params['gamma'] = 0.1 
        # params['min_child_weight'] = 5
        # params['scale_pos_weight'] = 0.2
    print params

    d_train = xgb.DMatrix(X_train, label=y_train)
    d_valid = xgb.DMatrix(X_test, label=y_test)
    watchlist = [(d_train, 'train'), (d_valid, 'valid')]

    bst = xgb.train(params, d_train, n, watchlist, early_stopping_rounds=50, verbose_eval=100)
    train_loss_str = log_loss(y_train, bst.predict(d_train))
    valid_loss_str = log_loss(y_test, bst.predict(d_valid))
    sub_stamp = 'xgb_t%s_v%s'%(str(train_loss_str)[2:6], str(valid_loss_str)[2:6])
    
    print 'train logloss: %s' %(train_loss_str)
    print 'valid logloss: %s' %(valid_loss_str)
    print 'best_ntree_limit %s' %(bst.best_ntree_limit)
    
    return bst, bst.best_ntree_limit, sub_stamp, params



In [11]:

    
params_opt = {'colsample_bytree': 0.7852290495822306, 'eval_metric': 'logloss', 'max_delta_step': 2.0, 'nthread': 16, 
          'min_child_weight': 3.0, 'subsample': 0.8071297738930207, 'base_score': 0.2, 
          'objective': 'binary:logistic', 'alpha': 10, 'max_depth': 11, 'gamma': 0.1, 'lambda': 100}
params_opt['eta'] = 0.02

bst, BEST_NTREE, sub_stamp, PARAMS = xgb_train(n=5000,params=params_opt)

bst.save_model(config.SUB_PATH + sub_stamp+'.mdl')
bst = xgb.Booster(PARAMS)
bst.load_model(config.SUB_PATH + sub_stamp+'.mdl')

d_test = xgb.DMatrix(test)
p_test = bst.predict(d_test, ntree_limit=BEST_NTREE)

sub = pd.DataFrame()
df_test = pd.read_csv(config.RAW_PATH+'test.csv')
sub['test_id'] = df_test['test_id']
sub['is_duplicate'] = p_test
sub.to_csv(config.SUB_PATH + sub_stamp+'.csv', index=False)

del df_test
gc.collect()









    



{'eval_metric': 'logloss', 'max_delta_step': 2.0, 'base_score': 0.2, 'alpha': 10, 'colsample_bytree': 0.7852290495822306, 'nthread': 16, 'min_child_weight': 3.0, 'subsample': 0.8071297738930207, 'eta': 0.02, 'objective': 'binary:logistic', 'max_depth': 11, 'gamma': 0.1, 'lambda': 100}
[0]	train-logloss:0.50341	valid-logloss:0.50313
Multiple eval metrics have been passed: 'valid-logloss' will be used for early stopping.

Will train until valid-logloss hasn't improved in 50 rounds.
[100]	train-logloss:0.219269	valid-logloss:0.227403
[200]	train-logloss:0.18408	valid-logloss:0.196613
[300]	train-logloss:0.172434	valid-logloss:0.188292
[400]	train-logloss:0.164398	valid-logloss:0.184019
[500]	train-logloss:0.157704	valid-logloss:0.181411
[600]	train-logloss:0.151349	valid-logloss:0.17949
[700]	train-logloss:0.145556	valid-logloss:0.178007
[800]	train-logloss:0.140086	valid-logloss:0.176809
[900]	train-logloss:0.134905	valid-logloss:0.175897
[1000]	train-logloss:0.130165	valid-logloss:0.175175
[1100]	train-logloss:0.125414	valid-logloss:0.174495
[1200]	train-logloss:0.120968	valid-logloss:0.174038
[1300]	train-logloss:0.116844	valid-logloss:0.173565
[1400]	train-logloss:0.11285	valid-logloss:0.173223
[1500]	train-logloss:0.109094	valid-logloss:0.172909
[1600]	train-logloss:0.105561	valid-logloss:0.172716
[1700]	train-logloss:0.102334	valid-logloss:0.172533
[1800]	train-logloss:0.099147	valid-logloss:0.172337
[1900]	train-logloss:0.096148	valid-logloss:0.172187
[2000]	train-logloss:0.093203	valid-logloss:0.172082
[2100]	train-logloss:0.090359	valid-logloss:0.171944
[2200]	train-logloss:0.087646	valid-logloss:0.171907
Stopping. Best iteration:
[2209]	train-logloss:0.087436	valid-logloss:0.171888

train logloss: 0.0860946948267
valid logloss: 0.171893311453
best_ntree_limit 2210






    Out[11]:





82



In [11]:

    
####### lightgbm ######

def lgb_train(params, n_inter=200):
    d_train = lgb.Dataset(X_train, label=y_train)
    d_valid = lgb.Dataset(X_test, label=y_test)
    watchlist = [d_train, d_valid]
    bst = lgb.train(params, d_train, n_inter, watchlist, early_stopping_rounds=30, verbose_eval=100)
    train_loss_str = log_loss(y_train, bst.predict(X_train))
    valid_loss_str = log_loss(y_test, bst.predict(X_test))
    sub_stamp = 'lgb_t%s_v%s'%(str(train_loss_str)[2:6], str(valid_loss_str)[2:6])
    num_iteration=bst.best_iteration

    print 'train logloss: %s' %(train_loss_str)
    print 'valid logloss: %s' %(valid_loss_str)
    return bst, num_iteration, sub_stamp

params_lgb = {}
params_lgb['learning_rate'] = 0.05
params_lgb['boosting_type'] = 'dart'
params_lgb['objective'] = 'binary'
params_lgb['metric'] = 'binary_logloss'
params_lgb['feature_fraction'] = 0.7
params_lgb['bagging_fraction'] = 0.7
params_lgb['num_leaves'] = 256
params_lgb['max_depth'] = 8
params_lgb['min_data_in_leaf'] = 1
params_lgb['min_data'] = 50
params_lgb['min_hessian'] = 1
params_lgb['is_unbalance'] = True

bst, num_iteration, sub_stamp = lgb_train(params_lgb, n_inter=2000)
p_test = bst.predict(test, num_iteration=num_iteration)

sub = pd.DataFrame()
df_test = pd.read_csv(config.RAW_PATH+'test.csv')
sub['test_id'] = df_test['test_id']
sub['is_duplicate'] = p_test
sub.to_csv(config.SUB_PATH + sub_stamp+'.csv', index=False)

del df_test
gc.collect()

# lgb.train(params, train_set, num_boost_round=100, valid_sets=None, valid_names=None, fobj=None, feval=None, 
#           init_model=None, feature_name='auto', categorical_feature='auto', early_stopping_rounds=None, 
#           evals_result=None, verbose_eval=True, learning_rates=None, callbacks=None)









    



Train until valid scores didn't improve in 30 rounds.
[100]	training's binary_logloss: 0.258224	valid_1's binary_logloss: 0.283031
[200]	training's binary_logloss: 0.21013	valid_1's binary_logloss: 0.247885
[300]	training's binary_logloss: 0.180039	valid_1's binary_logloss: 0.231944
[400]	training's binary_logloss: 0.158238	valid_1's binary_logloss: 0.223963
[500]	training's binary_logloss: 0.14591	valid_1's binary_logloss: 0.220252
Early stopping, best iteration is:
[526]	training's binary_logloss: 0.140657	valid_1's binary_logloss: 0.218957
train logloss: 0.146522590924
valid logloss: 0.220318533714






    Out[11]:





7



In [7]:

    
############# oof ############
def xgb_train(n=2500,params=False):
    if not params:
        params = {}
        params['base_score'] = 0.2
        params['objective'] = 'binary:logistic'
        params['eval_metric'] = 'logloss'
        params['eta'] = 0.02
        params['max_depth'] = 7
        params['subsample'] = 0.75
        params['nthread'] = 16
        params['colsample_bytree'] = 0.7 
        params['gamma'] = 0.1 
        params['min_child_weight'] = 3
        # params['scale_pos_weight'] = 0.2
        params['max_delta_step'] = 2

    print params

    d_train = xgb.DMatrix(X_train, label=y_train)
    d_valid = xgb.DMatrix(X_test, label=y_test)
    watchlist = [(d_train, 'train'), (d_valid, 'valid')]

    bst = xgb.train(params, d_train, n, watchlist, early_stopping_rounds=50, verbose_eval=200)
    train_loss_str = log_loss(y_train, bst.predict(d_train))
    valid_loss_str = log_loss(y_test, bst.predict(d_valid))
    sub_stamp = 'xgb_t%s_v%s'%(str(train_loss_str)[2:6], str(valid_loss_str)[2:6])
    print sub_stamp
    print 'train logloss: %s' %(train_loss_str)
    print 'valid logloss: %s' %(valid_loss_str)
    print 'best_ntree_limit: %s' %(bst.best_ntree_limit)
    oof_valid = bst.predict(d_valid, ntree_limit = bst.best_ntree_limit)
    return bst, bst.best_ntree_limit, sub_stamp, params, oof_valid


# random_seed = config.oof_random
random_seed = 1988


test_array = np.zeros(test.shape[0],dtype='float32')
oof_array = []

kf = StratifiedKFold(n_splits=5, random_state=random_seed, shuffle=True)
for train_idx, valid_idx in kf.split(train, y=train_y):
    X_train, X_test, y_train, y_test = train.loc[train_idx,:], train.loc[valid_idx,:], train_y[train_idx],train_y[valid_idx]
    print X_train.shape, y_train.shape
    X_train,y_train = oversample(X_train,y_train,p=0.1742)
    X_test,y_test = oversample(X_test,y_test,p=0.1742)
    X_train,y_train = shuffle(X_train,y_train,random_state=42)  
    print X_train.shape, y_train.shape
    bst, best_ntree_limit, sub_stamp, params, oof_valid = xgb_train(n=3000)
    oof_array.extend(list(oof_valid))
    
    bst.save_model(config.SUB_PATH + sub_stamp+'_oof'+'.mdl')
    bst = xgb.Booster(params)
    bst.load_model(config.SUB_PATH + sub_stamp+'_oof'+'.mdl')
    d_test = xgb.DMatrix(test)
    p_test = bst.predict(d_test, ntree_limit = best_ntree_limit)
    test_array = test_array + p_test
    print '='*20

test_array = test_array / 5.0
sub = pd.DataFrame()
df_test = pd.read_csv(config.RAW_PATH+'test.csv')
sub['test_id'] = df_test['test_id']
sub['is_duplicate'] = test_array
sub.to_csv(config.SUB_PATH + 'oof_xgb_test.csv',index=False)

sub = pd.DataFrame()
sub['xgb'] = oof_array
sub.to_csv(config.SUB_PATH + 'oof_xgb_valid.csv',index=False)


del df_test
gc.collect()









    



(323402, 403) (323402,)
RAW shape: 323402 | Mean rate: 0.369230864373
Oversample: 576180 | Mean rate: 0.207244263945
RAW shape: 80852 | Mean rate: 0.369230198387
Oversample: 144047 | Mean rate: 0.207244857581
(576180, 403) (576180,)
{'colsample_bytree': 0.7, 'eval_metric': 'logloss', 'max_delta_step': 2, 'nthread': 16, 'base_score': 0.2, 'subsample': 0.75, 'eta': 0.02, 'min_child_weight': 3, 'objective': 'binary:logistic', 'max_depth': 7, 'gamma': 0.1}
[0]	train-logloss:0.503596	valid-logloss:0.50362
Multiple eval metrics have been passed: 'valid-logloss' will be used for early stopping.

Will train until valid-logloss hasn't improved in 50 rounds.
[200]	train-logloss:0.189948	valid-logloss:0.196425
[400]	train-logloss:0.170875	valid-logloss:0.183451
[600]	train-logloss:0.159765	valid-logloss:0.178592
[800]	train-logloss:0.151321	valid-logloss:0.176105
[1000]	train-logloss:0.144417	valid-logloss:0.174563
[1200]	train-logloss:0.138119	valid-logloss:0.173543
[1400]	train-logloss:0.132505	valid-logloss:0.17275
[1600]	train-logloss:0.127264	valid-logloss:0.172054
[1800]	train-logloss:0.122337	valid-logloss:0.171545
[2000]	train-logloss:0.117792	valid-logloss:0.171192
[2200]	train-logloss:0.113222	valid-logloss:0.170801
[2400]	train-logloss:0.109032	valid-logloss:0.170515
[2600]	train-logloss:0.104877	valid-logloss:0.170257
[2800]	train-logloss:0.101069	valid-logloss:0.170069
Stopping. Best iteration:
[2929]	train-logloss:0.098683	valid-logloss:0.170014

xgb_t0978_v1700
train logloss: 0.0978111479097
valid logloss: 0.170015492358
best_ntree_limit: 2930
====================
(323403, 403) (323403,)
RAW shape: 323403 | Mean rate: 0.369229722668
Oversample: 576180 | Mean rate: 0.207244263945
RAW shape: 80851 | Mean rate: 0.369234765185
Oversample: 144047 | Mean rate: 0.207244857581
(576180, 403) (576180,)
{'colsample_bytree': 0.7, 'eval_metric': 'logloss', 'max_delta_step': 2, 'nthread': 16, 'base_score': 0.2, 'subsample': 0.75, 'eta': 0.02, 'min_child_weight': 3, 'objective': 'binary:logistic', 'max_depth': 7, 'gamma': 0.1}
[0]	train-logloss:0.503585	valid-logloss:0.503638
Multiple eval metrics have been passed: 'valid-logloss' will be used for early stopping.

Will train until valid-logloss hasn't improved in 50 rounds.
[200]	train-logloss:0.190597	valid-logloss:0.195747
[400]	train-logloss:0.171292	valid-logloss:0.181869
[600]	train-logloss:0.160194	valid-logloss:0.17668
[800]	train-logloss:0.15158	valid-logloss:0.174097
[1000]	train-logloss:0.144635	valid-logloss:0.172601
[1200]	train-logloss:0.138357	valid-logloss:0.171387
[1400]	train-logloss:0.132654	valid-logloss:0.170562
[1600]	train-logloss:0.1274	valid-logloss:0.169941
[1800]	train-logloss:0.122559	valid-logloss:0.169412
[2000]	train-logloss:0.118132	valid-logloss:0.169058
[2200]	train-logloss:0.113696	valid-logloss:0.168664
[2400]	train-logloss:0.109539	valid-logloss:0.168402
[2600]	train-logloss:0.105416	valid-logloss:0.168179
[2800]	train-logloss:0.101578	valid-logloss:0.16807
xgb_t0977_v1679
train logloss: 0.097758615212
valid logloss: 0.167935983843
best_ntree_limit: 2999
====================
(323403, 403) (323403,)
RAW shape: 323403 | Mean rate: 0.369229722668
Oversample: 576180 | Mean rate: 0.207244263945
RAW shape: 80851 | Mean rate: 0.369234765185
Oversample: 144047 | Mean rate: 0.207244857581
(576180, 403) (576180,)
{'colsample_bytree': 0.7, 'eval_metric': 'logloss', 'max_delta_step': 2, 'nthread': 16, 'base_score': 0.2, 'subsample': 0.75, 'eta': 0.02, 'min_child_weight': 3, 'objective': 'binary:logistic', 'max_depth': 7, 'gamma': 0.1}
[0]	train-logloss:0.503603	valid-logloss:0.503628
Multiple eval metrics have been passed: 'valid-logloss' will be used for early stopping.

Will train until valid-logloss hasn't improved in 50 rounds.
[200]	train-logloss:0.190238	valid-logloss:0.196067
[400]	train-logloss:0.171149	valid-logloss:0.182294
[600]	train-logloss:0.159929	valid-logloss:0.177168
[800]	train-logloss:0.151603	valid-logloss:0.174466
[1000]	train-logloss:0.144687	valid-logloss:0.17276
[1200]	train-logloss:0.138722	valid-logloss:0.171589
[1400]	train-logloss:0.133166	valid-logloss:0.170716
[1600]	train-logloss:0.128086	valid-logloss:0.170038
[1800]	train-logloss:0.123255	valid-logloss:0.169475
[2000]	train-logloss:0.11831	valid-logloss:0.168978
[2200]	train-logloss:0.113997	valid-logloss:0.168551
[2400]	train-logloss:0.109733	valid-logloss:0.168237
[2600]	train-logloss:0.105741	valid-logloss:0.167985
[2800]	train-logloss:0.101963	valid-logloss:0.167726
xgb_t0982_v1675
train logloss: 0.098234591892
valid logloss: 0.167548654817
best_ntree_limit: 3000
====================
(323404, 403) (323404,)
RAW shape: 323404 | Mean rate: 0.369231673078
Oversample: 576185 | Mean rate: 0.207244201081
RAW shape: 80850 | Mean rate: 0.369226963513
Oversample: 144042 | Mean rate: 0.207245109065
(576185, 403) (576185,)
{'colsample_bytree': 0.7, 'eval_metric': 'logloss', 'max_delta_step': 2, 'nthread': 16, 'base_score': 0.2, 'subsample': 0.75, 'eta': 0.02, 'min_child_weight': 3, 'objective': 'binary:logistic', 'max_depth': 7, 'gamma': 0.1}
[0]	train-logloss:0.503513	valid-logloss:0.503563
Multiple eval metrics have been passed: 'valid-logloss' will be used for early stopping.

Will train until valid-logloss hasn't improved in 50 rounds.
[200]	train-logloss:0.189973	valid-logloss:0.197182
[400]	train-logloss:0.170949	valid-logloss:0.183589
[600]	train-logloss:0.159742	valid-logloss:0.178529
[800]	train-logloss:0.151519	valid-logloss:0.175952
[1000]	train-logloss:0.144369	valid-logloss:0.174411
[1200]	train-logloss:0.13786	valid-logloss:0.173227
[1400]	train-logloss:0.132026	valid-logloss:0.172338
[1600]	train-logloss:0.126929	valid-logloss:0.171714
[1800]	train-logloss:0.122152	valid-logloss:0.171227
[2000]	train-logloss:0.117607	valid-logloss:0.170758
[2200]	train-logloss:0.113114	valid-logloss:0.170377
[2400]	train-logloss:0.10904	valid-logloss:0.170126
[2600]	train-logloss:0.105079	valid-logloss:0.169899
[2800]	train-logloss:0.101117	valid-logloss:0.169726
xgb_t0974_v1695
train logloss: 0.0974404333264
valid logloss: 0.169514171229
best_ntree_limit: 3000
====================
(323404, 403) (323404,)
RAW shape: 323404 | Mean rate: 0.369231673078
Oversample: 576185 | Mean rate: 0.207244201081
RAW shape: 80850 | Mean rate: 0.369226963513
Oversample: 144042 | Mean rate: 0.207245109065
(576185, 403) (576185,)
{'colsample_bytree': 0.7, 'eval_metric': 'logloss', 'max_delta_step': 2, 'nthread': 16, 'base_score': 0.2, 'subsample': 0.75, 'eta': 0.02, 'min_child_weight': 3, 'objective': 'binary:logistic', 'max_depth': 7, 'gamma': 0.1}
[0]	train-logloss:0.503519	valid-logloss:0.503537
Multiple eval metrics have been passed: 'valid-logloss' will be used for early stopping.

Will train until valid-logloss hasn't improved in 50 rounds.
[200]	train-logloss:0.190184	valid-logloss:0.195659
[400]	train-logloss:0.171194	valid-logloss:0.182356
[600]	train-logloss:0.160483	valid-logloss:0.177624
[800]	train-logloss:0.151943	valid-logloss:0.175074
[1000]	train-logloss:0.144739	valid-logloss:0.173407
[1200]	train-logloss:0.138599	valid-logloss:0.17234
[1400]	train-logloss:0.13311	valid-logloss:0.171536
[1600]	train-logloss:0.127733	valid-logloss:0.170858
[1800]	train-logloss:0.122723	valid-logloss:0.170369
[2000]	train-logloss:0.117977	valid-logloss:0.169907
[2200]	train-logloss:0.113505	valid-logloss:0.169588
[2400]	train-logloss:0.109178	valid-logloss:0.16927
[2600]	train-logloss:0.105341	valid-logloss:0.169039
[2800]	train-logloss:0.10157	valid-logloss:0.168889
xgb_t0979_v1687
train logloss: 0.0979464055983
valid logloss: 0.16870307632
best_ntree_limit: 2993
====================






    Out[7]:





154



In [40]:

    
###################################################
############### train on full dataset #############
###################################################

X_train, X_test, y_train, y_test = train_test_split(train, train_y, test_size=0.2, random_state=1024)

X_train,y_train = oversample(X_train,y_train,p=0.173)
X_test,y_test = oversample(X_test,y_test,p=0.173)

X_train = np.concatenate((X_train, X_test), axis=0)#.shape
y_train = np.concatenate((y_train, y_test), axis=0)#.shape

X_train,y_train = shuffle(X_train,y_train,random_state=42)  
print X_train.shape, y_train.shape

d_train = xgb.DMatrix(X_train, label=y_train)
bst = xgb.train(params, d_train, 500, verbose_eval=30)
print(log_loss(y_test, bst.predict(d_valid)))









    



RAW shape: 323432 | Mean rate: 0.368924534369
Oversample: 581635 | Mean rate: 0.205149277468
RAW shape: 80858 | Mean rate: 0.370291127656
Oversample: 146073 | Mean rate: 0.204972856038
(727708, 207) (727708,)
0.185475337464



In [9]:

    
features = list(train.columns.values)
print("Features: {}".format(len(features)))
print("Features importances...")
mapFeat = dict(zip(["f"+str(i) for i in range(len(features))],features))
# ft.index = ft.reset_index()['index'].map(mapFeat)

ft = pd.Series(bst.get_fscore())

ft = pd.DataFrame(ft).reset_index().rename(columns={'index': 'feature',0:"fscore"}).sort_values(by='fscore',ascending=1)
# ft['feature'] = ft['feature'].map(mapFeat)
ft.plot(kind='barh', x='feature', y='fscore', legend=False, figsize=(10, 25))









    



Features: 393
Features importances...






    Out[9]:





<matplotlib.axes._subplots.AxesSubplot at 0x7fce2ad260d0>



In [99]:

    
def create_feature_map(features):
    outfile = open('xgb.fmap', 'w')
    i = 0
    for feat in features:
        outfile.write('{0}\t{1}\tq\n'.format(i, feat))
        i = i + 1
    outfile.close()

feature_names = list(train.columns.values)
create_feature_map(feature_names)
print("Features: {}".format(len(feature_names)))
print("Features importances...")
import operator
importance = bst.get_fscore(fmap='xgb.fmap')
importance = sorted(importance.items(), key=operator.itemgetter(1))
ft = pd.DataFrame(importance, columns=['feature', 'fscore'])

ft.plot(kind='barh', x='feature', y='fscore', legend=False, figsize=(10, 25))









    



Features: 33
Features importances...






    Out[99]:





<matplotlib.axes._subplots.AxesSubplot at 0x7f3d2a9d2750>



In [24]:

    
ft_corr = tr_corr.reset_index()
ft_corr = ft_corr.rename(columns={'index': 'feature'})[['feature','is_duplicate']]
ft_corr = ft_corr.merge(ft,how='left')
ft_corr.sort_values(by='fscore',ascending=0)









    Out[24]:






  
    
      
      feature
      is_duplicate
      fscore
    
  
  
    
      22
      common_words
      0.232293
      1537.0
    
    
      31
      norm_wmd
      -0.350185
      1444.0
    
    
      14
      q1_q2_wm_ratio
      0.645999
      1148.0
    
    
      13
      q1_q2_intersect_ratio
      0.610168
      1106.0
    
    
      30
      wmd
      -0.357621
      1073.0
    
    
      51
      edit_dist
      -0.335208
      938.0
    
    
      5
      freq_diff
      -0.332427
      816.0
    
    
      28
      fuzz_token_set_ratio
      0.396250
      806.0
    
    
      25
      fuzz_partial_ratio
      0.358234
      802.0
    
    
      12
      q1_q2_intersect
      0.432860
      706.0
    
    
      23
      fuzz_qratio
      0.370515
      669.0
    
    
      4
      q2_freq
      0.265540
      649.0
    
    
      32
      cosine_distance
      -0.383353
      588.0
    
    
      3
      q1_freq
      0.343747
      560.0
    
    
      29
      fuzz_token_sort_ratio
      0.372549
      557.0
    
    
      50
      compression_dist
      -0.365613
      554.0
    
    
      178
      norm_pos_of_question1_n1_in_question2_mean
      0.146704
      531.0
    
    
      168
      norm_pos_of_question2_n1_in_question1_mean
      0.138984
      513.0
    
    
      130
      cooc_tfidf_question1_n2_count_max
      0.215381
      500.0
    
    
      125
      cooc_tfidf_question2_n2_count_max
      0.216392
      487.0
    
    
      42
      kur_q2vec
      0.015286
      453.0
    
    
      111
      cooc_tfidf_question1_n1_count_std
      0.105391
      444.0
    
    
      143
      bm25_question2_n1_mean
      0.287934
      440.0
    
    
      181
      norm_pos_of_question1_n1_in_question2_std
      0.221486
      421.0
    
    
      180
      norm_pos_of_question1_n1_in_question2_max
      0.252094
      416.0
    
    
      148
      bm25_question1_n1_mean
      0.289771
      412.0
    
    
      103
      cooc_tfidf_question2_n1_count_mean
      0.242006
      410.0
    
    
      171
      norm_pos_of_question2_n1_in_question1_std
      0.220216
      406.0
    
    
      44
      jaccard_n1
      0.336347
      402.0
    
    
      15
      len_q1
      -0.171079
      400.0
    
    
      ...
      ...
      ...
      ...
    
    
      43
      RMSE_distance
      -0.368021
      49.0
    
    
      185
      pos_of_question2_n2_in_question1_max
      0.116917
      49.0
    
    
      162
      pos_of_question2_n1_in_question1_min
      -0.128058
      49.0
    
    
      208
      cooccurrence_close_count_n1
      0.101425
      48.0
    
    
      66
      edit_dist_agg_n2_min_mean
      -0.264122
      45.0
    
    
      152
      bm25_question2_n2_min
      0.063232
      44.0
    
    
      195
      pos_of_question1_n2_in_question2_max
      0.143764
      42.0
    
    
      165
      pos_of_question2_n1_in_question1_max
      -0.052749
      39.0
    
    
      132
      cooc_tfidf_question2_n2_ratio_min
      0.051844
      34.0
    
    
      210
      cooccurrence_close_count_n2
      0.166876
      30.0
    
    
      58
      edit_dist_agg_n2_max_max
      -0.028882
      26.0
    
    
      137
      cooc_tfidf_question1_n2_ratio_min
      0.054437
      25.0
    
    
      212
      cooccurrence_close_count_n3
      0.143607
      18.0
    
    
      157
      bm25_question1_n2_min
      0.068174
      17.0
    
    
      34
      jaccard_distance
      -0.131714
      16.0
    
    
      99
      edit_dist_agg_n3_mean_min
      -0.251771
      15.0
    
    
      127
      cooc_tfidf_question1_n2_count_min
      0.073408
      11.0
    
    
      74
      edit_dist_agg_n2_mean_min
      -0.264122
      9.0
    
    
      101
      edit_dist_agg_n3_mean_mean
      -0.251771
      9.0
    
    
      76
      edit_dist_agg_n2_mean_mean
      -0.264122
      4.0
    
    
      0
      is_duplicate
      1.000000
      NaN
    
    
      1
      q1_hash
      -0.207682
      NaN
    
    
      2
      q2_hash
      -0.356072
      NaN
    
    
      6
      q_hash_pos
      0.123509
      NaN
    
    
      7
      q_hash_pos_1
      0.207493
      NaN
    
    
      8
      q2_change
      -0.361354
      NaN
    
    
      9
      q1_change
      -0.259241
      NaN
    
    
      10
      q1_q2_change_max
      -0.369983
      NaN
    
    
      11
      q_change_pair
      0.422098
      NaN
    
    
      26
      fuzz_partial_token_set_ratio
      0.156094
      NaN
    
  

216 rows × 3 columns



In [ ]:

    
{'colsample_bytree': 0.7852290495822306, 'eval_metric': 'logloss', 'max_delta_step': 2.0, 'nthread': 16, 'min_child_weight': 3.0, 'subsample': 0.8071297738930207, 'eta': 0.1, 'base_score': 0.2, 'objective': 'binary:logistic', 'alpha': 10, 'max_depth': 11, 'gamma': 0.1, 'lambda': 100}



In [8]:

    
from sklearn.metrics import roc_auc_score
import xgboost as xgb
from hyperopt import hp, fmin, tpe, STATUS_OK, Trials
d_train = xgb.DMatrix(X_train, label=y_train)
d_valid = xgb.DMatrix(X_test, label=y_test)

def objective(space):
    params = {}    
    params['max_depth'] = int(space['max_depth'])
    params['subsample'] = space['subsample']
    params['colsample_bytree'] = space['colsample_bytree'] 
    params['gamma'] = space['gamma']
    params['lambda'] = space['lambda']
    params['alpha'] = space['alpha']
    params['min_child_weight'] = space['min_child_weight']
    params['max_delta_step'] = space['max_delta_step']
    params['objective'] = 'binary:logistic'
    params['eval_metric'] = 'logloss'
    params['eta'] = 0.1
    params['nthread'] = 16
    params['base_score'] = 0.2
    print params
#     params['scale_pos_weight'] = 0.2
    
    watchlist = [(d_train, 'train'), (d_valid, 'valid')]
    bst = xgb.train(params, d_train, 2000, watchlist, early_stopping_rounds=30, verbose_eval=500)
    logloss = log_loss(y_test, bst.predict(d_valid))

    return{'loss':logloss, 'status': STATUS_OK }


space ={
        'max_depth': hp.quniform("max_depth", 4, 15, 1),
        'min_child_weight': hp.quniform ('min_child_weight', 1, 10, 1),
        'subsample': hp.uniform ('subsample', 0.1, 0.9),
        'colsample_bytree':hp.uniform('colsample_bytree',0.1,0.9),
         'lambda':hp.choice('lambda',[1e-5, 1e-2, 0.05, 0.1, 1, 10, 100]),
         'alpha':hp.choice('alpha',[0, 0.001, 0.005, 0.01, 0.05,0.1,1,10]),
        'gamma':hp.choice('gamma',[0, 0.1,0.2,0.3,0.4,0.5]),
        'max_delta_step':hp.quniform('max_delta_step', 1, 10, 1)
        }
    

trials = Trials()
best = fmin(fn=objective,
            space=space,
            algo=tpe.suggest,
            max_evals=100,
            trials=trials)

print best









    



{'colsample_bytree': 0.6215866306140633, 'eval_metric': 'logloss', 'max_delta_step': 9.0, 'nthread': 16, 'min_child_weight': 5.0, 'subsample': 0.2046354209730657, 'eta': 0.1, 'base_score': 0.2, 'objective': 'binary:logistic', 'alpha': 0.001, 'max_depth': 4, 'gamma': 0.4, 'lambda': 10}
[0]	train-logloss:0.457755	valid-logloss:0.458087
Multiple eval metrics have been passed: 'valid-logloss' will be used for early stopping.

Will train until valid-logloss hasn't improved in 30 rounds.
[500]	train-logloss:0.169888	valid-logloss:0.183592
Stopping. Best iteration:
[839]	train-logloss:0.160773	valid-logloss:0.181412

{'colsample_bytree': 0.4942310527676672, 'eval_metric': 'logloss', 'max_delta_step': 8.0, 'nthread': 16, 'min_child_weight': 8.0, 'subsample': 0.6688748954090619, 'eta': 0.1, 'base_score': 0.2, 'objective': 'binary:logistic', 'alpha': 0.005, 'max_depth': 7, 'gamma': 0, 'lambda': 0.1}
[0]	train-logloss:0.452926	valid-logloss:0.4535
Multiple eval metrics have been passed: 'valid-logloss' will be used for early stopping.

Will train until valid-logloss hasn't improved in 30 rounds.
[500]	train-logloss:0.119825	valid-logloss:0.175848
Stopping. Best iteration:
[487]	train-logloss:0.121232	valid-logloss:0.175664

{'colsample_bytree': 0.2262418681771136, 'eval_metric': 'logloss', 'max_delta_step': 10.0, 'nthread': 16, 'min_child_weight': 5.0, 'subsample': 0.6400081544667823, 'eta': 0.1, 'base_score': 0.2, 'objective': 'binary:logistic', 'alpha': 0.1, 'max_depth': 14, 'gamma': 0.1, 'lambda': 0.1}
[0]	train-logloss:0.449829	valid-logloss:0.452885
Multiple eval metrics have been passed: 'valid-logloss' will be used for early stopping.

Will train until valid-logloss hasn't improved in 30 rounds.
Stopping. Best iteration:
[126]	train-logloss:0.07523	valid-logloss:0.181878

{'colsample_bytree': 0.6524112525169689, 'eval_metric': 'logloss', 'max_delta_step': 8.0, 'nthread': 16, 'min_child_weight': 8.0, 'subsample': 0.3723787100850201, 'eta': 0.1, 'base_score': 0.2, 'objective': 'binary:logistic', 'alpha': 0, 'max_depth': 14, 'gamma': 0.4, 'lambda': 1e-05}
[0]	train-logloss:0.448619	valid-logloss:0.450279
Multiple eval metrics have been passed: 'valid-logloss' will be used for early stopping.

Will train until valid-logloss hasn't improved in 30 rounds.
Stopping. Best iteration:
[108]	train-logloss:0.109754	valid-logloss:0.18066

{'colsample_bytree': 0.16650529293613403, 'eval_metric': 'logloss', 'max_delta_step': 8.0, 'nthread': 16, 'min_child_weight': 8.0, 'subsample': 0.6162541363164695, 'eta': 0.1, 'base_score': 0.2, 'objective': 'binary:logistic', 'alpha': 0.001, 'max_depth': 10, 'gamma': 0.4, 'lambda': 1}
[0]	train-logloss:0.483575	valid-logloss:0.483906
Multiple eval metrics have been passed: 'valid-logloss' will be used for early stopping.

Will train until valid-logloss hasn't improved in 30 rounds.
Stopping. Best iteration:
[335]	train-logloss:0.093127	valid-logloss:0.176843

{'colsample_bytree': 0.2641676058323803, 'eval_metric': 'logloss', 'max_delta_step': 5.0, 'nthread': 16, 'min_child_weight': 7.0, 'subsample': 0.8773616833314521, 'eta': 0.1, 'base_score': 0.2, 'objective': 'binary:logistic', 'alpha': 0, 'max_depth': 12, 'gamma': 0, 'lambda': 10}
[0]	train-logloss:0.453449	valid-logloss:0.454672
Multiple eval metrics have been passed: 'valid-logloss' will be used for early stopping.

Will train until valid-logloss hasn't improved in 30 rounds.
Stopping. Best iteration:
[195]	train-logloss:0.094001	valid-logloss:0.176301

{'colsample_bytree': 0.8722642189983841, 'eval_metric': 'logloss', 'max_delta_step': 2.0, 'nthread': 16, 'min_child_weight': 5.0, 'subsample': 0.7678583799309385, 'eta': 0.1, 'base_score': 0.2, 'objective': 'binary:logistic', 'alpha': 10, 'max_depth': 9, 'gamma': 0.1, 'lambda': 100}
[0]	train-logloss:0.476353	valid-logloss:0.476386
Multiple eval metrics have been passed: 'valid-logloss' will be used for early stopping.

Will train until valid-logloss hasn't improved in 30 rounds.
[500]	train-logloss:0.106879	valid-logloss:0.173537
Stopping. Best iteration:
[540]	train-logloss:0.102536	valid-logloss:0.173374

{'colsample_bytree': 0.5681441209744663, 'eval_metric': 'logloss', 'max_delta_step': 4.0, 'nthread': 16, 'min_child_weight': 8.0, 'subsample': 0.3793264896708848, 'eta': 0.1, 'base_score': 0.2, 'objective': 'binary:logistic', 'alpha': 0.001, 'max_depth': 11, 'gamma': 0.2, 'lambda': 0.05}
[0]	train-logloss:0.456586	valid-logloss:0.457352
Multiple eval metrics have been passed: 'valid-logloss' will be used for early stopping.

Will train until valid-logloss hasn't improved in 30 rounds.
Stopping. Best iteration:
[172]	train-logloss:0.114973	valid-logloss:0.18001

{'colsample_bytree': 0.6585666437547426, 'eval_metric': 'logloss', 'max_delta_step': 7.0, 'nthread': 16, 'min_child_weight': 6.0, 'subsample': 0.12249883737053519, 'eta': 0.1, 'base_score': 0.2, 'objective': 'binary:logistic', 'alpha': 0.1, 'max_depth': 10, 'gamma': 0.3, 'lambda': 0.05}
[0]	train-logloss:0.45133	valid-logloss:0.452029
Multiple eval metrics have been passed: 'valid-logloss' will be used for early stopping.

Will train until valid-logloss hasn't improved in 30 rounds.
Stopping. Best iteration:
[120]	train-logloss:0.154732	valid-logloss:0.18615

{'colsample_bytree': 0.42964353040548475, 'eval_metric': 'logloss', 'max_delta_step': 6.0, 'nthread': 16, 'min_child_weight': 9.0, 'subsample': 0.4616646995522957, 'eta': 0.1, 'base_score': 0.2, 'objective': 'binary:logistic', 'alpha': 0.005, 'max_depth': 9, 'gamma': 0.2, 'lambda': 0.01}
[0]	train-logloss:0.452371	valid-logloss:0.453306
Multiple eval metrics have been passed: 'valid-logloss' will be used for early stopping.

Will train until valid-logloss hasn't improved in 30 rounds.
Stopping. Best iteration:
[256]	train-logloss:0.121103	valid-logloss:0.176798

{'colsample_bytree': 0.8650546946731305, 'eval_metric': 'logloss', 'max_delta_step': 6.0, 'nthread': 16, 'min_child_weight': 8.0, 'subsample': 0.3618221703220311, 'eta': 0.1, 'base_score': 0.2, 'objective': 'binary:logistic', 'alpha': 1, 'max_depth': 5, 'gamma': 0.1, 'lambda': 0.05}
[0]	train-logloss:0.455349	valid-logloss:0.455703
Multiple eval metrics have been passed: 'valid-logloss' will be used for early stopping.

Will train until valid-logloss hasn't improved in 30 rounds.
[500]	train-logloss:0.154015	valid-logloss:0.179303
Stopping. Best iteration:
[581]	train-logloss:0.149983	valid-logloss:0.17868

{'colsample_bytree': 0.7030797180234829, 'eval_metric': 'logloss', 'max_delta_step': 9.0, 'nthread': 16, 'min_child_weight': 7.0, 'subsample': 0.6995774779301367, 'eta': 0.1, 'base_score': 0.2, 'objective': 'binary:logistic', 'alpha': 0.05, 'max_depth': 4, 'gamma': 0.4, 'lambda': 1e-05}
[0]	train-logloss:0.457347	valid-logloss:0.457678
Multiple eval metrics have been passed: 'valid-logloss' will be used for early stopping.

Will train until valid-logloss hasn't improved in 30 rounds.
[500]	train-logloss:0.166804	valid-logloss:0.181617
[1000]	train-logloss:0.152616	valid-logloss:0.177618
[1500]	train-logloss:0.140951	valid-logloss:0.175593
Stopping. Best iteration:
[1494]	train-logloss:0.141075	valid-logloss:0.175577

{'colsample_bytree': 0.2353829374379225, 'eval_metric': 'logloss', 'max_delta_step': 2.0, 'nthread': 16, 'min_child_weight': 6.0, 'subsample': 0.26935957632771823, 'eta': 0.1, 'base_score': 0.2, 'objective': 'binary:logistic', 'alpha': 0.01, 'max_depth': 8, 'gamma': 0.4, 'lambda': 0.1}
[0]	train-logloss:0.47747	valid-logloss:0.477631
Multiple eval metrics have been passed: 'valid-logloss' will be used for early stopping.

Will train until valid-logloss hasn't improved in 30 rounds.
Stopping. Best iteration:
[270]	train-logloss:0.140681	valid-logloss:0.180478

{'colsample_bytree': 0.6672717162125326, 'eval_metric': 'logloss', 'max_delta_step': 9.0, 'nthread': 16, 'min_child_weight': 6.0, 'subsample': 0.30740585129681897, 'eta': 0.1, 'base_score': 0.2, 'objective': 'binary:logistic', 'alpha': 0, 'max_depth': 6, 'gamma': 0.5, 'lambda': 1e-05}
[0]	train-logloss:0.453928	valid-logloss:0.454313
Multiple eval metrics have been passed: 'valid-logloss' will be used for early stopping.

Will train until valid-logloss hasn't improved in 30 rounds.
[500]	train-logloss:0.141136	valid-logloss:0.178575
Stopping. Best iteration:
[582]	train-logloss:0.13594	valid-logloss:0.178281

{'colsample_bytree': 0.15880447956457236, 'eval_metric': 'logloss', 'max_delta_step': 8.0, 'nthread': 16, 'min_child_weight': 8.0, 'subsample': 0.1829600910862202, 'eta': 0.1, 'base_score': 0.2, 'objective': 'binary:logistic', 'alpha': 1, 'max_depth': 10, 'gamma': 0.5, 'lambda': 0.01}
[0]	train-logloss:0.484776	valid-logloss:0.484909
Multiple eval metrics have been passed: 'valid-logloss' will be used for early stopping.

Will train until valid-logloss hasn't improved in 30 rounds.
Stopping. Best iteration:
[217]	train-logloss:0.136896	valid-logloss:0.184117

{'colsample_bytree': 0.4328545889768428, 'eval_metric': 'logloss', 'max_delta_step': 8.0, 'nthread': 16, 'min_child_weight': 6.0, 'subsample': 0.24037830373984043, 'eta': 0.1, 'base_score': 0.2, 'objective': 'binary:logistic', 'alpha': 10, 'max_depth': 6, 'gamma': 0.2, 'lambda': 100}
[0]	train-logloss:0.459882	valid-logloss:0.460301
Multiple eval metrics have been passed: 'valid-logloss' will be used for early stopping.

Will train until valid-logloss hasn't improved in 30 rounds.
[500]	train-logloss:0.159745	valid-logloss:0.181588
Stopping. Best iteration:
[775]	train-logloss:0.148607	valid-logloss:0.179812

{'colsample_bytree': 0.45627781942866086, 'eval_metric': 'logloss', 'max_delta_step': 7.0, 'nthread': 16, 'min_child_weight': 8.0, 'subsample': 0.22173515414320227, 'eta': 0.1, 'base_score': 0.2, 'objective': 'binary:logistic', 'alpha': 0.1, 'max_depth': 7, 'gamma': 0.3, 'lambda': 10}
[0]	train-logloss:0.455826	valid-logloss:0.456411
Multiple eval metrics have been passed: 'valid-logloss' will be used for early stopping.

Will train until valid-logloss hasn't improved in 30 rounds.
[500]	train-logloss:0.133365	valid-logloss:0.179545
Stopping. Best iteration:
[485]	train-logloss:0.134569	valid-logloss:0.179385

{'colsample_bytree': 0.1931001251223502, 'eval_metric': 'logloss', 'max_delta_step': 6.0, 'nthread': 16, 'min_child_weight': 3.0, 'subsample': 0.45187782375223107, 'eta': 0.1, 'base_score': 0.2, 'objective': 'binary:logistic', 'alpha': 0.01, 'max_depth': 9, 'gamma': 0.2, 'lambda': 100}
[0]	train-logloss:0.488414	valid-logloss:0.488188
Multiple eval metrics have been passed: 'valid-logloss' will be used for early stopping.

Will train until valid-logloss hasn't improved in 30 rounds.
[500]	train-logloss:0.119792	valid-logloss:0.176875
Stopping. Best iteration:
[588]	train-logloss:0.111819	valid-logloss:0.176281

{'colsample_bytree': 0.5287440649793352, 'eval_metric': 'logloss', 'max_delta_step': 6.0, 'nthread': 16, 'min_child_weight': 4.0, 'subsample': 0.6080693746281164, 'eta': 0.1, 'base_score': 0.2, 'objective': 'binary:logistic', 'alpha': 0.001, 'max_depth': 12, 'gamma': 0.3, 'lambda': 1}
[0]	train-logloss:0.449223	valid-logloss:0.450825
Multiple eval metrics have been passed: 'valid-logloss' will be used for early stopping.

Will train until valid-logloss hasn't improved in 30 rounds.
Stopping. Best iteration:
[145]	train-logloss:0.090304	valid-logloss:0.178292

{'colsample_bytree': 0.7266881793590831, 'eval_metric': 'logloss', 'max_delta_step': 3.0, 'nthread': 16, 'min_child_weight': 8.0, 'subsample': 0.39309052689513946, 'eta': 0.1, 'base_score': 0.2, 'objective': 'binary:logistic', 'alpha': 0.01, 'max_depth': 9, 'gamma': 0.5, 'lambda': 0.1}
[0]	train-logloss:0.466118	valid-logloss:0.466507
Multiple eval metrics have been passed: 'valid-logloss' will be used for early stopping.

Will train until valid-logloss hasn't improved in 30 rounds.
Stopping. Best iteration:
[210]	train-logloss:0.128233	valid-logloss:0.178441

{'colsample_bytree': 0.8967588020888194, 'eval_metric': 'logloss', 'max_delta_step': 2.0, 'nthread': 16, 'min_child_weight': 2.0, 'subsample': 0.8555879726482751, 'eta': 0.1, 'base_score': 0.2, 'objective': 'binary:logistic', 'alpha': 0.05, 'max_depth': 4, 'gamma': 0.1, 'lambda': 100}
[0]	train-logloss:0.478228	valid-logloss:0.478229
Multiple eval metrics have been passed: 'valid-logloss' will be used for early stopping.

Will train until valid-logloss hasn't improved in 30 rounds.
[500]	train-logloss:0.170527	valid-logloss:0.182535
[1000]	train-logloss:0.157658	valid-logloss:0.178249
[1500]	train-logloss:0.147655	valid-logloss:0.176107
{'colsample_bytree': 0.885243232976707, 'eval_metric': 'logloss', 'max_delta_step': 1.0, 'nthread': 16, 'min_child_weight': 2.0, 'subsample': 0.8780683008706107, 'eta': 0.1, 'base_score': 0.2, 'objective': 'binary:logistic', 'alpha': 10, 'max_depth': 13, 'gamma': 0.1, 'lambda': 100}
[0]	train-logloss:0.488214	valid-logloss:0.488283
Multiple eval metrics have been passed: 'valid-logloss' will be used for early stopping.

Will train until valid-logloss hasn't improved in 30 rounds.
Stopping. Best iteration:
[343]	train-logloss:0.081298	valid-logloss:0.173949

{'colsample_bytree': 0.809427376934251, 'eval_metric': 'logloss', 'max_delta_step': 1.0, 'nthread': 16, 'min_child_weight': 1.0, 'subsample': 0.7835642044093545, 'eta': 0.1, 'base_score': 0.2, 'objective': 'binary:logistic', 'alpha': 10, 'max_depth': 13, 'gamma': 0.1, 'lambda': 100}
[0]	train-logloss:0.488249	valid-logloss:0.488313
Multiple eval metrics have been passed: 'valid-logloss' will be used for early stopping.

Will train until valid-logloss hasn't improved in 30 rounds.
Stopping. Best iteration:
[361]	train-logloss:0.079592	valid-logloss:0.174906

{'colsample_bytree': 0.8081900342913997, 'eval_metric': 'logloss', 'max_delta_step': 1.0, 'nthread': 16, 'min_child_weight': 1.0, 'subsample': 0.7693709637442891, 'eta': 0.1, 'base_score': 0.2, 'objective': 'binary:logistic', 'alpha': 10, 'max_depth': 12, 'gamma': 0.1, 'lambda': 100}
[0]	train-logloss:0.488346	valid-logloss:0.48836
Multiple eval metrics have been passed: 'valid-logloss' will be used for early stopping.

Will train until valid-logloss hasn't improved in 30 rounds.
Stopping. Best iteration:
[369]	train-logloss:0.087433	valid-logloss:0.174648

{'colsample_bytree': 0.7852290495822306, 'eval_metric': 'logloss', 'max_delta_step': 2.0, 'nthread': 16, 'min_child_weight': 3.0, 'subsample': 0.8071297738930207, 'eta': 0.1, 'base_score': 0.2, 'objective': 'binary:logistic', 'alpha': 10, 'max_depth': 11, 'gamma': 0.1, 'lambda': 100}
[0]	train-logloss:0.476087	valid-logloss:0.47621
Multiple eval metrics have been passed: 'valid-logloss' will be used for early stopping.

Will train until valid-logloss hasn't improved in 30 rounds.
[500]	train-logloss:0.079612	valid-logloss:0.173976
Stopping. Best iteration:
[474]	train-logloss:0.082591	valid-logloss:0.173839

{'colsample_bytree': 0.7699761681203763, 'eval_metric': 'logloss', 'max_delta_step': 3.0, 'nthread': 16, 'min_child_weight': 4.0, 'subsample': 0.5542116809010466, 'eta': 0.1, 'base_score': 0.2, 'objective': 'binary:logistic', 'alpha': 10, 'max_depth': 11, 'gamma': 0.1, 'lambda': 100}
[0]	train-logloss:0.466799	valid-logloss:0.467161
Multiple eval metrics have been passed: 'valid-logloss' will be used for early stopping.

Will train until valid-logloss hasn't improved in 30 rounds.
Stopping. Best iteration:
[361]	train-logloss:0.107015	valid-logloss:0.176052

{'colsample_bytree': 0.3453868164923646, 'eval_metric': 'logloss', 'max_delta_step': 4.0, 'nthread': 16, 'min_child_weight': 3.0, 'subsample': 0.8067425284677572, 'eta': 0.1, 'base_score': 0.2, 'objective': 'binary:logistic', 'alpha': 10, 'max_depth': 8, 'gamma': 0.1, 'lambda': 100}
[0]	train-logloss:0.46158	valid-logloss:0.462032
Multiple eval metrics have been passed: 'valid-logloss' will be used for early stopping.

Will train until valid-logloss hasn't improved in 30 rounds.
[500]	train-logloss:0.126788	valid-logloss:0.175164
Stopping. Best iteration:
[795]	train-logloss:0.104325	valid-logloss:0.173645

{'colsample_bytree': 0.29786813041005694, 'eval_metric': 'logloss', 'max_delta_step': 4.0, 'nthread': 16, 'min_child_weight': 4.0, 'subsample': 0.7224425527900319, 'eta': 0.1, 'base_score': 0.2, 'objective': 'binary:logistic', 'alpha': 10, 'max_depth': 8, 'gamma': 0, 'lambda': 100}
[0]	train-logloss:0.461759	valid-logloss:0.462159
Multiple eval metrics have been passed: 'valid-logloss' will be used for early stopping.

Will train until valid-logloss hasn't improved in 30 rounds.
[500]	train-logloss:0.128305	valid-logloss:0.175792
Stopping. Best iteration:
[743]	train-logloss:0.109823	valid-logloss:0.174752

{'colsample_bytree': 0.35255406299458536, 'eval_metric': 'logloss', 'max_delta_step': 4.0, 'nthread': 16, 'min_child_weight': 3.0, 'subsample': 0.5684920092729825, 'eta': 0.1, 'base_score': 0.2, 'objective': 'binary:logistic', 'alpha': 10, 'max_depth': 8, 'gamma': 0.1, 'lambda': 1}
[0]	train-logloss:0.460017	valid-logloss:0.460579
Multiple eval metrics have been passed: 'valid-logloss' will be used for early stopping.

Will train until valid-logloss hasn't improved in 30 rounds.
Stopping. Best iteration:
[468]	train-logloss:0.107501	valid-logloss:0.174734

{'colsample_bytree': 0.36121034843499156, 'eval_metric': 'logloss', 'max_delta_step': 3.0, 'nthread': 16, 'min_child_weight': 5.0, 'subsample': 0.818627908766779, 'eta': 0.1, 'base_score': 0.2, 'objective': 'binary:logistic', 'alpha': 0.005, 'max_depth': 7, 'gamma': 0.1, 'lambda': 0.01}
[0]	train-logloss:0.468212	valid-logloss:0.468661
Multiple eval metrics have been passed: 'valid-logloss' will be used for early stopping.

Will train until valid-logloss hasn't improved in 30 rounds.
Stopping. Best iteration:
[424]	train-logloss:0.125365	valid-logloss:0.175948

{'colsample_bytree': 0.11454188140127014, 'eval_metric': 'logloss', 'max_delta_step': 5.0, 'nthread': 16, 'min_child_weight': 2.0, 'subsample': 0.7317275135482078, 'eta': 0.1, 'base_score': 0.2, 'objective': 'binary:logistic', 'alpha': 0.05, 'max_depth': 6, 'gamma': 0, 'lambda': 10}
[0]	train-logloss:0.491215	valid-logloss:0.491119
Multiple eval metrics have been passed: 'valid-logloss' will be used for early stopping.

Will train until valid-logloss hasn't improved in 30 rounds.
[500]	train-logloss:0.146742	valid-logloss:0.178359
Stopping. Best iteration:
[745]	train-logloss:0.133101	valid-logloss:0.176483

{'colsample_bytree': 0.35873934232776394, 'eval_metric': 'logloss', 'max_delta_step': 3.0, 'nthread': 16, 'min_child_weight': 5.0, 'subsample': 0.693189987161331, 'eta': 0.1, 'base_score': 0.2, 'objective': 'binary:logistic', 'alpha': 1, 'max_depth': 5, 'gamma': 0.1, 'lambda': 100}
[0]	train-logloss:0.470408	valid-logloss:0.470681
Multiple eval metrics have been passed: 'valid-logloss' will be used for early stopping.

Will train until valid-logloss hasn't improved in 30 rounds.
[500]	train-logloss:0.162439	valid-logloss:0.180049
[1000]	train-logloss:0.145488	valid-logloss:0.176277
Stopping. Best iteration:
[1304]	train-logloss:0.136927	valid-logloss:0.175178

{'colsample_bytree': 0.58149140413932, 'eval_metric': 'logloss', 'max_delta_step': 2.0, 'nthread': 16, 'min_child_weight': 4.0, 'subsample': 0.5370865827609025, 'eta': 0.1, 'base_score': 0.2, 'objective': 'binary:logistic', 'alpha': 10, 'max_depth': 8, 'gamma': 0.1, 'lambda': 100}
[0]	train-logloss:0.476855	valid-logloss:0.476901
Multiple eval metrics have been passed: 'valid-logloss' will be used for early stopping.

Will train until valid-logloss hasn't improved in 30 rounds.
[500]	train-logloss:0.126188	valid-logloss:0.17585
Stopping. Best iteration:
[518]	train-logloss:0.124603	valid-logloss:0.175707

{'colsample_bytree': 0.4913478920459614, 'eval_metric': 'logloss', 'max_delta_step': 5.0, 'nthread': 16, 'min_child_weight': 3.0, 'subsample': 0.6611519161703867, 'eta': 0.1, 'base_score': 0.2, 'objective': 'binary:logistic', 'alpha': 0.005, 'max_depth': 7, 'gamma': 0.3, 'lambda': 1e-05}
[0]	train-logloss:0.452886	valid-logloss:0.453421
Multiple eval metrics have been passed: 'valid-logloss' will be used for early stopping.

Will train until valid-logloss hasn't improved in 30 rounds.
[500]	train-logloss:0.114178	valid-logloss:0.175691
Stopping. Best iteration:
[514]	train-logloss:0.113077	valid-logloss:0.175634

{'colsample_bytree': 0.3002022906789539, 'eval_metric': 'logloss', 'max_delta_step': 4.0, 'nthread': 16, 'min_child_weight': 5.0, 'subsample': 0.7566280826568816, 'eta': 0.1, 'base_score': 0.2, 'objective': 'binary:logistic', 'alpha': 10, 'max_depth': 9, 'gamma': 0.5, 'lambda': 0.1}
[0]	train-logloss:0.45933	valid-logloss:0.460163
Multiple eval metrics have been passed: 'valid-logloss' will be used for early stopping.

Will train until valid-logloss hasn't improved in 30 rounds.
Stopping. Best iteration:
[398]	train-logloss:0.09748	valid-logloss:0.173972

{'colsample_bytree': 0.5631979426523986, 'eval_metric': 'logloss', 'max_delta_step': 3.0, 'nthread': 16, 'min_child_weight': 7.0, 'subsample': 0.8936894272191092, 'eta': 0.1, 'base_score': 0.2, 'objective': 'binary:logistic', 'alpha': 0, 'max_depth': 5, 'gamma': 0, 'lambda': 1}
[0]	train-logloss:0.468534	valid-logloss:0.468784
Multiple eval metrics have been passed: 'valid-logloss' will be used for early stopping.

Will train until valid-logloss hasn't improved in 30 rounds.
[500]	train-logloss:0.154218	valid-logloss:0.178734
[1000]	train-logloss:0.13376	valid-logloss:0.175318
Stopping. Best iteration:
[1183]	train-logloss:0.127649	valid-logloss:0.174781

{'colsample_bytree': 0.3872379603142309, 'eval_metric': 'logloss', 'max_delta_step': 2.0, 'nthread': 16, 'min_child_weight': 10.0, 'subsample': 0.8473402869211502, 'eta': 0.1, 'base_score': 0.2, 'objective': 'binary:logistic', 'alpha': 0.1, 'max_depth': 10, 'gamma': 0.4, 'lambda': 0.05}
[0]	train-logloss:0.476375	valid-logloss:0.476886
Multiple eval metrics have been passed: 'valid-logloss' will be used for early stopping.

Will train until valid-logloss hasn't improved in 30 rounds.
Stopping. Best iteration:
[287]	train-logloss:0.095166	valid-logloss:0.175476

{'colsample_bytree': 0.125240726887441, 'eval_metric': 'logloss', 'max_delta_step': 5.0, 'nthread': 16, 'min_child_weight': 2.0, 'subsample': 0.6466219377538647, 'eta': 0.1, 'base_score': 0.2, 'objective': 'binary:logistic', 'alpha': 0.001, 'max_depth': 15, 'gamma': 0.1, 'lambda': 10}
[0]	train-logloss:0.481828	valid-logloss:0.483569
Multiple eval metrics have been passed: 'valid-logloss' will be used for early stopping.

Will train until valid-logloss hasn't improved in 30 rounds.
Stopping. Best iteration:
[195]	train-logloss:0.055078	valid-logloss:0.182065

{'colsample_bytree': 0.6179601149157483, 'eval_metric': 'logloss', 'max_delta_step': 4.0, 'nthread': 16, 'min_child_weight': 4.0, 'subsample': 0.492464726698365, 'eta': 0.1, 'base_score': 0.2, 'objective': 'binary:logistic', 'alpha': 10, 'max_depth': 7, 'gamma': 0.1, 'lambda': 0.01}
[0]	train-logloss:0.459017	valid-logloss:0.459552
Multiple eval metrics have been passed: 'valid-logloss' will be used for early stopping.

Will train until valid-logloss hasn't improved in 30 rounds.
[500]	train-logloss:0.120765	valid-logloss:0.176093
Stopping. Best iteration:
[481]	train-logloss:0.122537	valid-logloss:0.175998

{'colsample_bytree': 0.3011372038566066, 'eval_metric': 'logloss', 'max_delta_step': 7.0, 'nthread': 16, 'min_child_weight': 5.0, 'subsample': 0.5902261465225475, 'eta': 0.1, 'base_score': 0.2, 'objective': 'binary:logistic', 'alpha': 0, 'max_depth': 11, 'gamma': 0.2, 'lambda': 100}
[0]	train-logloss:0.457606	valid-logloss:0.458251
Multiple eval metrics have been passed: 'valid-logloss' will be used for early stopping.

Will train until valid-logloss hasn't improved in 30 rounds.
Stopping. Best iteration:
[407]	train-logloss:0.099604	valid-logloss:0.175287

{'colsample_bytree': 0.4815534771783838, 'eval_metric': 'logloss', 'max_delta_step': 1.0, 'nthread': 16, 'min_child_weight': 7.0, 'subsample': 0.8986936374687722, 'eta': 0.1, 'base_score': 0.2, 'objective': 'binary:logistic', 'alpha': 1, 'max_depth': 10, 'gamma': 0.4, 'lambda': 0.05}
[0]	train-logloss:0.488294	valid-logloss:0.488282
Multiple eval metrics have been passed: 'valid-logloss' will be used for early stopping.

Will train until valid-logloss hasn't improved in 30 rounds.
Stopping. Best iteration:
[271]	train-logloss:0.096622	valid-logloss:0.175078

{'colsample_bytree': 0.24755694526312677, 'eval_metric': 'logloss', 'max_delta_step': 10.0, 'nthread': 16, 'min_child_weight': 3.0, 'subsample': 0.8128000416673974, 'eta': 0.1, 'base_score': 0.2, 'objective': 'binary:logistic', 'alpha': 0.005, 'max_depth': 8, 'gamma': 0, 'lambda': 1e-05}
[0]	train-logloss:0.454051	valid-logloss:0.454859
Multiple eval metrics have been passed: 'valid-logloss' will be used for early stopping.

Will train until valid-logloss hasn't improved in 30 rounds.
Stopping. Best iteration:
[463]	train-logloss:0.09956	valid-logloss:0.175002

{'colsample_bytree': 0.41748483925514357, 'eval_metric': 'logloss', 'max_delta_step': 4.0, 'nthread': 16, 'min_child_weight': 6.0, 'subsample': 0.6286231325099906, 'eta': 0.1, 'base_score': 0.2, 'objective': 'binary:logistic', 'alpha': 0.1, 'max_depth': 9, 'gamma': 0.3, 'lambda': 0.1}
[0]	train-logloss:0.458699	valid-logloss:0.459592
Multiple eval metrics have been passed: 'valid-logloss' will be used for early stopping.

Will train until valid-logloss hasn't improved in 30 rounds.
Stopping. Best iteration:
[259]	train-logloss:0.111371	valid-logloss:0.176456

{'colsample_bytree': 0.5342407748737998, 'eval_metric': 'logloss', 'max_delta_step': 3.0, 'nthread': 16, 'min_child_weight': 9.0, 'subsample': 0.7307268600851822, 'eta': 0.1, 'base_score': 0.2, 'objective': 'binary:logistic', 'alpha': 0.001, 'max_depth': 6, 'gamma': 0.1, 'lambda': 1}
[0]	train-logloss:0.46769	valid-logloss:0.467919
Multiple eval metrics have been passed: 'valid-logloss' will be used for early stopping.

Will train until valid-logloss hasn't improved in 30 rounds.
[500]	train-logloss:0.138411	valid-logloss:0.176471
Stopping. Best iteration:
[746]	train-logloss:0.124029	valid-logloss:0.175004

{'colsample_bytree': 0.19458993822937742, 'eval_metric': 'logloss', 'max_delta_step': 5.0, 'nthread': 16, 'min_child_weight': 4.0, 'subsample': 0.5091554094240125, 'eta': 0.1, 'base_score': 0.2, 'objective': 'binary:logistic', 'alpha': 0.05, 'max_depth': 14, 'gamma': 0.5, 'lambda': 10}
[0]	train-logloss:0.482066	valid-logloss:0.483247
Multiple eval metrics have been passed: 'valid-logloss' will be used for early stopping.

Will train until valid-logloss hasn't improved in 30 rounds.
Stopping. Best iteration:
[172]	train-logloss:0.083291	valid-logloss:0.180125

{'colsample_bytree': 0.6886365852423127, 'eval_metric': 'logloss', 'max_delta_step': 2.0, 'nthread': 16, 'min_child_weight': 1.0, 'subsample': 0.6804680742133138, 'eta': 0.1, 'base_score': 0.2, 'objective': 'binary:logistic', 'alpha': 0.01, 'max_depth': 9, 'gamma': 0.2, 'lambda': 0.01}
[0]	train-logloss:0.475678	valid-logloss:0.47591
Multiple eval metrics have been passed: 'valid-logloss' will be used for early stopping.

Will train until valid-logloss hasn't improved in 30 rounds.
Stopping. Best iteration:
[220]	train-logloss:0.107397	valid-logloss:0.177099

{'colsample_bytree': 0.6335685054642126, 'eval_metric': 'logloss', 'max_delta_step': 1.0, 'nthread': 16, 'min_child_weight': 7.0, 'subsample': 0.4125109664512634, 'eta': 0.1, 'base_score': 0.2, 'objective': 'binary:logistic', 'alpha': 10, 'max_depth': 5, 'gamma': 0.4, 'lambda': 100}
[0]	train-logloss:0.48997	valid-logloss:0.48981
Multiple eval metrics have been passed: 'valid-logloss' will be used for early stopping.

Will train until valid-logloss hasn't improved in 30 rounds.
[500]	train-logloss:0.163834	valid-logloss:0.180581
[1000]	train-logloss:0.147307	valid-logloss:0.17702
Stopping. Best iteration:
[1106]	train-logloss:0.144376	valid-logloss:0.17673

{'colsample_bytree': 0.272055771843968, 'eval_metric': 'logloss', 'max_delta_step': 7.0, 'nthread': 16, 'min_child_weight': 5.0, 'subsample': 0.3440531015873607, 'eta': 0.1, 'base_score': 0.2, 'objective': 'binary:logistic', 'alpha': 0, 'max_depth': 12, 'gamma': 0.1, 'lambda': 1e-05}
[0]	train-logloss:0.451824	valid-logloss:0.453606
Multiple eval metrics have been passed: 'valid-logloss' will be used for early stopping.

Will train until valid-logloss hasn't improved in 30 rounds.
Stopping. Best iteration:
[166]	train-logloss:0.101752	valid-logloss:0.181765

{'colsample_bytree': 0.8548416605656896, 'eval_metric': 'logloss', 'max_delta_step': 6.0, 'nthread': 16, 'min_child_weight': 9.0, 'subsample': 0.8323108705212866, 'eta': 0.1, 'base_score': 0.2, 'objective': 'binary:logistic', 'alpha': 1, 'max_depth': 10, 'gamma': 0.3, 'lambda': 0.05}
[0]	train-logloss:0.450034	valid-logloss:0.451077
Multiple eval metrics have been passed: 'valid-logloss' will be used for early stopping.

Will train until valid-logloss hasn't improved in 30 rounds.
Stopping. Best iteration:
[245]	train-logloss:0.096381	valid-logloss:0.174968

{'colsample_bytree': 0.7461585425861366, 'eval_metric': 'logloss', 'max_delta_step': 2.0, 'nthread': 16, 'min_child_weight': 6.0, 'subsample': 0.16357518862955528, 'eta': 0.1, 'base_score': 0.2, 'objective': 'binary:logistic', 'alpha': 0.01, 'max_depth': 7, 'gamma': 0.5, 'lambda': 100}
[0]	train-logloss:0.477417	valid-logloss:0.477424
Multiple eval metrics have been passed: 'valid-logloss' will be used for early stopping.

Will train until valid-logloss hasn't improved in 30 rounds.
Stopping. Best iteration:
[424]	train-logloss:0.154338	valid-logloss:0.182012

{'colsample_bytree': 0.45593481360326926, 'eval_metric': 'logloss', 'max_delta_step': 4.0, 'nthread': 16, 'min_child_weight': 2.0, 'subsample': 0.7900904488081063, 'eta': 0.1, 'base_score': 0.2, 'objective': 'binary:logistic', 'alpha': 10, 'max_depth': 4, 'gamma': 0.2, 'lambda': 0.1}
[0]	train-logloss:0.463917	valid-logloss:0.464227
Multiple eval metrics have been passed: 'valid-logloss' will be used for early stopping.

Will train until valid-logloss hasn't improved in 30 rounds.
[500]	train-logloss:0.168392	valid-logloss:0.181765
[1000]	train-logloss:0.154509	valid-logloss:0.177592
Stopping. Best iteration:
[1426]	train-logloss:0.145318	valid-logloss:0.175894

{'colsample_bytree': 0.20854333330953745, 'eval_metric': 'logloss', 'max_delta_step': 5.0, 'nthread': 16, 'min_child_weight': 3.0, 'subsample': 0.8680613469000104, 'eta': 0.1, 'base_score': 0.2, 'objective': 'binary:logistic', 'alpha': 0.1, 'max_depth': 13, 'gamma': 0.1, 'lambda': 100}
[0]	train-logloss:0.483757	valid-logloss:0.484099
Multiple eval metrics have been passed: 'valid-logloss' will be used for early stopping.

Will train until valid-logloss hasn't improved in 30 rounds.
Stopping. Best iteration:
[333]	train-logloss:0.086233	valid-logloss:0.175335

{'colsample_bytree': 0.16017168242118035, 'eval_metric': 'logloss', 'max_delta_step': 3.0, 'nthread': 16, 'min_child_weight': 6.0, 'subsample': 0.7471647674921095, 'eta': 0.1, 'base_score': 0.2, 'objective': 'binary:logistic', 'alpha': 0.001, 'max_depth': 6, 'gamma': 0.4, 'lambda': 10}
[0]	train-logloss:0.489256	valid-logloss:0.48897
Multiple eval metrics have been passed: 'valid-logloss' will be used for early stopping.

Will train until valid-logloss hasn't improved in 30 rounds.
[500]	train-logloss:0.145898	valid-logloss:0.177459
[1000]	train-logloss:0.121003	valid-logloss:0.175287
Stopping. Best iteration:
[1013]	train-logloss:0.120507	valid-logloss:0.175206

{'colsample_bytree': 0.32907158928619007, 'eval_metric': 'logloss', 'max_delta_step': 1.0, 'nthread': 16, 'min_child_weight': 4.0, 'subsample': 0.6104725323063749, 'eta': 0.1, 'base_score': 0.2, 'objective': 'binary:logistic', 'alpha': 0.05, 'max_depth': 15, 'gamma': 0.1, 'lambda': 100}
[0]	train-logloss:0.488978	valid-logloss:0.489122
Multiple eval metrics have been passed: 'valid-logloss' will be used for early stopping.

Will train until valid-logloss hasn't improved in 30 rounds.
Stopping. Best iteration:
[344]	train-logloss:0.076086	valid-logloss:0.175998

{'colsample_bytree': 0.41291172790077446, 'eval_metric': 'logloss', 'max_delta_step': 9.0, 'nthread': 16, 'min_child_weight': 1.0, 'subsample': 0.42698697481392084, 'eta': 0.1, 'base_score': 0.2, 'objective': 'binary:logistic', 'alpha': 10, 'max_depth': 11, 'gamma': 0, 'lambda': 1}
[0]	train-logloss:0.45337	valid-logloss:0.454473
Multiple eval metrics have been passed: 'valid-logloss' will be used for early stopping.

Will train until valid-logloss hasn't improved in 30 rounds.
Stopping. Best iteration:
[243]	train-logloss:0.097876	valid-logloss:0.178026

{'colsample_bytree': 0.5867094456305084, 'eval_metric': 'logloss', 'max_delta_step': 6.0, 'nthread': 16, 'min_child_weight': 2.0, 'subsample': 0.12001748525343353, 'eta': 0.1, 'base_score': 0.2, 'objective': 'binary:logistic', 'alpha': 0.005, 'max_depth': 8, 'gamma': 0.1, 'lambda': 0.01}
[0]	train-logloss:0.4522	valid-logloss:0.452742
Multiple eval metrics have been passed: 'valid-logloss' will be used for early stopping.

Will train until valid-logloss hasn't improved in 30 rounds.
Stopping. Best iteration:
[134]	train-logloss:0.161293	valid-logloss:0.186812

{'colsample_bytree': 0.4002347810637314, 'eval_metric': 'logloss', 'max_delta_step': 10.0, 'nthread': 16, 'min_child_weight': 5.0, 'subsample': 0.7055265328868927, 'eta': 0.1, 'base_score': 0.2, 'objective': 'binary:logistic', 'alpha': 10, 'max_depth': 9, 'gamma': 0.3, 'lambda': 0.05}
[0]	train-logloss:0.453729	valid-logloss:0.454522
Multiple eval metrics have been passed: 'valid-logloss' will be used for early stopping.

Will train until valid-logloss hasn't improved in 30 rounds.
Stopping. Best iteration:
[407]	train-logloss:0.093631	valid-logloss:0.175224

{'colsample_bytree': 0.839445229143727, 'eval_metric': 'logloss', 'max_delta_step': 2.0, 'nthread': 16, 'min_child_weight': 3.0, 'subsample': 0.28157601426944423, 'eta': 0.1, 'base_score': 0.2, 'objective': 'binary:logistic', 'alpha': 0.01, 'max_depth': 12, 'gamma': 0.5, 'lambda': 100}
[0]	train-logloss:0.476458	valid-logloss:0.476524
Multiple eval metrics have been passed: 'valid-logloss' will be used for early stopping.

Will train until valid-logloss hasn't improved in 30 rounds.
Stopping. Best iteration:
[339]	train-logloss:0.112297	valid-logloss:0.178553

{'colsample_bytree': 0.5222127042770424, 'eval_metric': 'logloss', 'max_delta_step': 7.0, 'nthread': 16, 'min_child_weight': 4.0, 'subsample': 0.4746548994750394, 'eta': 0.1, 'base_score': 0.2, 'objective': 'binary:logistic', 'alpha': 10, 'max_depth': 10, 'gamma': 0.1, 'lambda': 1e-05}
[0]	train-logloss:0.451785	valid-logloss:0.452561
Multiple eval metrics have been passed: 'valid-logloss' will be used for early stopping.

Will train until valid-logloss hasn't improved in 30 rounds.
Stopping. Best iteration:
[309]	train-logloss:0.094752	valid-logloss:0.176082

{'colsample_bytree': 0.4464093794343689, 'eval_metric': 'logloss', 'max_delta_step': 8.0, 'nthread': 16, 'min_child_weight': 10.0, 'subsample': 0.5349401848000775, 'eta': 0.1, 'base_score': 0.2, 'objective': 'binary:logistic', 'alpha': 1, 'max_depth': 7, 'gamma': 0.2, 'lambda': 100}
[0]	train-logloss:0.457837	valid-logloss:0.458271
Multiple eval metrics have been passed: 'valid-logloss' will be used for early stopping.

Will train until valid-logloss hasn't improved in 30 rounds.
[500]	train-logloss:0.138412	valid-logloss:0.176231
Stopping. Best iteration:
[610]	train-logloss:0.131238	valid-logloss:0.175559

{'colsample_bytree': 0.7056575192744945, 'eval_metric': 'logloss', 'max_delta_step': 3.0, 'nthread': 16, 'min_child_weight': 7.0, 'subsample': 0.5797789060181122, 'eta': 0.1, 'base_score': 0.2, 'objective': 'binary:logistic', 'alpha': 0, 'max_depth': 11, 'gamma': 0.1, 'lambda': 0.1}
[0]	train-logloss:0.46501	valid-logloss:0.465849
Multiple eval metrics have been passed: 'valid-logloss' will be used for early stopping.

Will train until valid-logloss hasn't improved in 30 rounds.






    



---------------------------------------------------------------------------
KeyboardInterrupt                         Traceback (most recent call last)
<ipython-input-8-b88d419a1f67> in <module>()
     47             algo=tpe.suggest,
     48             max_evals=100,
---> 49             trials=trials)
     50 
     51 print best

/usr/local/lib/python2.7/dist-packages/hyperopt/fmin.pyc in fmin(fn, space, algo, max_evals, trials, rstate, allow_trials_fmin, pass_expr_memo_ctrl, catch_eval_exceptions, verbose, return_argmin)
    305             verbose=verbose,
    306             catch_eval_exceptions=catch_eval_exceptions,
--> 307             return_argmin=return_argmin,
    308         )
    309 

/usr/local/lib/python2.7/dist-packages/hyperopt/base.pyc in fmin(self, fn, space, algo, max_evals, rstate, verbose, pass_expr_memo_ctrl, catch_eval_exceptions, return_argmin)
    633             pass_expr_memo_ctrl=pass_expr_memo_ctrl,
    634             catch_eval_exceptions=catch_eval_exceptions,
--> 635             return_argmin=return_argmin)
    636 
    637 

/usr/local/lib/python2.7/dist-packages/hyperopt/fmin.pyc in fmin(fn, space, algo, max_evals, trials, rstate, allow_trials_fmin, pass_expr_memo_ctrl, catch_eval_exceptions, verbose, return_argmin)
    318                     verbose=verbose)
    319     rval.catch_eval_exceptions = catch_eval_exceptions
--> 320     rval.exhaust()
    321     if return_argmin:
    322         return trials.argmin

/usr/local/lib/python2.7/dist-packages/hyperopt/fmin.pyc in exhaust(self)
    197     def exhaust(self):
    198         n_done = len(self.trials)
--> 199         self.run(self.max_evals - n_done, block_until_done=self.async)
    200         self.trials.refresh()
    201         return self

/usr/local/lib/python2.7/dist-packages/hyperopt/fmin.pyc in run(self, N, block_until_done)
    171             else:
    172                 # -- loop over trials and do the jobs directly
--> 173                 self.serial_evaluate()
    174 
    175             if stopped:

/usr/local/lib/python2.7/dist-packages/hyperopt/fmin.pyc in serial_evaluate(self, N)
     90                 ctrl = base.Ctrl(self.trials, current_trial=trial)
     91                 try:
---> 92                     result = self.domain.evaluate(spec, ctrl)
     93                 except Exception as e:
     94                     logger.info('job exception: %s' % str(e))

/usr/local/lib/python2.7/dist-packages/hyperopt/base.pyc in evaluate(self, config, ctrl, attach_attachments)
    838                 memo=memo,
    839                 print_node_on_error=self.rec_eval_print_node_on_error)
--> 840             rval = self.fn(pyll_rval)
    841 
    842         if isinstance(rval, (float, int, np.number)):

<ipython-input-8-b88d419a1f67> in objective(space)
     24 
     25     watchlist = [(d_train, 'train'), (d_valid, 'valid')]
---> 26     bst = xgb.train(params, d_train, 2000, watchlist, early_stopping_rounds=30, verbose_eval=500)
     27     logloss = log_loss(y_test, bst.predict(d_valid))
     28 

/usr/local/lib/python2.7/dist-packages/xgboost/training.pyc in train(params, dtrain, num_boost_round, evals, obj, feval, maximize, early_stopping_rounds, evals_result, verbose_eval, learning_rates, xgb_model, callbacks)
    203                            evals=evals,
    204                            obj=obj, feval=feval,
--> 205                            xgb_model=xgb_model, callbacks=callbacks)
    206 
    207 

/usr/local/lib/python2.7/dist-packages/xgboost/training.pyc in _train_internal(params, dtrain, num_boost_round, evals, obj, feval, xgb_model, callbacks)
     74         # Skip the first update if it is a recovery step.
     75         if version % 2 == 0:
---> 76             bst.update(dtrain, i, obj)
     77             bst.save_rabit_checkpoint()
     78             version += 1

/usr/local/lib/python2.7/dist-packages/xgboost/core.pyc in update(self, dtrain, iteration, fobj)
    804 
    805         if fobj is None:
--> 806             _check_call(_LIB.XGBoosterUpdateOneIter(self.handle, iteration, dtrain.handle))
    807         else:
    808             pred = self.predict(dtrain)

KeyboardInterrupt:

	is_duplicate	q1_freq	q2_freq	freq_diff	q1_q2_intersect	q1_q2_intersect_ratio	q1_q2_wm_ratio	q1_pr	q2_pr	z_place_match	...	wc_diff_unique	wc_ratio_unique	wc_diff_unq_stop	wc_ratio_unique_stop	same_start	char_diff	char_diff_unq_stop	total_unique_words	total_unq_words_stop	char_ratio
is_duplicate	1.000000	0.296621	0.198609	0.337501	0.412979	0.609256	0.641026	0.018429	0.016661	0.063175	...	0.207048	0.065786	0.212901	0.066541	0.197812	0.211750	0.221244	0.288730	0.301078	0.044247
q1_q2_wm_ratio	0.641026	0.487466	0.355246	0.444353	0.684574	0.976114	1.000000	0.083618	0.034415	0.033779	...	0.151437	0.044734	0.152581	0.045589	0.120138	0.154682	0.157205	0.188936	0.202187	0.029748
q1_q2_intersect_ratio	0.609256	0.490514	0.358951	0.428979	0.722593	1.000000	0.976114	0.065419	0.024576	0.033451	...	0.147422	0.043581	0.148945	0.045619	0.114459	0.150171	0.152772	0.178367	0.191013	0.029460
word_match	0.456173	0.108091	0.068038	0.131844	0.164976	0.292098	0.289983	0.000107	0.010960	0.074797	...	0.416946	0.130007	0.423744	0.140251	0.365136	0.401727	0.421170	0.504688	0.514084	0.097808
tfidf_wm	0.426661	0.045062	0.013690	0.119178	0.097573	0.241715	0.240230	0.018761	0.029882	0.057995	...	0.389450	0.119110	0.390666	0.124922	0.310045	0.377316	0.394885	0.481146	0.492038	0.090539
tfidf_wm_stops	0.424446	0.046444	0.014802	0.119944	0.099261	0.241695	0.240182	0.018047	0.029574	0.057249	...	0.380831	0.116539	0.384502	0.123480	0.293581	0.370424	0.389616	0.475493	0.490034	0.088959
dicedistence_n1	0.420369	0.085192	0.042429	0.084832	0.120920	0.237792	0.245715	0.006209	0.018697	0.072514	...	0.449278	0.141261	0.460238	0.148549	0.370355	0.441570	0.463231	0.561132	0.564348	0.110338
q1_q2_intersect	0.412979	0.789134	0.591368	0.298225	1.000000	0.722593	0.684574	0.185699	0.095871	0.002180	...	0.086745	0.025830	0.086129	0.026942	0.069998	0.089146	0.088645	0.088004	0.094159	0.016798
jaccard_n1	0.403398	0.053339	0.023333	0.093114	0.093163	0.207972	0.212186	0.012013	0.020005	0.062210	...	0.431276	0.129653	0.444258	0.137975	0.357477	0.425794	0.449788	0.544192	0.546700	0.100936
compression_dist	0.401608	0.070227	0.044802	0.097175	0.109371	0.225148	0.231770	0.004483	0.003890	0.044859	...	0.591821	0.173829	0.629824	0.190559	0.377760	0.614931	0.658859	0.672904	0.682621	0.149767
fuzz_token_set_ratio	0.396250	0.073047	0.035621	0.065156	0.101685	0.215703	0.224983	0.003706	0.015883	0.072311	...	0.378203	0.119159	0.376923	0.112717	0.520594	0.357568	0.373778	0.529653	0.502431	0.090681
intersect_close_ratio_n1	0.386142	0.076081	0.033939	0.070294	0.106269	0.218804	0.225561	0.008511	0.021284	0.079721	...	0.403637	0.131768	0.413781	0.138550	0.344281	0.384914	0.413873	0.485864	0.487991	0.101267
cosine_distance	0.383353	0.094392	0.052775	0.050551	0.118282	0.226984	0.236324	0.023955	0.009529	0.099250	...	0.347785	0.123468	0.353087	0.129088	0.331542	0.332364	0.357008	0.390766	0.392690	0.110689
braycurtis_distance	0.373533	0.074025	0.041368	0.058423	0.100352	0.208712	0.214944	0.011251	0.009287	0.092192	...	0.352645	0.126017	0.360732	0.133364	0.328306	0.338199	0.365095	0.394012	0.395781	0.100361
fuzz_token_sort_ratio	0.372549	0.053984	0.024276	0.082596	0.088935	0.198681	0.202809	0.012577	0.020599	0.069679	...	0.573175	0.193992	0.544256	0.187017	0.509895	0.561328	0.548579	0.530075	0.497493	0.163729
fuzz_qratio	0.370515	0.064127	0.042052	0.099188	0.106037	0.217565	0.219972	0.014650	0.006547	0.050904	...	0.574604	0.187528	0.544941	0.180481	0.545101	0.558042	0.546280	0.585915	0.555160	0.155578
char_distribution_ratio_std	0.369171	0.040780	0.015734	0.084054	0.074781	0.176830	0.183112	0.015559	0.020534	0.055955	...	0.393185	0.109461	0.407003	0.115226	0.318902	0.404992	0.431820	0.478934	0.483277	0.092460
minkowski_distance	0.367988	0.058778	0.030754	0.069073	0.087539	0.197664	0.201169	0.001259	0.011970	0.088154	...	0.363508	0.117265	0.372212	0.125123	0.333252	0.350491	0.377956	0.423810	0.423766	0.088071
cityblock_distance	0.367678	0.058715	0.030465	0.068994	0.087468	0.197396	0.200907	0.001256	0.012152	0.088119	...	0.363716	0.117373	0.372440	0.125044	0.333383	0.350662	0.378038	0.423797	0.423710	0.088191
char_distribution_kl	0.367569	0.076128	0.042658	0.071124	0.104468	0.205766	0.216190	0.005751	0.010056	0.064252	...	0.427578	0.129746	0.440548	0.134910	0.329878	0.443522	0.466270	0.500613	0.505342	0.115182
edit_dist	0.363314	0.052797	0.031065	0.107013	0.098689	0.208114	0.208664	0.017945	0.011778	0.043613	...	0.455591	0.136686	0.463785	0.147319	0.394476	0.455909	0.475519	0.523411	0.520470	0.114905
fuzz_partial_token_sort_ratio	0.360776	0.052253	0.026430	0.071715	0.083777	0.187334	0.193005	0.009621	0.012978	0.066607	...	0.428750	0.124707	0.422835	0.124150	0.470208	0.416147	0.425894	0.532344	0.505971	0.097755
fuzz_partial_ratio	0.358234	0.063455	0.041912	0.082073	0.099348	0.203688	0.209226	0.010340	0.002210	0.046488	...	0.370820	0.104515	0.372347	0.105105	0.488419	0.351008	0.370044	0.552829	0.530615	0.067812
wmd	0.357621	0.080754	0.051812	0.095494	0.123375	0.233257	0.230458	0.000490	0.005023	0.039572	...	0.411257	0.129093	0.433550	0.142258	0.387992	0.393620	0.421469	0.450407	0.450937	0.117364
char_distribution_cosine	0.350929	0.066776	0.030565	0.066950	0.094656	0.190514	0.199702	0.002218	0.020482	0.072473	...	0.361426	0.115177	0.369945	0.120829	0.307911	0.366026	0.388538	0.412910	0.415369	0.100264
norm_wmd	0.350185	0.071848	0.041669	0.091403	0.113215	0.223563	0.220018	0.004530	0.011780	0.049612	...	0.429874	0.134419	0.452854	0.149156	0.400584	0.408873	0.435077	0.481996	0.476311	0.124535
jaccard	0.341490	0.057484	0.039217	0.093256	0.097949	0.202614	0.197817	0.015144	0.005916	0.061770	...	0.429713	0.130990	0.409725	0.127537	0.566882	0.393118	0.395894	0.482611	0.444249	0.094577
freq_diff	0.337501	0.169080	0.100777	1.000000	0.298225	0.428979	0.444353	0.063712	0.056260	0.003862	...	0.067860	0.018200	0.067786	0.021691	0.040075	0.073070	0.071185	0.076522	0.078187	0.018074
edit_dist_agg_n3_mean_mean	0.326339	0.024466	0.008742	0.076193	0.062855	0.161905	0.169505	0.018515	0.013620	0.010555	...	0.369987	0.100574	0.389232	0.103124	0.290440	0.373141	0.398077	0.596105	0.623957	0.084170
edit_dist_agg_n3_min_min	0.326339	0.024466	0.008742	0.076193	0.062855	0.161905	0.169505	0.018515	0.013620	0.010555	...	0.369987	0.100574	0.389232	0.103124	0.290440	0.373141	0.398077	0.596105	0.623957	0.084170
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
lsa_pn10_1	0.012576	0.001198	0.011997	0.005710	0.022041	0.000318	0.003768	0.012058	0.000979	0.006294	...	0.011730	0.005014	0.014417	0.006008	0.018911	0.009201	0.007050	0.021093	0.027982	0.006002
pos_of_question1_n1_in_question2_median	0.012323	0.040680	0.019722	0.052882	0.023614	0.013920	0.014394	0.033490	0.009789	0.083415	...	0.121917	0.175631	0.149237	0.202135	0.036631	0.162574	0.162411	0.378543	0.407656	0.143045
pos_of_question1_n1_in_question2_mean	0.011189	0.042929	0.020687	0.054457	0.025476	0.014044	0.014350	0.034695	0.010248	0.085515	...	0.129234	0.182698	0.157260	0.209693	0.037232	0.171015	0.170485	0.393125	0.422616	0.148715
diff_len	0.011137	0.018938	0.025338	0.013381	0.002910	0.006019	0.006060	0.039278	0.028082	0.002180	...	0.156516	0.776571	0.144614	0.746947	0.012580	0.150253	0.109326	0.129764	0.127882	0.651576
svd_cooc_on2_tn2_9	0.009420	0.013534	0.005391	0.001095	0.001769	0.001943	0.001611	0.014768	0.002735	0.003942	...	0.012894	0.003522	0.011192	0.003479	0.019686	0.010378	0.009962	0.005077	0.004264	0.002928
lsa_cn10_q1_1	0.008334	0.107563	0.065040	0.013471	0.138602	0.037324	0.032981	0.033257	0.000336	0.007604	...	0.013855	0.001805	0.014250	0.003909	0.008747	0.011963	0.009390	0.016959	0.021816	0.004651
lsa_cn10_q2_1	0.008334	0.107563	0.065040	0.013471	0.138602	0.037324	0.032981	0.033257	0.000336	0.007604	...	0.013855	0.001805	0.014250	0.003909	0.008747	0.011963	0.009390	0.016959	0.021816	0.004651
lsa_wn3_q2_1	0.008334	0.107563	0.065040	0.013471	0.138602	0.037324	0.032981	0.033257	0.000336	0.007604	...	0.013855	0.001805	0.014250	0.003909	0.008747	0.011963	0.009390	0.016959	0.021816	0.004651
nmf_c_question2_n5_1	0.008195	0.083173	0.083755	0.014720	0.076449	0.035411	0.033630	0.035111	0.040087	0.102338	...	0.228009	0.292831	0.232863	0.313614	0.061320	0.244838	0.243706	0.485460	0.493997	0.256925
svd_cooc_on2_tn1_5	0.007119	0.037085	0.026726	0.006325	0.027375	0.009104	0.010497	0.004027	0.006271	0.001474	...	0.002217	0.000463	0.001060	0.003516	0.001949	0.001334	0.001964	0.001781	0.000243	0.000294
nmf_p_n2_0	0.006894	0.079055	0.099540	0.000397	0.074983	0.019041	0.017734	0.054480	0.083563	0.061907	...	0.048702	0.011127	0.040406	0.007840	0.017834	0.048133	0.046750	0.061933	0.051369	0.008193
lsa_pn10_2	0.006817	0.185262	0.134210	0.020450	0.220081	0.045107	0.038408	0.028231	0.006104	0.035418	...	0.000562	0.002055	0.001679	0.005174	0.009103	0.004656	0.006761	0.029028	0.020454	0.002444
svd_cooc_on1_tn2_8	0.006333	0.005427	0.016213	0.007188	0.004666	0.001477	0.005593	0.013858	0.017694	0.000675	...	0.002742	0.001815	0.006556	0.000166	0.000590	0.004466	0.006302	0.012136	0.019301	0.001542
lsa_cn10_q1_5	0.006272	0.145682	0.294280	0.008552	0.118232	0.032678	0.033686	0.090832	0.252095	0.107857	...	0.005736	0.007977	0.007649	0.010489	0.001152	0.000503	0.002738	0.017380	0.009489	0.008456
lsa_cn10_q2_5	0.006272	0.145682	0.294280	0.008552	0.118232	0.032678	0.033686	0.090832	0.252095	0.107857	...	0.005736	0.007977	0.007649	0.010489	0.001152	0.000503	0.002738	0.017380	0.009489	0.008456
lsa_wn3_q2_5	0.006272	0.145682	0.294280	0.008552	0.118232	0.032678	0.033686	0.090832	0.252095	0.107857	...	0.005736	0.007977	0.007649	0.010489	0.001152	0.000503	0.002738	0.017380	0.009489	0.008456
pos_of_question2_n1_in_question1_mean	0.006033	0.046832	0.002510	0.063798	0.018479	0.002969	0.002360	0.055148	0.027873	0.077163	...	0.220370	0.211369	0.243474	0.236622	0.049420	0.254441	0.231778	0.456183	0.483100	0.181531
edit_dist_agg_n2_max_std	0.005927	0.002377	0.025246	0.012332	0.011621	0.008176	0.015845	0.015403	0.024549	0.020470	...	0.015716	0.197529	0.005400	0.246259	0.014387	0.015493	0.008581	0.087150	0.095353	0.163037
pos_of_question2_n1_in_question1_median	0.005047	0.043768	0.001196	0.062415	0.016045	0.002672	0.002142	0.053597	0.027518	0.075142	...	0.211907	0.202691	0.234442	0.227316	0.048960	0.245076	0.223403	0.440685	0.467177	0.174194
lsa_wn3_q1_5	0.004689	0.186252	0.152928	0.007093	0.108101	0.023675	0.028104	0.144274	0.120457	0.126869	...	0.007369	0.005389	0.004293	0.005723	0.008033	0.009288	0.010776	0.010182	0.006587	0.002025
cooc_tfidf_question1_n1_count_std	0.004534	0.041032	0.064001	0.073066	0.047108	0.015999	0.015170	0.000161	0.052752	0.024478	...	0.019993	0.028305	0.018761	0.050279	0.021049	0.010642	0.006420	0.031825	0.039982	0.018581
z_q2_place_num	0.004522	0.005548	0.008613	0.005287	0.002585	0.005915	0.003055	0.008097	0.011376	0.738607	...	0.032781	0.043078	0.037139	0.043092	0.016649	0.042655	0.045656	0.084232	0.098806	0.041025
nmf_p_n2_1	0.003621	0.026742	0.012484	0.011828	0.052396	0.014314	0.011199	0.036264	0.066747	0.012954	...	0.062976	0.018539	0.052242	0.015479	0.019251	0.061720	0.056959	0.080989	0.067650	0.014542
nmf_w_question1_n3_0	0.003426	0.144577	0.096858	0.016395	0.113097	0.013826	0.013711	0.076627	0.025876	0.038849	...	0.039278	0.007077	0.041755	0.007526	0.039450	0.034794	0.035216	0.033604	0.032700	0.010026
nmf_c_question1_n5_1	0.002647	0.073505	0.064601	0.026593	0.072965	0.036413	0.034726	0.018382	0.018876	0.109749	...	0.175084	0.214403	0.187532	0.249333	0.057093	0.207034	0.219829	0.443276	0.453683	0.202050
z_q1_place_num	0.002183	0.009255	0.008271	0.003545	0.005260	0.006741	0.004060	0.011865	0.007571	0.749328	...	0.023213	0.031389	0.027874	0.037104	0.013278	0.033444	0.037178	0.075146	0.089350	0.025675
edit_dist_agg_n1_max_std	0.001549	0.029105	0.001893	0.015360	0.017830	0.007274	0.003805	0.029608	0.022281	0.015480	...	0.070055	0.184489	0.067459	0.221408	0.052199	0.062229	0.054836	0.199959	0.212620	0.156063
cooc_tfidf_question2_n1_count_std	0.001358	0.027159	0.069295	0.068700	0.043920	0.014229	0.013165	0.023579	0.066267	0.025357	...	0.032098	0.049792	0.029457	0.031464	0.015366	0.004882	0.006462	0.024904	0.032381	0.049008
z_q2_has_place	0.000900	0.001412	0.002652	0.001977	0.004445	0.010539	0.006853	0.007198	0.010221	0.762324	...	0.019203	0.030487	0.022519	0.029478	0.011847	0.027166	0.031130	0.070461	0.084916	0.028749
z_q1_has_place	0.000180	0.004121	0.003831	0.001484	0.000121	0.009932	0.006590	0.010791	0.006724	0.764193	...	0.013340	0.027691	0.017229	0.033519	0.010078	0.021595	0.026155	0.063369	0.077690	0.022233

	feature	is_duplicate	fscore
22	common_words	0.232293	1537.0
31	norm_wmd	-0.350185	1444.0
14	q1_q2_wm_ratio	0.645999	1148.0
13	q1_q2_intersect_ratio	0.610168	1106.0
30	wmd	-0.357621	1073.0
51	edit_dist	-0.335208	938.0
5	freq_diff	-0.332427	816.0
28	fuzz_token_set_ratio	0.396250	806.0
25	fuzz_partial_ratio	0.358234	802.0
12	q1_q2_intersect	0.432860	706.0
23	fuzz_qratio	0.370515	669.0
4	q2_freq	0.265540	649.0
32	cosine_distance	-0.383353	588.0
3	q1_freq	0.343747	560.0
29	fuzz_token_sort_ratio	0.372549	557.0
50	compression_dist	-0.365613	554.0
178	norm_pos_of_question1_n1_in_question2_mean	0.146704	531.0
168	norm_pos_of_question2_n1_in_question1_mean	0.138984	513.0
130	cooc_tfidf_question1_n2_count_max	0.215381	500.0
125	cooc_tfidf_question2_n2_count_max	0.216392	487.0
42	kur_q2vec	0.015286	453.0
111	cooc_tfidf_question1_n1_count_std	0.105391	444.0
143	bm25_question2_n1_mean	0.287934	440.0
181	norm_pos_of_question1_n1_in_question2_std	0.221486	421.0
180	norm_pos_of_question1_n1_in_question2_max	0.252094	416.0
148	bm25_question1_n1_mean	0.289771	412.0
103	cooc_tfidf_question2_n1_count_mean	0.242006	410.0
171	norm_pos_of_question2_n1_in_question1_std	0.220216	406.0
44	jaccard_n1	0.336347	402.0
15	len_q1	-0.171079	400.0
...	...	...	...
43	RMSE_distance	-0.368021	49.0
185	pos_of_question2_n2_in_question1_max	0.116917	49.0
162	pos_of_question2_n1_in_question1_min	-0.128058	49.0
208	cooccurrence_close_count_n1	0.101425	48.0
66	edit_dist_agg_n2_min_mean	-0.264122	45.0
152	bm25_question2_n2_min	0.063232	44.0
195	pos_of_question1_n2_in_question2_max	0.143764	42.0
165	pos_of_question2_n1_in_question1_max	-0.052749	39.0
132	cooc_tfidf_question2_n2_ratio_min	0.051844	34.0
210	cooccurrence_close_count_n2	0.166876	30.0
58	edit_dist_agg_n2_max_max	-0.028882	26.0
137	cooc_tfidf_question1_n2_ratio_min	0.054437	25.0
212	cooccurrence_close_count_n3	0.143607	18.0
157	bm25_question1_n2_min	0.068174	17.0
34	jaccard_distance	-0.131714	16.0
99	edit_dist_agg_n3_mean_min	-0.251771	15.0
127	cooc_tfidf_question1_n2_count_min	0.073408	11.0
74	edit_dist_agg_n2_mean_min	-0.264122	9.0
101	edit_dist_agg_n3_mean_mean	-0.251771	9.0
76	edit_dist_agg_n2_mean_mean	-0.264122	4.0
0	is_duplicate	1.000000	NaN
1	q1_hash	-0.207682	NaN
2	q2_hash	-0.356072	NaN
6	q_hash_pos	0.123509	NaN
7	q_hash_pos_1	0.207493	NaN
8	q2_change	-0.361354	NaN
9	q1_change	-0.259241	NaN
10	q1_q2_change_max	-0.369983	NaN
11	q_change_pair	0.422098	NaN
26	fuzz_partial_token_set_ratio	0.156094	NaN