In [1]:
from __future__ import division
import random
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import os
import gc
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
from scipy import sparse as ssp
from sklearn.utils import resample,shuffle
from sklearn.metrics import log_loss, roc_auc_score
from sklearn.cross_validation import train_test_split
import xgboost as xgb
import lightgbm as lgb
import config
from sklearn.model_selection import KFold, StratifiedKFold

from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
from sklearn.feature_selection import SelectPercentile, f_classif
from sklearn import preprocessing


/usr/local/lib/python2.7/dist-packages/sklearn/cross_validation.py:44: DeprecationWarning: This module was deprecated in version 0.18 in favor of the model_selection module into which all the refactored classes and functions are moved. Also note that the interface of the new CV iterators are different from that of this module. This module will be removed in 0.20.
  "This module will be removed in 0.20.", DeprecationWarning)

In [71]:
RAW_PATH=config.RAW_PATH
FEAT_PATH =config.FEAT_PATH

train = pd.read_csv(RAW_PATH+'train.csv')
train.drop(train.index[[config.ab_dup_test]], inplace=True)
train.reset_index(drop=True, inplace=True)
train_y = train['is_duplicate'].values

In [4]:
feat_df = ['feat_ab.csv','feature_base_lemmer.csv','feature_vect_lemmer.csv','feat_158_stpf.csv']
# feat_df = ['feat_ab.csv','feat_158_stpf.csv']

df = pd.read_csv(FEAT_PATH+'magic_feature.csv')
del df['question1'], df['question2'], df['id']
print 'feat_mag {}'.format(df.shape)

def remove_col(train):
    list1=['question1','question2','id','is_duplicate']
    for i in list1:
        if i in list(train.columns):
            del train[i]
    return train

for f in feat_df:
    df1 = pd.read_csv(FEAT_PATH+f)
    df1 = remove_col(df1)
    df = pd.concat([df, df1],axis=1)
    del df1
    gc.collect()
    print f, df.shape

# feat_ab = pd.read_csv(FEAT_PATH+'feat_ab.csv')
# del feat_ab['question1'], feat_ab['question2']
# print 'feat_ab {}'.format(feat_ab.shape)

# feature_base_close_porter = pd.read_csv(FEAT_PATH+'feature_base_close_porter.csv')
# del feature_base_close_porter['question1'], feature_base_close_porter['question2'], feature_base_close_porter['is_duplicate']
# print 'feature_base_close_porter {}'.format(feature_base_close_porter.shape)

# df = pd.concat([feat_mag, feat_ab, feature_base_close_porter], axis=1)
# print 'df {}'.format(df.shape)

# del feat_mag, feat_ab, feature_base_close_porter
gc.collect()


feat_mag (2750086, 26)
feat_ab.csv (2750086, 55)
feature_base_lemmer.csv (2750086, 267)
feature_vect_lemmer.csv (2750086, 388)
feat_158_stpf.csv (2750086, 404)
Out[4]:
0

In [5]:
del_feat = ['q1_hash','q2_hash','q_hash_pos','q_hash_pos_1','q1_change','q2_change']
del_feat.extend(['q_change_pair','q1_q2_change_max'])
del_feat.extend(['euclidean_distance', 'jaccard_distance','RMSE_distance'])
# del_feat.extend(['freq_diff', 'q1_q2_intersect_ratio'])
del_feat.extend(list(tr_corr[abs(tr_corr['is_duplicate'])<0.01].index))

print df.shape
for i in list(df.columns):
    if i in del_feat:
        del df[i]
# df = df[use_feat]
print df.shape


---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-5-4fb5182de654> in <module>()
      4 # del_feat.extend(['freq_diff', 'q1_q2_intersect_ratio'])
      5 
----> 6 del_feat.extend(list(tr_corr[abs(tr_corr['is_duplicate'])<0.01].index))
      7 
      8 print df.shape

NameError: name 'tr_corr' is not defined

In [5]:
tr_corr = df[df['is_duplicate']!=-1].corr()
# tr_corr.sort_values(by='is_duplicate',ascending=0)
abs(tr_corr).sort_values(by='is_duplicate',ascending=0)
# tr_corr[abs(tr_corr['is_duplicate'])<0.1]
# tr_corr[tr_corr.index.str.contains('pos')].sort_values(by='is_duplicate',ascending=0)


Out[5]:
is_duplicate q1_freq q2_freq freq_diff q1_q2_intersect q1_q2_intersect_ratio q1_q2_wm_ratio q1_pr q2_pr z_place_match ... wc_diff_unique wc_ratio_unique wc_diff_unq_stop wc_ratio_unique_stop same_start char_diff char_diff_unq_stop total_unique_words total_unq_words_stop char_ratio
is_duplicate 1.000000 0.296621 0.198609 0.337501 0.412979 0.609256 0.641026 0.018429 0.016661 0.063175 ... 0.207048 0.065786 0.212901 0.066541 0.197812 0.211750 0.221244 0.288730 0.301078 0.044247
q1_q2_wm_ratio 0.641026 0.487466 0.355246 0.444353 0.684574 0.976114 1.000000 0.083618 0.034415 0.033779 ... 0.151437 0.044734 0.152581 0.045589 0.120138 0.154682 0.157205 0.188936 0.202187 0.029748
q1_q2_intersect_ratio 0.609256 0.490514 0.358951 0.428979 0.722593 1.000000 0.976114 0.065419 0.024576 0.033451 ... 0.147422 0.043581 0.148945 0.045619 0.114459 0.150171 0.152772 0.178367 0.191013 0.029460
word_match 0.456173 0.108091 0.068038 0.131844 0.164976 0.292098 0.289983 0.000107 0.010960 0.074797 ... 0.416946 0.130007 0.423744 0.140251 0.365136 0.401727 0.421170 0.504688 0.514084 0.097808
tfidf_wm 0.426661 0.045062 0.013690 0.119178 0.097573 0.241715 0.240230 0.018761 0.029882 0.057995 ... 0.389450 0.119110 0.390666 0.124922 0.310045 0.377316 0.394885 0.481146 0.492038 0.090539
tfidf_wm_stops 0.424446 0.046444 0.014802 0.119944 0.099261 0.241695 0.240182 0.018047 0.029574 0.057249 ... 0.380831 0.116539 0.384502 0.123480 0.293581 0.370424 0.389616 0.475493 0.490034 0.088959
dicedistence_n1 0.420369 0.085192 0.042429 0.084832 0.120920 0.237792 0.245715 0.006209 0.018697 0.072514 ... 0.449278 0.141261 0.460238 0.148549 0.370355 0.441570 0.463231 0.561132 0.564348 0.110338
q1_q2_intersect 0.412979 0.789134 0.591368 0.298225 1.000000 0.722593 0.684574 0.185699 0.095871 0.002180 ... 0.086745 0.025830 0.086129 0.026942 0.069998 0.089146 0.088645 0.088004 0.094159 0.016798
jaccard_n1 0.403398 0.053339 0.023333 0.093114 0.093163 0.207972 0.212186 0.012013 0.020005 0.062210 ... 0.431276 0.129653 0.444258 0.137975 0.357477 0.425794 0.449788 0.544192 0.546700 0.100936
compression_dist 0.401608 0.070227 0.044802 0.097175 0.109371 0.225148 0.231770 0.004483 0.003890 0.044859 ... 0.591821 0.173829 0.629824 0.190559 0.377760 0.614931 0.658859 0.672904 0.682621 0.149767
fuzz_token_set_ratio 0.396250 0.073047 0.035621 0.065156 0.101685 0.215703 0.224983 0.003706 0.015883 0.072311 ... 0.378203 0.119159 0.376923 0.112717 0.520594 0.357568 0.373778 0.529653 0.502431 0.090681
intersect_close_ratio_n1 0.386142 0.076081 0.033939 0.070294 0.106269 0.218804 0.225561 0.008511 0.021284 0.079721 ... 0.403637 0.131768 0.413781 0.138550 0.344281 0.384914 0.413873 0.485864 0.487991 0.101267
cosine_distance 0.383353 0.094392 0.052775 0.050551 0.118282 0.226984 0.236324 0.023955 0.009529 0.099250 ... 0.347785 0.123468 0.353087 0.129088 0.331542 0.332364 0.357008 0.390766 0.392690 0.110689
braycurtis_distance 0.373533 0.074025 0.041368 0.058423 0.100352 0.208712 0.214944 0.011251 0.009287 0.092192 ... 0.352645 0.126017 0.360732 0.133364 0.328306 0.338199 0.365095 0.394012 0.395781 0.100361
fuzz_token_sort_ratio 0.372549 0.053984 0.024276 0.082596 0.088935 0.198681 0.202809 0.012577 0.020599 0.069679 ... 0.573175 0.193992 0.544256 0.187017 0.509895 0.561328 0.548579 0.530075 0.497493 0.163729
fuzz_qratio 0.370515 0.064127 0.042052 0.099188 0.106037 0.217565 0.219972 0.014650 0.006547 0.050904 ... 0.574604 0.187528 0.544941 0.180481 0.545101 0.558042 0.546280 0.585915 0.555160 0.155578
char_distribution_ratio_std 0.369171 0.040780 0.015734 0.084054 0.074781 0.176830 0.183112 0.015559 0.020534 0.055955 ... 0.393185 0.109461 0.407003 0.115226 0.318902 0.404992 0.431820 0.478934 0.483277 0.092460
minkowski_distance 0.367988 0.058778 0.030754 0.069073 0.087539 0.197664 0.201169 0.001259 0.011970 0.088154 ... 0.363508 0.117265 0.372212 0.125123 0.333252 0.350491 0.377956 0.423810 0.423766 0.088071
cityblock_distance 0.367678 0.058715 0.030465 0.068994 0.087468 0.197396 0.200907 0.001256 0.012152 0.088119 ... 0.363716 0.117373 0.372440 0.125044 0.333383 0.350662 0.378038 0.423797 0.423710 0.088191
char_distribution_kl 0.367569 0.076128 0.042658 0.071124 0.104468 0.205766 0.216190 0.005751 0.010056 0.064252 ... 0.427578 0.129746 0.440548 0.134910 0.329878 0.443522 0.466270 0.500613 0.505342 0.115182
edit_dist 0.363314 0.052797 0.031065 0.107013 0.098689 0.208114 0.208664 0.017945 0.011778 0.043613 ... 0.455591 0.136686 0.463785 0.147319 0.394476 0.455909 0.475519 0.523411 0.520470 0.114905
fuzz_partial_token_sort_ratio 0.360776 0.052253 0.026430 0.071715 0.083777 0.187334 0.193005 0.009621 0.012978 0.066607 ... 0.428750 0.124707 0.422835 0.124150 0.470208 0.416147 0.425894 0.532344 0.505971 0.097755
fuzz_partial_ratio 0.358234 0.063455 0.041912 0.082073 0.099348 0.203688 0.209226 0.010340 0.002210 0.046488 ... 0.370820 0.104515 0.372347 0.105105 0.488419 0.351008 0.370044 0.552829 0.530615 0.067812
wmd 0.357621 0.080754 0.051812 0.095494 0.123375 0.233257 0.230458 0.000490 0.005023 0.039572 ... 0.411257 0.129093 0.433550 0.142258 0.387992 0.393620 0.421469 0.450407 0.450937 0.117364
char_distribution_cosine 0.350929 0.066776 0.030565 0.066950 0.094656 0.190514 0.199702 0.002218 0.020482 0.072473 ... 0.361426 0.115177 0.369945 0.120829 0.307911 0.366026 0.388538 0.412910 0.415369 0.100264
norm_wmd 0.350185 0.071848 0.041669 0.091403 0.113215 0.223563 0.220018 0.004530 0.011780 0.049612 ... 0.429874 0.134419 0.452854 0.149156 0.400584 0.408873 0.435077 0.481996 0.476311 0.124535
jaccard 0.341490 0.057484 0.039217 0.093256 0.097949 0.202614 0.197817 0.015144 0.005916 0.061770 ... 0.429713 0.130990 0.409725 0.127537 0.566882 0.393118 0.395894 0.482611 0.444249 0.094577
freq_diff 0.337501 0.169080 0.100777 1.000000 0.298225 0.428979 0.444353 0.063712 0.056260 0.003862 ... 0.067860 0.018200 0.067786 0.021691 0.040075 0.073070 0.071185 0.076522 0.078187 0.018074
edit_dist_agg_n3_mean_mean 0.326339 0.024466 0.008742 0.076193 0.062855 0.161905 0.169505 0.018515 0.013620 0.010555 ... 0.369987 0.100574 0.389232 0.103124 0.290440 0.373141 0.398077 0.596105 0.623957 0.084170
edit_dist_agg_n3_min_min 0.326339 0.024466 0.008742 0.076193 0.062855 0.161905 0.169505 0.018515 0.013620 0.010555 ... 0.369987 0.100574 0.389232 0.103124 0.290440 0.373141 0.398077 0.596105 0.623957 0.084170
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
lsa_pn10_1 0.012576 0.001198 0.011997 0.005710 0.022041 0.000318 0.003768 0.012058 0.000979 0.006294 ... 0.011730 0.005014 0.014417 0.006008 0.018911 0.009201 0.007050 0.021093 0.027982 0.006002
pos_of_question1_n1_in_question2_median 0.012323 0.040680 0.019722 0.052882 0.023614 0.013920 0.014394 0.033490 0.009789 0.083415 ... 0.121917 0.175631 0.149237 0.202135 0.036631 0.162574 0.162411 0.378543 0.407656 0.143045
pos_of_question1_n1_in_question2_mean 0.011189 0.042929 0.020687 0.054457 0.025476 0.014044 0.014350 0.034695 0.010248 0.085515 ... 0.129234 0.182698 0.157260 0.209693 0.037232 0.171015 0.170485 0.393125 0.422616 0.148715
diff_len 0.011137 0.018938 0.025338 0.013381 0.002910 0.006019 0.006060 0.039278 0.028082 0.002180 ... 0.156516 0.776571 0.144614 0.746947 0.012580 0.150253 0.109326 0.129764 0.127882 0.651576
svd_cooc_on2_tn2_9 0.009420 0.013534 0.005391 0.001095 0.001769 0.001943 0.001611 0.014768 0.002735 0.003942 ... 0.012894 0.003522 0.011192 0.003479 0.019686 0.010378 0.009962 0.005077 0.004264 0.002928
lsa_cn10_q1_1 0.008334 0.107563 0.065040 0.013471 0.138602 0.037324 0.032981 0.033257 0.000336 0.007604 ... 0.013855 0.001805 0.014250 0.003909 0.008747 0.011963 0.009390 0.016959 0.021816 0.004651
lsa_cn10_q2_1 0.008334 0.107563 0.065040 0.013471 0.138602 0.037324 0.032981 0.033257 0.000336 0.007604 ... 0.013855 0.001805 0.014250 0.003909 0.008747 0.011963 0.009390 0.016959 0.021816 0.004651
lsa_wn3_q2_1 0.008334 0.107563 0.065040 0.013471 0.138602 0.037324 0.032981 0.033257 0.000336 0.007604 ... 0.013855 0.001805 0.014250 0.003909 0.008747 0.011963 0.009390 0.016959 0.021816 0.004651
nmf_c_question2_n5_1 0.008195 0.083173 0.083755 0.014720 0.076449 0.035411 0.033630 0.035111 0.040087 0.102338 ... 0.228009 0.292831 0.232863 0.313614 0.061320 0.244838 0.243706 0.485460 0.493997 0.256925
svd_cooc_on2_tn1_5 0.007119 0.037085 0.026726 0.006325 0.027375 0.009104 0.010497 0.004027 0.006271 0.001474 ... 0.002217 0.000463 0.001060 0.003516 0.001949 0.001334 0.001964 0.001781 0.000243 0.000294
nmf_p_n2_0 0.006894 0.079055 0.099540 0.000397 0.074983 0.019041 0.017734 0.054480 0.083563 0.061907 ... 0.048702 0.011127 0.040406 0.007840 0.017834 0.048133 0.046750 0.061933 0.051369 0.008193
lsa_pn10_2 0.006817 0.185262 0.134210 0.020450 0.220081 0.045107 0.038408 0.028231 0.006104 0.035418 ... 0.000562 0.002055 0.001679 0.005174 0.009103 0.004656 0.006761 0.029028 0.020454 0.002444
svd_cooc_on1_tn2_8 0.006333 0.005427 0.016213 0.007188 0.004666 0.001477 0.005593 0.013858 0.017694 0.000675 ... 0.002742 0.001815 0.006556 0.000166 0.000590 0.004466 0.006302 0.012136 0.019301 0.001542
lsa_cn10_q1_5 0.006272 0.145682 0.294280 0.008552 0.118232 0.032678 0.033686 0.090832 0.252095 0.107857 ... 0.005736 0.007977 0.007649 0.010489 0.001152 0.000503 0.002738 0.017380 0.009489 0.008456
lsa_cn10_q2_5 0.006272 0.145682 0.294280 0.008552 0.118232 0.032678 0.033686 0.090832 0.252095 0.107857 ... 0.005736 0.007977 0.007649 0.010489 0.001152 0.000503 0.002738 0.017380 0.009489 0.008456
lsa_wn3_q2_5 0.006272 0.145682 0.294280 0.008552 0.118232 0.032678 0.033686 0.090832 0.252095 0.107857 ... 0.005736 0.007977 0.007649 0.010489 0.001152 0.000503 0.002738 0.017380 0.009489 0.008456
pos_of_question2_n1_in_question1_mean 0.006033 0.046832 0.002510 0.063798 0.018479 0.002969 0.002360 0.055148 0.027873 0.077163 ... 0.220370 0.211369 0.243474 0.236622 0.049420 0.254441 0.231778 0.456183 0.483100 0.181531
edit_dist_agg_n2_max_std 0.005927 0.002377 0.025246 0.012332 0.011621 0.008176 0.015845 0.015403 0.024549 0.020470 ... 0.015716 0.197529 0.005400 0.246259 0.014387 0.015493 0.008581 0.087150 0.095353 0.163037
pos_of_question2_n1_in_question1_median 0.005047 0.043768 0.001196 0.062415 0.016045 0.002672 0.002142 0.053597 0.027518 0.075142 ... 0.211907 0.202691 0.234442 0.227316 0.048960 0.245076 0.223403 0.440685 0.467177 0.174194
lsa_wn3_q1_5 0.004689 0.186252 0.152928 0.007093 0.108101 0.023675 0.028104 0.144274 0.120457 0.126869 ... 0.007369 0.005389 0.004293 0.005723 0.008033 0.009288 0.010776 0.010182 0.006587 0.002025
cooc_tfidf_question1_n1_count_std 0.004534 0.041032 0.064001 0.073066 0.047108 0.015999 0.015170 0.000161 0.052752 0.024478 ... 0.019993 0.028305 0.018761 0.050279 0.021049 0.010642 0.006420 0.031825 0.039982 0.018581
z_q2_place_num 0.004522 0.005548 0.008613 0.005287 0.002585 0.005915 0.003055 0.008097 0.011376 0.738607 ... 0.032781 0.043078 0.037139 0.043092 0.016649 0.042655 0.045656 0.084232 0.098806 0.041025
nmf_p_n2_1 0.003621 0.026742 0.012484 0.011828 0.052396 0.014314 0.011199 0.036264 0.066747 0.012954 ... 0.062976 0.018539 0.052242 0.015479 0.019251 0.061720 0.056959 0.080989 0.067650 0.014542
nmf_w_question1_n3_0 0.003426 0.144577 0.096858 0.016395 0.113097 0.013826 0.013711 0.076627 0.025876 0.038849 ... 0.039278 0.007077 0.041755 0.007526 0.039450 0.034794 0.035216 0.033604 0.032700 0.010026
nmf_c_question1_n5_1 0.002647 0.073505 0.064601 0.026593 0.072965 0.036413 0.034726 0.018382 0.018876 0.109749 ... 0.175084 0.214403 0.187532 0.249333 0.057093 0.207034 0.219829 0.443276 0.453683 0.202050
z_q1_place_num 0.002183 0.009255 0.008271 0.003545 0.005260 0.006741 0.004060 0.011865 0.007571 0.749328 ... 0.023213 0.031389 0.027874 0.037104 0.013278 0.033444 0.037178 0.075146 0.089350 0.025675
edit_dist_agg_n1_max_std 0.001549 0.029105 0.001893 0.015360 0.017830 0.007274 0.003805 0.029608 0.022281 0.015480 ... 0.070055 0.184489 0.067459 0.221408 0.052199 0.062229 0.054836 0.199959 0.212620 0.156063
cooc_tfidf_question2_n1_count_std 0.001358 0.027159 0.069295 0.068700 0.043920 0.014229 0.013165 0.023579 0.066267 0.025357 ... 0.032098 0.049792 0.029457 0.031464 0.015366 0.004882 0.006462 0.024904 0.032381 0.049008
z_q2_has_place 0.000900 0.001412 0.002652 0.001977 0.004445 0.010539 0.006853 0.007198 0.010221 0.762324 ... 0.019203 0.030487 0.022519 0.029478 0.011847 0.027166 0.031130 0.070461 0.084916 0.028749
z_q1_has_place 0.000180 0.004121 0.003831 0.001484 0.000121 0.009932 0.006590 0.010791 0.006724 0.764193 ... 0.013340 0.027691 0.017229 0.033519 0.010078 0.021595 0.026155 0.063369 0.077690 0.022233

392 rows × 392 columns


In [7]:
####################################################
########### remove high related features ###########
####################################################

ab_corr = abs(tr_corr).sort_values(by='is_duplicate',ascending=0)
corr_list = list(ab_corr.index)
for i in list(ab_corr.index):
    if i in corr_list:
        for j in corr_list:
            if ab_corr.ix[i][j]>0.9 and ab_corr.ix[i][j]<1:
                corr_list.remove(j)
                print (i,j,ab_corr.ix[i][j])
    else: pass
    
# df = df[corr_list]
# print df.shape


('q1_q2_wm_ratio', 'q1_q2_intersect_ratio', 0.98339464560072498)
('word_match', 'tfidf_wm', 0.93322210026153851)
('tfidf_wm_stops', 'word_match', 0.92995158765309249)
('cosine_distance', 'braycurtis_distance', 0.97916062541974491)
('cosine_distance', 'RMSE_distance', 0.94831239762790553)
('cosine_distance', 'minkowski_distance', 0.94797559168818923)
('fuzz_token_sort_ratio', 'fuzz_qratio', 0.92026621176353296)
('fuzz_token_sort_ratio', 'fuzz_partial_token_sort_ratio', 0.91308422436086456)
('euclidean_distance', 'cosine_distance', 0.94831843891180545)
('euclidean_distance', 'cityblock_distance', 0.99935660679125382)
('euclidean_distance', 'canberra_distance', 0.9563974628318419)
('wmd', 'norm_wmd', 0.97716355153362222)
('total_unq_words_stop', 'total_unique_words', 0.95390582381940558)
('char_diff_unq_stop', 'wc_diff_unq_stop', 0.90138626802904287)
('char_diff', 'char_diff_unq_stop', 0.91877313594035004)
('char_diff', 'wc_diff_unique', 0.90408376051139294)
('wc_diff', 'char_diff', 0.9231118250276108)
('len_q1', 'len_word_q1', 0.95802024672788144)
('len_q2', 'len_word_q2', 0.9675617534614811)
('wc_ratio', 'wc_ratio_unique', 0.9855730058919987)

In [5]:
#######################################
######### feature selection ###########
#######################################


train = df[df['is_duplicate']!=-1].copy()
train =train.replace([np.inf, -np.inf], np.nan).dropna()

# X_indices = np.arange(train.shape[-1])
# selector = SelectPercentile(f_classif, percentile=10)
# selector.fit(train, train['is_duplicate'])
# scores = -np.log10(selector.pvalues_)
# scores /= scores.max()
# plt.bar(X_indices - .45, scores, width=.2,
#         label=r'Univariate score ($-Log(p_{value})$)', color='darkorange')

full_feat = list(train.columns)
full_feat.remove('is_duplicate')
min_max_scaler = preprocessing.MinMaxScaler()
train[full_feat] = min_max_scaler.fit_transform(train[full_feat])

selector = SelectKBest(chi2, k=350)
selector.fit(train[full_feat], train['is_duplicate'])
idxs_selected = selector.get_support(indices=True)
columns_selected = train[full_feat].columns[idxs_selected]
print columns_selected
del train
gc.collect()

# df = df[list(columns_selected)+['is_duplicate']]


Index([u'q1_freq', u'q2_freq', u'freq_diff', u'q1_q2_intersect',
       u'q1_q2_intersect_ratio', u'q1_q2_wm_ratio', u'q1_pr', u'q2_pr',
       u'z_place_match', u'z_place_match_num',
       ...
       u'wc_diff_unique', u'wc_ratio_unique', u'wc_diff_unq_stop',
       u'wc_ratio_unique_stop', u'same_start', u'char_diff',
       u'char_diff_unq_stop', u'total_unique_words', u'total_unq_words_stop',
       u'char_ratio'],
      dtype='object', length=350)

In [6]:
######### 0.8 oversample ###########
test = df[df['is_duplicate']==-1].copy()
del test['is_duplicate']
train = df[df['is_duplicate']!=-1].copy()
del train['is_duplicate']
print train.shape, test.shape

X_train, X_test, y_train, y_test = train_test_split(train, train_y, test_size=0.1, random_state=4242)

#UPDownSampling
pos_train = X_train[y_train == 1]
neg_train = X_train[y_train == 0]
X_train = pd.concat((neg_train, pos_train.iloc[:int(0.8*len(pos_train))], neg_train))
y_train = np.array([0] * neg_train.shape[0] + [1] * pos_train.iloc[:int(0.8*len(pos_train))].shape[0] + [0] * neg_train.shape[0])
print(np.mean(y_train))
del pos_train, neg_train

pos_test = X_test[y_test == 1]
neg_test = X_test[y_test == 0]
X_test = pd.concat((neg_test, pos_test.iloc[:int(0.8 * len(pos_test))], neg_test))
y_test = np.array([0] * neg_test.shape[0] + [1] * pos_test.iloc[:int(0.8 * len(pos_test))].shape[0] + [0] * neg_test.shape[0])
print(np.mean(y_test))
del pos_test, neg_test


(404290, 46) (2345796, 46)
0.189752932122
0.189234677675

In [22]:
def oversample(X_ot,y,p=0.173):
    raw_num = X_ot.shape[0]
    print "RAW shape: {} | Mean rate: {}".format(X_ot.shape[0], y.mean())
    pos_ot = X_ot[y==1]
    neg_ot = X_ot[y==0]
    #p = 0.165
    scale = ((pos_ot.shape[0]*1.0 / (pos_ot.shape[0] + neg_ot.shape[0])) / p) - 1
    while scale > 1:
        neg_ot = np.vstack([neg_ot, neg_ot])
        scale -=1
    neg_ot = np.vstack([neg_ot, neg_ot[:int(scale * neg_ot.shape[0])]])
    ot = np.vstack([pos_ot, neg_ot])
    y=np.zeros(ot.shape[0])
    y[:pos_ot.shape[0]]=1.0
    print "Oversample: {} | Mean rate: {}".format(ot.shape[0],y.mean())
    return ot,y


test = df[df['is_duplicate']==-1].copy()
del test['is_duplicate']
train = df[df['is_duplicate']!=-1].copy()
del train['is_duplicate']
del df
print train.shape, test.shape

############### drop absolute duplicate rows #################
train.drop(train.index[[config.ab_dup_test]], inplace=True)
train.reset_index(drop=True, inplace=True)

X_train, X_test, y_train, y_test = train_test_split(train, train_y, test_size=0.1, random_state=1048)

X_train,y_train = oversample(X_train,y_train,p=0.1742)
X_test,y_test = oversample(X_test,y_test,p=0.1742)
X_train,y_train = shuffle(X_train,y_train,random_state=421)  
print X_train.shape, y_train.shape
gc.collect()

In [10]:
def xgb_train(n=2500,params=False):
    if not params:
        params = {}
        params['base_score'] = 0.2
        params['objective'] = 'binary:logistic'
        params['eval_metric'] = 'logloss'
        params['eta'] = 0.02
        params['max_depth'] = 7
        params['subsample'] = 0.6
        params['nthread'] = 16
        # params['colsample_bytree'] = 0.6 
        # params['gamma'] = 0.1 
        # params['min_child_weight'] = 5
        # params['scale_pos_weight'] = 0.2
    print params

    d_train = xgb.DMatrix(X_train, label=y_train)
    d_valid = xgb.DMatrix(X_test, label=y_test)
    watchlist = [(d_train, 'train'), (d_valid, 'valid')]

    bst = xgb.train(params, d_train, n, watchlist, early_stopping_rounds=50, verbose_eval=100)
    train_loss_str = log_loss(y_train, bst.predict(d_train))
    valid_loss_str = log_loss(y_test, bst.predict(d_valid))
    sub_stamp = 'xgb_t%s_v%s'%(str(train_loss_str)[2:6], str(valid_loss_str)[2:6])
    
    print 'train logloss: %s' %(train_loss_str)
    print 'valid logloss: %s' %(valid_loss_str)
    print 'best_ntree_limit %s' %(bst.best_ntree_limit)
    
    return bst, bst.best_ntree_limit, sub_stamp, params

In [11]:
params_opt = {'colsample_bytree': 0.7852290495822306, 'eval_metric': 'logloss', 'max_delta_step': 2.0, 'nthread': 16, 
          'min_child_weight': 3.0, 'subsample': 0.8071297738930207, 'base_score': 0.2, 
          'objective': 'binary:logistic', 'alpha': 10, 'max_depth': 11, 'gamma': 0.1, 'lambda': 100}
params_opt['eta'] = 0.02

bst, BEST_NTREE, sub_stamp, PARAMS = xgb_train(n=5000,params=params_opt)

bst.save_model(config.SUB_PATH + sub_stamp+'.mdl')
bst = xgb.Booster(PARAMS)
bst.load_model(config.SUB_PATH + sub_stamp+'.mdl')

d_test = xgb.DMatrix(test)
p_test = bst.predict(d_test, ntree_limit=BEST_NTREE)

sub = pd.DataFrame()
df_test = pd.read_csv(config.RAW_PATH+'test.csv')
sub['test_id'] = df_test['test_id']
sub['is_duplicate'] = p_test
sub.to_csv(config.SUB_PATH + sub_stamp+'.csv', index=False)

del df_test
gc.collect()


{'eval_metric': 'logloss', 'max_delta_step': 2.0, 'base_score': 0.2, 'alpha': 10, 'colsample_bytree': 0.7852290495822306, 'nthread': 16, 'min_child_weight': 3.0, 'subsample': 0.8071297738930207, 'eta': 0.02, 'objective': 'binary:logistic', 'max_depth': 11, 'gamma': 0.1, 'lambda': 100}
[0]	train-logloss:0.50341	valid-logloss:0.50313
Multiple eval metrics have been passed: 'valid-logloss' will be used for early stopping.

Will train until valid-logloss hasn't improved in 50 rounds.
[100]	train-logloss:0.219269	valid-logloss:0.227403
[200]	train-logloss:0.18408	valid-logloss:0.196613
[300]	train-logloss:0.172434	valid-logloss:0.188292
[400]	train-logloss:0.164398	valid-logloss:0.184019
[500]	train-logloss:0.157704	valid-logloss:0.181411
[600]	train-logloss:0.151349	valid-logloss:0.17949
[700]	train-logloss:0.145556	valid-logloss:0.178007
[800]	train-logloss:0.140086	valid-logloss:0.176809
[900]	train-logloss:0.134905	valid-logloss:0.175897
[1000]	train-logloss:0.130165	valid-logloss:0.175175
[1100]	train-logloss:0.125414	valid-logloss:0.174495
[1200]	train-logloss:0.120968	valid-logloss:0.174038
[1300]	train-logloss:0.116844	valid-logloss:0.173565
[1400]	train-logloss:0.11285	valid-logloss:0.173223
[1500]	train-logloss:0.109094	valid-logloss:0.172909
[1600]	train-logloss:0.105561	valid-logloss:0.172716
[1700]	train-logloss:0.102334	valid-logloss:0.172533
[1800]	train-logloss:0.099147	valid-logloss:0.172337
[1900]	train-logloss:0.096148	valid-logloss:0.172187
[2000]	train-logloss:0.093203	valid-logloss:0.172082
[2100]	train-logloss:0.090359	valid-logloss:0.171944
[2200]	train-logloss:0.087646	valid-logloss:0.171907
Stopping. Best iteration:
[2209]	train-logloss:0.087436	valid-logloss:0.171888

train logloss: 0.0860946948267
valid logloss: 0.171893311453
best_ntree_limit 2210
Out[11]:
82

In [11]:
####### lightgbm ######

def lgb_train(params, n_inter=200):
    d_train = lgb.Dataset(X_train, label=y_train)
    d_valid = lgb.Dataset(X_test, label=y_test)
    watchlist = [d_train, d_valid]
    bst = lgb.train(params, d_train, n_inter, watchlist, early_stopping_rounds=30, verbose_eval=100)
    train_loss_str = log_loss(y_train, bst.predict(X_train))
    valid_loss_str = log_loss(y_test, bst.predict(X_test))
    sub_stamp = 'lgb_t%s_v%s'%(str(train_loss_str)[2:6], str(valid_loss_str)[2:6])
    num_iteration=bst.best_iteration

    print 'train logloss: %s' %(train_loss_str)
    print 'valid logloss: %s' %(valid_loss_str)
    return bst, num_iteration, sub_stamp

params_lgb = {}
params_lgb['learning_rate'] = 0.05
params_lgb['boosting_type'] = 'dart'
params_lgb['objective'] = 'binary'
params_lgb['metric'] = 'binary_logloss'
params_lgb['feature_fraction'] = 0.7
params_lgb['bagging_fraction'] = 0.7
params_lgb['num_leaves'] = 256
params_lgb['max_depth'] = 8
params_lgb['min_data_in_leaf'] = 1
params_lgb['min_data'] = 50
params_lgb['min_hessian'] = 1
params_lgb['is_unbalance'] = True

bst, num_iteration, sub_stamp = lgb_train(params_lgb, n_inter=2000)
p_test = bst.predict(test, num_iteration=num_iteration)

sub = pd.DataFrame()
df_test = pd.read_csv(config.RAW_PATH+'test.csv')
sub['test_id'] = df_test['test_id']
sub['is_duplicate'] = p_test
sub.to_csv(config.SUB_PATH + sub_stamp+'.csv', index=False)

del df_test
gc.collect()

# lgb.train(params, train_set, num_boost_round=100, valid_sets=None, valid_names=None, fobj=None, feval=None, 
#           init_model=None, feature_name='auto', categorical_feature='auto', early_stopping_rounds=None, 
#           evals_result=None, verbose_eval=True, learning_rates=None, callbacks=None)


Train until valid scores didn't improve in 30 rounds.
[100]	training's binary_logloss: 0.258224	valid_1's binary_logloss: 0.283031
[200]	training's binary_logloss: 0.21013	valid_1's binary_logloss: 0.247885
[300]	training's binary_logloss: 0.180039	valid_1's binary_logloss: 0.231944
[400]	training's binary_logloss: 0.158238	valid_1's binary_logloss: 0.223963
[500]	training's binary_logloss: 0.14591	valid_1's binary_logloss: 0.220252
Early stopping, best iteration is:
[526]	training's binary_logloss: 0.140657	valid_1's binary_logloss: 0.218957
train logloss: 0.146522590924
valid logloss: 0.220318533714
Out[11]:
7

In [7]:
############# oof ############
def xgb_train(n=2500,params=False):
    if not params:
        params = {}
        params['base_score'] = 0.2
        params['objective'] = 'binary:logistic'
        params['eval_metric'] = 'logloss'
        params['eta'] = 0.02
        params['max_depth'] = 7
        params['subsample'] = 0.75
        params['nthread'] = 16
        params['colsample_bytree'] = 0.7 
        params['gamma'] = 0.1 
        params['min_child_weight'] = 3
        # params['scale_pos_weight'] = 0.2
        params['max_delta_step'] = 2

    print params

    d_train = xgb.DMatrix(X_train, label=y_train)
    d_valid = xgb.DMatrix(X_test, label=y_test)
    watchlist = [(d_train, 'train'), (d_valid, 'valid')]

    bst = xgb.train(params, d_train, n, watchlist, early_stopping_rounds=50, verbose_eval=200)
    train_loss_str = log_loss(y_train, bst.predict(d_train))
    valid_loss_str = log_loss(y_test, bst.predict(d_valid))
    sub_stamp = 'xgb_t%s_v%s'%(str(train_loss_str)[2:6], str(valid_loss_str)[2:6])
    print sub_stamp
    print 'train logloss: %s' %(train_loss_str)
    print 'valid logloss: %s' %(valid_loss_str)
    print 'best_ntree_limit: %s' %(bst.best_ntree_limit)
    oof_valid = bst.predict(d_valid, ntree_limit = bst.best_ntree_limit)
    return bst, bst.best_ntree_limit, sub_stamp, params, oof_valid


# random_seed = config.oof_random
random_seed = 1988


test_array = np.zeros(test.shape[0],dtype='float32')
oof_array = []

kf = StratifiedKFold(n_splits=5, random_state=random_seed, shuffle=True)
for train_idx, valid_idx in kf.split(train, y=train_y):
    X_train, X_test, y_train, y_test = train.loc[train_idx,:], train.loc[valid_idx,:], train_y[train_idx],train_y[valid_idx]
    print X_train.shape, y_train.shape
    X_train,y_train = oversample(X_train,y_train,p=0.1742)
    X_test,y_test = oversample(X_test,y_test,p=0.1742)
    X_train,y_train = shuffle(X_train,y_train,random_state=42)  
    print X_train.shape, y_train.shape
    bst, best_ntree_limit, sub_stamp, params, oof_valid = xgb_train(n=3000)
    oof_array.extend(list(oof_valid))
    
    bst.save_model(config.SUB_PATH + sub_stamp+'_oof'+'.mdl')
    bst = xgb.Booster(params)
    bst.load_model(config.SUB_PATH + sub_stamp+'_oof'+'.mdl')
    d_test = xgb.DMatrix(test)
    p_test = bst.predict(d_test, ntree_limit = best_ntree_limit)
    test_array = test_array + p_test
    print '='*20

test_array = test_array / 5.0
sub = pd.DataFrame()
df_test = pd.read_csv(config.RAW_PATH+'test.csv')
sub['test_id'] = df_test['test_id']
sub['is_duplicate'] = test_array
sub.to_csv(config.SUB_PATH + 'oof_xgb_test.csv',index=False)

sub = pd.DataFrame()
sub['xgb'] = oof_array
sub.to_csv(config.SUB_PATH + 'oof_xgb_valid.csv',index=False)


del df_test
gc.collect()


(323402, 403) (323402,)
RAW shape: 323402 | Mean rate: 0.369230864373
Oversample: 576180 | Mean rate: 0.207244263945
RAW shape: 80852 | Mean rate: 0.369230198387
Oversample: 144047 | Mean rate: 0.207244857581
(576180, 403) (576180,)
{'colsample_bytree': 0.7, 'eval_metric': 'logloss', 'max_delta_step': 2, 'nthread': 16, 'base_score': 0.2, 'subsample': 0.75, 'eta': 0.02, 'min_child_weight': 3, 'objective': 'binary:logistic', 'max_depth': 7, 'gamma': 0.1}
[0]	train-logloss:0.503596	valid-logloss:0.50362
Multiple eval metrics have been passed: 'valid-logloss' will be used for early stopping.

Will train until valid-logloss hasn't improved in 50 rounds.
[200]	train-logloss:0.189948	valid-logloss:0.196425
[400]	train-logloss:0.170875	valid-logloss:0.183451
[600]	train-logloss:0.159765	valid-logloss:0.178592
[800]	train-logloss:0.151321	valid-logloss:0.176105
[1000]	train-logloss:0.144417	valid-logloss:0.174563
[1200]	train-logloss:0.138119	valid-logloss:0.173543
[1400]	train-logloss:0.132505	valid-logloss:0.17275
[1600]	train-logloss:0.127264	valid-logloss:0.172054
[1800]	train-logloss:0.122337	valid-logloss:0.171545
[2000]	train-logloss:0.117792	valid-logloss:0.171192
[2200]	train-logloss:0.113222	valid-logloss:0.170801
[2400]	train-logloss:0.109032	valid-logloss:0.170515
[2600]	train-logloss:0.104877	valid-logloss:0.170257
[2800]	train-logloss:0.101069	valid-logloss:0.170069
Stopping. Best iteration:
[2929]	train-logloss:0.098683	valid-logloss:0.170014

xgb_t0978_v1700
train logloss: 0.0978111479097
valid logloss: 0.170015492358
best_ntree_limit: 2930
====================
(323403, 403) (323403,)
RAW shape: 323403 | Mean rate: 0.369229722668
Oversample: 576180 | Mean rate: 0.207244263945
RAW shape: 80851 | Mean rate: 0.369234765185
Oversample: 144047 | Mean rate: 0.207244857581
(576180, 403) (576180,)
{'colsample_bytree': 0.7, 'eval_metric': 'logloss', 'max_delta_step': 2, 'nthread': 16, 'base_score': 0.2, 'subsample': 0.75, 'eta': 0.02, 'min_child_weight': 3, 'objective': 'binary:logistic', 'max_depth': 7, 'gamma': 0.1}
[0]	train-logloss:0.503585	valid-logloss:0.503638
Multiple eval metrics have been passed: 'valid-logloss' will be used for early stopping.

Will train until valid-logloss hasn't improved in 50 rounds.
[200]	train-logloss:0.190597	valid-logloss:0.195747
[400]	train-logloss:0.171292	valid-logloss:0.181869
[600]	train-logloss:0.160194	valid-logloss:0.17668
[800]	train-logloss:0.15158	valid-logloss:0.174097
[1000]	train-logloss:0.144635	valid-logloss:0.172601
[1200]	train-logloss:0.138357	valid-logloss:0.171387
[1400]	train-logloss:0.132654	valid-logloss:0.170562
[1600]	train-logloss:0.1274	valid-logloss:0.169941
[1800]	train-logloss:0.122559	valid-logloss:0.169412
[2000]	train-logloss:0.118132	valid-logloss:0.169058
[2200]	train-logloss:0.113696	valid-logloss:0.168664
[2400]	train-logloss:0.109539	valid-logloss:0.168402
[2600]	train-logloss:0.105416	valid-logloss:0.168179
[2800]	train-logloss:0.101578	valid-logloss:0.16807
xgb_t0977_v1679
train logloss: 0.097758615212
valid logloss: 0.167935983843
best_ntree_limit: 2999
====================
(323403, 403) (323403,)
RAW shape: 323403 | Mean rate: 0.369229722668
Oversample: 576180 | Mean rate: 0.207244263945
RAW shape: 80851 | Mean rate: 0.369234765185
Oversample: 144047 | Mean rate: 0.207244857581
(576180, 403) (576180,)
{'colsample_bytree': 0.7, 'eval_metric': 'logloss', 'max_delta_step': 2, 'nthread': 16, 'base_score': 0.2, 'subsample': 0.75, 'eta': 0.02, 'min_child_weight': 3, 'objective': 'binary:logistic', 'max_depth': 7, 'gamma': 0.1}
[0]	train-logloss:0.503603	valid-logloss:0.503628
Multiple eval metrics have been passed: 'valid-logloss' will be used for early stopping.

Will train until valid-logloss hasn't improved in 50 rounds.
[200]	train-logloss:0.190238	valid-logloss:0.196067
[400]	train-logloss:0.171149	valid-logloss:0.182294
[600]	train-logloss:0.159929	valid-logloss:0.177168
[800]	train-logloss:0.151603	valid-logloss:0.174466
[1000]	train-logloss:0.144687	valid-logloss:0.17276
[1200]	train-logloss:0.138722	valid-logloss:0.171589
[1400]	train-logloss:0.133166	valid-logloss:0.170716
[1600]	train-logloss:0.128086	valid-logloss:0.170038
[1800]	train-logloss:0.123255	valid-logloss:0.169475
[2000]	train-logloss:0.11831	valid-logloss:0.168978
[2200]	train-logloss:0.113997	valid-logloss:0.168551
[2400]	train-logloss:0.109733	valid-logloss:0.168237
[2600]	train-logloss:0.105741	valid-logloss:0.167985
[2800]	train-logloss:0.101963	valid-logloss:0.167726
xgb_t0982_v1675
train logloss: 0.098234591892
valid logloss: 0.167548654817
best_ntree_limit: 3000
====================
(323404, 403) (323404,)
RAW shape: 323404 | Mean rate: 0.369231673078
Oversample: 576185 | Mean rate: 0.207244201081
RAW shape: 80850 | Mean rate: 0.369226963513
Oversample: 144042 | Mean rate: 0.207245109065
(576185, 403) (576185,)
{'colsample_bytree': 0.7, 'eval_metric': 'logloss', 'max_delta_step': 2, 'nthread': 16, 'base_score': 0.2, 'subsample': 0.75, 'eta': 0.02, 'min_child_weight': 3, 'objective': 'binary:logistic', 'max_depth': 7, 'gamma': 0.1}
[0]	train-logloss:0.503513	valid-logloss:0.503563
Multiple eval metrics have been passed: 'valid-logloss' will be used for early stopping.

Will train until valid-logloss hasn't improved in 50 rounds.
[200]	train-logloss:0.189973	valid-logloss:0.197182
[400]	train-logloss:0.170949	valid-logloss:0.183589
[600]	train-logloss:0.159742	valid-logloss:0.178529
[800]	train-logloss:0.151519	valid-logloss:0.175952
[1000]	train-logloss:0.144369	valid-logloss:0.174411
[1200]	train-logloss:0.13786	valid-logloss:0.173227
[1400]	train-logloss:0.132026	valid-logloss:0.172338
[1600]	train-logloss:0.126929	valid-logloss:0.171714
[1800]	train-logloss:0.122152	valid-logloss:0.171227
[2000]	train-logloss:0.117607	valid-logloss:0.170758
[2200]	train-logloss:0.113114	valid-logloss:0.170377
[2400]	train-logloss:0.10904	valid-logloss:0.170126
[2600]	train-logloss:0.105079	valid-logloss:0.169899
[2800]	train-logloss:0.101117	valid-logloss:0.169726
xgb_t0974_v1695
train logloss: 0.0974404333264
valid logloss: 0.169514171229
best_ntree_limit: 3000
====================
(323404, 403) (323404,)
RAW shape: 323404 | Mean rate: 0.369231673078
Oversample: 576185 | Mean rate: 0.207244201081
RAW shape: 80850 | Mean rate: 0.369226963513
Oversample: 144042 | Mean rate: 0.207245109065
(576185, 403) (576185,)
{'colsample_bytree': 0.7, 'eval_metric': 'logloss', 'max_delta_step': 2, 'nthread': 16, 'base_score': 0.2, 'subsample': 0.75, 'eta': 0.02, 'min_child_weight': 3, 'objective': 'binary:logistic', 'max_depth': 7, 'gamma': 0.1}
[0]	train-logloss:0.503519	valid-logloss:0.503537
Multiple eval metrics have been passed: 'valid-logloss' will be used for early stopping.

Will train until valid-logloss hasn't improved in 50 rounds.
[200]	train-logloss:0.190184	valid-logloss:0.195659
[400]	train-logloss:0.171194	valid-logloss:0.182356
[600]	train-logloss:0.160483	valid-logloss:0.177624
[800]	train-logloss:0.151943	valid-logloss:0.175074
[1000]	train-logloss:0.144739	valid-logloss:0.173407
[1200]	train-logloss:0.138599	valid-logloss:0.17234
[1400]	train-logloss:0.13311	valid-logloss:0.171536
[1600]	train-logloss:0.127733	valid-logloss:0.170858
[1800]	train-logloss:0.122723	valid-logloss:0.170369
[2000]	train-logloss:0.117977	valid-logloss:0.169907
[2200]	train-logloss:0.113505	valid-logloss:0.169588
[2400]	train-logloss:0.109178	valid-logloss:0.16927
[2600]	train-logloss:0.105341	valid-logloss:0.169039
[2800]	train-logloss:0.10157	valid-logloss:0.168889
xgb_t0979_v1687
train logloss: 0.0979464055983
valid logloss: 0.16870307632
best_ntree_limit: 2993
====================
Out[7]:
154

In [40]:
###################################################
############### train on full dataset #############
###################################################

X_train, X_test, y_train, y_test = train_test_split(train, train_y, test_size=0.2, random_state=1024)

X_train,y_train = oversample(X_train,y_train,p=0.173)
X_test,y_test = oversample(X_test,y_test,p=0.173)

X_train = np.concatenate((X_train, X_test), axis=0)#.shape
y_train = np.concatenate((y_train, y_test), axis=0)#.shape

X_train,y_train = shuffle(X_train,y_train,random_state=42)  
print X_train.shape, y_train.shape

d_train = xgb.DMatrix(X_train, label=y_train)
bst = xgb.train(params, d_train, 500, verbose_eval=30)
print(log_loss(y_test, bst.predict(d_valid)))


RAW shape: 323432 | Mean rate: 0.368924534369
Oversample: 581635 | Mean rate: 0.205149277468
RAW shape: 80858 | Mean rate: 0.370291127656
Oversample: 146073 | Mean rate: 0.204972856038
(727708, 207) (727708,)
0.185475337464

In [9]:
features = list(train.columns.values)
print("Features: {}".format(len(features)))
print("Features importances...")
mapFeat = dict(zip(["f"+str(i) for i in range(len(features))],features))
# ft.index = ft.reset_index()['index'].map(mapFeat)

ft = pd.Series(bst.get_fscore())

ft = pd.DataFrame(ft).reset_index().rename(columns={'index': 'feature',0:"fscore"}).sort_values(by='fscore',ascending=1)
# ft['feature'] = ft['feature'].map(mapFeat)
ft.plot(kind='barh', x='feature', y='fscore', legend=False, figsize=(10, 25))


Features: 393
Features importances...
Out[9]:
<matplotlib.axes._subplots.AxesSubplot at 0x7fce2ad260d0>

In [99]:
def create_feature_map(features):
    outfile = open('xgb.fmap', 'w')
    i = 0
    for feat in features:
        outfile.write('{0}\t{1}\tq\n'.format(i, feat))
        i = i + 1
    outfile.close()

feature_names = list(train.columns.values)
create_feature_map(feature_names)
print("Features: {}".format(len(feature_names)))
print("Features importances...")
import operator
importance = bst.get_fscore(fmap='xgb.fmap')
importance = sorted(importance.items(), key=operator.itemgetter(1))
ft = pd.DataFrame(importance, columns=['feature', 'fscore'])

ft.plot(kind='barh', x='feature', y='fscore', legend=False, figsize=(10, 25))


Features: 33
Features importances...
Out[99]:
<matplotlib.axes._subplots.AxesSubplot at 0x7f3d2a9d2750>

In [24]:
ft_corr = tr_corr.reset_index()
ft_corr = ft_corr.rename(columns={'index': 'feature'})[['feature','is_duplicate']]
ft_corr = ft_corr.merge(ft,how='left')
ft_corr.sort_values(by='fscore',ascending=0)


Out[24]:
feature is_duplicate fscore
22 common_words 0.232293 1537.0
31 norm_wmd -0.350185 1444.0
14 q1_q2_wm_ratio 0.645999 1148.0
13 q1_q2_intersect_ratio 0.610168 1106.0
30 wmd -0.357621 1073.0
51 edit_dist -0.335208 938.0
5 freq_diff -0.332427 816.0
28 fuzz_token_set_ratio 0.396250 806.0
25 fuzz_partial_ratio 0.358234 802.0
12 q1_q2_intersect 0.432860 706.0
23 fuzz_qratio 0.370515 669.0
4 q2_freq 0.265540 649.0
32 cosine_distance -0.383353 588.0
3 q1_freq 0.343747 560.0
29 fuzz_token_sort_ratio 0.372549 557.0
50 compression_dist -0.365613 554.0
178 norm_pos_of_question1_n1_in_question2_mean 0.146704 531.0
168 norm_pos_of_question2_n1_in_question1_mean 0.138984 513.0
130 cooc_tfidf_question1_n2_count_max 0.215381 500.0
125 cooc_tfidf_question2_n2_count_max 0.216392 487.0
42 kur_q2vec 0.015286 453.0
111 cooc_tfidf_question1_n1_count_std 0.105391 444.0
143 bm25_question2_n1_mean 0.287934 440.0
181 norm_pos_of_question1_n1_in_question2_std 0.221486 421.0
180 norm_pos_of_question1_n1_in_question2_max 0.252094 416.0
148 bm25_question1_n1_mean 0.289771 412.0
103 cooc_tfidf_question2_n1_count_mean 0.242006 410.0
171 norm_pos_of_question2_n1_in_question1_std 0.220216 406.0
44 jaccard_n1 0.336347 402.0
15 len_q1 -0.171079 400.0
... ... ... ...
43 RMSE_distance -0.368021 49.0
185 pos_of_question2_n2_in_question1_max 0.116917 49.0
162 pos_of_question2_n1_in_question1_min -0.128058 49.0
208 cooccurrence_close_count_n1 0.101425 48.0
66 edit_dist_agg_n2_min_mean -0.264122 45.0
152 bm25_question2_n2_min 0.063232 44.0
195 pos_of_question1_n2_in_question2_max 0.143764 42.0
165 pos_of_question2_n1_in_question1_max -0.052749 39.0
132 cooc_tfidf_question2_n2_ratio_min 0.051844 34.0
210 cooccurrence_close_count_n2 0.166876 30.0
58 edit_dist_agg_n2_max_max -0.028882 26.0
137 cooc_tfidf_question1_n2_ratio_min 0.054437 25.0
212 cooccurrence_close_count_n3 0.143607 18.0
157 bm25_question1_n2_min 0.068174 17.0
34 jaccard_distance -0.131714 16.0
99 edit_dist_agg_n3_mean_min -0.251771 15.0
127 cooc_tfidf_question1_n2_count_min 0.073408 11.0
74 edit_dist_agg_n2_mean_min -0.264122 9.0
101 edit_dist_agg_n3_mean_mean -0.251771 9.0
76 edit_dist_agg_n2_mean_mean -0.264122 4.0
0 is_duplicate 1.000000 NaN
1 q1_hash -0.207682 NaN
2 q2_hash -0.356072 NaN
6 q_hash_pos 0.123509 NaN
7 q_hash_pos_1 0.207493 NaN
8 q2_change -0.361354 NaN
9 q1_change -0.259241 NaN
10 q1_q2_change_max -0.369983 NaN
11 q_change_pair 0.422098 NaN
26 fuzz_partial_token_set_ratio 0.156094 NaN

216 rows × 3 columns


In [ ]:
{'colsample_bytree': 0.7852290495822306, 'eval_metric': 'logloss', 'max_delta_step': 2.0, 'nthread': 16, 'min_child_weight': 3.0, 'subsample': 0.8071297738930207, 'eta': 0.1, 'base_score': 0.2, 'objective': 'binary:logistic', 'alpha': 10, 'max_depth': 11, 'gamma': 0.1, 'lambda': 100}

In [8]:
from sklearn.metrics import roc_auc_score
import xgboost as xgb
from hyperopt import hp, fmin, tpe, STATUS_OK, Trials
d_train = xgb.DMatrix(X_train, label=y_train)
d_valid = xgb.DMatrix(X_test, label=y_test)

def objective(space):
    params = {}    
    params['max_depth'] = int(space['max_depth'])
    params['subsample'] = space['subsample']
    params['colsample_bytree'] = space['colsample_bytree'] 
    params['gamma'] = space['gamma']
    params['lambda'] = space['lambda']
    params['alpha'] = space['alpha']
    params['min_child_weight'] = space['min_child_weight']
    params['max_delta_step'] = space['max_delta_step']
    params['objective'] = 'binary:logistic'
    params['eval_metric'] = 'logloss'
    params['eta'] = 0.1
    params['nthread'] = 16
    params['base_score'] = 0.2
    print params
#     params['scale_pos_weight'] = 0.2
    
    watchlist = [(d_train, 'train'), (d_valid, 'valid')]
    bst = xgb.train(params, d_train, 2000, watchlist, early_stopping_rounds=30, verbose_eval=500)
    logloss = log_loss(y_test, bst.predict(d_valid))

    return{'loss':logloss, 'status': STATUS_OK }


space ={
        'max_depth': hp.quniform("max_depth", 4, 15, 1),
        'min_child_weight': hp.quniform ('min_child_weight', 1, 10, 1),
        'subsample': hp.uniform ('subsample', 0.1, 0.9),
        'colsample_bytree':hp.uniform('colsample_bytree',0.1,0.9),
         'lambda':hp.choice('lambda',[1e-5, 1e-2, 0.05, 0.1, 1, 10, 100]),
         'alpha':hp.choice('alpha',[0, 0.001, 0.005, 0.01, 0.05,0.1,1,10]),
        'gamma':hp.choice('gamma',[0, 0.1,0.2,0.3,0.4,0.5]),
        'max_delta_step':hp.quniform('max_delta_step', 1, 10, 1)
        }
    

trials = Trials()
best = fmin(fn=objective,
            space=space,
            algo=tpe.suggest,
            max_evals=100,
            trials=trials)

print best


{'colsample_bytree': 0.6215866306140633, 'eval_metric': 'logloss', 'max_delta_step': 9.0, 'nthread': 16, 'min_child_weight': 5.0, 'subsample': 0.2046354209730657, 'eta': 0.1, 'base_score': 0.2, 'objective': 'binary:logistic', 'alpha': 0.001, 'max_depth': 4, 'gamma': 0.4, 'lambda': 10}
[0]	train-logloss:0.457755	valid-logloss:0.458087
Multiple eval metrics have been passed: 'valid-logloss' will be used for early stopping.

Will train until valid-logloss hasn't improved in 30 rounds.
[500]	train-logloss:0.169888	valid-logloss:0.183592
Stopping. Best iteration:
[839]	train-logloss:0.160773	valid-logloss:0.181412

{'colsample_bytree': 0.4942310527676672, 'eval_metric': 'logloss', 'max_delta_step': 8.0, 'nthread': 16, 'min_child_weight': 8.0, 'subsample': 0.6688748954090619, 'eta': 0.1, 'base_score': 0.2, 'objective': 'binary:logistic', 'alpha': 0.005, 'max_depth': 7, 'gamma': 0, 'lambda': 0.1}
[0]	train-logloss:0.452926	valid-logloss:0.4535
Multiple eval metrics have been passed: 'valid-logloss' will be used for early stopping.

Will train until valid-logloss hasn't improved in 30 rounds.
[500]	train-logloss:0.119825	valid-logloss:0.175848
Stopping. Best iteration:
[487]	train-logloss:0.121232	valid-logloss:0.175664

{'colsample_bytree': 0.2262418681771136, 'eval_metric': 'logloss', 'max_delta_step': 10.0, 'nthread': 16, 'min_child_weight': 5.0, 'subsample': 0.6400081544667823, 'eta': 0.1, 'base_score': 0.2, 'objective': 'binary:logistic', 'alpha': 0.1, 'max_depth': 14, 'gamma': 0.1, 'lambda': 0.1}
[0]	train-logloss:0.449829	valid-logloss:0.452885
Multiple eval metrics have been passed: 'valid-logloss' will be used for early stopping.

Will train until valid-logloss hasn't improved in 30 rounds.
Stopping. Best iteration:
[126]	train-logloss:0.07523	valid-logloss:0.181878

{'colsample_bytree': 0.6524112525169689, 'eval_metric': 'logloss', 'max_delta_step': 8.0, 'nthread': 16, 'min_child_weight': 8.0, 'subsample': 0.3723787100850201, 'eta': 0.1, 'base_score': 0.2, 'objective': 'binary:logistic', 'alpha': 0, 'max_depth': 14, 'gamma': 0.4, 'lambda': 1e-05}
[0]	train-logloss:0.448619	valid-logloss:0.450279
Multiple eval metrics have been passed: 'valid-logloss' will be used for early stopping.

Will train until valid-logloss hasn't improved in 30 rounds.
Stopping. Best iteration:
[108]	train-logloss:0.109754	valid-logloss:0.18066

{'colsample_bytree': 0.16650529293613403, 'eval_metric': 'logloss', 'max_delta_step': 8.0, 'nthread': 16, 'min_child_weight': 8.0, 'subsample': 0.6162541363164695, 'eta': 0.1, 'base_score': 0.2, 'objective': 'binary:logistic', 'alpha': 0.001, 'max_depth': 10, 'gamma': 0.4, 'lambda': 1}
[0]	train-logloss:0.483575	valid-logloss:0.483906
Multiple eval metrics have been passed: 'valid-logloss' will be used for early stopping.

Will train until valid-logloss hasn't improved in 30 rounds.
Stopping. Best iteration:
[335]	train-logloss:0.093127	valid-logloss:0.176843

{'colsample_bytree': 0.2641676058323803, 'eval_metric': 'logloss', 'max_delta_step': 5.0, 'nthread': 16, 'min_child_weight': 7.0, 'subsample': 0.8773616833314521, 'eta': 0.1, 'base_score': 0.2, 'objective': 'binary:logistic', 'alpha': 0, 'max_depth': 12, 'gamma': 0, 'lambda': 10}
[0]	train-logloss:0.453449	valid-logloss:0.454672
Multiple eval metrics have been passed: 'valid-logloss' will be used for early stopping.

Will train until valid-logloss hasn't improved in 30 rounds.
Stopping. Best iteration:
[195]	train-logloss:0.094001	valid-logloss:0.176301

{'colsample_bytree': 0.8722642189983841, 'eval_metric': 'logloss', 'max_delta_step': 2.0, 'nthread': 16, 'min_child_weight': 5.0, 'subsample': 0.7678583799309385, 'eta': 0.1, 'base_score': 0.2, 'objective': 'binary:logistic', 'alpha': 10, 'max_depth': 9, 'gamma': 0.1, 'lambda': 100}
[0]	train-logloss:0.476353	valid-logloss:0.476386
Multiple eval metrics have been passed: 'valid-logloss' will be used for early stopping.

Will train until valid-logloss hasn't improved in 30 rounds.
[500]	train-logloss:0.106879	valid-logloss:0.173537
Stopping. Best iteration:
[540]	train-logloss:0.102536	valid-logloss:0.173374

{'colsample_bytree': 0.5681441209744663, 'eval_metric': 'logloss', 'max_delta_step': 4.0, 'nthread': 16, 'min_child_weight': 8.0, 'subsample': 0.3793264896708848, 'eta': 0.1, 'base_score': 0.2, 'objective': 'binary:logistic', 'alpha': 0.001, 'max_depth': 11, 'gamma': 0.2, 'lambda': 0.05}
[0]	train-logloss:0.456586	valid-logloss:0.457352
Multiple eval metrics have been passed: 'valid-logloss' will be used for early stopping.

Will train until valid-logloss hasn't improved in 30 rounds.
Stopping. Best iteration:
[172]	train-logloss:0.114973	valid-logloss:0.18001

{'colsample_bytree': 0.6585666437547426, 'eval_metric': 'logloss', 'max_delta_step': 7.0, 'nthread': 16, 'min_child_weight': 6.0, 'subsample': 0.12249883737053519, 'eta': 0.1, 'base_score': 0.2, 'objective': 'binary:logistic', 'alpha': 0.1, 'max_depth': 10, 'gamma': 0.3, 'lambda': 0.05}
[0]	train-logloss:0.45133	valid-logloss:0.452029
Multiple eval metrics have been passed: 'valid-logloss' will be used for early stopping.

Will train until valid-logloss hasn't improved in 30 rounds.
Stopping. Best iteration:
[120]	train-logloss:0.154732	valid-logloss:0.18615

{'colsample_bytree': 0.42964353040548475, 'eval_metric': 'logloss', 'max_delta_step': 6.0, 'nthread': 16, 'min_child_weight': 9.0, 'subsample': 0.4616646995522957, 'eta': 0.1, 'base_score': 0.2, 'objective': 'binary:logistic', 'alpha': 0.005, 'max_depth': 9, 'gamma': 0.2, 'lambda': 0.01}
[0]	train-logloss:0.452371	valid-logloss:0.453306
Multiple eval metrics have been passed: 'valid-logloss' will be used for early stopping.

Will train until valid-logloss hasn't improved in 30 rounds.
Stopping. Best iteration:
[256]	train-logloss:0.121103	valid-logloss:0.176798

{'colsample_bytree': 0.8650546946731305, 'eval_metric': 'logloss', 'max_delta_step': 6.0, 'nthread': 16, 'min_child_weight': 8.0, 'subsample': 0.3618221703220311, 'eta': 0.1, 'base_score': 0.2, 'objective': 'binary:logistic', 'alpha': 1, 'max_depth': 5, 'gamma': 0.1, 'lambda': 0.05}
[0]	train-logloss:0.455349	valid-logloss:0.455703
Multiple eval metrics have been passed: 'valid-logloss' will be used for early stopping.

Will train until valid-logloss hasn't improved in 30 rounds.
[500]	train-logloss:0.154015	valid-logloss:0.179303
Stopping. Best iteration:
[581]	train-logloss:0.149983	valid-logloss:0.17868

{'colsample_bytree': 0.7030797180234829, 'eval_metric': 'logloss', 'max_delta_step': 9.0, 'nthread': 16, 'min_child_weight': 7.0, 'subsample': 0.6995774779301367, 'eta': 0.1, 'base_score': 0.2, 'objective': 'binary:logistic', 'alpha': 0.05, 'max_depth': 4, 'gamma': 0.4, 'lambda': 1e-05}
[0]	train-logloss:0.457347	valid-logloss:0.457678
Multiple eval metrics have been passed: 'valid-logloss' will be used for early stopping.

Will train until valid-logloss hasn't improved in 30 rounds.
[500]	train-logloss:0.166804	valid-logloss:0.181617
[1000]	train-logloss:0.152616	valid-logloss:0.177618
[1500]	train-logloss:0.140951	valid-logloss:0.175593
Stopping. Best iteration:
[1494]	train-logloss:0.141075	valid-logloss:0.175577

{'colsample_bytree': 0.2353829374379225, 'eval_metric': 'logloss', 'max_delta_step': 2.0, 'nthread': 16, 'min_child_weight': 6.0, 'subsample': 0.26935957632771823, 'eta': 0.1, 'base_score': 0.2, 'objective': 'binary:logistic', 'alpha': 0.01, 'max_depth': 8, 'gamma': 0.4, 'lambda': 0.1}
[0]	train-logloss:0.47747	valid-logloss:0.477631
Multiple eval metrics have been passed: 'valid-logloss' will be used for early stopping.

Will train until valid-logloss hasn't improved in 30 rounds.
Stopping. Best iteration:
[270]	train-logloss:0.140681	valid-logloss:0.180478

{'colsample_bytree': 0.6672717162125326, 'eval_metric': 'logloss', 'max_delta_step': 9.0, 'nthread': 16, 'min_child_weight': 6.0, 'subsample': 0.30740585129681897, 'eta': 0.1, 'base_score': 0.2, 'objective': 'binary:logistic', 'alpha': 0, 'max_depth': 6, 'gamma': 0.5, 'lambda': 1e-05}
[0]	train-logloss:0.453928	valid-logloss:0.454313
Multiple eval metrics have been passed: 'valid-logloss' will be used for early stopping.

Will train until valid-logloss hasn't improved in 30 rounds.
[500]	train-logloss:0.141136	valid-logloss:0.178575
Stopping. Best iteration:
[582]	train-logloss:0.13594	valid-logloss:0.178281

{'colsample_bytree': 0.15880447956457236, 'eval_metric': 'logloss', 'max_delta_step': 8.0, 'nthread': 16, 'min_child_weight': 8.0, 'subsample': 0.1829600910862202, 'eta': 0.1, 'base_score': 0.2, 'objective': 'binary:logistic', 'alpha': 1, 'max_depth': 10, 'gamma': 0.5, 'lambda': 0.01}
[0]	train-logloss:0.484776	valid-logloss:0.484909
Multiple eval metrics have been passed: 'valid-logloss' will be used for early stopping.

Will train until valid-logloss hasn't improved in 30 rounds.
Stopping. Best iteration:
[217]	train-logloss:0.136896	valid-logloss:0.184117

{'colsample_bytree': 0.4328545889768428, 'eval_metric': 'logloss', 'max_delta_step': 8.0, 'nthread': 16, 'min_child_weight': 6.0, 'subsample': 0.24037830373984043, 'eta': 0.1, 'base_score': 0.2, 'objective': 'binary:logistic', 'alpha': 10, 'max_depth': 6, 'gamma': 0.2, 'lambda': 100}
[0]	train-logloss:0.459882	valid-logloss:0.460301
Multiple eval metrics have been passed: 'valid-logloss' will be used for early stopping.

Will train until valid-logloss hasn't improved in 30 rounds.
[500]	train-logloss:0.159745	valid-logloss:0.181588
Stopping. Best iteration:
[775]	train-logloss:0.148607	valid-logloss:0.179812

{'colsample_bytree': 0.45627781942866086, 'eval_metric': 'logloss', 'max_delta_step': 7.0, 'nthread': 16, 'min_child_weight': 8.0, 'subsample': 0.22173515414320227, 'eta': 0.1, 'base_score': 0.2, 'objective': 'binary:logistic', 'alpha': 0.1, 'max_depth': 7, 'gamma': 0.3, 'lambda': 10}
[0]	train-logloss:0.455826	valid-logloss:0.456411
Multiple eval metrics have been passed: 'valid-logloss' will be used for early stopping.

Will train until valid-logloss hasn't improved in 30 rounds.
[500]	train-logloss:0.133365	valid-logloss:0.179545
Stopping. Best iteration:
[485]	train-logloss:0.134569	valid-logloss:0.179385

{'colsample_bytree': 0.1931001251223502, 'eval_metric': 'logloss', 'max_delta_step': 6.0, 'nthread': 16, 'min_child_weight': 3.0, 'subsample': 0.45187782375223107, 'eta': 0.1, 'base_score': 0.2, 'objective': 'binary:logistic', 'alpha': 0.01, 'max_depth': 9, 'gamma': 0.2, 'lambda': 100}
[0]	train-logloss:0.488414	valid-logloss:0.488188
Multiple eval metrics have been passed: 'valid-logloss' will be used for early stopping.

Will train until valid-logloss hasn't improved in 30 rounds.
[500]	train-logloss:0.119792	valid-logloss:0.176875
Stopping. Best iteration:
[588]	train-logloss:0.111819	valid-logloss:0.176281

{'colsample_bytree': 0.5287440649793352, 'eval_metric': 'logloss', 'max_delta_step': 6.0, 'nthread': 16, 'min_child_weight': 4.0, 'subsample': 0.6080693746281164, 'eta': 0.1, 'base_score': 0.2, 'objective': 'binary:logistic', 'alpha': 0.001, 'max_depth': 12, 'gamma': 0.3, 'lambda': 1}
[0]	train-logloss:0.449223	valid-logloss:0.450825
Multiple eval metrics have been passed: 'valid-logloss' will be used for early stopping.

Will train until valid-logloss hasn't improved in 30 rounds.
Stopping. Best iteration:
[145]	train-logloss:0.090304	valid-logloss:0.178292

{'colsample_bytree': 0.7266881793590831, 'eval_metric': 'logloss', 'max_delta_step': 3.0, 'nthread': 16, 'min_child_weight': 8.0, 'subsample': 0.39309052689513946, 'eta': 0.1, 'base_score': 0.2, 'objective': 'binary:logistic', 'alpha': 0.01, 'max_depth': 9, 'gamma': 0.5, 'lambda': 0.1}
[0]	train-logloss:0.466118	valid-logloss:0.466507
Multiple eval metrics have been passed: 'valid-logloss' will be used for early stopping.

Will train until valid-logloss hasn't improved in 30 rounds.
Stopping. Best iteration:
[210]	train-logloss:0.128233	valid-logloss:0.178441

{'colsample_bytree': 0.8967588020888194, 'eval_metric': 'logloss', 'max_delta_step': 2.0, 'nthread': 16, 'min_child_weight': 2.0, 'subsample': 0.8555879726482751, 'eta': 0.1, 'base_score': 0.2, 'objective': 'binary:logistic', 'alpha': 0.05, 'max_depth': 4, 'gamma': 0.1, 'lambda': 100}
[0]	train-logloss:0.478228	valid-logloss:0.478229
Multiple eval metrics have been passed: 'valid-logloss' will be used for early stopping.

Will train until valid-logloss hasn't improved in 30 rounds.
[500]	train-logloss:0.170527	valid-logloss:0.182535
[1000]	train-logloss:0.157658	valid-logloss:0.178249
[1500]	train-logloss:0.147655	valid-logloss:0.176107
{'colsample_bytree': 0.885243232976707, 'eval_metric': 'logloss', 'max_delta_step': 1.0, 'nthread': 16, 'min_child_weight': 2.0, 'subsample': 0.8780683008706107, 'eta': 0.1, 'base_score': 0.2, 'objective': 'binary:logistic', 'alpha': 10, 'max_depth': 13, 'gamma': 0.1, 'lambda': 100}
[0]	train-logloss:0.488214	valid-logloss:0.488283
Multiple eval metrics have been passed: 'valid-logloss' will be used for early stopping.

Will train until valid-logloss hasn't improved in 30 rounds.
Stopping. Best iteration:
[343]	train-logloss:0.081298	valid-logloss:0.173949

{'colsample_bytree': 0.809427376934251, 'eval_metric': 'logloss', 'max_delta_step': 1.0, 'nthread': 16, 'min_child_weight': 1.0, 'subsample': 0.7835642044093545, 'eta': 0.1, 'base_score': 0.2, 'objective': 'binary:logistic', 'alpha': 10, 'max_depth': 13, 'gamma': 0.1, 'lambda': 100}
[0]	train-logloss:0.488249	valid-logloss:0.488313
Multiple eval metrics have been passed: 'valid-logloss' will be used for early stopping.

Will train until valid-logloss hasn't improved in 30 rounds.
Stopping. Best iteration:
[361]	train-logloss:0.079592	valid-logloss:0.174906

{'colsample_bytree': 0.8081900342913997, 'eval_metric': 'logloss', 'max_delta_step': 1.0, 'nthread': 16, 'min_child_weight': 1.0, 'subsample': 0.7693709637442891, 'eta': 0.1, 'base_score': 0.2, 'objective': 'binary:logistic', 'alpha': 10, 'max_depth': 12, 'gamma': 0.1, 'lambda': 100}
[0]	train-logloss:0.488346	valid-logloss:0.48836
Multiple eval metrics have been passed: 'valid-logloss' will be used for early stopping.

Will train until valid-logloss hasn't improved in 30 rounds.
Stopping. Best iteration:
[369]	train-logloss:0.087433	valid-logloss:0.174648

{'colsample_bytree': 0.7852290495822306, 'eval_metric': 'logloss', 'max_delta_step': 2.0, 'nthread': 16, 'min_child_weight': 3.0, 'subsample': 0.8071297738930207, 'eta': 0.1, 'base_score': 0.2, 'objective': 'binary:logistic', 'alpha': 10, 'max_depth': 11, 'gamma': 0.1, 'lambda': 100}
[0]	train-logloss:0.476087	valid-logloss:0.47621
Multiple eval metrics have been passed: 'valid-logloss' will be used for early stopping.

Will train until valid-logloss hasn't improved in 30 rounds.
[500]	train-logloss:0.079612	valid-logloss:0.173976
Stopping. Best iteration:
[474]	train-logloss:0.082591	valid-logloss:0.173839

{'colsample_bytree': 0.7699761681203763, 'eval_metric': 'logloss', 'max_delta_step': 3.0, 'nthread': 16, 'min_child_weight': 4.0, 'subsample': 0.5542116809010466, 'eta': 0.1, 'base_score': 0.2, 'objective': 'binary:logistic', 'alpha': 10, 'max_depth': 11, 'gamma': 0.1, 'lambda': 100}
[0]	train-logloss:0.466799	valid-logloss:0.467161
Multiple eval metrics have been passed: 'valid-logloss' will be used for early stopping.

Will train until valid-logloss hasn't improved in 30 rounds.
Stopping. Best iteration:
[361]	train-logloss:0.107015	valid-logloss:0.176052

{'colsample_bytree': 0.3453868164923646, 'eval_metric': 'logloss', 'max_delta_step': 4.0, 'nthread': 16, 'min_child_weight': 3.0, 'subsample': 0.8067425284677572, 'eta': 0.1, 'base_score': 0.2, 'objective': 'binary:logistic', 'alpha': 10, 'max_depth': 8, 'gamma': 0.1, 'lambda': 100}
[0]	train-logloss:0.46158	valid-logloss:0.462032
Multiple eval metrics have been passed: 'valid-logloss' will be used for early stopping.

Will train until valid-logloss hasn't improved in 30 rounds.
[500]	train-logloss:0.126788	valid-logloss:0.175164
Stopping. Best iteration:
[795]	train-logloss:0.104325	valid-logloss:0.173645

{'colsample_bytree': 0.29786813041005694, 'eval_metric': 'logloss', 'max_delta_step': 4.0, 'nthread': 16, 'min_child_weight': 4.0, 'subsample': 0.7224425527900319, 'eta': 0.1, 'base_score': 0.2, 'objective': 'binary:logistic', 'alpha': 10, 'max_depth': 8, 'gamma': 0, 'lambda': 100}
[0]	train-logloss:0.461759	valid-logloss:0.462159
Multiple eval metrics have been passed: 'valid-logloss' will be used for early stopping.

Will train until valid-logloss hasn't improved in 30 rounds.
[500]	train-logloss:0.128305	valid-logloss:0.175792
Stopping. Best iteration:
[743]	train-logloss:0.109823	valid-logloss:0.174752

{'colsample_bytree': 0.35255406299458536, 'eval_metric': 'logloss', 'max_delta_step': 4.0, 'nthread': 16, 'min_child_weight': 3.0, 'subsample': 0.5684920092729825, 'eta': 0.1, 'base_score': 0.2, 'objective': 'binary:logistic', 'alpha': 10, 'max_depth': 8, 'gamma': 0.1, 'lambda': 1}
[0]	train-logloss:0.460017	valid-logloss:0.460579
Multiple eval metrics have been passed: 'valid-logloss' will be used for early stopping.

Will train until valid-logloss hasn't improved in 30 rounds.
Stopping. Best iteration:
[468]	train-logloss:0.107501	valid-logloss:0.174734

{'colsample_bytree': 0.36121034843499156, 'eval_metric': 'logloss', 'max_delta_step': 3.0, 'nthread': 16, 'min_child_weight': 5.0, 'subsample': 0.818627908766779, 'eta': 0.1, 'base_score': 0.2, 'objective': 'binary:logistic', 'alpha': 0.005, 'max_depth': 7, 'gamma': 0.1, 'lambda': 0.01}
[0]	train-logloss:0.468212	valid-logloss:0.468661
Multiple eval metrics have been passed: 'valid-logloss' will be used for early stopping.

Will train until valid-logloss hasn't improved in 30 rounds.
Stopping. Best iteration:
[424]	train-logloss:0.125365	valid-logloss:0.175948

{'colsample_bytree': 0.11454188140127014, 'eval_metric': 'logloss', 'max_delta_step': 5.0, 'nthread': 16, 'min_child_weight': 2.0, 'subsample': 0.7317275135482078, 'eta': 0.1, 'base_score': 0.2, 'objective': 'binary:logistic', 'alpha': 0.05, 'max_depth': 6, 'gamma': 0, 'lambda': 10}
[0]	train-logloss:0.491215	valid-logloss:0.491119
Multiple eval metrics have been passed: 'valid-logloss' will be used for early stopping.

Will train until valid-logloss hasn't improved in 30 rounds.
[500]	train-logloss:0.146742	valid-logloss:0.178359
Stopping. Best iteration:
[745]	train-logloss:0.133101	valid-logloss:0.176483

{'colsample_bytree': 0.35873934232776394, 'eval_metric': 'logloss', 'max_delta_step': 3.0, 'nthread': 16, 'min_child_weight': 5.0, 'subsample': 0.693189987161331, 'eta': 0.1, 'base_score': 0.2, 'objective': 'binary:logistic', 'alpha': 1, 'max_depth': 5, 'gamma': 0.1, 'lambda': 100}
[0]	train-logloss:0.470408	valid-logloss:0.470681
Multiple eval metrics have been passed: 'valid-logloss' will be used for early stopping.

Will train until valid-logloss hasn't improved in 30 rounds.
[500]	train-logloss:0.162439	valid-logloss:0.180049
[1000]	train-logloss:0.145488	valid-logloss:0.176277
Stopping. Best iteration:
[1304]	train-logloss:0.136927	valid-logloss:0.175178

{'colsample_bytree': 0.58149140413932, 'eval_metric': 'logloss', 'max_delta_step': 2.0, 'nthread': 16, 'min_child_weight': 4.0, 'subsample': 0.5370865827609025, 'eta': 0.1, 'base_score': 0.2, 'objective': 'binary:logistic', 'alpha': 10, 'max_depth': 8, 'gamma': 0.1, 'lambda': 100}
[0]	train-logloss:0.476855	valid-logloss:0.476901
Multiple eval metrics have been passed: 'valid-logloss' will be used for early stopping.

Will train until valid-logloss hasn't improved in 30 rounds.
[500]	train-logloss:0.126188	valid-logloss:0.17585
Stopping. Best iteration:
[518]	train-logloss:0.124603	valid-logloss:0.175707

{'colsample_bytree': 0.4913478920459614, 'eval_metric': 'logloss', 'max_delta_step': 5.0, 'nthread': 16, 'min_child_weight': 3.0, 'subsample': 0.6611519161703867, 'eta': 0.1, 'base_score': 0.2, 'objective': 'binary:logistic', 'alpha': 0.005, 'max_depth': 7, 'gamma': 0.3, 'lambda': 1e-05}
[0]	train-logloss:0.452886	valid-logloss:0.453421
Multiple eval metrics have been passed: 'valid-logloss' will be used for early stopping.

Will train until valid-logloss hasn't improved in 30 rounds.
[500]	train-logloss:0.114178	valid-logloss:0.175691
Stopping. Best iteration:
[514]	train-logloss:0.113077	valid-logloss:0.175634

{'colsample_bytree': 0.3002022906789539, 'eval_metric': 'logloss', 'max_delta_step': 4.0, 'nthread': 16, 'min_child_weight': 5.0, 'subsample': 0.7566280826568816, 'eta': 0.1, 'base_score': 0.2, 'objective': 'binary:logistic', 'alpha': 10, 'max_depth': 9, 'gamma': 0.5, 'lambda': 0.1}
[0]	train-logloss:0.45933	valid-logloss:0.460163
Multiple eval metrics have been passed: 'valid-logloss' will be used for early stopping.

Will train until valid-logloss hasn't improved in 30 rounds.
Stopping. Best iteration:
[398]	train-logloss:0.09748	valid-logloss:0.173972

{'colsample_bytree': 0.5631979426523986, 'eval_metric': 'logloss', 'max_delta_step': 3.0, 'nthread': 16, 'min_child_weight': 7.0, 'subsample': 0.8936894272191092, 'eta': 0.1, 'base_score': 0.2, 'objective': 'binary:logistic', 'alpha': 0, 'max_depth': 5, 'gamma': 0, 'lambda': 1}
[0]	train-logloss:0.468534	valid-logloss:0.468784
Multiple eval metrics have been passed: 'valid-logloss' will be used for early stopping.

Will train until valid-logloss hasn't improved in 30 rounds.
[500]	train-logloss:0.154218	valid-logloss:0.178734
[1000]	train-logloss:0.13376	valid-logloss:0.175318
Stopping. Best iteration:
[1183]	train-logloss:0.127649	valid-logloss:0.174781

{'colsample_bytree': 0.3872379603142309, 'eval_metric': 'logloss', 'max_delta_step': 2.0, 'nthread': 16, 'min_child_weight': 10.0, 'subsample': 0.8473402869211502, 'eta': 0.1, 'base_score': 0.2, 'objective': 'binary:logistic', 'alpha': 0.1, 'max_depth': 10, 'gamma': 0.4, 'lambda': 0.05}
[0]	train-logloss:0.476375	valid-logloss:0.476886
Multiple eval metrics have been passed: 'valid-logloss' will be used for early stopping.

Will train until valid-logloss hasn't improved in 30 rounds.
Stopping. Best iteration:
[287]	train-logloss:0.095166	valid-logloss:0.175476

{'colsample_bytree': 0.125240726887441, 'eval_metric': 'logloss', 'max_delta_step': 5.0, 'nthread': 16, 'min_child_weight': 2.0, 'subsample': 0.6466219377538647, 'eta': 0.1, 'base_score': 0.2, 'objective': 'binary:logistic', 'alpha': 0.001, 'max_depth': 15, 'gamma': 0.1, 'lambda': 10}
[0]	train-logloss:0.481828	valid-logloss:0.483569
Multiple eval metrics have been passed: 'valid-logloss' will be used for early stopping.

Will train until valid-logloss hasn't improved in 30 rounds.
Stopping. Best iteration:
[195]	train-logloss:0.055078	valid-logloss:0.182065

{'colsample_bytree': 0.6179601149157483, 'eval_metric': 'logloss', 'max_delta_step': 4.0, 'nthread': 16, 'min_child_weight': 4.0, 'subsample': 0.492464726698365, 'eta': 0.1, 'base_score': 0.2, 'objective': 'binary:logistic', 'alpha': 10, 'max_depth': 7, 'gamma': 0.1, 'lambda': 0.01}
[0]	train-logloss:0.459017	valid-logloss:0.459552
Multiple eval metrics have been passed: 'valid-logloss' will be used for early stopping.

Will train until valid-logloss hasn't improved in 30 rounds.
[500]	train-logloss:0.120765	valid-logloss:0.176093
Stopping. Best iteration:
[481]	train-logloss:0.122537	valid-logloss:0.175998

{'colsample_bytree': 0.3011372038566066, 'eval_metric': 'logloss', 'max_delta_step': 7.0, 'nthread': 16, 'min_child_weight': 5.0, 'subsample': 0.5902261465225475, 'eta': 0.1, 'base_score': 0.2, 'objective': 'binary:logistic', 'alpha': 0, 'max_depth': 11, 'gamma': 0.2, 'lambda': 100}
[0]	train-logloss:0.457606	valid-logloss:0.458251
Multiple eval metrics have been passed: 'valid-logloss' will be used for early stopping.

Will train until valid-logloss hasn't improved in 30 rounds.
Stopping. Best iteration:
[407]	train-logloss:0.099604	valid-logloss:0.175287

{'colsample_bytree': 0.4815534771783838, 'eval_metric': 'logloss', 'max_delta_step': 1.0, 'nthread': 16, 'min_child_weight': 7.0, 'subsample': 0.8986936374687722, 'eta': 0.1, 'base_score': 0.2, 'objective': 'binary:logistic', 'alpha': 1, 'max_depth': 10, 'gamma': 0.4, 'lambda': 0.05}
[0]	train-logloss:0.488294	valid-logloss:0.488282
Multiple eval metrics have been passed: 'valid-logloss' will be used for early stopping.

Will train until valid-logloss hasn't improved in 30 rounds.
Stopping. Best iteration:
[271]	train-logloss:0.096622	valid-logloss:0.175078

{'colsample_bytree': 0.24755694526312677, 'eval_metric': 'logloss', 'max_delta_step': 10.0, 'nthread': 16, 'min_child_weight': 3.0, 'subsample': 0.8128000416673974, 'eta': 0.1, 'base_score': 0.2, 'objective': 'binary:logistic', 'alpha': 0.005, 'max_depth': 8, 'gamma': 0, 'lambda': 1e-05}
[0]	train-logloss:0.454051	valid-logloss:0.454859
Multiple eval metrics have been passed: 'valid-logloss' will be used for early stopping.

Will train until valid-logloss hasn't improved in 30 rounds.
Stopping. Best iteration:
[463]	train-logloss:0.09956	valid-logloss:0.175002

{'colsample_bytree': 0.41748483925514357, 'eval_metric': 'logloss', 'max_delta_step': 4.0, 'nthread': 16, 'min_child_weight': 6.0, 'subsample': 0.6286231325099906, 'eta': 0.1, 'base_score': 0.2, 'objective': 'binary:logistic', 'alpha': 0.1, 'max_depth': 9, 'gamma': 0.3, 'lambda': 0.1}
[0]	train-logloss:0.458699	valid-logloss:0.459592
Multiple eval metrics have been passed: 'valid-logloss' will be used for early stopping.

Will train until valid-logloss hasn't improved in 30 rounds.
Stopping. Best iteration:
[259]	train-logloss:0.111371	valid-logloss:0.176456

{'colsample_bytree': 0.5342407748737998, 'eval_metric': 'logloss', 'max_delta_step': 3.0, 'nthread': 16, 'min_child_weight': 9.0, 'subsample': 0.7307268600851822, 'eta': 0.1, 'base_score': 0.2, 'objective': 'binary:logistic', 'alpha': 0.001, 'max_depth': 6, 'gamma': 0.1, 'lambda': 1}
[0]	train-logloss:0.46769	valid-logloss:0.467919
Multiple eval metrics have been passed: 'valid-logloss' will be used for early stopping.

Will train until valid-logloss hasn't improved in 30 rounds.
[500]	train-logloss:0.138411	valid-logloss:0.176471
Stopping. Best iteration:
[746]	train-logloss:0.124029	valid-logloss:0.175004

{'colsample_bytree': 0.19458993822937742, 'eval_metric': 'logloss', 'max_delta_step': 5.0, 'nthread': 16, 'min_child_weight': 4.0, 'subsample': 0.5091554094240125, 'eta': 0.1, 'base_score': 0.2, 'objective': 'binary:logistic', 'alpha': 0.05, 'max_depth': 14, 'gamma': 0.5, 'lambda': 10}
[0]	train-logloss:0.482066	valid-logloss:0.483247
Multiple eval metrics have been passed: 'valid-logloss' will be used for early stopping.

Will train until valid-logloss hasn't improved in 30 rounds.
Stopping. Best iteration:
[172]	train-logloss:0.083291	valid-logloss:0.180125

{'colsample_bytree': 0.6886365852423127, 'eval_metric': 'logloss', 'max_delta_step': 2.0, 'nthread': 16, 'min_child_weight': 1.0, 'subsample': 0.6804680742133138, 'eta': 0.1, 'base_score': 0.2, 'objective': 'binary:logistic', 'alpha': 0.01, 'max_depth': 9, 'gamma': 0.2, 'lambda': 0.01}
[0]	train-logloss:0.475678	valid-logloss:0.47591
Multiple eval metrics have been passed: 'valid-logloss' will be used for early stopping.

Will train until valid-logloss hasn't improved in 30 rounds.
Stopping. Best iteration:
[220]	train-logloss:0.107397	valid-logloss:0.177099

{'colsample_bytree': 0.6335685054642126, 'eval_metric': 'logloss', 'max_delta_step': 1.0, 'nthread': 16, 'min_child_weight': 7.0, 'subsample': 0.4125109664512634, 'eta': 0.1, 'base_score': 0.2, 'objective': 'binary:logistic', 'alpha': 10, 'max_depth': 5, 'gamma': 0.4, 'lambda': 100}
[0]	train-logloss:0.48997	valid-logloss:0.48981
Multiple eval metrics have been passed: 'valid-logloss' will be used for early stopping.

Will train until valid-logloss hasn't improved in 30 rounds.
[500]	train-logloss:0.163834	valid-logloss:0.180581
[1000]	train-logloss:0.147307	valid-logloss:0.17702
Stopping. Best iteration:
[1106]	train-logloss:0.144376	valid-logloss:0.17673

{'colsample_bytree': 0.272055771843968, 'eval_metric': 'logloss', 'max_delta_step': 7.0, 'nthread': 16, 'min_child_weight': 5.0, 'subsample': 0.3440531015873607, 'eta': 0.1, 'base_score': 0.2, 'objective': 'binary:logistic', 'alpha': 0, 'max_depth': 12, 'gamma': 0.1, 'lambda': 1e-05}
[0]	train-logloss:0.451824	valid-logloss:0.453606
Multiple eval metrics have been passed: 'valid-logloss' will be used for early stopping.

Will train until valid-logloss hasn't improved in 30 rounds.
Stopping. Best iteration:
[166]	train-logloss:0.101752	valid-logloss:0.181765

{'colsample_bytree': 0.8548416605656896, 'eval_metric': 'logloss', 'max_delta_step': 6.0, 'nthread': 16, 'min_child_weight': 9.0, 'subsample': 0.8323108705212866, 'eta': 0.1, 'base_score': 0.2, 'objective': 'binary:logistic', 'alpha': 1, 'max_depth': 10, 'gamma': 0.3, 'lambda': 0.05}
[0]	train-logloss:0.450034	valid-logloss:0.451077
Multiple eval metrics have been passed: 'valid-logloss' will be used for early stopping.

Will train until valid-logloss hasn't improved in 30 rounds.
Stopping. Best iteration:
[245]	train-logloss:0.096381	valid-logloss:0.174968

{'colsample_bytree': 0.7461585425861366, 'eval_metric': 'logloss', 'max_delta_step': 2.0, 'nthread': 16, 'min_child_weight': 6.0, 'subsample': 0.16357518862955528, 'eta': 0.1, 'base_score': 0.2, 'objective': 'binary:logistic', 'alpha': 0.01, 'max_depth': 7, 'gamma': 0.5, 'lambda': 100}
[0]	train-logloss:0.477417	valid-logloss:0.477424
Multiple eval metrics have been passed: 'valid-logloss' will be used for early stopping.

Will train until valid-logloss hasn't improved in 30 rounds.
Stopping. Best iteration:
[424]	train-logloss:0.154338	valid-logloss:0.182012

{'colsample_bytree': 0.45593481360326926, 'eval_metric': 'logloss', 'max_delta_step': 4.0, 'nthread': 16, 'min_child_weight': 2.0, 'subsample': 0.7900904488081063, 'eta': 0.1, 'base_score': 0.2, 'objective': 'binary:logistic', 'alpha': 10, 'max_depth': 4, 'gamma': 0.2, 'lambda': 0.1}
[0]	train-logloss:0.463917	valid-logloss:0.464227
Multiple eval metrics have been passed: 'valid-logloss' will be used for early stopping.

Will train until valid-logloss hasn't improved in 30 rounds.
[500]	train-logloss:0.168392	valid-logloss:0.181765
[1000]	train-logloss:0.154509	valid-logloss:0.177592
Stopping. Best iteration:
[1426]	train-logloss:0.145318	valid-logloss:0.175894

{'colsample_bytree': 0.20854333330953745, 'eval_metric': 'logloss', 'max_delta_step': 5.0, 'nthread': 16, 'min_child_weight': 3.0, 'subsample': 0.8680613469000104, 'eta': 0.1, 'base_score': 0.2, 'objective': 'binary:logistic', 'alpha': 0.1, 'max_depth': 13, 'gamma': 0.1, 'lambda': 100}
[0]	train-logloss:0.483757	valid-logloss:0.484099
Multiple eval metrics have been passed: 'valid-logloss' will be used for early stopping.

Will train until valid-logloss hasn't improved in 30 rounds.
Stopping. Best iteration:
[333]	train-logloss:0.086233	valid-logloss:0.175335

{'colsample_bytree': 0.16017168242118035, 'eval_metric': 'logloss', 'max_delta_step': 3.0, 'nthread': 16, 'min_child_weight': 6.0, 'subsample': 0.7471647674921095, 'eta': 0.1, 'base_score': 0.2, 'objective': 'binary:logistic', 'alpha': 0.001, 'max_depth': 6, 'gamma': 0.4, 'lambda': 10}
[0]	train-logloss:0.489256	valid-logloss:0.48897
Multiple eval metrics have been passed: 'valid-logloss' will be used for early stopping.

Will train until valid-logloss hasn't improved in 30 rounds.
[500]	train-logloss:0.145898	valid-logloss:0.177459
[1000]	train-logloss:0.121003	valid-logloss:0.175287
Stopping. Best iteration:
[1013]	train-logloss:0.120507	valid-logloss:0.175206

{'colsample_bytree': 0.32907158928619007, 'eval_metric': 'logloss', 'max_delta_step': 1.0, 'nthread': 16, 'min_child_weight': 4.0, 'subsample': 0.6104725323063749, 'eta': 0.1, 'base_score': 0.2, 'objective': 'binary:logistic', 'alpha': 0.05, 'max_depth': 15, 'gamma': 0.1, 'lambda': 100}
[0]	train-logloss:0.488978	valid-logloss:0.489122
Multiple eval metrics have been passed: 'valid-logloss' will be used for early stopping.

Will train until valid-logloss hasn't improved in 30 rounds.
Stopping. Best iteration:
[344]	train-logloss:0.076086	valid-logloss:0.175998

{'colsample_bytree': 0.41291172790077446, 'eval_metric': 'logloss', 'max_delta_step': 9.0, 'nthread': 16, 'min_child_weight': 1.0, 'subsample': 0.42698697481392084, 'eta': 0.1, 'base_score': 0.2, 'objective': 'binary:logistic', 'alpha': 10, 'max_depth': 11, 'gamma': 0, 'lambda': 1}
[0]	train-logloss:0.45337	valid-logloss:0.454473
Multiple eval metrics have been passed: 'valid-logloss' will be used for early stopping.

Will train until valid-logloss hasn't improved in 30 rounds.
Stopping. Best iteration:
[243]	train-logloss:0.097876	valid-logloss:0.178026

{'colsample_bytree': 0.5867094456305084, 'eval_metric': 'logloss', 'max_delta_step': 6.0, 'nthread': 16, 'min_child_weight': 2.0, 'subsample': 0.12001748525343353, 'eta': 0.1, 'base_score': 0.2, 'objective': 'binary:logistic', 'alpha': 0.005, 'max_depth': 8, 'gamma': 0.1, 'lambda': 0.01}
[0]	train-logloss:0.4522	valid-logloss:0.452742
Multiple eval metrics have been passed: 'valid-logloss' will be used for early stopping.

Will train until valid-logloss hasn't improved in 30 rounds.
Stopping. Best iteration:
[134]	train-logloss:0.161293	valid-logloss:0.186812

{'colsample_bytree': 0.4002347810637314, 'eval_metric': 'logloss', 'max_delta_step': 10.0, 'nthread': 16, 'min_child_weight': 5.0, 'subsample': 0.7055265328868927, 'eta': 0.1, 'base_score': 0.2, 'objective': 'binary:logistic', 'alpha': 10, 'max_depth': 9, 'gamma': 0.3, 'lambda': 0.05}
[0]	train-logloss:0.453729	valid-logloss:0.454522
Multiple eval metrics have been passed: 'valid-logloss' will be used for early stopping.

Will train until valid-logloss hasn't improved in 30 rounds.
Stopping. Best iteration:
[407]	train-logloss:0.093631	valid-logloss:0.175224

{'colsample_bytree': 0.839445229143727, 'eval_metric': 'logloss', 'max_delta_step': 2.0, 'nthread': 16, 'min_child_weight': 3.0, 'subsample': 0.28157601426944423, 'eta': 0.1, 'base_score': 0.2, 'objective': 'binary:logistic', 'alpha': 0.01, 'max_depth': 12, 'gamma': 0.5, 'lambda': 100}
[0]	train-logloss:0.476458	valid-logloss:0.476524
Multiple eval metrics have been passed: 'valid-logloss' will be used for early stopping.

Will train until valid-logloss hasn't improved in 30 rounds.
Stopping. Best iteration:
[339]	train-logloss:0.112297	valid-logloss:0.178553

{'colsample_bytree': 0.5222127042770424, 'eval_metric': 'logloss', 'max_delta_step': 7.0, 'nthread': 16, 'min_child_weight': 4.0, 'subsample': 0.4746548994750394, 'eta': 0.1, 'base_score': 0.2, 'objective': 'binary:logistic', 'alpha': 10, 'max_depth': 10, 'gamma': 0.1, 'lambda': 1e-05}
[0]	train-logloss:0.451785	valid-logloss:0.452561
Multiple eval metrics have been passed: 'valid-logloss' will be used for early stopping.

Will train until valid-logloss hasn't improved in 30 rounds.
Stopping. Best iteration:
[309]	train-logloss:0.094752	valid-logloss:0.176082

{'colsample_bytree': 0.4464093794343689, 'eval_metric': 'logloss', 'max_delta_step': 8.0, 'nthread': 16, 'min_child_weight': 10.0, 'subsample': 0.5349401848000775, 'eta': 0.1, 'base_score': 0.2, 'objective': 'binary:logistic', 'alpha': 1, 'max_depth': 7, 'gamma': 0.2, 'lambda': 100}
[0]	train-logloss:0.457837	valid-logloss:0.458271
Multiple eval metrics have been passed: 'valid-logloss' will be used for early stopping.

Will train until valid-logloss hasn't improved in 30 rounds.
[500]	train-logloss:0.138412	valid-logloss:0.176231
Stopping. Best iteration:
[610]	train-logloss:0.131238	valid-logloss:0.175559

{'colsample_bytree': 0.7056575192744945, 'eval_metric': 'logloss', 'max_delta_step': 3.0, 'nthread': 16, 'min_child_weight': 7.0, 'subsample': 0.5797789060181122, 'eta': 0.1, 'base_score': 0.2, 'objective': 'binary:logistic', 'alpha': 0, 'max_depth': 11, 'gamma': 0.1, 'lambda': 0.1}
[0]	train-logloss:0.46501	valid-logloss:0.465849
Multiple eval metrics have been passed: 'valid-logloss' will be used for early stopping.

Will train until valid-logloss hasn't improved in 30 rounds.
---------------------------------------------------------------------------
KeyboardInterrupt                         Traceback (most recent call last)
<ipython-input-8-b88d419a1f67> in <module>()
     47             algo=tpe.suggest,
     48             max_evals=100,
---> 49             trials=trials)
     50 
     51 print best

/usr/local/lib/python2.7/dist-packages/hyperopt/fmin.pyc in fmin(fn, space, algo, max_evals, trials, rstate, allow_trials_fmin, pass_expr_memo_ctrl, catch_eval_exceptions, verbose, return_argmin)
    305             verbose=verbose,
    306             catch_eval_exceptions=catch_eval_exceptions,
--> 307             return_argmin=return_argmin,
    308         )
    309 

/usr/local/lib/python2.7/dist-packages/hyperopt/base.pyc in fmin(self, fn, space, algo, max_evals, rstate, verbose, pass_expr_memo_ctrl, catch_eval_exceptions, return_argmin)
    633             pass_expr_memo_ctrl=pass_expr_memo_ctrl,
    634             catch_eval_exceptions=catch_eval_exceptions,
--> 635             return_argmin=return_argmin)
    636 
    637 

/usr/local/lib/python2.7/dist-packages/hyperopt/fmin.pyc in fmin(fn, space, algo, max_evals, trials, rstate, allow_trials_fmin, pass_expr_memo_ctrl, catch_eval_exceptions, verbose, return_argmin)
    318                     verbose=verbose)
    319     rval.catch_eval_exceptions = catch_eval_exceptions
--> 320     rval.exhaust()
    321     if return_argmin:
    322         return trials.argmin

/usr/local/lib/python2.7/dist-packages/hyperopt/fmin.pyc in exhaust(self)
    197     def exhaust(self):
    198         n_done = len(self.trials)
--> 199         self.run(self.max_evals - n_done, block_until_done=self.async)
    200         self.trials.refresh()
    201         return self

/usr/local/lib/python2.7/dist-packages/hyperopt/fmin.pyc in run(self, N, block_until_done)
    171             else:
    172                 # -- loop over trials and do the jobs directly
--> 173                 self.serial_evaluate()
    174 
    175             if stopped:

/usr/local/lib/python2.7/dist-packages/hyperopt/fmin.pyc in serial_evaluate(self, N)
     90                 ctrl = base.Ctrl(self.trials, current_trial=trial)
     91                 try:
---> 92                     result = self.domain.evaluate(spec, ctrl)
     93                 except Exception as e:
     94                     logger.info('job exception: %s' % str(e))

/usr/local/lib/python2.7/dist-packages/hyperopt/base.pyc in evaluate(self, config, ctrl, attach_attachments)
    838                 memo=memo,
    839                 print_node_on_error=self.rec_eval_print_node_on_error)
--> 840             rval = self.fn(pyll_rval)
    841 
    842         if isinstance(rval, (float, int, np.number)):

<ipython-input-8-b88d419a1f67> in objective(space)
     24 
     25     watchlist = [(d_train, 'train'), (d_valid, 'valid')]
---> 26     bst = xgb.train(params, d_train, 2000, watchlist, early_stopping_rounds=30, verbose_eval=500)
     27     logloss = log_loss(y_test, bst.predict(d_valid))
     28 

/usr/local/lib/python2.7/dist-packages/xgboost/training.pyc in train(params, dtrain, num_boost_round, evals, obj, feval, maximize, early_stopping_rounds, evals_result, verbose_eval, learning_rates, xgb_model, callbacks)
    203                            evals=evals,
    204                            obj=obj, feval=feval,
--> 205                            xgb_model=xgb_model, callbacks=callbacks)
    206 
    207 

/usr/local/lib/python2.7/dist-packages/xgboost/training.pyc in _train_internal(params, dtrain, num_boost_round, evals, obj, feval, xgb_model, callbacks)
     74         # Skip the first update if it is a recovery step.
     75         if version % 2 == 0:
---> 76             bst.update(dtrain, i, obj)
     77             bst.save_rabit_checkpoint()
     78             version += 1

/usr/local/lib/python2.7/dist-packages/xgboost/core.pyc in update(self, dtrain, iteration, fobj)
    804 
    805         if fobj is None:
--> 806             _check_call(_LIB.XGBoosterUpdateOneIter(self.handle, iteration, dtrain.handle))
    807         else:
    808             pred = self.predict(dtrain)

KeyboardInterrupt: