In [1]:
import os
import pandas as pd
from collections import Counter
import tqdm
import re
import numpy as np
import xgboost as xgb
from sklearn.metrics import roc_auc_score
import functools
import sys
from __future__ import division # for python2 compatability


/usr/local/lib/python3.5/dist-packages/sklearn/cross_validation.py:41: DeprecationWarning: This module was deprecated in version 0.18 in favor of the model_selection module into which all the refactored classes and functions are moved. Also note that the interface of the new CV iterators are different from that of this module. This module will be removed in 0.20.
  "This module will be removed in 0.20.", DeprecationWarning)

In [2]:
data_dir = "/home/sabr/PycharmProjects/kaggle_competitions/sberbank/taskA/data"
train_csv = os.path.join(data_dir, "train_task1_latest.csv")
test_csv = os.path.join(data_dir, "sdsj_A_test.csv")

df_train = pd.read_csv(train_csv)
df_test = pd.read_csv(test_csv)
df_train.head()


Out[2]:
paragraph_id question_id paragraph question target
0 1094 46273 В отличие от рыб, земноводные (амфибии) и прес... С какого года Русское Царство перешло на летои... 0.0
1 7414 19164 В 1049 году Балдуину V удалось отнять у Герман... Кто упомянул о его первых разногласиях со Штей... 0.0
2 6744 39767 Стремление достичь предельных значений ёмкости... Как называется имеющая мировое значение эпоха ... 0.0
3 7300 36318 Первый практически пригодный двухтактный газов... Что усугублялось из-за международного давления... 0.0
4 7077 41534 Требуя от художника углубленного изучения изоб... Какой характер носят пророчества Леонардо да В... 0.0

In [3]:
def uniq_words(text):
    return set(re.findall("\w+", text, re.UNICODE))

def calculate_idfs(data):
    counter_paragraph = Counter()
    uniq_paragraphs = data['paragraph'].unique()
    for paragraph in tqdm.tqdm(uniq_paragraphs, desc="calc idf"):
        set_words = uniq_words(paragraph)
        counter_paragraph.update(set_words)
        
    num_docs = uniq_paragraphs.shape[0]
    idfs = {}
    for word in counter_paragraph:
        idfs[word] = np.log(num_docs / counter_paragraph[word])
    return idfs

In [4]:
idfs = calculate_idfs(df_train)


calc idf: 100%|██████████| 9078/9078 [00:00<00:00, 16538.93it/s]

In [5]:
for name, df in [('train', df_train), ('test', df_test)]:
    for index, row in tqdm.tqdm(df.iterrows(), total=df.shape[0], desc="build features for " + name):
        question = uniq_words(row.question)
        paragraph = uniq_words(row.paragraph)
        df.loc[index, 'len_paragraph'] = len(paragraph)
        df.loc[index, 'len_question'] = len(question)
        df.loc[index, 'len_intersection'] = len(paragraph & question)
#         df.loc[index, 'idf_question'] = np.sum([idfs.get(word, 0.0) for word in question])
#         df.loc[index, 'idf_paragraph'] = np.sum([idfs.get(word, 0.0) for word in paragraph])
#         df.loc[index, 'idf_intersection'] = np.sum([idfs.get(word, 0.0) for word in paragraph & question])


build features for train: 100%|██████████| 119398/119398 [07:09<00:00, 277.97it/s]
build features for test: 100%|██████████| 74286/74286 [03:35<00:00, 344.94it/s]

In [7]:
from __future__ import division  # for python2 compatability

import sys
import os
import tqdm
import re
import numpy as np
import pandas as pd
import xgboost as xgb
from functools import lru_cache
from collections import Counter
from sklearn.model_selection import KFold
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import roc_auc_score

In [10]:
columns = ['len_paragraph', 'len_question', 'len_intersection', 'idf_question', 'idf_paragraph', 'idf_intersection']

cv = KFold(n_splits=3, random_state=1, shuffle=True)
X = df_train[columns]
y = df_train["target"]
scores = []

for t, v in cv.split(X):
    X_train, y_train = X.values[t], y.values[t]
    X_val, y_val = X.values[v], y.values[v]
    
    model = GradientBoostingClassifier().fit(X_train, y_train)
    y_pred = model.predict_proba(X_val)[:, 1]
    score = roc_auc_score(y_val, y_pred)
    print(score)
    scores.append(score)
print("Mean GBclassifier score is %.4f" % np.mean(scores))


0.995944895203
0.995920754405
0.995899907422
Mean GBclassifier score is 0.9959

In [ ]:


In [30]:
from sklearn.model_selection import KFold, GroupKFold, StratifiedKFold

xgb_params = {
#     'eta': 0.05,
#     'gamma': 0.1,
#     'lambda': 1,
#     'alpha': 0,
    'max_depth': 5,
#     'min_child_weight': 1,
#     'max_delta_step': 0,
#     'subsample': 0.86,
#     'colsample_bytree': 0.6,
#     'colsample_bylevel': 1,
    'objective': 'binary:logistic',
    'eval_metric': 'auc',
#     'nthread': 12,
#     'seed': 42,
#     'silent': 1
}

cv = KFold(n_splits=5, random_state=42, shuffle=True)
X = df_train[columns]
y = df_train["target"]
groups = 

cv_scores = []
n_estimators = 100
models = {"xgb": {"score": 0}}

for t, v in cv.split(X):
    X_train, y_train = X.values[t], y.values[t]
    X_val, y_val = X.values[v], y.values[v]

    xgb_train = xgb.DMatrix(X_train, label=y_train, feature_names=list(X.columns), missing=np.nan)
    xgb_val = xgb.DMatrix(X_val, label=y_val, feature_names=list(X.columns), missing=np.nan)
    watchlist = [(xgb_train, 'train'), (xgb_val, 'validation')]

    model = xgb.train(xgb_params, xgb_train, evals=watchlist, num_boost_round=n_estimators, verbose_eval=1, early_stopping_rounds=10)
    y_pred = model.predict(xgb_val)
    roc_auc = roc_auc_score(y_val, y_pred)
    cv_scores.append(roc_auc)
    # best cv
    if models["xgb"]["score"] < roc_auc:
        models["xgb"].update({"model": model, "score": roc_auc})

print("Mean CV score is %s" % np.mean(cv_scores))


[0]	train-auc:0.992544	validation-auc:0.992123
Multiple eval metrics have been passed: 'validation-auc' will be used for early stopping.

Will train until validation-auc hasn't improved in 10 rounds.
[1]	train-auc:0.994213	validation-auc:0.99408
[2]	train-auc:0.99482	validation-auc:0.994561
[3]	train-auc:0.995315	validation-auc:0.994966
[4]	train-auc:0.995526	validation-auc:0.995211
[5]	train-auc:0.995604	validation-auc:0.995227
[6]	train-auc:0.995893	validation-auc:0.995383
[7]	train-auc:0.996057	validation-auc:0.995584
[8]	train-auc:0.996107	validation-auc:0.99563
[9]	train-auc:0.996172	validation-auc:0.995691
[10]	train-auc:0.996215	validation-auc:0.995718
[11]	train-auc:0.996258	validation-auc:0.995798
[12]	train-auc:0.996301	validation-auc:0.995817
[13]	train-auc:0.996416	validation-auc:0.995901
[14]	train-auc:0.996439	validation-auc:0.995907
[15]	train-auc:0.996507	validation-auc:0.995915
[16]	train-auc:0.996563	validation-auc:0.995929
[17]	train-auc:0.996576	validation-auc:0.995927
[18]	train-auc:0.996599	validation-auc:0.995925
[19]	train-auc:0.996616	validation-auc:0.995924
[20]	train-auc:0.996664	validation-auc:0.995936
[21]	train-auc:0.996674	validation-auc:0.995975
[22]	train-auc:0.996702	validation-auc:0.99598
[23]	train-auc:0.996728	validation-auc:0.995973
[24]	train-auc:0.996736	validation-auc:0.99598
[25]	train-auc:0.99675	validation-auc:0.996004
[26]	train-auc:0.996796	validation-auc:0.996019
[27]	train-auc:0.996833	validation-auc:0.996019
[28]	train-auc:0.996853	validation-auc:0.996025
[29]	train-auc:0.996861	validation-auc:0.996032
[30]	train-auc:0.996918	validation-auc:0.996071
[31]	train-auc:0.996936	validation-auc:0.996068
[32]	train-auc:0.996983	validation-auc:0.996092
[33]	train-auc:0.996989	validation-auc:0.996101
[34]	train-auc:0.997026	validation-auc:0.996114
[35]	train-auc:0.997038	validation-auc:0.996115
[36]	train-auc:0.997069	validation-auc:0.996134
[37]	train-auc:0.99707	validation-auc:0.996148
[38]	train-auc:0.997078	validation-auc:0.996152
[39]	train-auc:0.997092	validation-auc:0.996155
[40]	train-auc:0.997142	validation-auc:0.996175
[41]	train-auc:0.997173	validation-auc:0.996186
[42]	train-auc:0.997185	validation-auc:0.996189
[43]	train-auc:0.997225	validation-auc:0.996226
[44]	train-auc:0.997227	validation-auc:0.996228
[45]	train-auc:0.997248	validation-auc:0.996233
[46]	train-auc:0.997302	validation-auc:0.996275
[47]	train-auc:0.997316	validation-auc:0.996283
[48]	train-auc:0.99736	validation-auc:0.996316
[49]	train-auc:0.997364	validation-auc:0.99631
[50]	train-auc:0.997372	validation-auc:0.996311
[51]	train-auc:0.997391	validation-auc:0.996314
[52]	train-auc:0.997392	validation-auc:0.996314
[53]	train-auc:0.99744	validation-auc:0.996358
[54]	train-auc:0.99747	validation-auc:0.996376
[55]	train-auc:0.997474	validation-auc:0.996379
[56]	train-auc:0.997479	validation-auc:0.996382
[57]	train-auc:0.997501	validation-auc:0.996396
[58]	train-auc:0.997511	validation-auc:0.996394
[59]	train-auc:0.997533	validation-auc:0.99641
[60]	train-auc:0.997552	validation-auc:0.996419
[61]	train-auc:0.997599	validation-auc:0.996428
[62]	train-auc:0.997608	validation-auc:0.996426
[63]	train-auc:0.997629	validation-auc:0.996426
[64]	train-auc:0.99765	validation-auc:0.996436
[65]	train-auc:0.997652	validation-auc:0.996438
[66]	train-auc:0.997659	validation-auc:0.996441
[67]	train-auc:0.997664	validation-auc:0.996441
[68]	train-auc:0.997666	validation-auc:0.996444
[69]	train-auc:0.997693	validation-auc:0.996466
[70]	train-auc:0.997709	validation-auc:0.996478
[71]	train-auc:0.997715	validation-auc:0.996478
[72]	train-auc:0.997736	validation-auc:0.996495
[73]	train-auc:0.997778	validation-auc:0.99653
[74]	train-auc:0.997794	validation-auc:0.996539
[75]	train-auc:0.997816	validation-auc:0.996548
[76]	train-auc:0.997835	validation-auc:0.996557
[77]	train-auc:0.997841	validation-auc:0.996549
[78]	train-auc:0.99785	validation-auc:0.996548
[79]	train-auc:0.99787	validation-auc:0.996551
[80]	train-auc:0.997877	validation-auc:0.99655
[81]	train-auc:0.997891	validation-auc:0.996559
[82]	train-auc:0.997913	validation-auc:0.996576
[83]	train-auc:0.99793	validation-auc:0.99658
[84]	train-auc:0.997931	validation-auc:0.996579
[85]	train-auc:0.99794	validation-auc:0.996581
[86]	train-auc:0.997959	validation-auc:0.996593
[87]	train-auc:0.997972	validation-auc:0.996605
[88]	train-auc:0.997986	validation-auc:0.99661
[89]	train-auc:0.998004	validation-auc:0.996616
[90]	train-auc:0.998017	validation-auc:0.996632
[91]	train-auc:0.998026	validation-auc:0.996641
[92]	train-auc:0.998028	validation-auc:0.996642
[93]	train-auc:0.998034	validation-auc:0.996646
[94]	train-auc:0.998059	validation-auc:0.996659
[95]	train-auc:0.998069	validation-auc:0.996651
[96]	train-auc:0.998077	validation-auc:0.99665
[97]	train-auc:0.998089	validation-auc:0.99665
[98]	train-auc:0.998102	validation-auc:0.996653
[99]	train-auc:0.998104	validation-auc:0.996653
[0]	train-auc:0.992097	validation-auc:0.99246
Multiple eval metrics have been passed: 'validation-auc' will be used for early stopping.

Will train until validation-auc hasn't improved in 10 rounds.
[1]	train-auc:0.994527	validation-auc:0.994443
[2]	train-auc:0.995049	validation-auc:0.995224
[3]	train-auc:0.995268	validation-auc:0.995413
[4]	train-auc:0.995563	validation-auc:0.995664
[5]	train-auc:0.995647	validation-auc:0.995732
[6]	train-auc:0.99571	validation-auc:0.995836
[7]	train-auc:0.995998	validation-auc:0.995903
[8]	train-auc:0.996116	validation-auc:0.996036
[9]	train-auc:0.996188	validation-auc:0.996062
[10]	train-auc:0.996245	validation-auc:0.996115
[11]	train-auc:0.996274	validation-auc:0.996129
[12]	train-auc:0.996335	validation-auc:0.996164
[13]	train-auc:0.99638	validation-auc:0.996185
[14]	train-auc:0.996409	validation-auc:0.996195
[15]	train-auc:0.996432	validation-auc:0.996201
[16]	train-auc:0.996463	validation-auc:0.996243
[17]	train-auc:0.996485	validation-auc:0.996256
[18]	train-auc:0.996505	validation-auc:0.996273
[19]	train-auc:0.996545	validation-auc:0.99631
[20]	train-auc:0.996568	validation-auc:0.996312
[21]	train-auc:0.996607	validation-auc:0.996335
[22]	train-auc:0.996642	validation-auc:0.996375
[23]	train-auc:0.996648	validation-auc:0.996378
[24]	train-auc:0.996691	validation-auc:0.996404
[25]	train-auc:0.996712	validation-auc:0.996403
[26]	train-auc:0.996727	validation-auc:0.996403
[27]	train-auc:0.996769	validation-auc:0.996425
[28]	train-auc:0.996788	validation-auc:0.996435
[29]	train-auc:0.996826	validation-auc:0.996449
[30]	train-auc:0.996839	validation-auc:0.996461
[31]	train-auc:0.99686	validation-auc:0.996459
[32]	train-auc:0.996881	validation-auc:0.996462
[33]	train-auc:0.996892	validation-auc:0.99646
[34]	train-auc:0.99694	validation-auc:0.996491
[35]	train-auc:0.996981	validation-auc:0.996513
[36]	train-auc:0.996994	validation-auc:0.996516
[37]	train-auc:0.996999	validation-auc:0.996523
[38]	train-auc:0.997016	validation-auc:0.996527
[39]	train-auc:0.997038	validation-auc:0.996526
[40]	train-auc:0.997042	validation-auc:0.996525
[41]	train-auc:0.997063	validation-auc:0.996534
[42]	train-auc:0.997072	validation-auc:0.996531
[43]	train-auc:0.997099	validation-auc:0.996533
[44]	train-auc:0.997154	validation-auc:0.996575
[45]	train-auc:0.99717	validation-auc:0.996583
[46]	train-auc:0.997191	validation-auc:0.996576
[47]	train-auc:0.997197	validation-auc:0.996572
[48]	train-auc:0.997225	validation-auc:0.996586
[49]	train-auc:0.99726	validation-auc:0.996606
[50]	train-auc:0.997312	validation-auc:0.996636
[51]	train-auc:0.997317	validation-auc:0.996636
[52]	train-auc:0.997332	validation-auc:0.99664
[53]	train-auc:0.997372	validation-auc:0.996669
[54]	train-auc:0.997379	validation-auc:0.996673
[55]	train-auc:0.997392	validation-auc:0.996675
[56]	train-auc:0.997423	validation-auc:0.996692
[57]	train-auc:0.997429	validation-auc:0.996692
[58]	train-auc:0.997437	validation-auc:0.996698
[59]	train-auc:0.99746	validation-auc:0.996715
[60]	train-auc:0.997471	validation-auc:0.996722
[61]	train-auc:0.997499	validation-auc:0.99675
[62]	train-auc:0.997529	validation-auc:0.996766
[63]	train-auc:0.997542	validation-auc:0.996764
[64]	train-auc:0.997556	validation-auc:0.996766
[65]	train-auc:0.997593	validation-auc:0.996799
[66]	train-auc:0.997595	validation-auc:0.996799
[67]	train-auc:0.997607	validation-auc:0.996801
[68]	train-auc:0.99761	validation-auc:0.9968
[69]	train-auc:0.997612	validation-auc:0.996804
[70]	train-auc:0.997637	validation-auc:0.996818
[71]	train-auc:0.997668	validation-auc:0.996826
[72]	train-auc:0.99769	validation-auc:0.99683
[73]	train-auc:0.997693	validation-auc:0.99683
[74]	train-auc:0.997707	validation-auc:0.996844
[75]	train-auc:0.997723	validation-auc:0.996854
[76]	train-auc:0.997728	validation-auc:0.996856
[77]	train-auc:0.997744	validation-auc:0.996857
[78]	train-auc:0.997761	validation-auc:0.99687
[79]	train-auc:0.997791	validation-auc:0.996876
[80]	train-auc:0.997802	validation-auc:0.996884
[81]	train-auc:0.997825	validation-auc:0.996894
[82]	train-auc:0.997832	validation-auc:0.9969
[83]	train-auc:0.997862	validation-auc:0.996906
[84]	train-auc:0.997879	validation-auc:0.996893
[85]	train-auc:0.997882	validation-auc:0.996889
[86]	train-auc:0.997886	validation-auc:0.996887
[87]	train-auc:0.997895	validation-auc:0.996882
[88]	train-auc:0.997897	validation-auc:0.996877
[89]	train-auc:0.997928	validation-auc:0.99691
[90]	train-auc:0.997931	validation-auc:0.996909
[91]	train-auc:0.99795	validation-auc:0.996933
[92]	train-auc:0.997959	validation-auc:0.996942
[93]	train-auc:0.99797	validation-auc:0.996942
[94]	train-auc:0.997991	validation-auc:0.996941
[95]	train-auc:0.998013	validation-auc:0.99694
[96]	train-auc:0.998023	validation-auc:0.996955
[97]	train-auc:0.998037	validation-auc:0.99695
[98]	train-auc:0.998048	validation-auc:0.996941
[99]	train-auc:0.998065	validation-auc:0.996944
[0]	train-auc:0.992317	validation-auc:0.991697
Multiple eval metrics have been passed: 'validation-auc' will be used for early stopping.

Will train until validation-auc hasn't improved in 10 rounds.
[1]	train-auc:0.994323	validation-auc:0.993738
[2]	train-auc:0.994805	validation-auc:0.994305
[3]	train-auc:0.9952	validation-auc:0.994809
[4]	train-auc:0.995412	validation-auc:0.994965
[5]	train-auc:0.995533	validation-auc:0.99509
[6]	train-auc:0.995852	validation-auc:0.995333
[7]	train-auc:0.996034	validation-auc:0.995529
[8]	train-auc:0.99612	validation-auc:0.995573
[9]	train-auc:0.996191	validation-auc:0.995704
[10]	train-auc:0.996246	validation-auc:0.995715
[11]	train-auc:0.996274	validation-auc:0.995743
[12]	train-auc:0.996321	validation-auc:0.99581
[13]	train-auc:0.996368	validation-auc:0.995833
[14]	train-auc:0.996416	validation-auc:0.995916
[15]	train-auc:0.996453	validation-auc:0.995909
[16]	train-auc:0.996516	validation-auc:0.996067
[17]	train-auc:0.996544	validation-auc:0.996164
[18]	train-auc:0.996561	validation-auc:0.99617
[19]	train-auc:0.996591	validation-auc:0.996171
[20]	train-auc:0.996607	validation-auc:0.996161
[21]	train-auc:0.996628	validation-auc:0.99619
[22]	train-auc:0.996647	validation-auc:0.996198
[23]	train-auc:0.996673	validation-auc:0.996224
[24]	train-auc:0.996677	validation-auc:0.996228
[25]	train-auc:0.996709	validation-auc:0.996257
[26]	train-auc:0.996723	validation-auc:0.996261
[27]	train-auc:0.996732	validation-auc:0.99626
[28]	train-auc:0.996741	validation-auc:0.996264
[29]	train-auc:0.996746	validation-auc:0.996266
[30]	train-auc:0.996768	validation-auc:0.996273
[31]	train-auc:0.996783	validation-auc:0.996272
[32]	train-auc:0.996856	validation-auc:0.996298
[33]	train-auc:0.996912	validation-auc:0.996342
[34]	train-auc:0.996941	validation-auc:0.996348
[35]	train-auc:0.996983	validation-auc:0.996376
[36]	train-auc:0.996993	validation-auc:0.996384
[37]	train-auc:0.997043	validation-auc:0.996423
[38]	train-auc:0.997057	validation-auc:0.996419
[39]	train-auc:0.997073	validation-auc:0.996422
[40]	train-auc:0.997089	validation-auc:0.996422
[41]	train-auc:0.997095	validation-auc:0.996424
[42]	train-auc:0.997113	validation-auc:0.996419
[43]	train-auc:0.997121	validation-auc:0.996423
[44]	train-auc:0.997139	validation-auc:0.99644
[45]	train-auc:0.997186	validation-auc:0.996459
[46]	train-auc:0.997201	validation-auc:0.996458
[47]	train-auc:0.997237	validation-auc:0.996461
[48]	train-auc:0.997277	validation-auc:0.996494
[49]	train-auc:0.997295	validation-auc:0.996511
[50]	train-auc:0.997299	validation-auc:0.996511
[51]	train-auc:0.997302	validation-auc:0.996512
[52]	train-auc:0.997341	validation-auc:0.996537
[53]	train-auc:0.997348	validation-auc:0.996541
[54]	train-auc:0.997359	validation-auc:0.996542
[55]	train-auc:0.997363	validation-auc:0.996542
[56]	train-auc:0.997393	validation-auc:0.996542
[57]	train-auc:0.997428	validation-auc:0.996557
[58]	train-auc:0.997462	validation-auc:0.996574
[59]	train-auc:0.997485	validation-auc:0.996579
[60]	train-auc:0.997512	validation-auc:0.996587
[61]	train-auc:0.997519	validation-auc:0.996588
[62]	train-auc:0.997548	validation-auc:0.996595
[63]	train-auc:0.997552	validation-auc:0.996593
[64]	train-auc:0.997579	validation-auc:0.996603
[65]	train-auc:0.997607	validation-auc:0.99661
[66]	train-auc:0.997636	validation-auc:0.996617
[67]	train-auc:0.99764	validation-auc:0.99662
[68]	train-auc:0.997658	validation-auc:0.99663
[69]	train-auc:0.997664	validation-auc:0.996634
[70]	train-auc:0.997666	validation-auc:0.996635
[71]	train-auc:0.997689	validation-auc:0.996652
[72]	train-auc:0.997737	validation-auc:0.996683
[73]	train-auc:0.99776	validation-auc:0.996697
[74]	train-auc:0.997766	validation-auc:0.996695
[75]	train-auc:0.997771	validation-auc:0.996701
[76]	train-auc:0.997788	validation-auc:0.9967
[77]	train-auc:0.997799	validation-auc:0.996693
[78]	train-auc:0.997823	validation-auc:0.996705
[79]	train-auc:0.997828	validation-auc:0.996701
[80]	train-auc:0.997831	validation-auc:0.996702
[81]	train-auc:0.99784	validation-auc:0.9967
[82]	train-auc:0.997857	validation-auc:0.996716
[83]	train-auc:0.997868	validation-auc:0.996718
[84]	train-auc:0.997871	validation-auc:0.996719
[85]	train-auc:0.997872	validation-auc:0.996719
[86]	train-auc:0.997886	validation-auc:0.99671
[87]	train-auc:0.997888	validation-auc:0.996714
[88]	train-auc:0.997907	validation-auc:0.996725
[89]	train-auc:0.997922	validation-auc:0.996733
[90]	train-auc:0.997939	validation-auc:0.99673
[91]	train-auc:0.997955	validation-auc:0.996734
[92]	train-auc:0.997958	validation-auc:0.99673
[93]	train-auc:0.997969	validation-auc:0.996728
[94]	train-auc:0.997971	validation-auc:0.996726
[95]	train-auc:0.997983	validation-auc:0.996741
[96]	train-auc:0.997984	validation-auc:0.996742
[97]	train-auc:0.997987	validation-auc:0.99674
[98]	train-auc:0.997991	validation-auc:0.996736
[99]	train-auc:0.997999	validation-auc:0.996741
[0]	train-auc:0.992729	validation-auc:0.992107
Multiple eval metrics have been passed: 'validation-auc' will be used for early stopping.

Will train until validation-auc hasn't improved in 10 rounds.
[1]	train-auc:0.993852	validation-auc:0.993212
[2]	train-auc:0.994364	validation-auc:0.99401
[3]	train-auc:0.995068	validation-auc:0.994811
[4]	train-auc:0.99522	validation-auc:0.994957
[5]	train-auc:0.995606	validation-auc:0.995319
[6]	train-auc:0.995696	validation-auc:0.995386
[7]	train-auc:0.995984	validation-auc:0.995666
[8]	train-auc:0.996101	validation-auc:0.99583
[9]	train-auc:0.996162	validation-auc:0.995925
[10]	train-auc:0.99622	validation-auc:0.996015
[11]	train-auc:0.996287	validation-auc:0.996044
[12]	train-auc:0.996329	validation-auc:0.996053
[13]	train-auc:0.996354	validation-auc:0.996067
[14]	train-auc:0.996398	validation-auc:0.996113
[15]	train-auc:0.996438	validation-auc:0.996117
[16]	train-auc:0.996474	validation-auc:0.996119
[17]	train-auc:0.996508	validation-auc:0.996154
[18]	train-auc:0.996519	validation-auc:0.996187
[19]	train-auc:0.996553	validation-auc:0.996199
[20]	train-auc:0.996589	validation-auc:0.996219
[21]	train-auc:0.996612	validation-auc:0.996228
[22]	train-auc:0.996627	validation-auc:0.996228
[23]	train-auc:0.996648	validation-auc:0.996247
[24]	train-auc:0.996654	validation-auc:0.996246
[25]	train-auc:0.996676	validation-auc:0.996251
[26]	train-auc:0.996686	validation-auc:0.996255
[27]	train-auc:0.996715	validation-auc:0.996279
[28]	train-auc:0.996723	validation-auc:0.996291
[29]	train-auc:0.996759	validation-auc:0.996313
[30]	train-auc:0.996782	validation-auc:0.996328
[31]	train-auc:0.996818	validation-auc:0.996352
[32]	train-auc:0.996864	validation-auc:0.99638
[33]	train-auc:0.996875	validation-auc:0.996393
[34]	train-auc:0.996888	validation-auc:0.996407
[35]	train-auc:0.996893	validation-auc:0.996409
[36]	train-auc:0.996929	validation-auc:0.996419
[37]	train-auc:0.996943	validation-auc:0.996415
[38]	train-auc:0.996951	validation-auc:0.996419
[39]	train-auc:0.996974	validation-auc:0.996424
[40]	train-auc:0.997009	validation-auc:0.996459
[41]	train-auc:0.997018	validation-auc:0.996467
[42]	train-auc:0.997078	validation-auc:0.9965
[43]	train-auc:0.997086	validation-auc:0.996502
[44]	train-auc:0.997091	validation-auc:0.996502
[45]	train-auc:0.997133	validation-auc:0.996551
[46]	train-auc:0.997143	validation-auc:0.996561
[47]	train-auc:0.997148	validation-auc:0.996565
[48]	train-auc:0.997152	validation-auc:0.996563
[49]	train-auc:0.997195	validation-auc:0.996604
[50]	train-auc:0.997223	validation-auc:0.996619
[51]	train-auc:0.997259	validation-auc:0.996657
[52]	train-auc:0.997292	validation-auc:0.996667
[53]	train-auc:0.997294	validation-auc:0.996671
[54]	train-auc:0.997299	validation-auc:0.996678
[55]	train-auc:0.997306	validation-auc:0.996678
[56]	train-auc:0.997309	validation-auc:0.996682
[57]	train-auc:0.997312	validation-auc:0.996681
[58]	train-auc:0.997333	validation-auc:0.996676
[59]	train-auc:0.997346	validation-auc:0.996673
[60]	train-auc:0.997382	validation-auc:0.996695
[61]	train-auc:0.997386	validation-auc:0.996696
[62]	train-auc:0.997392	validation-auc:0.996688
[63]	train-auc:0.997411	validation-auc:0.996701
[64]	train-auc:0.997425	validation-auc:0.996702
[65]	train-auc:0.99743	validation-auc:0.996699
[66]	train-auc:0.997434	validation-auc:0.996703
[67]	train-auc:0.997444	validation-auc:0.996701
[68]	train-auc:0.997473	validation-auc:0.996706
[69]	train-auc:0.997497	validation-auc:0.996723
[70]	train-auc:0.997526	validation-auc:0.996731
[71]	train-auc:0.997557	validation-auc:0.996743
[72]	train-auc:0.997559	validation-auc:0.996741
[73]	train-auc:0.997573	validation-auc:0.996745
[74]	train-auc:0.997589	validation-auc:0.996753
[75]	train-auc:0.997616	validation-auc:0.996773
[76]	train-auc:0.997633	validation-auc:0.996783
[77]	train-auc:0.997656	validation-auc:0.996787
[78]	train-auc:0.997675	validation-auc:0.996803
[79]	train-auc:0.997688	validation-auc:0.996793
[80]	train-auc:0.997712	validation-auc:0.996792
[81]	train-auc:0.997715	validation-auc:0.996795
[82]	train-auc:0.997727	validation-auc:0.996799
[83]	train-auc:0.99776	validation-auc:0.996839
[84]	train-auc:0.997771	validation-auc:0.996835
[85]	train-auc:0.997772	validation-auc:0.996835
[86]	train-auc:0.997775	validation-auc:0.996833
[87]	train-auc:0.997797	validation-auc:0.99684
[88]	train-auc:0.997818	validation-auc:0.996847
[89]	train-auc:0.997832	validation-auc:0.996854
[90]	train-auc:0.997856	validation-auc:0.996849
[91]	train-auc:0.997873	validation-auc:0.996854
[92]	train-auc:0.997884	validation-auc:0.996861
[93]	train-auc:0.997917	validation-auc:0.996873
[94]	train-auc:0.997922	validation-auc:0.996876
[95]	train-auc:0.997928	validation-auc:0.996876
[96]	train-auc:0.99793	validation-auc:0.996875
[97]	train-auc:0.997947	validation-auc:0.99689
[98]	train-auc:0.997972	validation-auc:0.996905
[99]	train-auc:0.997995	validation-auc:0.996929
[0]	train-auc:0.992678	validation-auc:0.991312
Multiple eval metrics have been passed: 'validation-auc' will be used for early stopping.

Will train until validation-auc hasn't improved in 10 rounds.
[1]	train-auc:0.994631	validation-auc:0.993731
[2]	train-auc:0.994728	validation-auc:0.993867
[3]	train-auc:0.995246	validation-auc:0.994413
[4]	train-auc:0.995427	validation-auc:0.994604
[5]	train-auc:0.995817	validation-auc:0.995146
[6]	train-auc:0.995985	validation-auc:0.995363
[7]	train-auc:0.996116	validation-auc:0.995476
[8]	train-auc:0.996173	validation-auc:0.995555
[9]	train-auc:0.996221	validation-auc:0.995593
[10]	train-auc:0.996276	validation-auc:0.995716
[11]	train-auc:0.99631	validation-auc:0.995736
[12]	train-auc:0.996388	validation-auc:0.995784
[13]	train-auc:0.996441	validation-auc:0.995834
[14]	train-auc:0.99648	validation-auc:0.995843
[15]	train-auc:0.99654	validation-auc:0.995868
[16]	train-auc:0.996552	validation-auc:0.995868
[17]	train-auc:0.996587	validation-auc:0.995869
[18]	train-auc:0.996606	validation-auc:0.995882
[19]	train-auc:0.996638	validation-auc:0.99587
[20]	train-auc:0.996661	validation-auc:0.995914
[21]	train-auc:0.99669	validation-auc:0.995918
[22]	train-auc:0.996699	validation-auc:0.995919
[23]	train-auc:0.996705	validation-auc:0.995911
[24]	train-auc:0.996733	validation-auc:0.995954
[25]	train-auc:0.996754	validation-auc:0.995951
[26]	train-auc:0.996763	validation-auc:0.995962
[27]	train-auc:0.996799	validation-auc:0.995952
[28]	train-auc:0.996826	validation-auc:0.995973
[29]	train-auc:0.99684	validation-auc:0.99599
[30]	train-auc:0.996855	validation-auc:0.995995
[31]	train-auc:0.996863	validation-auc:0.995998
[32]	train-auc:0.996884	validation-auc:0.996
[33]	train-auc:0.996897	validation-auc:0.995993
[34]	train-auc:0.996929	validation-auc:0.995989
[35]	train-auc:0.996942	validation-auc:0.995984
[36]	train-auc:0.997025	validation-auc:0.996057
[37]	train-auc:0.997042	validation-auc:0.996076
[38]	train-auc:0.997066	validation-auc:0.99609
[39]	train-auc:0.997127	validation-auc:0.996149
[40]	train-auc:0.997139	validation-auc:0.996159
[41]	train-auc:0.997143	validation-auc:0.996166
[42]	train-auc:0.997195	validation-auc:0.996178
[43]	train-auc:0.997223	validation-auc:0.996183
[44]	train-auc:0.997227	validation-auc:0.996186
[45]	train-auc:0.997232	validation-auc:0.996194
[46]	train-auc:0.997248	validation-auc:0.996197
[47]	train-auc:0.997259	validation-auc:0.996197
[48]	train-auc:0.997266	validation-auc:0.9962
[49]	train-auc:0.997295	validation-auc:0.996216
[50]	train-auc:0.99732	validation-auc:0.996227
[51]	train-auc:0.997358	validation-auc:0.996277
[52]	train-auc:0.997391	validation-auc:0.996252
[53]	train-auc:0.997419	validation-auc:0.996292
[54]	train-auc:0.997425	validation-auc:0.996294
[55]	train-auc:0.997461	validation-auc:0.996315
[56]	train-auc:0.997477	validation-auc:0.996336
[57]	train-auc:0.9975	validation-auc:0.996353
[58]	train-auc:0.997509	validation-auc:0.996361
[59]	train-auc:0.997513	validation-auc:0.996363
[60]	train-auc:0.997522	validation-auc:0.996372
[61]	train-auc:0.997523	validation-auc:0.996374
[62]	train-auc:0.997535	validation-auc:0.996373
[63]	train-auc:0.997546	validation-auc:0.996374
[64]	train-auc:0.997556	validation-auc:0.996364
[65]	train-auc:0.997558	validation-auc:0.996365
[66]	train-auc:0.997577	validation-auc:0.99638
[67]	train-auc:0.997595	validation-auc:0.996396
[68]	train-auc:0.997605	validation-auc:0.996401
[69]	train-auc:0.997626	validation-auc:0.996404
[70]	train-auc:0.997653	validation-auc:0.996413
[71]	train-auc:0.997669	validation-auc:0.996419
[72]	train-auc:0.997687	validation-auc:0.996412
[73]	train-auc:0.997697	validation-auc:0.996414
[74]	train-auc:0.997722	validation-auc:0.996429
[75]	train-auc:0.997744	validation-auc:0.996434
[76]	train-auc:0.99777	validation-auc:0.996436
[77]	train-auc:0.997792	validation-auc:0.996442
[78]	train-auc:0.99781	validation-auc:0.996454
[79]	train-auc:0.997848	validation-auc:0.996492
[80]	train-auc:0.997862	validation-auc:0.99649
[81]	train-auc:0.99787	validation-auc:0.996492
[82]	train-auc:0.997877	validation-auc:0.996499
[83]	train-auc:0.997882	validation-auc:0.9965
[84]	train-auc:0.997899	validation-auc:0.996497
[85]	train-auc:0.997921	validation-auc:0.996497
[86]	train-auc:0.997927	validation-auc:0.996502
[87]	train-auc:0.997928	validation-auc:0.996506
[88]	train-auc:0.99793	validation-auc:0.996511
[89]	train-auc:0.997953	validation-auc:0.996531
[90]	train-auc:0.997966	validation-auc:0.996539
[91]	train-auc:0.997985	validation-auc:0.996552
[92]	train-auc:0.99801	validation-auc:0.996578
[93]	train-auc:0.998028	validation-auc:0.996581
[94]	train-auc:0.998034	validation-auc:0.996584
[95]	train-auc:0.998046	validation-auc:0.996591
[96]	train-auc:0.998062	validation-auc:0.996591
[97]	train-auc:0.998063	validation-auc:0.996592
[98]	train-auc:0.99807	validation-auc:0.996593
[99]	train-auc:0.998073	validation-auc:0.996592
Mean CV score is 0.996771751784

In [14]:
model = models["xgb"]["model"]
xgb_test = xgb.DMatrix(df_test[columns])
df_test['prediction'] = model.predict(xgb_test)
df_test[['paragraph_id', 'question_id', 'prediction']].to_csv("prediction.csv", index=False)

In [ ]:
Mean CV score is 0.9965
0.8150940555316126

In [ ]: