In [ ]:
## da mettere in files
# tag_max_freq
# tag_counter

In [1]:
import quoradefs as qd
from time import time
import gc
import numpy as np  # linear algebra
import pandas as pd  # data processing, CSV file I/O (e.g. pd.read_csv)

import matplotlib.pyplot as plt
import seaborn as sns
import nltk
import re

from itertools import chain, combinations
from collections import Counter

from gensim.models import word2vec
import gensim

from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"
import pickle
import gc
import xgboost as xgb

from sklearn.decomposition import PCA
from sklearn.manifold import Isomap

In [36]:
train_df = pd.read_csv("./train.csv")
test_df = pd.read_csv("./test.csv")


## adjusting the nan value
train_df.fillna("", inplace=True)
test_df.fillna("", inplace=True)

(404290, 6)
(2345796, 3)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 404290 entries, 0 to 404289
Data columns (total 6 columns):
id              404290 non-null int64
qid1            404290 non-null int64
qid2            404290 non-null int64
question1       404290 non-null object
question2       404290 non-null object
is_duplicate    404290 non-null int64
dtypes: int64(4), object(2)
memory usage: 18.5+ MB

In [6]:
is_dup = train_df['is_duplicate'].value_counts()

sns.barplot(is_dup.index, is_dup.values, alpha=0.8)
plt.ylabel('Number of Occurrences', fontsize=12)
plt.xlabel('Is Duplicate', fontsize=12)

Create basic features

  • word shared
  • different characters/words length
  • matching first/second/third/last word
  • puntualization
  • etc.

In [50]:
question1, question2 = 'question1', 'question2'
qd.create_diff_punteggiatura(train_df, question1, question2)
qd.create_diff_punteggiatura(test_df, question1, question2)

qd.add_clear_first(train_df, question1)
qd.add_clear_first(train_df, question2)
qd.add_clear_first(test_df, question1)
qd.add_clear_first(test_df, question2)

question1_tagg = question1 + '_clear_1'
question2_tagg = question2 + '_clear_1'

qd.add_clear_second(train_df, question1_tagg)
qd.add_clear_second(train_df, question2_tagg)

q1 = 'question1_clear_1_clear_2'
q2 = 'question2_clear_1_clear_2'

qd.create_add_vars(train_df, q1=q1, q2=q2)
qd.calc_match_capital(train_df, question1=q1, question2=q1)

qd.calc_match_capital(test_df, question1=q1, question2=q1)
qd.create_add_vars(test_df, q1=q1, q2=q2)

In [48]:
all_unique_quests = set(
    list(set(train_df[q1])) + list(set(train_df[q2])) + list(set(test_df[q1]))
    + list(set(test_df[q2])))
all_words = qd.flatmap(qd.get_words, all_unique_quests)
cnt_words = Counter(all_words)

In [342]:
all_unique_quests = list(
        list(set(train_df[q1])) + list(set(train_df[q2])) +
        list(set(test_df[q1])) + list(set(test_df[q2]))))
qcat = pd.Series(
    list(map(lambda x: (x.split()[0].lower()), list(
qcat = (1 / (qcat)).to_dict()


Create features after tagger (see tagger notebook)

  • matching verbds, nouns, etc.
  • rarity: different class of rarity

  • sequences: matching structure of the phrase

In [24]:
work = pd.read_csv('tagged_list_counted_solved.csv', encoding='latin1', sep=';')

sym = work[work['tag_recod']=='SYM']
fw = work[work['tag_recod']=='FW']
uh = work[work['tag_recod']=='UH']
mo = work[work['tag_recod']=='MO']
noun = work[work['tag_recod']=='NOUN']
verb = work[work['tag_recod']=='VERB']
jj = work[work['tag_recod']=='JJ']

listverb = Counter(list(verb['name'].values))
listsym = Counter(list(sym['name'].values))
listuh = Counter(list(uh['name'].values))
listfw = Counter(list(fw['name'].values))
listnoun = Counter(list(noun['name'].values))
listjj = Counter(list(jj['name'].values))
listmo = Counter(list(mo['name'].values))

tag_list_vars = {
    '_VERB': listverb,
    '_NOUN': listnoun,
    '_FW': listfw,
    '_SYM': listsym,
    '_MO': listmo,
    '_JJ': listjj,
    '_UH': listuh

tag_rare_vars = {}
for cut in [(0,10), (10,50), (50,100), (100, 1000)]:
    tag_rare_vars[str(cut[0]) + '_' + str(cut[1])] = {
        '_VERB': Counter(list(verb[ (verb['count'] > cut[0]) & (verb['count']<= cut[1]) ]['name'].values)),
        '_NOUN': Counter(list(noun[ (noun['count'] > cut[0]) & (noun['count']<= cut[1]) ]['name'].values)),
        '_FW': Counter(list(fw[ (fw['count'] > cut[0]) & (fw['count']<= cut[1]) ]['name'].values)),
        '_JJ': Counter(list(jj[ (jj['count'] > cut[0]) & (jj['count']<= cut[1]) ]['name'].values))

In [12]:
for tag_var in tag_list_vars.keys():
    train_df['count'+tag_var] = train_df.apply(axis=1, func = lambda x:
                                                      abs(qd.countword(x[q1], tag_list_vars[tag_var])
                                                          - qd.countword(x[q2], tag_list_vars[tag_var])))
for tag_var in tag_list_vars.keys():
    test_df['count'+tag_var] = test_df.apply(axis=1, func = lambda x:
                                                      abs(qd.countword(x[q1], tag_list_vars[tag_var])
                                                          - qd.countword(x[q2], tag_list_vars[tag_var])))
for cut in tag_rare_vars.keys():
    for tag_var in tag_rare_vars[cut]:
        train_df['rare'+cut +tag_var] = train_df.apply(axis=1, func = lambda x:
                                                      abs(qd.countword(x[q1], tag_rare_vars[cut][tag_var])
                                                          - qd.countword(x[q2], tag_rare_vars[cut][tag_var])))
for cut in tag_rare_vars.keys():
    for tag_var in tag_rare_vars[cut]:
        test_df['rare'+cut +tag_var] = test_df.apply(axis=1, func = lambda x:
                                                      abs(qd.countword(x[q1], tag_rare_vars[cut][tag_var])
                                                          - qd.countword(x[q2], tag_rare_vars[cut][tag_var])))
cut = '100_1000'
for tag_var in tag_rare_vars[cut]:
    train_df['rare'+cut +tag_var] = train_df.apply(axis=1, func = lambda x:
                                                 abs(qd.countword(x[q1], tag_rare_vars[cut][tag_var])
                                                     - qd.countword(x[q2], tag_rare_vars[cut][tag_var])))
cut = '100_1000'
for tag_var in tag_rare_vars[cut]:
    test_df['rare'+cut +tag_var] = test_df.apply(axis=1, func = lambda x:
                                                 abs(qd.countword(x[q1], tag_rare_vars[cut][tag_var])
                                                     - qd.countword(x[q2], tag_rare_vars[cut][tag_var])))

In [ ]:
listona_tag = [
    'CC', 'CD', 'DT', 'EX', 'FW', 'IN', 'JJ', 'JJR', 'JJS', 'LS', 'MD', 'NN',
    'NNS', 'NP', 'NPS', 'PDT', 'POS', 'PP', 'PP$', 'RB', 'RBR', 'RBS', 'RP',
    'SYM', 'TO', 'UH', 'VB', 'VBD', 'VBG', 'VBN', 'VBP', 'VBZ', 'WDT', 'WP',
    'WP$', 'WRB'

for tag in listona_tag:
    train_df[tag + '_diff'] = train_df.apply(
        axis=1, func=lambda x: abs(dizionario.get(x[q1], 'ciccia')[tag] - dizionario.get(x[q2], 'ciccia')[tag]))
for tag in listona_tag:
    test_df[tag + '_diff'] = test_df.apply(
        axis=1, func=lambda x: abs(dizionario.get(x[q1], 'ciccia')[tag] - dizionario.get(x[q2], 'ciccia')[tag]))

dizionario_max = qd.load_obj('tag_max_freq')

for i in ['JJ', 'FW', 'VERB', 'NOUN', 'SYM', 'MD', 'WRB', 'IN']:
    qd.calc_match_capital2(train_df, q1, q2, diz=dizionario_max, tipo=i)
for i in ['JJ', 'FW', 'VERB', 'NOUN', 'SYM', 'MD', 'WRB', 'IN']:
    qd.calc_match_capital2(test_df, q1, q2, diz=dizionario_max, tipo=i)
for t in ['VERB', 'NOUN', 'JJ', 'WRB']:
    train_df['seq_sim_' + t] = train_df.apply(
        axis=1, func=lambda x:qd. get_sequenceones_similarity(x[q1], x[q2], [t]))
train_df['seq_sim_tot'] = train_df.apply(axis=1, func = lambda x: qd.get_sequenceones_similarity(x[q1],x[q2]))

for t in ['VERB', 'NOUN', 'JJ', 'WRB']:
    test_df['seq_sim_' + t] = test_df.apply(
        axis=1, func=lambda x: qd.get_sequenceones_similarity(x[q1], x[q2], [t]))
test_df['seq_sim_tot'] = test_df.apply(axis=1, func = lambda x: qd.get_sequenceones_similarity(x[q1],x[q2]))

tempvar = train_df.apply(
    axis=1, func=lambda x: qd.getback_function(qd.get_sequenceones_similarity2(x[q1], x[q2])))
tempvar.columns = ['seq_sim1', 'seq_sim2']
train_df = pd.concat([train_df, tempvar], axis=1)

tempvar = test_df.apply(
    axis=1, func=lambda x: qd.getback_function(qd.get_sequenceones_similarity2(x[q1], x[q2])))
tempvar.columns = ['seq_sim1', 'seq_sim2']
test_df = pd.concat([test_df, tempvar], axis=1)

Create "external" features

  • maths
  • SPOILER alert
  • unknown words shared
  • states shared
  • citizen shared
  • tvseries shared

In [ ]:
qd.get_diff_spoiler_math(train_df, q1, q2)
qd.get_diff_spoiler_math(test_df, q1, q2)

dizionario = qd.load_obj('tag_counter')
dizionario['ciccia'] = Counter()

serie = pd.read_csv('.\\files\\tvseries.txt')
serie = list(serie['name'].values)
states = pd.read_csv('.\\files\\states.txt')
states = list(states['name'].values)
citizen = pd.read_csv('.\\files\\citizen.txt')
citizen = list(citizen['name'].values)
serie = [n.lower() for n in serie]
states = [n.lower() for n in states]
citizen = [n.lower() for n in citizen]

english_vocab = set(w.lower() for w in nltk.corpus.words.words())

tempvar = train_df.apply(
    lambda x: qd.getback_function(qd.get_unknown_variables(x[q1], x[q2], english_vocab)))
tempvar.columns = ['unk1', 'unk2', 'unk3']
train_df = pd.concat([train_df, tempvar], axis=1)

tempvar = train_df.apply(
    lambda x: qd.getback_function(qd.get_variables_from_lists(x[q1], x[q2], serie)))
tempvar.columns = ['serie1', 'serie2', 'serie3']
train_df = pd.concat([train_df, tempvar], axis=1)

tempvar = train_df.apply(
    lambda x: qd.getback_function(qd.get_variables_from_lists(x[q1], x[q2], states)))
tempvar.columns = ['states1', 'states2', 'states3']
train_df = pd.concat([train_df, tempvar], axis=1)

tempvar = train_df.apply(
    lambda x: qd.getback_function(qd.get_variables_from_lists(x[q1], x[q2], citizen)))
tempvar.columns = ['citizen1', 'citizen2', 'citizen3']
train_df = pd.concat([train_df, tempvar], axis=1)

tempvar = test_df.apply(
    lambda x: qd.getback_function(qd.get_unknown_variables(x[q1], x[q2])))
tempvar.columns = ['unk1', 'unk2', 'unk3']
test_df = pd.concat([test_df, tempvar], axis=1)

tempvar = test_df.apply(
    lambda x: qd.getback_function(qd.get_variables_from_lists(x[q1], x[q2], serie)))
tempvar.columns = ['serie1', 'serie2', 'serie3']
test_df = pd.concat([test_df, tempvar], axis=1)

tempvar = test_df.apply(
    lambda x: qd.getback_function(qd.get_variables_from_lists(x[q1], x[q2], states)))
tempvar.columns = ['states1', 'states2', 'states3']
test_df = pd.concat([test_df, tempvar], axis=1)

tempvar = test_df.apply(
    lambda x: qd.getback_function(qd.get_variables_from_lists(x[q1], x[q2], citizen)))
tempvar.columns = ['citizen1', 'citizen2', 'citizen3']
test_df = pd.concat([test_df, tempvar], axis=1)

Create bit features

In [63]:
loc_time = ['as', 'of', 'for', 'up', 'down', 'out', 'on', 'off', 'over', 'under', 'again', 'so',
'too', 'very', 'few', 'more', 'most']

for i in loc_time:
    train_df[i+'_or'] = train_df.apply(axis=1, func=lambda x: qd.get_bit_single_word(x[q1], i) or qd.get_bit_single_word(x[q2], i))
    train_df[i+'_and'] = train_df.apply(axis=1, func=lambda x: qd.get_bit_single_word(x[q1], i) and qd.get_bit_single_word(x[q2], i))

for i in loc_time:
    test_df[i+'_or'] = test_df.apply(axis=1, func=lambda x: qd.get_bit_single_word(x[q1], i) or qd.get_bit_single_word(x[q2], i))
    test_df[i+'_and'] = test_df.apply(axis=1, func=lambda x: qd.get_bit_single_word(x[q1], i) and qd.get_bit_single_word(x[q2], i))

Some features with word2vec

  • feature extraction using pca on words vector
  • isomap after pca on difference between phrases
  • W2V distances between phrases by removing a fraction of words

In [ ]:
train_df = pd.read_csv('train_clear.csv', usecols=(1,6,9,10))
test_df = pd.read_csv('test_clear.csv', usecols=(1,6,7))
train_df.fillna("ciccia", inplace=True)
test_df.fillna("ciccia", inplace=True)

In [ ]:
q1, q2 = 'question1_final', 'question2_final'

corpus = qd.build_corpus(train_df, q1, q2)
corpus.extend(qd.build_corpus(test_df, q1, q2))

In [ ]:
model = word2vec.Word2Vec(

model_50 = word2vec.Word2Vec(

In [ ]:
train_df = pd.read_csv('.\\train_clear.csv', encoding='latin1')

q1 = 'question1_final'
q2 = 'question2_final'

train_df.fillna('ciccia', inplace=True)

all_unique_quests = set(list(set(train_df[q1])) + list(set(train_df[q2])))
all_words = qd.flatmap(qd.get_words, all_unique_quests)
cnt_words = Counter(all_words)

cnt_most = cnt_words.most_common(200)
cnt_most = [cnt_most[i][0] for i in range(0, 200)]

model = gensim.models.KeyedVectors.load('./model_best_50.bin')

all_train = train_df[['id', q1, q2]].values
b = list(map(qd.clear_phrase, all_train))

results = []
step = 5000

for i in range(0, len(b), step):  #len(b)
    x = time.clock()
    results.extend(list(map(qd.get_phrase_distances2, b[i:i + step])))
    y = time.clock() - x
    print(str(i) + '/404290 e ci ho impiegato ' + str(y / 60) + ' minuti')

myres = np.array(results)
temp = train_df.apply(axis=1, func=lambda x: qd.getback_function(myres[x['id']]))
temp.columns = ['dist_' + str(i) for i in range(0, 10)]
train_df = pd.concat([train_df, temp], axis=1)

train_df.replace(np.inf, np.nan, inplace=True)

ftrs = list(
    set(train_df.columns) - set([
        'Unnamed: 0', 'qid1', 'qid2', 'question1', 'question2', 'is_duplicate',
        'question1_clear_1', 'question2_clear_1', 'question1_final',


for n in temp.columns:
    qd.plot_variable(train_df[np.isfinite(train_df['dist_0'])], n)

# test
all_test = test_df[['test_id', q1, q2]].values
b = list(map(qd.clear_phrase, all_test))

results = []
step = 5000

for i in range(0, len(b), step):  #len(b)
    x = time.clock()
    results.extend(list(map(qd.get_phrase_distances2, b[i:i + step])))
    y = time.clock() - x
    print(str(i) + '/2345796 e ci ho impiegato ' + str(y / 60) + ' minuti')

myres = np.array(results)
temp = test_df.apply(axis=1, func=lambda x: qd.getback_function(myres[x['id']]))
temp.columns = ['dist_' + str(i) for i in range(0, 10)]
test_df = pd.concat([test_df, temp], axis=1)
test_df.replace(np.inf, np.nan, inplace=True)

ftrs = list(
    set(test_df.columns) - set([
        'Unnamed: 0', 'qid1', 'qid2', 'question1', 'question2', 'is_duplicate',
        'question1_clear_1', 'question2_clear_1', 'question1_final',


In [ ]:
tempvar = train_df.apply(
    axis=1, func=lambda x: qd.getback_function(qd.pca_vars(x[q1], x[q2])))
tempvar.columns = ['diff_eigenv_pca', 'cos_pca', 'diff_ratio_ecc_pca']
train_df = pd.concat([train_df, tempvar], axis=1)
train_df.fillna(0, inplace=True)

tempvar = test_df.apply(
    axis=1, func=lambda x: qd.getback_function(qd.pca_vars(x[q1], x[q2])))
tempvar.columns = ['diff_eigenv_pca', 'cos_pca', 'diff_ratio_ecc_pca']
test_df = pd.concat([test_df, tempvar], axis=1)
test_df.fillna(0, inplace=True)

tempvar = train_df.apply(
    axis=1, func=lambda x: qd.getback_function(qd.get_W2V_variables(x[q1], x[q2], model)))
tempvar.columns = ['norm_mean_wv', 'norm_sum_wv', 'cos_mean_wv', 'cos_sum_wv']
train_df = pd.concat([train_df, tempvar], axis=1)
train_df.replace(np.inf, np.nan,inplace=True)
train_df.fillna(0, inplace=True)

tempvar = test_df.apply(
    axis=1, func=lambda x: qd.getback_function(qd.get_W2V_variables(x[q1], x[q2], model)))
tempvar.columns = ['norm_mean_wv', 'norm_sum_wv', 'cos_mean_wv', 'cos_sum_wv']
test_df = pd.concat([test_df, tempvar], axis=1)
test_df.replace(np.inf, np.nan,inplace=True)
test_df.fillna(0, inplace=True)

In [ ]:
v1 = train.apply(axis=1,func=lambda x: qd.getback_function([qd.get_W2V_sum_sentence(x.question1_final,model)]))
v2 = train.apply(axis=1,func=lambda x: qd.getback_function([qd.get_W2V_sum_sentence(x.question2_final,model)]))

v3 = test.apply(axis=1,func=lambda x: qd.getback_function([qd.get_W2V_sum_sentence(x.question1_final,model)]))
v4 = test.apply(axis=1,func=lambda x: qd.getback_function([qd.get_W2V_sum_sentence(x.question2_final,model)]))

name = ['diff_' + str(i) for i in range(0, 50)]

train_diff = pd.DataFrame(v1 - v2)
train_diff.columns = name
train = pd.concat([train, train_diff], axis=1)

test_diff = pd.DataFrame(np.abs(v3 - v4))
test_diff.columns = name
test = pd.concat([test, test_diff], axis=1)

train = np.abs(train)
test = np.abs(test)

traintestpca = pd.concat([train, test], axis=0).drop('Unnamed: 0', axis=1)

pca = PCA(n_components=50)
pca_diff = pca.fit_transform(traintestpca)

pca1_train = pca_diff[:404290, 1]
pca2_train = pca_diff[:404290, 2]
pca3_train = pca_diff[:404290, 3]
pca1_test = pca_diff[404290:, 1]
pca2_test = pca_diff[404290:, 2]
pca3_test = pca_diff[404290:, 3]

train_df['pca1'] = pca1_train
train_df['pca2'] = pca2_train
train_df['pca3'] = pca3_train

X_iso_fit = Isomap(
    n_neighbors=30, n_components=3).fit(traintestpca.sample(n=1000))

step = 50000
iso_all = []
for i in range(0, 404290, step):
            traintestpca[:404290].iloc[0 + i:i + step]))  # 2345796:

name = 'iso_map_pca'
for i in range(0, 3):
    j = name + '_' + str(i)
    train_df[j] = np.array(iso_all)[:404290, i]

qd.plot_variable(train_df[(train_df['pca1'] > -4) & (train_df['pca1'] < 4)],
qd.plot_variable(train_df[(train_df['pca2'] > -4) & (train_df['pca2'] < 4)],
qd.plot_variable(train_df[(train_df['pca3'] > -4) & (train_df['pca3'] < 4)],
qd.plot_variable(train_df[(train_df['iso_map_pca_0'] > -10) &
                         (train_df['iso_map_pca_0'] < 10)], 'iso_map_pca_0')
qd.plot_variable(train_df[(train_df['iso_map_pca_1'] > -5) &
                         (train_df['iso_map_pca_1'] < 5)], 'iso_map_pca_1')
qd.plot_variable(train_df[(train_df['iso_map_pca_2'] > -5) &
                         (train_df['iso_map_pca_2'] < 5)], 'iso_map_pca_2')

test['pca1'] = pca1_test
test['pca2'] = pca2_test
test['pca3'] = pca3_test

traintestpca = traintestpca.iloc[404290:]

step = 100000
iso_all_test = []

for i in range(0, 2345796, step):
        X_iso_fit.transform(traintestpca.iloc[0 + i:i + step]))  # 2345796:

name = 'iso_map_pca'
for i in range(0, 3):
    j = name + '_' + str(i)
    test[j] = np.array(iso_all_test)[:, i]

Train Xgboost model

In [3]:
df = train_df

In [7]:
n = 1500
e_start = 0.3
x = np.linspace(0,n,n)
eta_adaptive = e_start*(1- (x/(5+x)))
eta_adaptive[eta_adaptive<0.05] = 0.05

other_par = {
    , 'early_stopping_rounds':50
    , 'verbose_eval':20

params = {
    , 'silent': 0
    , 'nthread':10
    , 'eta':0.05
    , 'eval_metric': 'logloss'
    , 'maximize':True
    , 'max_depth':15
    , 'min_child_weight': 15
    , 'colsample_bytree': 0.75
    , 'subsample': 0.5
    , 'gamma': 70
    , 'alpha': 5

In [8]:
eventuali = [
    'question1_clear_1', 'question2_clear_1', 'question1_clear_1_clear_2',
    'question1_clear_1_clear_2', 'question2_clear_1_clear_2', 'question1_final', 'question2_final'
features = list(
    set(df.columns) -
    set(['id', 'qid1', 'qid2', 'question1', 'question2', 'is_duplicate'] +

In [14]:

In [17]:
x_train = df[features+['is_duplicate']]

In [18]:
model_xgb, roc_auc_test, roc_auc_train, var_imp = train_xgboost_w(
    x_train, features, 'is_duplicate', params, other_par)

Bayesian Optimization

In [12]:
other_par = {
    , 'early_stopping_rounds':50
    , 'verbose_eval':False

params = {
    , 'silent': 0
    , 'nthread':10
    , 'eta':0.02
    , 'eval_metric': 'logloss'
    , 'maximize':True

myranges = {
    'min_child_weight': (1, 250),
    'colsample_bytree': (0.5, 1),
    'max_depth': (14, 25),
    'subsample': (0.5, 1),
    'gamma': (0, 300),
    'alpha': (0, 100)

In [10]:
target = 'is_duplicate'
df = df.sample(frac=1)
X_train, X_test, y_train, y_test = qd.train_test_split(
    df[features], df[target], test_size=0.33, random_state=786)
r = df[target].value_counts()[0]/df[target].value_counts()[1]
r = 1./0.165
w_train = y_train*r
w_train[y_train == 0] = 1
w_test = y_test*r
w_test[y_test == 0] = 1  
xg_train = xgb.DMatrix(X_train, label=y_train, weight=w_train)
xg_test = xgb.DMatrix(X_test, label=y_test, weight=w_test)
watchlist = [(xg_train, 'train'), (xg_test, 'test')]

In [11]:
xgbBO = qd.go_with_BayesianOptimization(

   33 | 42m55s |   -0.31887 |   93.1281 |             0.5413 |    0.4951 |     14.9974 |           245.7469 |      0.6142 | 
   34 | 46m52s |   -0.31877 |   13.1527 |             0.9981 |   38.0240 |     14.7632 |           214.8538 |      0.8860 | 
   35 | 45m24s |   -0.30844 |    5.4838 |             0.7620 |    3.3574 |     14.9620 |           175.8215 |      0.9743 | 
   36 | 46m26s |   -0.30919 |   47.9023 |             0.7800 |    2.8984 |     14.9278 |           151.2222 |      0.9774 | 
   37 | 45m57s |   -0.32931 |   59.7161 |             0.6971 |   33.0900 |     14.9995 |           142.7598 |      0.5602 | 
   38 | 54m39s |   -0.34550 |    2.5252 |             0.7673 |  169.8156 |     14.4513 |             1.1195 |      0.7219 | 
   39 | 49m58s |   -0.30771 |   28.2608 |             0.5847 |    6.4338 |     14.8763 |            45.7781 |      0.9069 | 
   40 | 45m31s |   -0.31155 |   29.8622 |             0.6408 |    1.2015 |     14.3428 |           186.4499 |      0.6210 | 
In [13]:
xgbBO = qd.go_with_BayesianOptimization(

In [16]:
n = 1500

other_par = {
    , 'early_stopping_rounds':50
    , 'verbose_eval':20

params = {
    , 'silent': 0
    , 'nthread':10
    , 'eta':0.02
    , 'eval_metric': 'logloss'
    , 'maximize':True
    , 'max_depth':15
    , 'min_child_weight': 45.7781
    , 'colsample_bytree': 0.5847
    , 'subsample': 0.9069
    , 'gamma': 6.4338 
    , 'alpha': 28.2608

retrain = train_xgboost(df, features, 'is_duplicate', params, other_par)

[0]	train-logloss:0.682476	test-logloss:0.682646
Multiple eval metrics have been passed: 'test-logloss' will be used for early stopping.

Will train until test-logloss hasn't improved in 50 rounds.
[20]	train-logloss:0.533695	test-logloss:0.536705
[40]	train-logloss:0.453194	test-logloss:0.458528
[60]	train-logloss:0.406491	test-logloss:0.413945
[80]	train-logloss:0.377739	test-logloss:0.38719
[100]	train-logloss:0.358584	test-logloss:0.370053
[120]	train-logloss:0.345586	test-logloss:0.358891
[140]	train-logloss:0.33546	test-logloss:0.350698
[160]	train-logloss:0.327844	test-logloss:0.344904
[180]	train-logloss:0.321875	test-logloss:0.340596
[200]	train-logloss:0.317022	test-logloss:0.337165
[220]	train-logloss:0.312479	test-logloss:0.334028
In [ ]:

Make Submission

In [39]:
# checking if test has the same variables of train
list(set(train_df.columns) - set(test_df.columns))

['is_duplicate', 'id']

In [41]:
qd.create_submission_xgboost(test_df, features, retrain)

submission.csv created

In [13]:
train_df['predict_proba'] = retrain.predict(xgboost.DMatrix(train_df[features]))

In [376]:
train_df[(train_df.is_duplicate==1) & (train_df.predict_proba<0.2)].head()

id is_duplicate question1_final question2_final diff_and_star_in diff_or_dollar diff_and_end_point diff_or_under_in diff_and_hat_in diff_and_tilde_in ... 1_word_match 2_word_match 3_word_match 1_2_word_match 1_3_word_match 2_3_word_match 1_2_3_word_match sum_first_log word_share_tfidf predict_proba
62 62 1 How be new Harry Potter book Harry Potter curs... how bad be new book J K Rowling False False False False False False ... False False False False False False False -13.126410 0.008324 0.005416
66 66 1 what be good book ever make what be important book you have ever read False False False False False False ... True True False True False False False -13.522170 0.438651 0.112567
92 92 1 what be some well romantic movie English what be well romantic movie you have ever see False False False False False False ... True True False True False False False -13.522170 0.896681 0.131273
93 93 1 what cause nightmare what cause nightmare seem real False False False False False False ... True True True True True True True -13.522170 0.942560 0.181026
135 135 1 who be Rohingya Muslims who be Rohingya people False False False False False False ... True True True True True True True -10.672345 0.995288 0.125229

5 rows × 66 columns

In [377]:
train_df[(train_df.is_duplicate==0) & (train_df.predict_proba>0.9)].head()

id is_duplicate question1_final question2_final diff_and_star_in diff_or_dollar diff_and_end_point diff_or_under_in diff_and_hat_in diff_and_tilde_in ... 1_word_match 2_word_match 3_word_match 1_2_word_match 1_3_word_match 2_3_word_match 1_2_3_word_match sum_first_log word_share_tfidf predict_proba
19249 19249 0 what be like many poor people America what work be like many poor people America False False False False False False ... True False False False False False False -1.352217e+01 0.983210 0.963356
25157 25157 0 how do you say I miss you my love farsi Farsi how do you say I miss you False False False False False False ... False False False False False False False 9.959609e-07 0.999049 0.937159
26052 26052 0 how do I lose 20 kgs year how can I lose weight quickly False False False False False False ... True False True False True False False -1.312641e+01 0.042834 0.908189
28540 28540 0 how do I invest my money wisely how can I invest my money wisely False False False False False False ... True False True False True False False -1.312641e+01 0.999856 0.934331
39986 39986 0 what be difference between freemason Illuminati what be difference between Freemason Illuminati False False False False False False ... True True True True True True True -1.352217e+01 1.000000 0.920126

5 rows × 66 columns

In [ ]: