(Based on the published notebook by Pavel Nesterov; converted to Python 3)
In [ ]:
from IPython.core.display import HTML
HTML("""
<style>
div.cell { /* Tunes the space between cells */
margin-top:1em;
margin-bottom:1em;
}
div.text_cell_render h1 { /* Main titles bigger, centered */
font-size: 2.2em;
line-height:1.4em;
text-align:center;
}
div.text_cell_render h2 { /* Parts names nearer from text */
margin-bottom: -0.4em;
}
div.text_cell_render { /* Customize text cells */
font-family: 'Times New Roman';
font-size:1.5em;
line-height:1.4em;
padding-left:3em;
padding-right:3em;
}
</style>
""")
In [ ]:
%matplotlib inline
import matplotlib
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
pd.set_option('display.max_columns', 500)
import seaborn as sns
sns.set_style("dark")
plt.rcParams['figure.figsize'] = 16, 12
from tqdm import tqdm, tqdm_notebook
import itertools as it
import pickle
import glob
import os
import string
from scipy import sparse
import nltk
import spacy
from sklearn.model_selection import StratifiedKFold, cross_val_score, GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.linear_model import SGDClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import log_loss, make_scorer
from sklearn.decomposition import TruncatedSVD
from scipy.optimize import minimize
import eli5
from IPython.display import display
import xgboost as xgb
In [ ]:
df_train = pd.read_csv('./../data/train.csv',
dtype={
'question1': np.str,
'question2': np.str
})
df_train['test_id'] = -1
df_test = pd.read_csv('./../data/test.csv',
dtype={
'question1': np.str,
'question2': np.str
})
df_test['id'] = -1
df_test['qid1'] = -1
df_test['qid2'] = -1
df_test['is_duplicate'] = -1
df = pd.concat([df_train, df_test])
df['question1'] = df['question1'].fillna('')
df['question2'] = df['question2'].fillna('')
df['uid'] = np.arange(df.shape[0])
df = df.set_index(['uid'])
print(df.dtypes)
del(df_train, df_test)
In [ ]:
ix_train = np.where(df['id'] >= 0)[0]
ix_test = np.where(df['id'] == -1)[0]
ix_is_dup = np.where(df['is_duplicate'] == 1)[0]
ix_not_dup = np.where(df['is_duplicate'] == 0)[0]
In [ ]:
print(df[df['is_duplicate'] >= 0]['is_duplicate'].value_counts(normalize=True))
In [ ]:
if len(glob.glob('./../data/tmp/0_*.csv')) < 5:
df_submit = df.loc[ix_test][['test_id', 'is_duplicate']].copy()
df_submit['is_duplicate'] = 0.0
df_submit.to_csv('./../data/tmp/0_zeros.csv', index=False)
df_submit['is_duplicate'] = 1.0
df_submit.to_csv('./../data/tmp/0_ones.csv', index=False)
df_submit['is_duplicate'] = 0.5
df_submit.to_csv('./../data/tmp/0_half.csv', index=False)
df_submit['is_duplicate'] = 0.25
df_submit.to_csv('./../data/tmp/0_quarter.csv', index=False)
df_submit['is_duplicate'] = 0.75
df_submit.to_csv('./../data/tmp/0_threequarters.csv', index=False)
In [ ]:
score0 = 6.01888
score1 = 28.52056
score05 = 0.69315
score025 = 0.47913
score075 = 1.19485
Вспомним формуду риска гипотезы: $$\large Q\left(h\right) = \text{E}_{x, y \sim p\left(x, y\right)}\left[L\left(h\left(x\right), y\right)\right]$$
Тогда эмпирический риск имеет форму: $$\large Q_{\text{emp}}\left(h\right) = \frac{1}{n}\sum_{i = 1}^n L\left(h\left(x_i\right), y_i\right)$$
Хотя если мы знаем, что распределение меток не равномерное, то можно это учесть, добавив вес примера: $$\large Q_{\text{emp}}\left(h\right) = \sum_{i = 1}^n w_i L\left(h\left(x_i\right), y_i\right)$$
В задаче Quora Question Pairs в качестве $L$ используется метрика logloss (вспоминаем логистическую регрессию), а гипотезы выдают вероятность принадлежности к классу дубликатов $h\left(x_i\right) = p_i$: $$\large\begin{array}{rcl} Q_{\text{emp}} &=& \frac{1}{n}\sum_{i = 1}^n L\left(h\left(x_i\right), y_i\right) \\ &=& -\frac{1}{n}\sum_{i = 1}^n y_i \log p_i + \left(1 - y_i\right)\log\left(1 - p_i\right) \\ &=& -\frac{1}{n}\sum_{i \in I_0} \log\left(1 - p_i\right) - \frac{1}{n}\sum_{i \in I_1} \log p_i \end{array}$$
Предположим, что $\forall p_i = 0$, уравнение (1): $$\large\begin{array}{rcl} q_0 &\approx& -\frac{1}{n}\sum_{i \in I_0} \log\left(1 - \epsilon\right) - \frac{1}{n}\sum_{i \in I_1} \log \epsilon \\ &=& -\frac{n_0}{n}\log\left(1 - \epsilon\right) - \frac{n_1}{n} \log \epsilon \\ &=& -r_0\log\left(1 - \epsilon\right) - r_1 \log \epsilon \end{array}$$
$\forall p_i = 1$, уравнение (2): $$\large\begin{array}{rcl} q_1 &\approx& -r_0\log \epsilon - r_1 \log \left(1 - \epsilon\right) \end{array}$$
$\forall p_i = \frac{1}{2}$, уравнение (3): $$\large\begin{array}{rcl} q_{1/2} &=& -r_0\log \frac{1}{2} - r_1 \log \frac{1}{2} \\ &=& \left(r_0 + r_1\right) \log 2 \end{array}$$
(1) + (2): $$\large\begin{array}{rcl} -\left(q_0+ q_1\right) &=& r_0 \log \left(1 - \epsilon\right) + r_1 \log \epsilon + r_0 \log \epsilon + r_1 \log \left(1 - \epsilon\right) \\ &=& \left(r_0 + r_1\right) \log \epsilon \left(1 - \epsilon\right) \end{array}$$
Используя уравнение (3), вычисляем $\epsilon$: $$\large\begin{array}{rcl} -\left(q_0+ q_1\right) &=& \frac{q_{1/2}}{\log 2} \log \epsilon \left(1 - \epsilon\right) \Leftrightarrow \\ \log \epsilon \left(1 - \epsilon\right) &=& \frac{q_0 + q_1}{q_{1/2}} \log \frac{1}{2} \Leftrightarrow \\ \log \epsilon \left(1 - \epsilon\right) &=& A \Leftrightarrow \\ \epsilon^2 - \epsilon + e^A &=& 0 \Rightarrow \\ \epsilon &=& \frac{1}{2}\left(1 \pm \sqrt{1 - 4e^A}\right) \end{array}$$
In [ ]:
A = (score0 + score1)*np.log(0.5)/score05
print('A =', A)
print('eps_0 =', (1 + np.sqrt(1 - 4*np.exp(A)))/2)
print('eps_1 =', (1 - np.sqrt(1 - 4*np.exp(A)))/2)
eps = 10e-16
print('eps =', eps)
Для простоты написаня введем еще две константы: $$\large\begin{array}{rcl} B &=& \log \left(1 - \epsilon\right) \\ C &=& \log \epsilon \end{array}$$
In [ ]:
B = np.log(1 - eps)
print('B =', B)
C = np.log(eps)
print('C =', C)
Решая систему из двух уравнений (1) и (2), получим: $$\large\begin{array}{rcl} r_1 &=& \frac{B\left(q_1 - \frac{C}{B}q_0\right)}{C^2 - B^2} \\ r_0 &=& \frac{1}{C}\left(-q_1 - r_1 B\right) \end{array}$$
In [ ]:
r1 = (score1 - (C/B)*score0) / ((C*C/B) - B)
print('r1 =', r1)
r0 = (-score1 - r1*B)/C
print('r0 =', r0)
print('r0 + r1 =', r0 + r1)
In [ ]:
if not os.path.isfile('./../data/tmp/0_174.csv'):
df_submit = df.loc[ix_test][['test_id', 'is_duplicate']].copy()
df_submit['is_duplicate'] = 0.17426442474
df_submit.to_csv('./../data/tmp/0_174.csv', index=False)
Оказывается, что распределение меток на трейне и тесте разное, хотелось бы обучаться на трейне, но как то оптимизировать функцию ошибки на тесте. Сделаем разумное предположение:
Обозначим за $p$ лучшую модель на тренировочном наборе данных, тогда:
$$\large\begin{array}{rcl} p = P\left(y = 1 \mid x\right) &=& \frac{P\left(x \mid y = 1\right)P\left(y = 1\right)}{P\left(x \mid y = 1\right)P\left(y = 1\right) + P\left(x \mid y = 0\right)P\left(y = 0\right)} \\ &=& \frac{a p_1}{a p_1 + b p_0} \Rightarrow \\ b p_0 &=& \frac{a p_1}{p} - a p_1 = \frac{a p_1 - a p_1 p}{p} \end{array}$$По предположению, распределение меток на тесте просто скошено, т.е. $P\left(y' = i\right) = \gamma_i P\left(y = 1\right)$, тогда:
$$\large\begin{array}{rcl} P\left(y' = 1 \mid x\right) &=& \frac{P\left(x \mid y' = 1\right)P\left(y' = 1\right)}{P\left(x \mid y' = 1\right)P\left(y' = 1\right) + P\left(x \mid y' = 0\right)P\left(y' = 0\right)} \\ &=& \frac{a\gamma_1p_1}{a\gamma_1p_1 + b\gamma_0p_0} \\ &=& \frac{a\gamma_1p_1}{a\gamma_1p_1 + \gamma_0\frac{a p_1 - a p_1 p}{p}} \\ &=& \frac{a\gamma_1p_1p}{a\gamma_1p_1p + \gamma_0a p_1 - \gamma_0a p_1 p} \\ &=& \frac{\gamma_1 p}{\gamma_1 p + \gamma_0 - \gamma_0 p} \\ &=& \frac{\gamma_1 p}{\gamma_1 p + \gamma_0 \left(1 - p\right)} \\ \end{array}$$Получается, что прогноз модели, обученной на тренировочном наборе необходимо перед посылкой преобразовать, используя функцию: $$\large f\left(x\right) = \frac{\gamma_1 x}{\gamma_1 x + \gamma_0 \left(1 - x\right)}$$
In [ ]:
d = df[df['is_duplicate'] >= 0]['is_duplicate'].value_counts(normalize=True).to_dict()
print('P(y = 0) =', d[0])
print('P(y = 1) =', d[1])
print('P(y\' = 0) =', r0)
print('P(y\' = 1) =', r1)
gamma_0 = r0/d[0]
gamma_1 = r1/d[1]
print('gamma_0 =', gamma_0)
print('gamma_1 =', gamma_1)
def link_function(x):
return gamma_1*x/(gamma_1*x + gamma_0*(1 - x))
support = np.linspace(0, 1, 1000)
values = link_function(support)
fig, ax = plt.subplots()
ax.plot(support, values)
ax.set_title('Link transformation', fontsize=20)
ax.set_xlabel('x')
ax.set_ylabel('f(x)')
plt.show()
In [ ]:
df['len1'] = df['question1'].str.len().astype(np.float32)
df['len2'] = df['question2'].str.len().astype(np.float32)
df['abs_diff_len1_len2'] = np.abs(df['len1'] - df['len2'])
In [ ]:
def plot_real_feature(fname):
fig = plt.figure()
ax1 = plt.subplot2grid((3, 2), (0, 0), colspan=2)
ax2 = plt.subplot2grid((3, 2), (1, 0), colspan=2)
ax3 = plt.subplot2grid((3, 2), (2, 0))
ax4 = plt.subplot2grid((3, 2), (2, 1))
ax1.set_title('Distribution of %s' % fname, fontsize=20)
sns.distplot(df.loc[ix_train][fname],
bins=50,
ax=ax1)
sns.distplot(df.loc[ix_is_dup][fname],
bins=50,
ax=ax2,
label='is dup')
sns.distplot(df.loc[ix_not_dup][fname],
bins=50,
ax=ax2,
label='not dup')
ax2.legend(loc='upper right', prop={'size': 18})
sns.boxplot(y=fname,
x='is_duplicate',
data=df.loc[ix_train],
ax=ax3)
sns.violinplot(y=fname,
x='is_duplicate',
data=df.loc[ix_train],
ax=ax4)
plt.show()
plot_real_feature('abs_diff_len1_len2')
In [ ]:
print(df.loc[ix_train]['abs_diff_len1_len2'].describe())
In [ ]:
max_in_dup = df.loc[ix_is_dup]['abs_diff_len1_len2'].max()
print ('Максимум среди дублей: ', max_in_dup)
max_in_not_dups = df.loc[ix_not_dup]['abs_diff_len1_len2'].max()
print ('Максимум среди не дублей: ', max_in_not_dups)
print ('Количество строк больше прога: ', (df.loc[ix_train]['abs_diff_len1_len2'] > max_in_dup).sum())
std_in_dups = df.loc[ix_is_dup]['abs_diff_len1_len2'].std()
print ('Стандартное отклонение в дублях:', std_in_dups)
replace_value = max_in_dup + 2*std_in_dups
print ('Новое значение: ', replace_value)
In [ ]:
df['abs_diff_len1_len2'] = df['abs_diff_len1_len2'].apply(lambda x: x if x < replace_value else replace_value)
plot_real_feature('abs_diff_len1_len2')
In [ ]:
df['log_abs_diff_len1_len2'] = np.log(df['abs_diff_len1_len2'] + 1)
plot_real_feature('log_abs_diff_len1_len2')
In [ ]:
df['ratio_len1_len2'] = df['len1'].apply(lambda x: x if x > 0.0 else 1.0)/\
df['len2'].apply(lambda x: x if x > 0.0 else 1.0)
plot_real_feature('ratio_len1_len2')
In [ ]:
print (df.loc[ix_train]['ratio_len1_len2'].describe())
In [ ]:
max_in_dup = df.loc[ix_is_dup]['ratio_len1_len2'].max()
print ('Максимум среди дублей: ', max_in_dup)
max_in_not_dups = df.loc[ix_not_dup]['ratio_len1_len2'].max()
print ('Максимум среди не дублей: ', max_in_not_dups)
print ('Количество строк больше порога: ', (df.loc[ix_train]['ratio_len1_len2'] > max_in_dup).sum())
std_in_dups = df.loc[ix_is_dup]['ratio_len1_len2'].std()
print ('Стандартное отклонение в дублях: ', std_in_dups)
replace_value = max_in_dup + 2*std_in_dups
print ('Новое значение: ', replace_value)
In [ ]:
df['ratio_len1_len2'] = df['ratio_len1_len2'].apply(lambda x: x if x < replace_value else replace_value)
plot_real_feature('ratio_len1_len2')
In [ ]:
df['log_ratio_len1_len2'] = np.log(df['ratio_len1_len2'] + 1)
plot_real_feature('log_ratio_len1_len2')
In [ ]:
sns.pairplot(df.loc[ix_train][df.columns[7:].tolist() + ['is_duplicate']], hue="is_duplicate")
In [ ]:
sns.jointplot('log_ratio_len1_len2', 'abs_diff_len1_len2', df.loc[ix_train], alpha=0.01)
In [ ]:
def plot_corr(predictors):
predictors = predictors[:]
predictors += ['is_duplicate']
mcorr = df.loc[ix_train][predictors].corr()
mask = np.zeros_like(mcorr, dtype=np.bool)
mask[np.triu_indices_from(mask)] = True
cmap = sns.diverging_palette(220, 10, as_cmap=True)
g = sns.heatmap(mcorr, mask=mask, cmap=cmap, square=True, annot=True, fmt='0.2f')
g.set_xticklabels(predictors, rotation=90)
g.set_yticklabels(reversed(predictors))
plt.show()
plot_corr(df.columns[7:].tolist())
In [ ]:
predictors = df.columns[7:].tolist()
print (predictors)
def check_model(predictors):
classifier = lambda: SGDClassifier(
loss='log',
penalty='elasticnet',
fit_intercept=True,
n_iter=100,
shuffle=True,
n_jobs=-1,
class_weight=None)
model = Pipeline(steps=[
('ss', StandardScaler()),
('en', classifier())
])
parameters = {
'en__alpha': [0.00001, 0.0001, 0.001, 0.01, 0.02, 0.1, 0.5, 0.9, 1],
'en__l1_ratio': [0, 0.0001, 0.001, 0.01, 0.1, 0.3, 0.5, 0.75, 0.9, 1]
}
folder = StratifiedKFold(n_splits=5, shuffle=True)
grid_search = GridSearchCV(
model,
parameters,
cv=folder,
n_jobs=-1,
verbose=1)
grid_search = grid_search.fit(df.loc[ix_train][predictors],
df.loc[ix_train]['is_duplicate'])
return grid_search
if not os.path.isfile('./../data/tmp/1_model.pkl'):
model = check_model(predictors)
print (model.best_score_)
print (model.best_params_)
with open('./../data/tmp/1_model.pkl', 'wb') as f:
pickle.dump(model, f)
else:
with open('./../data/tmp/1_model.pkl', 'rb') as f:
model = pickle.load(f)
Fitting 5 folds for each of 90 candidates, totalling 450 fits
[Parallel(n_jobs=-1)]: Done 18 tasks | elapsed: 21.6s
[Parallel(n_jobs=-1)]: Done 168 tasks | elapsed: 2.1min
[Parallel(n_jobs=-1)]: Done 418 tasks | elapsed: 5.2min
[Parallel(n_jobs=-1)]: Done 450 out of 450 | elapsed: 5.5min finished
0.633560068268
{'en__l1_ratio': 0, 'en__alpha': 0.02}
In [ ]:
y_test_pred = model.predict_proba(df.loc[ix_test][predictors])[:, 1]
y_test_pred_fixed = link_function(y_test_pred)
fig = plt.figure()
ax1 = plt.subplot2grid((1, 2), (0, 0))
ax2 = plt.subplot2grid((1, 2), (0, 1))
sns.distplot(y_test_pred, ax=ax1)
ax1.set_title('Model prediction')
sns.distplot(y_test_pred_fixed, ax=ax2)
ax2.set_title('Model prediction fixed')
plt.show()
In [ ]:
if not os.path.isfile('./../data/tmp/1_pred.csv'):
pd.DataFrame.from_records(
zip(df.loc[ix_test]['test_id'].values,
y_test_pred),
columns=['test_id', 'is_duplicate']).to_csv('./../data/tmp/1_pred.csv', index=False)
In [ ]:
if not os.path.isfile('./../data/tmp/1_pred_fixed.csv'):
pd.DataFrame.from_records(
zip(df.loc[ix_test]['test_id'].values,
y_test_pred_fixed),
columns=['test_id', 'is_duplicate']).to_csv('./../data/tmp/1_pred_fixed.csv', index=False)
In [ ]:
display(
eli5.sklearn.explain_linear_classifier_weights(
model.best_estimator_.steps[1][1],
feature_names=predictors))
In [ ]:
%%time
if os.path.isfile('./../data/tmp/cv_char.pkl') and os.path.isfile('./../data/tmp/ch_freq.pkl'):
with open('./../data/tmp/cv_char.pkl', 'rb') as f:
cv_char = pickle.load(f)
with open('./../data/tmp/ch_freq.pkl', 'rb') as f:
ch_freq = pickle.load(f)
else:
cv_char = CountVectorizer(ngram_range=(1, 3), analyzer='char')
ch_freq = np.array(cv_char.fit_transform(df['question1'].tolist() + df['question2'].tolist()).sum(axis=0))[0, :]
with open('./../data/tmp/cv_char.pkl', 'wb') as f:
pickle.dump(cv_char, f)
with open('./../data/tmp/ch_freq.pkl', 'wb') as f:
pickle.dump(ch_freq, f)
In [ ]:
unigrams = dict([(k, v) for (k, v) in cv_char.vocabulary_.items() if len(k) == 1])
ix_unigrams = np.sort(list(unigrams.values()))
print ('Unigrams:', len(unigrams))
bigrams = dict([(k, v) for (k, v) in cv_char.vocabulary_.items() if len(k) == 2])
ix_bigrams = np.sort(list(bigrams.values()))
print ('Bigrams: ', len(bigrams))
trigrams = dict([(k, v) for (k, v) in cv_char.vocabulary_.items() if len(k) == 3])
ix_trigrams = np.sort(list(trigrams.values()))
print ('Trigrams:', len(trigrams))
In [ ]:
%%time
def save_sparse_csr(fname, sm):
np.savez(fname,
data=sm.data,
indices=sm.indices,
indptr=sm.indptr,
shape=sm.shape)
def load_sparse_csr(fname):
loader = np.load(fname)
return sparse.csr_matrix((
loader['data'],
loader['indices'],
loader['indptr']),
shape=loader['shape'])
if os.path.isfile('./../data/tmp/m_q1.npz') and os.path.isfile('./../data/tmp/m_q2.npz'):
m_q1 = load_sparse_csr('./../data/tmp/m_q1.npz')
m_q2 = load_sparse_csr('./../data/tmp/m_q2.npz')
else:
m_q1 = cv_char.transform(df['question1'].values)
m_q2 = cv_char.transform(df['question2'].values)
save_sparse_csr('./../data/tmp/m_q1.npz', m_q1)
save_sparse_csr('./../data/tmp/m_q2.npz', m_q2)
In [ ]:
v_num = (m_q1[:, ix_unigrams] > 0).minimum((m_q2[:, ix_unigrams] > 0)).sum(axis=1)
v_den = (m_q1[:, ix_unigrams] > 0).maximum((m_q2[:, ix_unigrams] > 0)).sum(axis=1)
v_score = np.array(v_num.flatten()).astype(np.float32)[0, :]/np.array(v_den.flatten())[0, :]
df['unigram_jaccard'] = v_score
plot_real_feature('unigram_jaccard')
In [ ]:
# учитываем каждую букву больше одного раза
v_num = m_q1[:, ix_unigrams].minimum(m_q2[:, ix_unigrams]).sum(axis=1)
v_den = m_q1[:, ix_unigrams].sum(axis=1) + m_q2[:, ix_unigrams].sum(axis=1)
v_score = np.array(v_num.flatten()).astype(np.float32)[0, :]/np.array(v_den.flatten())[0, :]
df['unigram_all_jaccard'] = v_score
plot_real_feature('unigram_all_jaccard')
In [ ]:
# учитываем каждую букву больше одного раза
# нормируем максимальным значением, а не суммой
v_num = m_q1[:, ix_unigrams].minimum(m_q2[:, ix_unigrams]).sum(axis=1)
v_den = m_q1[:, ix_unigrams].maximum(m_q2[:, ix_unigrams]).sum(axis=1)
v_score = np.array(v_num.flatten()).astype(np.float32)[0, :]/np.array(v_den.flatten())[0, :]
df['unigram_all_jaccard_max'] = v_score
plot_real_feature('unigram_all_jaccard_max')
In [ ]:
v_num = (m_q1[:, ix_bigrams] > 0).minimum((m_q2[:, ix_bigrams] > 0)).sum(axis=1)
v_den = (m_q1[:, ix_bigrams] > 0).maximum((m_q2[:, ix_bigrams] > 0)).sum(axis=1)
v_score = np.array(v_num.flatten()).astype(np.float32)[0, :]/np.array(v_den.flatten())[0, :]
df['bigram_jaccard'] = v_score
plot_real_feature('bigram_jaccard')
In [ ]:
df.loc[ix_train].groupby(['is_duplicate'])['bigram_jaccard'].agg(
{
'min': np.min,
'max': np.max,
'p1': lambda x: np.percentile(x, q=0.01),
'p99': lambda x: np.percentile(x, q=99.99)
}
)
In [ ]:
print ('Количество аутлаеров справа:', (df['bigram_jaccard'] > 1).sum())
print ('Количество аутлаеров слева: ', (df['bigram_jaccard'] < -1.47).sum())
In [ ]:
df.loc[df['bigram_jaccard'] < -1.478751, 'bigram_jaccard'] = -1.478751
df.loc[df['bigram_jaccard'] > 1.0, 'bigram_jaccard'] = 1.0
plot_real_feature('bigram_jaccard')
In [ ]:
# учитываем каждую букву больше одного раза
v_num = m_q1[:, ix_bigrams].minimum(m_q2[:, ix_bigrams]).sum(axis=1)
v_den = m_q1[:, ix_bigrams].sum(axis=1) + m_q2[:, ix_bigrams].sum(axis=1)
v_score = np.array(v_num.flatten()).astype(np.float32)[0, :]/np.array(v_den.flatten())[0, :]
df['bigram_all_jaccard'] = v_score
plot_real_feature('bigram_all_jaccard')
In [ ]:
# учитываем каждую букву больше одного раза
# нормируем максимальным значением, а не суммой
v_num = m_q1[:, ix_bigrams].minimum(m_q2[:, ix_bigrams]).sum(axis=1)
v_den = m_q1[:, ix_bigrams].maximum(m_q2[:, ix_bigrams]).sum(axis=1)
v_score = np.array(v_num.flatten()).astype(np.float32)[0, :]/np.array(v_den.flatten())[0, :]
df['bigram_all_jaccard_max'] = v_score
plot_real_feature('bigram_all_jaccard_max')
In [ ]:
m_q1 = m_q1[:, ix_trigrams]
m_q2 = m_q2[:, ix_trigrams]
In [ ]:
v_num = (m_q1 > 0).minimum((m_q2 > 0)).sum(axis=1)
v_den = (m_q1 > 0).maximum((m_q2 > 0)).sum(axis=1)
v_den[np.where(v_den == 0)] = 1
v_score = np.array(v_num.flatten()).astype(np.float32)[0, :]/np.array(v_den.flatten())[0, :]
df['trigram_jaccard'] = v_score
plot_real_feature('trigram_jaccard')
In [ ]:
# учитываем каждую букву больше одного раза
v_num = m_q1.minimum(m_q2).sum(axis=1)
v_den = m_q1.sum(axis=1) + m_q2.sum(axis=1)
v_den[np.where(v_den == 0)] = 1
v_score = np.array(v_num.flatten()).astype(np.float32)[0, :]/np.array(v_den.flatten())[0, :]
df['trigram_all_jaccard'] = v_score
plot_real_feature('trigram_all_jaccard')
In [ ]:
# учитываем каждую букву больше одного раза
# нормируем максимальным значением, а не суммой
v_num = m_q1.minimum(m_q2).sum(axis=1)
v_den = m_q1.maximum(m_q2).sum(axis=1)
v_den[np.where(v_den == 0)] = 1
v_score = np.array(v_num.flatten()).astype(np.float32)[0, :]/np.array(v_den.flatten())[0, :]
df['trigram_all_jaccard_max'] = v_score
plot_real_feature('trigram_all_jaccard_max')
In [ ]:
tft = TfidfTransformer(
norm='l2',
use_idf=True,
smooth_idf=True,
sublinear_tf=False)
tft = tft.fit(sparse.vstack((m_q1, m_q2)))
m_q1_tf = tft.transform(m_q1)
m_q2_tf = tft.transform(m_q2)
v_num = np.array(m_q1_tf.multiply(m_q2_tf).sum(axis=1))[:, 0]
v_den = np.array(np.sqrt(m_q1_tf.multiply(m_q1_tf).sum(axis=1)))[:, 0] * \
np.array(np.sqrt(m_q2_tf.multiply(m_q2_tf).sum(axis=1)))[:, 0]
v_num[np.where(v_den == 0)] = 1
v_den[np.where(v_den == 0)] = 1
v_score = 1 - v_num/v_den
df['trigram_tfidf_cosine'] = v_score
plot_real_feature('trigram_tfidf_cosine')
In [ ]:
tft = TfidfTransformer(
norm='l2',
use_idf=True,
smooth_idf=True,
sublinear_tf=False)
tft = tft.fit(sparse.vstack((m_q1, m_q2)))
m_q1_tf = tft.transform(m_q1)
m_q2_tf = tft.transform(m_q2)
v_score = (m_q1_tf - m_q2_tf)
v_score = np.sqrt(np.array(v_score.multiply(v_score).sum(axis=1))[:, 0])
df['trigram_tfidf_l2_euclidean'] = v_score
plot_real_feature('trigram_tfidf_l2_euclidean')
In [ ]:
tft = TfidfTransformer(
norm='l1',
use_idf=True,
smooth_idf=True,
sublinear_tf=False)
tft = tft.fit(sparse.vstack((m_q1, m_q2)))
m_q1_tf = tft.transform(m_q1)
m_q2_tf = tft.transform(m_q2)
v_score = (m_q1_tf - m_q2_tf)
v_score = np.sqrt(np.array(v_score.multiply(v_score).sum(axis=1))[:, 0])
df['trigram_tfidf_l1_euclidean'] = v_score
plot_real_feature('trigram_tfidf_l1_euclidean')
In [ ]:
tft = TfidfTransformer(
norm='l2',
use_idf=False,
smooth_idf=True,
sublinear_tf=False)
tft = tft.fit(sparse.vstack((m_q1, m_q2)))
m_q1_tf = tft.transform(m_q1)
m_q2_tf = tft.transform(m_q2)
v_score = (m_q1_tf - m_q2_tf)
v_score = np.sqrt(np.array(v_score.multiply(v_score).sum(axis=1))[:, 0])
df['trigram_tf_l2_euclidean'] = v_score
plot_real_feature('trigram_tf_l2_euclidean')
In [ ]:
predictors = df.columns[7:].tolist()
print (predictors)
In [ ]:
plot_corr(predictors)
In [ ]:
sns.pairplot(df.loc[ix_train][[
'abs_diff_len1_len2',
'trigram_jaccard',
'trigram_tfidf_cosine',
'trigram_tfidf_l2_euclidean',
'trigram_tfidf_l1_euclidean',
'is_duplicate']], hue="is_duplicate")
In [ ]:
if not os.path.isfile('./../data/tmp/2_model.pkl'):
model = check_model(predictors)
print (model.best_score_)
print (model.best_params_)
with open('./../data/tmp/2_model.pkl', 'wb') as f:
pickle.dump(model, f)
else:
with open('./../data/tmp/2_model.pkl', 'rb') as f:
model = pickle.load(f)
Fitting 5 folds for each of 90 candidates, totalling 450 fits
[Parallel(n_jobs=-1)]: Done 18 tasks | elapsed: 34.0s
[Parallel(n_jobs=-1)]: Done 168 tasks | elapsed: 3.2min
[Parallel(n_jobs=-1)]: Done 418 tasks | elapsed: 7.9min
[Parallel(n_jobs=-1)]: Done 450 out of 450 | elapsed: 8.4min finished
0.716752825942
{'en__l1_ratio': 1, 'en__alpha': 1e-05}
In [ ]:
y_test_pred = model.predict_proba(df.loc[ix_test][predictors])[:, 1]
y_test_pred_fixed = link_function(y_test_pred)
if not os.path.isfile('./../data/tmp/2_pred_fixed.csv'):
pd.DataFrame.from_records(
zip(df.loc[ix_test]['test_id'].values,
y_test_pred_fixed),
columns=['test_id', 'is_duplicate']).to_csv('./../data/tmp/2_pred_fixed.csv', index=False)
In [ ]:
fig = plt.figure()
ax1 = plt.subplot2grid((1, 2), (0, 0))
ax2 = plt.subplot2grid((1, 2), (0, 1))
sns.distplot(y_test_pred, ax=ax1)
ax1.set_title('Model prediction')
sns.distplot(y_test_pred_fixed, ax=ax2)
ax2.set_title('Model prediction fixed')
plt.show()
In [ ]:
display(
eli5.sklearn.explain_linear_classifier_weights(
model.best_estimator_.steps[1][1],
feature_names=predictors))
In [ ]:
def check_model(predictors, data=None, do_scaling=True):
classifier = lambda: SGDClassifier(
loss='log',
penalty='elasticnet',
fit_intercept=True,
n_iter=100,
shuffle=True,
n_jobs=-1,
class_weight=None)
steps = []
if do_scaling:
steps.append(('ss', StandardScaler()))
steps.append(('en', classifier()))
model = Pipeline(steps=steps)
parameters = {
'en__alpha': [0.00001, 0.0001, 0.001, 0.01, 0.02, 0.1, 0.5, 0.9, 1],
'en__l1_ratio': [0, 0.0001, 0.001, 0.01, 0.1, 0.3, 0.5, 0.75, 0.9, 1]
}
folder = StratifiedKFold(n_splits=5, shuffle=True)
grid_search = GridSearchCV(
model,
parameters,
cv=folder,
n_jobs=-1,
verbose=1)
if data is None:
grid_search = grid_search.fit(df.loc[ix_train][predictors],
df.loc[ix_train]['is_duplicate'])
else:
grid_search = grid_search.fit(data['X'],
data['y'])
return grid_search
In [ ]:
if not os.path.isfile('./../data/tmp/3_model.pkl'):
model = check_model(None, data={
'X': sparse.csc_matrix(sparse.hstack((m_q1_tf, m_q2_tf)))[ix_train, :],
'y': df.loc[ix_train]['is_duplicate']
}, do_scaling=False)
print (model.best_score_)
print (model.best_params_)
with open('./../data/tmp/3_model.pkl', 'wb') as f:
pickle.dump(model, f)
else:
with open('./../data/tmp/3_model.pkl', 'rb') as f:
model = pickle.load(f)
Fitting 5 folds for each of 90 candidates, totalling 450 fits
[Parallel(n_jobs=-1)]: Done 18 tasks | elapsed: 3.3min
[Parallel(n_jobs=-1)]: Done 168 tasks | elapsed: 18.8min
[Parallel(n_jobs=-1)]: Done 418 tasks | elapsed: 43.8min
[Parallel(n_jobs=-1)]: Done 450 out of 450 | elapsed: 46.3min finished
0.741522174677
{'en__l1_ratio': 0.0001, 'en__alpha': 1e-05}
In [ ]:
y_test_pred = model.predict_proba(
sparse.csc_matrix(sparse.hstack((m_q1_tf, m_q2_tf)))[ix_test, :])[:, 1]
y_test_pred_fixed = link_function(y_test_pred)
if not os.path.isfile('./../data/tmp/3_pred_fixed.csv'):
pd.DataFrame.from_records(
zip(df.loc[ix_test]['test_id'].values,
y_test_pred_fixed),
columns=['test_id', 'is_duplicate']).to_csv('./../data/tmp/3_pred_fixed.csv', index=False)
In [ ]:
fig = plt.figure()
ax1 = plt.subplot2grid((1, 2), (0, 0))
ax2 = plt.subplot2grid((1, 2), (0, 1))
sns.distplot(y_test_pred, ax=ax1)
ax1.set_title('Model prediction')
sns.distplot(y_test_pred_fixed, ax=ax2)
ax2.set_title('Model prediction fixed')
plt.show()
In [ ]:
tmp = set(ix_trigrams.tolist())
tmp = [k for (k, v) in sorted(cv_char.vocabulary_.items(), key=lambda t: t[1]) if v in tmp]
tmp = tmp + tmp
display(
eli5.sklearn.explain_linear_classifier_weights(
model.best_estimator_.steps[0][1],
feature_names=tmp))
In [ ]:
df[:1000][df['question1'][:1000].apply(lambda s: 'o v' in s)]['question1'].tolist()
In [ ]:
df[:1000][df['question1'][:1000].apply(lambda s: 'o v' in s)]['question2'].tolist()
In [ ]:
df[:1000][df['question1'][:1000].apply(lambda s: '.co' in s)]['question1'].tolist()
In [ ]:
df[:1000][df['question1'][:1000].apply(lambda s: '.co' in s)]['question2'].tolist()
In [ ]:
data = {
'X_train': sparse.csc_matrix(sparse.hstack((m_q1_tf, m_q2_tf)))[ix_train, :],
'y_train': df.loc[ix_train]['is_duplicate'],
'X_test': sparse.csc_matrix(sparse.hstack((m_q1_tf, m_q2_tf)))[ix_test, :],
'y_train_pred': np.zeros(ix_train.shape[0]),
'y_test_pred': []
}
In [ ]:
if not os.path.isfile('./../data/tmp/4_model_pred.pkl'):
n_splits = 10
folder = StratifiedKFold(n_splits=n_splits, shuffle=True)
for ix_first, ix_second in tqdm_notebook(folder.split(np.zeros(data['y_train'].shape[0]), data['y_train']),
total=n_splits):
# {'en__l1_ratio': 0.0001, 'en__alpha': 1e-05}
model = SGDClassifier(
loss='log',
penalty='elasticnet',
fit_intercept=True,
n_iter=100,
shuffle=True,
n_jobs=-1,
l1_ratio=0.0001,
alpha=1e-05,
class_weight=None)
model = model.fit(data['X_train'][ix_first, :], data['y_train'][ix_first])
data['y_train_pred'][ix_second] = model.predict_proba(data['X_train'][ix_second, :])[:, 1]
data['y_test_pred'].append(model.predict_proba(data['X_test'])[:, 1])
data['y_test_pred'] = np.array(data['y_test_pred']).T.mean(axis=1)
with open('./../data/tmp/4_model_pred.pkl', 'wb') as f:
pickle.dump({
'model': model,
'y_train_pred': data['y_train_pred'],
'y_test_pred': data['y_test_pred']
}, f)
else:
with open('./../data/tmp/4_model_pred.pkl', 'rb') as f:
tmp = pickle.load(f)
model = tmp['model']
data['y_train_pred'] = tmp['y_train_pred']
data['y_test_pred'] = tmp['y_test_pred']
del(tmp)
In [ ]:
fig = plt.figure()
ax1 = plt.subplot2grid((1, 2), (0, 0))
ax2 = plt.subplot2grid((1, 2), (0, 1))
sns.distplot(data['y_train_pred'], label='train', ax=ax1)
sns.distplot(data['y_test_pred'], label='test', ax=ax1)
ax1.axvline(np.mean(data['y_train_pred']), color='b', alpha=1, linestyle='--', label='mean(train)')
ax1.axvline(np.mean(data['y_test_pred']), color='g', alpha=1, linestyle='--', label='mean(test)')
ax1.legend(loc='upper right', prop={'size': 18})
ax1.set_title('Train/test OOF predictions')
sns.distplot(data['y_train_pred'][:ix_train.shape[0]//2], label='train first part', ax=ax2)
sns.distplot(data['y_train_pred'][ix_train.shape[0]//2:], label='train second part', ax=ax2)
ax2.legend(loc='upper right', prop={'size': 18})
ax2.set_title('Train/train OOF predictions')
plt.show()
Необходимо пофиксить распределение, хотя бы среднее значение, воспользуемся уже знакомым преобразованием:
$$\large g\left(x\right) = \frac{w_1 x}{w_1 x + w_0 \left(1 - x\right)} = \frac{w x}{w x + (1 - w) \left(1 - x\right)}$$Условие:
$$\large\begin{array}{rcl} \mathbb{E}\left[p\right] = \mathbb{E}\left[g\left(q\right)\right] \end{array}$$Можно составить уравнение:
$$\large\begin{array}{rcl} \frac{1}{n} \sum_{i=1}^n p_i &=& \frac{1}{m} \sum_{i=1}^m g\left(q_i\right) \Rightarrow \\ \mu_p &=& \frac{1}{m} \sum_{i=1}^n \frac{wq_i}{wq_i + \left(1 - w\right)\left(1 - q_i\right)} \end{array}$$Получается следующая задача оптимизации: $$\large \hat{w} = \arg\min_w \left(\mu_p - \frac{1}{m} \sum_{i=1}^n \frac{wq_i}{wq_i + \left(1 - w\right)\left(1 - q_i\right)}\right)^2 $$
In [ ]:
mp = np.mean(data['y_train_pred'])
print (mp)
def func(w):
return (mp*data['y_test_pred'].shape[0] -
np.sum(w[0]*data['y_test_pred']/(w[0]*data['y_test_pred'] +
(1 - w[0]) * (1 - data['y_test_pred']))))**2
print (func(np.array([1])))
res = minimize(func, np.array([1]), method='L-BFGS-B', bounds=[(0, 1)])
print (res)
In [ ]:
w = res['x'][0]
def fix_function(x):
return w*x/(w*x + (1 - w)*(1 - x))
support = np.linspace(0, 1, 1000)
values = fix_function(support)
fig, ax = plt.subplots()
ax.plot(support, values)
ax.set_title('Fix transformation', fontsize=20)
ax.set_xlabel('x')
ax.set_ylabel('f(x)')
plt.show()
data['y_test_pred_fixed'] = fix_function(data['y_test_pred'])
In [ ]:
fig = plt.figure()
ax1 = plt.subplot2grid((1, 2), (0, 0))
ax2 = plt.subplot2grid((1, 2), (0, 1))
sns.distplot(data['y_train_pred'], label='train', ax=ax1)
sns.distplot(data['y_test_pred_fixed'], label='test', ax=ax1)
ax1.axvline(np.mean(data['y_train_pred']), color='b', alpha=1, linestyle='--', label='mean(train)')
ax1.axvline(np.mean(data['y_test_pred_fixed']), color='g', alpha=1, linestyle='--', label='mean(test)')
ax1.legend(loc='upper right', prop={'size': 18})
ax1.set_title('Train/test OOF predictions')
sns.distplot(data['y_train_pred'][:ix_train.shape[0]//2], label='train first part', ax=ax2)
sns.distplot(data['y_train_pred'][ix_train.shape[0]//2:], label='train second part', ax=ax2)
ax2.legend(loc='upper right', prop={'size': 18})
ax2.set_title('Train/train OOF predictions')
plt.show()
In [ ]:
df['m_q1_q2_tf_oof'] = np.zeros(df.shape[0])
df.loc[ix_train, 'm_q1_q2_tf_oof'] = data['y_train_pred']
df.loc[ix_test, 'm_q1_q2_tf_oof'] = data['y_test_pred_fixed']
del(data)
In [ ]:
predictors = df.columns[7:].tolist()
print (predictors)
if not os.path.isfile('./../data/tmp/5_model.pkl'):
model = check_model(predictors)
print (model.best_score_)
print (model.best_params_)
with open('./../data/tmp/5_model.pkl', 'wb') as f:
pickle.dump(model, f)
else:
with open('./../data/tmp/5_model.pkl', 'rb') as f:
model = pickle.load(f)
Fitting 5 folds for each of 90 candidates, totalling 450 fits
[Parallel(n_jobs=-1)]: Done 18 tasks | elapsed: 35.5s
[Parallel(n_jobs=-1)]: Done 168 tasks | elapsed: 3.5min
[Parallel(n_jobs=-1)]: Done 418 tasks | elapsed: 8.6min
[Parallel(n_jobs=-1)]: Done 450 out of 450 | elapsed: 9.1min finished
0.784444334512
{'en__l1_ratio': 0.5, 'en__alpha': 1e-05}
In [ ]:
y_test_pred = model.predict_proba(df.loc[ix_test][predictors])[:, 1]
y_test_pred_fixed = link_function(y_test_pred)
if not os.path.isfile('./../data/tmp/5_pred_fixed.csv'):
pd.DataFrame.from_records(
zip(df.loc[ix_test]['test_id'].values,
y_test_pred_fixed),
columns=['test_id', 'is_duplicate']).to_csv('./../data/tmp/5_pred_fixed.csv', index=False)
In [ ]:
fig = plt.figure()
ax1 = plt.subplot2grid((1, 2), (0, 0))
ax2 = plt.subplot2grid((1, 2), (0, 1))
sns.distplot(y_test_pred, ax=ax1)
ax1.set_title('Model prediction')
sns.distplot(y_test_pred_fixed, ax=ax2)
ax2.set_title('Model prediction fixed')
plt.show()
In [ ]:
display(
eli5.sklearn.explain_linear_classifier_weights(
model.best_estimator_.steps[1][1],
feature_names=predictors))
In [ ]:
plot_real_feature('m_q1_q2_tf_oof')
In [ ]:
del(unigrams, bigrams, trigrams)
In [ ]:
if not os.path.isfile('./../data/tmp/1_svd.pkl'):
svd = TruncatedSVD(n_components=100)
m_svd = svd.fit_transform(sparse.csc_matrix(sparse.hstack((m_q1_tf, m_q2_tf))))
with open('./../data/tmp/1_svd.pkl', 'wb') as f:
pickle.dump(svd, f)
with open('./../data/tmp/1_m_svd.npz', 'wb') as f:
np.savez(f, m_svd)
else:
with open('./../data/tmp/1_svd.pkl', 'rb') as f:
svd = pickle.load(f)
with open('./../data/tmp/1_m_svd.npz', 'rb') as f:
m_svd = np.load(f)['arr_0']
In [ ]:
plt.plot(np.cumsum(svd.explained_variance_ratio_))
In [ ]:
fig = plt.figure()
ax1 = plt.subplot2grid((2, 2), (0, 0))
ax2 = plt.subplot2grid((2, 2), (0, 1))
ax3 = plt.subplot2grid((2, 2), (1, 0))
ax4 = plt.subplot2grid((2, 2), (1, 1))
ax1.scatter(m_svd[np.where(df['is_duplicate'] == 0), 0],
m_svd[np.where(df['is_duplicate'] == 0), 1],
color='g', alpha=0.1, label='not dub')
ax1.scatter(m_svd[np.where(df['is_duplicate'] == 1), 0],
m_svd[np.where(df['is_duplicate'] == 1), 1],
color='r', alpha=0.1, label='dub')
ax1.set_title('rSVD: 0 vs 1')
ax1.legend(loc='upper right', prop={'size': 18})
ax2.scatter(m_svd[np.where(df['is_duplicate'] == 0), 0],
m_svd[np.where(df['is_duplicate'] == 0), 2],
color='g', alpha=0.1)
ax2.scatter(m_svd[np.where(df['is_duplicate'] == 1), 0],
m_svd[np.where(df['is_duplicate'] == 1), 2],
color='r', alpha=0.1)
ax2.set_title('rSVD: 0 vs 2')
ax3.scatter(m_svd[np.where(df['is_duplicate'] == 0), 1],
m_svd[np.where(df['is_duplicate'] == 0), 2],
color='g', alpha=0.1)
ax3.scatter(m_svd[np.where(df['is_duplicate'] == 1), 1],
m_svd[np.where(df['is_duplicate'] == 1), 2],
color='r', alpha=0.1)
ax3.set_title('rSVD: 1 vs 2')
ax4.scatter(m_svd[np.where(df['is_duplicate'] == 0), 0],
m_svd[np.where(df['is_duplicate'] == 0), 3],
color='g', alpha=0.1)
ax4.scatter(m_svd[np.where(df['is_duplicate'] == 1), 0],
m_svd[np.where(df['is_duplicate'] == 1), 3],
color='r', alpha=0.1)
ax4.set_title('rSVD: 0 vs 3')
plt.show()
In [ ]:
df['m_q1_q2_tf_svd0'] = m_svd[:, 0]
plot_real_feature('m_q1_q2_tf_svd0')
In [ ]:
df['m_q1_q2_tf_svd1'] = m_svd[:, 0]
plot_real_feature('m_q1_q2_tf_svd1')
In [ ]:
df['tmp'] = m_svd[:, 4]
plot_real_feature('tmp')
df = df.drop('tmp', axis=1)
In [ ]:
if not os.path.isfile('./../data/tmp/6_model.pkl'):
model = check_model(None, data={
'X': m_svd[ix_train, :],
'y': df.loc[ix_train]['is_duplicate']
}, do_scaling=False)
print (model.best_score_)
print (model.best_params_)
with open('./../data/tmp/6_model.pkl', 'wb') as f:
pickle.dump(model, f)
else:
with open('./../data/tmp/6_model.pkl', 'rb') as f:
model = pickle.load(f)
Fitting 5 folds for each of 90 candidates, totalling 450 fits
[Parallel(n_jobs=-1)]: Done 18 tasks | elapsed: 1.8min
[Parallel(n_jobs=-1)]: Done 168 tasks | elapsed: 11.2min
[Parallel(n_jobs=-1)]: Done 418 tasks | elapsed: 29.6min
[Parallel(n_jobs=-1)]: Done 450 out of 450 | elapsed: 31.9min finished
0.68117193104
{'en__l1_ratio': 0.5, 'en__alpha': 1e-05}
In [ ]:
y_test_pred = model.predict_proba(m_svd[ix_test, :])[:, 1]
y_test_pred_fixed = link_function(y_test_pred)
if not os.path.isfile('./../data/tmp/6_pred_fixed.csv'):
pd.DataFrame.from_records(
zip(df.loc[ix_test]['test_id'].values,
y_test_pred_fixed),
columns=['test_id', 'is_duplicate']).to_csv('./../data/tmp/6_pred_fixed.csv', index=False)
In [ ]:
fig = plt.figure()
ax1 = plt.subplot2grid((1, 2), (0, 0))
ax2 = plt.subplot2grid((1, 2), (0, 1))
sns.distplot(y_test_pred, ax=ax1)
ax1.set_title('Model prediction')
sns.distplot(y_test_pred_fixed, ax=ax2)
ax2.set_title('Model prediction fixed')
plt.show()
In [ ]:
data={
'X_train': m_svd[ix_train, :],
'y_train': df.loc[ix_train]['is_duplicate'],
'X_test': m_svd[ix_test, :],
'y_train_pred': np.zeros(ix_train.shape[0]),
'y_test_pred': []
}
In [ ]:
if not os.path.isfile('./../data/tmp/7_model_pred.pkl'):
n_splits = 10
folder = StratifiedKFold(n_splits=n_splits, shuffle=True)
for ix_first, ix_second in tqdm_notebook(folder.split(np.zeros(data['y_train'].shape[0]), data['y_train']),
total=n_splits):
# {'en__l1_ratio': 0.5, 'en__alpha': 1e-05}
model = SGDClassifier(
loss='log',
penalty='elasticnet',
fit_intercept=True,
n_iter=100,
shuffle=True,
n_jobs=-1,
l1_ratio=0.5,
alpha=1e-05,
class_weight=None)
model = model.fit(data['X_train'][ix_first, :], data['y_train'][ix_first])
data['y_train_pred'][ix_second] = model.predict_proba(data['X_train'][ix_second, :])[:, 1]
data['y_test_pred'].append(model.predict_proba(data['X_test'])[:, 1])
data['y_test_pred'] = np.array(data['y_test_pred']).T.mean(axis=1)
with open('./../data/tmp/7_model_pred.pkl', 'wb') as f:
pickle.dump({
'model': model,
'y_train_pred': data['y_train_pred'],
'y_test_pred': data['y_test_pred']
}, f)
else:
with open('./../data/tmp/7_model_pred.pkl', 'rb') as f:
tmp = pickle.load(f)
model = tmp['model']
data['y_train_pred'] = tmp['y_train_pred']
data['y_test_pred'] = tmp['y_test_pred']
del(tmp)
In [ ]:
fig = plt.figure()
ax1 = plt.subplot2grid((1, 2), (0, 0))
ax2 = plt.subplot2grid((1, 2), (0, 1))
sns.distplot(data['y_train_pred'], label='train', ax=ax1)
sns.distplot(data['y_test_pred'], label='test', ax=ax1)
ax1.axvline(np.mean(data['y_train_pred']), color='b', alpha=1, linestyle='--', label='mean(train)')
ax1.axvline(np.mean(data['y_test_pred']), color='g', alpha=1, linestyle='--', label='mean(test)')
ax1.legend(loc='upper right', prop={'size': 18})
ax1.set_title('Train/test OOF predictions')
sns.distplot(data['y_train_pred'][:ix_train.shape[0]//2], label='train first part', ax=ax2)
sns.distplot(data['y_train_pred'][ix_train.shape[0]//2:], label='train second part', ax=ax2)
ax2.legend(loc='upper right', prop={'size': 18})
ax2.set_title('Train/train OOF predictions')
plt.show()
In [ ]:
mp = np.mean(data['y_train_pred'])
print (mp)
def func(w):
return (mp*data['y_test_pred'].shape[0] -
np.sum(w[0]*data['y_test_pred']/(w[0]*data['y_test_pred'] +
(1 - w[0]) * (1 - data['y_test_pred']))))**2
print (func(np.array([1])))
res = minimize(func, np.array([1]), method='L-BFGS-B', bounds=[(0, 1)])
print (res)
In [ ]:
w = res['x'][0]
def fix_function(x):
return w*x/(w*x + (1 - w)*(1 - x))
support = np.linspace(0, 1, 1000)
values = fix_function(support)
fig, ax = plt.subplots()
ax.plot(support, values)
ax.set_title('Fix transformation', fontsize=20)
ax.set_xlabel('x')
ax.set_ylabel('f(x)')
plt.show()
data['y_test_pred_fixed'] = fix_function(data['y_test_pred'])
In [ ]:
fig = plt.figure()
ax1 = plt.subplot2grid((1, 2), (0, 0))
ax2 = plt.subplot2grid((1, 2), (0, 1))
sns.distplot(data['y_train_pred'], label='train', ax=ax1)
sns.distplot(data['y_test_pred_fixed'], label='test', ax=ax1)
ax1.axvline(np.mean(data['y_train_pred']), color='b', alpha=1, linestyle='--', label='mean(train)')
ax1.axvline(np.mean(data['y_test_pred_fixed']), color='g', alpha=1, linestyle='--', label='mean(test)')
ax1.legend(loc='upper right', prop={'size': 18})
ax1.set_title('Train/test OOF predictions')
sns.distplot(data['y_train_pred'][:ix_train.shape[0]//2], label='train first part', ax=ax2)
sns.distplot(data['y_train_pred'][ix_train.shape[0]//2:], label='train second part', ax=ax2)
ax2.legend(loc='upper right', prop={'size': 18})
ax2.set_title('Train/train OOF predictions')
plt.show()
In [ ]:
df['m_q1_q2_tf_svd100_oof'] = np.zeros(df.shape[0])
df.loc[ix_train, 'm_q1_q2_tf_svd100_oof'] = data['y_train_pred']
df.loc[ix_test, 'm_q1_q2_tf_svd100_oof'] = data['y_test_pred_fixed']
del(data)
In [ ]:
predictors = df.columns[7:].tolist()
print (predictors)
if not os.path.isfile('./../data/tmp/8_model.pkl'):
model = check_model(predictors)
print (model.best_score_)
print (model.best_params_)
with open('./../data/tmp/8_model.pkl', 'wb') as f:
pickle.dump(model, f)
else:
with open('./../data/tmp/8_model.pkl', 'rb') as f:
model = pickle.load(f)
Fitting 5 folds for each of 90 candidates, totalling 450 fits
[Parallel(n_jobs=-1)]: Done 18 tasks | elapsed: 38.8s
[Parallel(n_jobs=-1)]: Done 168 tasks | elapsed: 3.8min
[Parallel(n_jobs=-1)]: Done 418 tasks | elapsed: 9.4min
[Parallel(n_jobs=-1)]: Done 450 out of 450 | elapsed: 10.1min finished
0.785762695095
{'en__l1_ratio': 0.001, 'en__alpha': 1e-05}
In [ ]:
y_test_pred = model.predict_proba(df.loc[ix_test][predictors])[:, 1]
y_test_pred_fixed = link_function(y_test_pred)
if not os.path.isfile('./../data/tmp/8_pred_fixed.csv'):
pd.DataFrame.from_records(
zip(df.loc[ix_test]['test_id'].values,
y_test_pred_fixed),
columns=['test_id', 'is_duplicate']).to_csv('./../data/tmp/8_pred_fixed.csv', index=False)
In [ ]:
fig = plt.figure()
ax1 = plt.subplot2grid((1, 2), (0, 0))
ax2 = plt.subplot2grid((1, 2), (0, 1))
sns.distplot(y_test_pred, ax=ax1)
ax1.set_title('Model prediction')
sns.distplot(y_test_pred_fixed, ax=ax2)
ax2.set_title('Model prediction fixed')
plt.show()
In [ ]:
plot_real_feature('m_q1_q2_tf_svd100_oof')
In [ ]:
display(
eli5.sklearn.explain_linear_classifier_weights(
model.best_estimator_.steps[1][1],
feature_names=predictors,
top=30))
In [ ]:
sns.jointplot('m_q1_q2_tf_oof', 'm_q1_q2_tf_svd100_oof', df.loc[ix_train], kind='reg')
In [ ]:
del(m_q1, m_q2, m_svd)
In [ ]:
m_diff_q1_q2 = m_q1_tf - m_q2_tf
In [ ]:
if not os.path.isfile('./../data/tmp/9_model.pkl'):
model = check_model(None, data={
'X': m_diff_q1_q2[ix_train, :],
'y': df.loc[ix_train]['is_duplicate']
}, do_scaling=False)
print (model.best_score_)
print (model.best_params_)
with open('./../data/tmp/9_model.pkl', 'wb') as f:
pickle.dump(model, f)
else:
with open('./../data/tmp/9_model.pkl', 'rb') as f:
model = pickle.load(f)
Fitting 5 folds for each of 90 candidates, totalling 450 fits
[Parallel(n_jobs=-1)]: Done 18 tasks | elapsed: 2.5min
[Parallel(n_jobs=-1)]: Done 168 tasks | elapsed: 13.9min
[Parallel(n_jobs=-1)]: Done 418 tasks | elapsed: 34.0min
[Parallel(n_jobs=-1)]: Done 450 out of 450 | elapsed: 36.1min finished
0.630802146974
{'en__l1_ratio': 0.01, 'en__alpha': 0.001}
In [ ]:
y_test_pred = model.predict_proba(m_diff_q1_q2[ix_test, :])[:, 1]
y_test_pred_fixed = link_function(y_test_pred)
if not os.path.isfile('./../data/tmp/9_pred_fixed.csv'):
pd.DataFrame.from_records(
zip(df.loc[ix_test]['test_id'].values,
y_test_pred_fixed),
columns=['test_id', 'is_duplicate']).to_csv('./../data/tmp/9_pred_fixed.csv', index=False)
In [ ]:
fig = plt.figure()
ax1 = plt.subplot2grid((1, 2), (0, 0))
ax2 = plt.subplot2grid((1, 2), (0, 1))
sns.distplot(y_test_pred, ax=ax1)
ax1.set_title('Model prediction')
sns.distplot(y_test_pred_fixed, ax=ax2)
ax2.set_title('Model prediction fixed')
plt.show()
In [ ]:
data={
'X_train': m_diff_q1_q2[ix_train, :],
'y_train': df.loc[ix_train]['is_duplicate'],
'X_test': m_diff_q1_q2[ix_test, :],
'y_train_pred': np.zeros(ix_train.shape[0]),
'y_test_pred': []
}
In [ ]:
if not os.path.isfile('./../data/tmp/10_model_pred.pkl'):
n_splits = 10
folder = StratifiedKFold(n_splits=n_splits, shuffle=True)
for ix_first, ix_second in tqdm_notebook(folder.split(np.zeros(data['y_train'].shape[0]), data['y_train']),
total=n_splits):
# {'en__l1_ratio': 0.01, 'en__alpha': 0.001}
model = SGDClassifier(
loss='log',
penalty='elasticnet',
fit_intercept=True,
n_iter=100,
shuffle=True,
n_jobs=-1,
l1_ratio=0.01,
alpha=0.001,
class_weight=None)
model = model.fit(data['X_train'][ix_first, :], data['y_train'][ix_first])
data['y_train_pred'][ix_second] = model.predict_proba(data['X_train'][ix_second, :])[:, 1]
data['y_test_pred'].append(model.predict_proba(data['X_test'])[:, 1])
data['y_test_pred'] = np.array(data['y_test_pred']).T.mean(axis=1)
with open('./../data/tmp/10_model_pred.pkl', 'wb') as f:
pickle.dump({
'model': model,
'y_train_pred': data['y_train_pred'],
'y_test_pred': data['y_test_pred']
}, f)
else:
with open('./../data/tmp/10_model_pred.pkl', 'rb') as f:
tmp = pickle.load(f)
model = tmp['model']
data['y_train_pred'] = tmp['y_train_pred']
data['y_test_pred'] = tmp['y_test_pred']
del(tmp)
In [ ]:
fig = plt.figure()
ax1 = plt.subplot2grid((1, 2), (0, 0))
ax2 = plt.subplot2grid((1, 2), (0, 1))
sns.distplot(data['y_train_pred'], label='train', ax=ax1)
sns.distplot(data['y_test_pred'], label='test', ax=ax1)
ax1.axvline(np.mean(data['y_train_pred']), color='b', alpha=1, linestyle='--', label='mean(train)')
ax1.axvline(np.mean(data['y_test_pred']), color='g', alpha=1, linestyle='--', label='mean(test)')
ax1.legend(loc='upper right', prop={'size': 18})
ax1.set_title('Train/test OOF predictions')
sns.distplot(data['y_train_pred'][:ix_train.shape[0]//2], label='train first part', ax=ax2)
sns.distplot(data['y_train_pred'][ix_train.shape[0]//2:], label='train second part', ax=ax2)
ax2.legend(loc='upper right', prop={'size': 18})
ax2.set_title('Train/train OOF predictions')
plt.show()
In [ ]:
df['m_diff_q1_q2_tf_oof'] = np.zeros(df.shape[0])
df.loc[ix_train, 'm_diff_q1_q2_tf_oof'] = data['y_train_pred']
df.loc[ix_test, 'm_diff_q1_q2_tf_oof'] = data['y_test_pred']
del(data)
In [ ]:
plot_real_feature('m_diff_q1_q2_tf_oof')
In [ ]:
del(m_diff_q1_q2)
In [ ]:
predictors = df.columns[7:].tolist()
print (predictors)
if not os.path.isfile('./../data/tmp/11_model.pkl'):
model = check_model(predictors)
print (model.best_score_)
print (model.best_params_)
with open('./../data/tmp/11_model.pkl', 'wb') as f:
pickle.dump(model, f)
else:
with open('./../data/tmp/11_model.pkl', 'rb') as f:
model = pickle.load(f)
Fitting 5 folds for each of 90 candidates, totalling 450 fits
[Parallel(n_jobs=-1)]: Done 18 tasks | elapsed: 40.9s
[Parallel(n_jobs=-1)]: Done 168 tasks | elapsed: 3.9min
[Parallel(n_jobs=-1)]: Done 418 tasks | elapsed: 9.8min
[Parallel(n_jobs=-1)]: Done 450 out of 450 | elapsed: 10.4min finished
0.786160924089
{'en__l1_ratio': 0.75, 'en__alpha': 1e-05}
In [ ]:
y_test_pred = model.predict_proba(df.loc[ix_test][predictors])[:, 1]
y_test_pred_fixed = link_function(y_test_pred)
if not os.path.isfile('./../data/tmp/11_pred_fixed.csv'):
pd.DataFrame.from_records(
zip(df.loc[ix_test]['test_id'].values,
y_test_pred_fixed),
columns=['test_id', 'is_duplicate']).to_csv('./../data/tmp/11_pred_fixed.csv', index=False)
In [ ]:
fig = plt.figure()
ax1 = plt.subplot2grid((1, 2), (0, 0))
ax2 = plt.subplot2grid((1, 2), (0, 1))
sns.distplot(y_test_pred, ax=ax1)
ax1.set_title('Model prediction')
sns.distplot(y_test_pred_fixed, ax=ax2)
ax2.set_title('Model prediction fixed')
plt.show()
In [ ]:
display(
eli5.sklearn.explain_linear_classifier_weights(
model.best_estimator_.steps[1][1],
feature_names=predictors, top=30))
In [ ]:
class QuoraKFold():
def __init__(self, n_splits=10):
self.n_splits = n_splits
def get_n_splits(self, X=None, y=None, groups=None):
return self.n_splits
def split(self, X, y, groups=None):
r_split = 1 - 1.0/self.n_splits
k = r1*d[0]/(r0*d[1])
ix_ones = np.where(y == 1)[0]
ix_zeros = np.where(y == 0)[0]
for i in range(self.n_splits):
ix_first_zeros = np.random.choice(
ix_zeros,
size=int(r_split * ix_zeros.shape[0]),
replace=False)
ix_first_ones = np.random.choice(
ix_ones,
size=int(ix_first_zeros.shape[0] * d[1]/d[0]),
replace=False)
ix_second_zeros = np.setdiff1d(ix_zeros, ix_first_zeros)
ix_second_ones = np.random.choice(
np.setdiff1d(ix_ones, ix_first_ones),
size=int(ix_second_zeros.shape[0] * r1/r0),
replace=False)
ix_first = np.hstack((ix_first_zeros, ix_first_ones))
ix_first = ix_first[np.random.choice(
range(ix_first.shape[0]), size=ix_first.shape[0], replace=False)]
ix_second = np.hstack((ix_second_zeros, ix_second_ones))
ix_second = ix_second[np.random.choice(
range(ix_second.shape[0]), size=ix_second.shape[0], replace=False)]
yield ix_first, ix_second
In [ ]:
def log_loss_lf(y_true, y_pred):
return log_loss(y_true, link_function(y_pred[:, 1]), eps=eps)
In [ ]:
def check_model(predictors, data=None, do_scaling=True, folder=None):
classifier = lambda: SGDClassifier(
loss='log',
penalty='elasticnet',
fit_intercept=True,
n_iter=100,
shuffle=True,
n_jobs=-1,
class_weight=None)
steps = []
if do_scaling:
steps.append(('ss', StandardScaler()))
steps.append(('en', classifier()))
model = Pipeline(steps=steps)
parameters = {
'en__alpha': [0.00001, 0.0001, 0.001, 0.01, 0.02, 0.1, 0.5, 0.9, 1],
'en__l1_ratio': [0, 0.0001, 0.001, 0.01, 0.1, 0.3, 0.5, 0.75, 0.9, 1]
}
if folder is None:
folder = QuoraKFold(n_splits=10)
grid_search = GridSearchCV(
model,
parameters,
cv=folder,
n_jobs=-1,
scoring=make_scorer(log_loss_lf, greater_is_better=False, needs_proba=True),
verbose=1)
if data is None:
grid_search = grid_search.fit(df.loc[ix_train][predictors],
df.loc[ix_train]['is_duplicate'])
else:
grid_search = grid_search.fit(data['X'],
data['y'])
return grid_search
In [ ]:
if not os.path.isfile('./../data/tmp/12_model.pkl'):
model = check_model(predictors, folder=StratifiedKFold(n_splits=5, shuffle=True))
print (-model.best_score_)
print (model.best_params_)
with open('./../data/tmp/12_model.pkl', 'wb') as f:
pickle.dump(model, f)
else:
with open('./../data/tmp/12_model.pkl', 'rb') as f:
model = pickle.load(f)
Fitting 5 folds for each of 90 candidates, totalling 450 fits
[Parallel(n_jobs=-1)]: Done 18 tasks | elapsed: 40.5s
[Parallel(n_jobs=-1)]: Done 168 tasks | elapsed: 4.0min
[Parallel(n_jobs=-1)]: Done 418 tasks | elapsed: 9.8min
[Parallel(n_jobs=-1)]: Done 450 out of 450 | elapsed: 10.5min finished
0.50136321542
{'en__l1_ratio': 0.01, 'en__alpha': 1e-05}
In [ ]:
if not os.path.isfile('./../data/tmp/13_model.pkl'):
model = check_model(predictors)
print (-model.best_score_)
print (model.best_params_)
with open('./../data/tmp/13_model.pkl', 'wb') as f:
pickle.dump(model, f)
else:
with open('./../data/tmp/13_model.pkl', 'rb') as f:
model = pickle.load(f)
Fitting 10 folds for each of 90 candidates, totalling 900 fits
[Parallel(n_jobs=-1)]: Done 18 tasks | elapsed: 46.7s
[Parallel(n_jobs=-1)]: Done 168 tasks | elapsed: 4.5min
[Parallel(n_jobs=-1)]: Done 418 tasks | elapsed: 10.9min
[Parallel(n_jobs=-1)]: Done 768 tasks | elapsed: 19.9min
[Parallel(n_jobs=-1)]: Done 900 out of 900 | elapsed: 23.4min finished
0.323176002744
{'en__l1_ratio': 1, 'en__alpha': 0.0001}
In [ ]:
y_test_pred = model.predict_proba(df.loc[ix_test][predictors])[:, 1]
y_test_pred_fixed = link_function(y_test_pred)
if not os.path.isfile('./../data/tmp/13_pred_fixed.csv'):
pd.DataFrame.from_records(
zip(df.loc[ix_test]['test_id'].values,
y_test_pred_fixed),
columns=['test_id', 'is_duplicate']).to_csv('./../data/tmp/13_pred_fixed.csv', index=False)
In [ ]:
fig = plt.figure()
ax1 = plt.subplot2grid((1, 2), (0, 0))
ax2 = plt.subplot2grid((1, 2), (0, 1))
sns.distplot(y_test_pred, ax=ax1)
ax1.set_title('Model prediction')
sns.distplot(y_test_pred_fixed, ax=ax2)
ax2.set_title('Model prediction fixed')
plt.show()
In [ ]:
if not os.path.isfile('./../data/tmp/2_svd.pkl'):
svd = TruncatedSVD(n_components=100)
m_svd = svd.fit_transform(sparse.csc_matrix(sparse.vstack((m_q1_tf, m_q2_tf))))
with open('./../data/tmp/2_svd.pkl', 'wb') as f:
pickle.dump(svd, f)
with open('./../data/tmp/2_m_svd.npz', 'wb') as f:
np.savez(f, m_svd)
else:
with open('./../data/tmp/2_svd.pkl', 'rb') as f:
svd = pickle.load(f)
with open('./../data/tmp/2_m_svd.npz', 'rb') as f:
m_svd = np.load(f)['arr_0']
In [ ]:
del(m_q1_tf, m_q2_tf)
In [ ]:
m_svd_q1 = m_svd[:m_svd.shape[0]//2, :]
m_svd_q2 = m_svd[m_svd.shape[0]//2:, :]
del(m_svd)
In [ ]:
df['m_vstack_svd_q1_q1_euclidean'] = ((m_svd_q1 - m_svd_q2)**2).mean(axis=1)
plot_real_feature('m_vstack_svd_q1_q1_euclidean')
In [ ]:
num = (m_svd_q1*m_svd_q2).sum(axis=1)
den = np.sqrt((m_svd_q1**2).sum(axis=1))*np.sqrt((m_svd_q2**2).sum(axis=1))
num[np.where(den == 0)] = 0
den[np.where(den == 0)] = 1
df['m_vstack_svd_q1_q1_cosine'] = 1 - num/den
plot_real_feature('m_vstack_svd_q1_q1_cosine')
In [ ]:
m_svd = m_svd_q1*m_svd_q2
if not os.path.isfile('./../data/tmp/14_model.pkl'):
model = check_model(None, data={
'X': m_svd[ix_train, :],
'y': df.loc[ix_train]['is_duplicate']
}, do_scaling=False)
print (model.best_score_)
print (model.best_params_)
with open('./../data/tmp/14_model.pkl', 'wb') as f:
pickle.dump(model, f)
else:
with open('./../data/tmp/14_model.pkl', 'rb') as f:
model = pickle.load(f)
Fitting 10 folds for each of 90 candidates, totalling 900 fits
[Parallel(n_jobs=-1)]: Done 18 tasks | elapsed: 2.1min
[Parallel(n_jobs=-1)]: Done 168 tasks | elapsed: 12.3min
[Parallel(n_jobs=-1)]: Done 418 tasks | elapsed: 31.6min
[Parallel(n_jobs=-1)]: Done 768 tasks | elapsed: 60.1min
[Parallel(n_jobs=-1)]: Done 900 out of 900 | elapsed: 70.9min finished
-0.409563471096
{'en__l1_ratio': 1, 'en__alpha': 1e-05}
In [ ]:
data={
'X_train': m_svd[ix_train, :],
'y_train': df.loc[ix_train]['is_duplicate'],
'X_test': m_svd[ix_test, :],
'y_train_pred': np.zeros(ix_train.shape[0]),
'y_test_pred': []
}
del(m_svd)
In [ ]:
if not os.path.isfile('./../data/tmp/15_model_pred.pkl'):
n_splits = 10
folder = StratifiedKFold(n_splits=n_splits, shuffle=True)
for ix_first, ix_second in tqdm_notebook(folder.split(np.zeros(data['y_train'].shape[0]), data['y_train']),
total=n_splits):
# {'en__l1_ratio': 1, 'en__alpha': 1e-05}
model = SGDClassifier(
loss='log',
penalty='elasticnet',
fit_intercept=True,
n_iter=100,
shuffle=True,
n_jobs=-1,
l1_ratio=1.0,
alpha=1e-05,
class_weight=None)
model = model.fit(data['X_train'][ix_first, :], data['y_train'][ix_first])
data['y_train_pred'][ix_second] = model.predict_proba(data['X_train'][ix_second, :])[:, 1]
data['y_test_pred'].append(model.predict_proba(data['X_test'])[:, 1])
data['y_test_pred'] = np.array(data['y_test_pred']).T.mean(axis=1)
with open('./../data/tmp/15_model_pred.pkl', 'wb') as f:
pickle.dump({
'model': model,
'y_train_pred': data['y_train_pred'],
'y_test_pred': data['y_test_pred']
}, f)
else:
with open('./../data/tmp/15_model_pred.pkl', 'rb') as f:
tmp = pickle.load(f)
model = tmp['model']
data['y_train_pred'] = tmp['y_train_pred']
data['y_test_pred'] = tmp['y_test_pred']
del(tmp)
In [ ]:
fig = plt.figure()
ax1 = plt.subplot2grid((1, 2), (0, 0))
ax2 = plt.subplot2grid((1, 2), (0, 1))
sns.distplot(data['y_train_pred'], label='train', ax=ax1)
sns.distplot(data['y_test_pred'], label='test', ax=ax1)
ax1.axvline(np.mean(data['y_train_pred']), color='b', alpha=1, linestyle='--', label='mean(train)')
ax1.axvline(np.mean(data['y_test_pred']), color='g', alpha=1, linestyle='--', label='mean(test)')
ax1.legend(loc='upper right', prop={'size': 18})
ax1.set_title('Train/test OOF predictions')
sns.distplot(data['y_train_pred'][:ix_train.shape[0]//2], label='train first part', ax=ax2)
sns.distplot(data['y_train_pred'][ix_train.shape[0]//2:], label='train second part', ax=ax2)
ax2.legend(loc='upper right', prop={'size': 18})
ax2.set_title('Train/train OOF predictions')
plt.show()
In [ ]:
mp = np.mean(data['y_train_pred'])
print (mp)
def func(w):
return (mp*data['y_test_pred'].shape[0] -
np.sum(w[0]*data['y_test_pred']/(w[0]*data['y_test_pred'] +
(1 - w[0]) * (1 - data['y_test_pred']))))**2
print (func(np.array([1])))
res = minimize(func, np.array([1]), method='L-BFGS-B', bounds=[(0, 1)])
print(res)
In [ ]:
w = res['x'][0]
def fix_function(x):
return w*x/(w*x + (1 - w)*(1 - x))
support = np.linspace(0, 1, 1000)
values = fix_function(support)
fig, ax = plt.subplots()
ax.plot(support, values)
ax.set_title('Fix transformation', fontsize=20)
ax.set_xlabel('x')
ax.set_ylabel('f(x)')
plt.show()
data['y_test_pred_fixed'] = fix_function(data['y_test_pred'])
In [ ]:
fig = plt.figure()
ax1 = plt.subplot2grid((1, 2), (0, 0))
ax2 = plt.subplot2grid((1, 2), (0, 1))
sns.distplot(data['y_train_pred'], label='train', ax=ax1)
sns.distplot(data['y_test_pred_fixed'], label='test', ax=ax1)
ax1.axvline(np.mean(data['y_train_pred']), color='b', alpha=1, linestyle='--', label='mean(train)')
ax1.axvline(np.mean(data['y_test_pred_fixed']), color='g', alpha=1, linestyle='--', label='mean(test)')
ax1.legend(loc='upper right', prop={'size': 18})
ax1.set_title('Train/test OOF predictions')
sns.distplot(data['y_train_pred'][:ix_train.shape[0]//2], label='train first part', ax=ax2)
sns.distplot(data['y_train_pred'][ix_train.shape[0]//2:], label='train second part', ax=ax2)
ax2.legend(loc='upper right', prop={'size': 18})
ax2.set_title('Train/train OOF predictions')
plt.show()
In [ ]:
df['m_vstack_svd_mult_q1_q2_oof'] = np.zeros(df.shape[0])
df.loc[ix_train, 'm_vstack_svd_mult_q1_q2_oof'] = data['y_train_pred']
df.loc[ix_test, 'm_vstack_svd_mult_q1_q2_oof'] = data['y_test_pred_fixed']
del(data)
plot_real_feature('m_vstack_svd_mult_q1_q2_oof')
In [ ]:
m_svd = np.abs(m_svd_q1 - m_svd_q2)
if not os.path.isfile('./../data/tmp/16_model.pkl'):
model = check_model(None, data={
'X': m_svd[ix_train, :],
'y': df.loc[ix_train]['is_duplicate']
}, do_scaling=False)
print (model.best_score_)
print (model.best_params_)
with open('./../data/tmp/16_model.pkl', 'wb') as f:
pickle.dump(model, f)
else:
with open('./../data/tmp/16_model.pkl', 'rb') as f:
model = pickle.load(f)
[Parallel(n_jobs=-1)]: Done 18 tasks | elapsed: 2.1min
[Parallel(n_jobs=-1)]: Done 168 tasks | elapsed: 12.5min
[Parallel(n_jobs=-1)]: Done 418 tasks | elapsed: 30.0min
[Parallel(n_jobs=-1)]: Done 768 tasks | elapsed: 54.6min
[Parallel(n_jobs=-1)]: Done 900 out of 900 | elapsed: 63.4min finished
-0.439305223632
{'en__l1_ratio': 0.01, 'en__alpha': 1e-05}
In [ ]:
data={
'X_train': m_svd[ix_train, :],
'y_train': df.loc[ix_train]['is_duplicate'],
'X_test': m_svd[ix_test, :],
'y_train_pred': np.zeros(ix_train.shape[0]),
'y_test_pred': []
}
del(m_svd, m_svd_q1, m_svd_q2)
In [ ]:
if not os.path.isfile('./../data/tmp/17_model_pred.pkl'):
n_splits = 10
folder = StratifiedKFold(n_splits=n_splits, shuffle=True)
for ix_first, ix_second in tqdm_notebook(folder.split(np.zeros(data['y_train'].shape[0]), data['y_train']),
total=n_splits):
# {'en__l1_ratio': 0.01, 'en__alpha': 1e-05}
model = SGDClassifier(
loss='log',
penalty='elasticnet',
fit_intercept=True,
n_iter=100,
shuffle=True,
n_jobs=-1,
l1_ratio=0.01,
alpha=1e-05,
class_weight=None)
model = model.fit(data['X_train'][ix_first, :], data['y_train'][ix_first])
data['y_train_pred'][ix_second] = model.predict_proba(data['X_train'][ix_second, :])[:, 1]
data['y_test_pred'].append(model.predict_proba(data['X_test'])[:, 1])
data['y_test_pred'] = np.array(data['y_test_pred']).T.mean(axis=1)
with open('./../data/tmp/17_model_pred.pkl', 'wb') as f:
pickle.dump({
'model': model,
'y_train_pred': data['y_train_pred'],
'y_test_pred': data['y_test_pred']
}, f)
else:
with open('./../data/tmp/17_model_pred.pkl', 'rb') as f:
tmp = pickle.load(f)
model = tmp['model']
data['y_train_pred'] = tmp['y_train_pred']
data['y_test_pred'] = tmp['y_test_pred']
del(tmp)
In [ ]:
fig = plt.figure()
ax1 = plt.subplot2grid((1, 2), (0, 0))
ax2 = plt.subplot2grid((1, 2), (0, 1))
sns.distplot(data['y_train_pred'], label='train', ax=ax1)
sns.distplot(data['y_test_pred'], label='test', ax=ax1)
ax1.axvline(np.mean(data['y_train_pred']), color='b', alpha=1, linestyle='--', label='mean(train)')
ax1.axvline(np.mean(data['y_test_pred']), color='g', alpha=1, linestyle='--', label='mean(test)')
ax1.legend(loc='upper right', prop={'size': 18})
ax1.set_title('Train/test OOF predictions')
sns.distplot(data['y_train_pred'][:ix_train.shape[0]//2], label='train first part', ax=ax2)
sns.distplot(data['y_train_pred'][ix_train.shape[0]//2:], label='train second part', ax=ax2)
ax2.legend(loc='upper right', prop={'size': 18})
ax2.set_title('Train/train OOF predictions')
plt.show()
In [ ]:
mp = np.mean(data['y_train_pred'])
print (mp)
def func(w):
return (mp*data['y_test_pred'].shape[0] -
np.sum(w[0]*data['y_test_pred']/(w[0]*data['y_test_pred'] +
(1 - w[0]) * (1 - data['y_test_pred']))))**2
print (func(np.array([1])))
res = minimize(func, np.array([1]), method='L-BFGS-B', bounds=[(0, 1)])
print (res)
In [ ]:
w = res['x'][0]
def fix_function(x):
return w*x/(w*x + (1 - w)*(1 - x))
support = np.linspace(0, 1, 1000)
values = fix_function(support)
fig, ax = plt.subplots()
ax.plot(support, values)
ax.set_title('Fix transformation', fontsize=20)
ax.set_xlabel('x')
ax.set_ylabel('f(x)')
plt.show()
data['y_test_pred_fixed'] = fix_function(data['y_test_pred'])
In [ ]:
fig = plt.figure()
ax1 = plt.subplot2grid((1, 2), (0, 0))
ax2 = plt.subplot2grid((1, 2), (0, 1))
sns.distplot(data['y_train_pred'], label='train', ax=ax1)
sns.distplot(data['y_test_pred_fixed'], label='test', ax=ax1)
ax1.axvline(np.mean(data['y_train_pred']), color='b', alpha=1, linestyle='--', label='mean(train)')
ax1.axvline(np.mean(data['y_test_pred_fixed']), color='g', alpha=1, linestyle='--', label='mean(test)')
ax1.legend(loc='upper right', prop={'size': 18})
ax1.set_title('Train/test OOF predictions')
sns.distplot(data['y_train_pred'][:ix_train.shape[0]//2], label='train first part', ax=ax2)
sns.distplot(data['y_train_pred'][ix_train.shape[0]//2:], label='train second part', ax=ax2)
ax2.legend(loc='upper right', prop={'size': 18})
ax2.set_title('Train/train OOF predictions')
plt.show()
In [ ]:
df['m_vstack_svd_absdiff_q1_q2_oof'] = np.zeros(df.shape[0])
df.loc[ix_train, 'm_vstack_svd_absdiff_q1_q2_oof'] = data['y_train_pred']
df.loc[ix_test, 'm_vstack_svd_absdiff_q1_q2_oof'] = data['y_test_pred']
del(data)
plot_real_feature('m_vstack_svd_absdiff_q1_q2_oof')
In [ ]:
nlp = spacy.load('en')
df.head()['question1'].apply(lambda s: ' '.join([c.lemma_ for c in nlp(s) if c.lemma_ != '?']))
In [ ]:
SYMBOLS = set(' '.join(string.punctuation).split(' ') + ['...', '“', '”', '\'ve'])
if not os.path.isfile('./../data/tmp/bow_lemma.pkl'):
q1 = []
for doc in nlp.pipe(df['question1'].values.tolist(), n_threads=16, batch_size=10000):
q1.append([c.lemma_ for c in doc if c.lemma_ not in SYMBOLS])
q2 = []
for doc in nlp.pipe(df['question2'].values.tolist(), n_threads=16, batch_size=10000):
q2.append([c.lemma_ for c in doc if c.lemma_ not in SYMBOLS])
with open('./../data/tmp/bow_lemma.pkl', 'wb') as f:
pickle.dump({
'q1': q1,
'q2': q2
}, f)
else:
with open('./../data/tmp/bow_lemma.pkl', 'rb') as f:
tmp = pickle.load(f)
q1 = tmp['q1']
q2 = tmp['q2']
del(tmp)
In [ ]:
%%time
if os.path.isfile('./../data/tmp/cv_word_lemma.pkl') and os.path.isfile('./../data/tmp/wl_freq.pkl'):
with open('./../data/tmp/cv_word_lemma.pkl', 'rb') as f:
cv_words = pickle.load(f)
with open('./../data/tmp/wl_freq.pkl', 'rb') as f:
w_freq = pickle.load(f)
else:
cv_words = CountVectorizer(ngram_range=(1, 1), analyzer='word')
w_freq = np.array(cv_words.fit_transform(
[' '.join(s) for s in q1] + [' '.join(s) for s in q2]).sum(axis=0))[0, :]
with open('./../data/tmp/cv_word_lemma.pkl', 'wb') as f:
pickle.dump(cv_words, f)
with open('./../data/tmp/wl_freq.pkl', 'wb') as f:
pickle.dump(w_freq, f)
In [ ]:
if os.path.isfile('./../data/tmp/m_q1_wl.npz') and os.path.isfile('./../data/tmp/m_q2_wl.npz'):
m_q1 = load_sparse_csr('./../data/tmp/m_q1_wl.npz')
m_q2 = load_sparse_csr('./../data/tmp/m_q2_wl.npz')
else:
m_q1 = cv_words.transform([' '.join(s) for s in q1])
m_q2 = cv_words.transform([' '.join(s) for s in q2])
save_sparse_csr('./../data/tmp/m_q1_wl.npz', m_q1)
save_sparse_csr('./../data/tmp/m_q2_wl.npz', m_q2)
In [ ]:
tft = TfidfTransformer(
norm='l2',
use_idf=True,
smooth_idf=True,
sublinear_tf=False)
tft = tft.fit(sparse.vstack((m_q1, m_q2)))
m_q1_tf = tft.transform(m_q1)
m_q2_tf = tft.transform(m_q2)
v_num = np.array(m_q1_tf.multiply(m_q2_tf).sum(axis=1))[:, 0]
v_den = np.array(np.sqrt(m_q1_tf.multiply(m_q1_tf).sum(axis=1)))[:, 0] * \
np.array(np.sqrt(m_q2_tf.multiply(m_q2_tf).sum(axis=1)))[:, 0]
v_num[np.where(v_den == 0)] = 1
v_den[np.where(v_den == 0)] = 1
v_score = 1 - v_num/v_den
df['1wl_tfidf_cosine'] = v_score
plot_real_feature('1wl_tfidf_cosine')
In [ ]:
tft = TfidfTransformer(
norm='l2',
use_idf=True,
smooth_idf=True,
sublinear_tf=False)
tft = tft.fit(sparse.vstack((m_q1, m_q2)))
m_q1_tf = tft.transform(m_q1)
m_q2_tf = tft.transform(m_q2)
v_score = (m_q1_tf - m_q2_tf)
v_score = np.sqrt(np.array(v_score.multiply(v_score).sum(axis=1))[:, 0])
df['1wl_tfidf_l2_euclidean'] = v_score
plot_real_feature('1wl_tfidf_l2_euclidean')
In [ ]:
tft = TfidfTransformer(
norm='l2',
use_idf=False,
smooth_idf=True,
sublinear_tf=False)
tft = tft.fit(sparse.vstack((m_q1, m_q2)))
m_q1_tf = tft.transform(m_q1)
m_q2_tf = tft.transform(m_q2)
v_score = (m_q1_tf - m_q2_tf)
v_score = np.sqrt(np.array(v_score.multiply(v_score).sum(axis=1))[:, 0])
df['1wl_tf_l2_euclidean'] = v_score
plot_real_feature('1wl_tf_l2_euclidean')
In [ ]:
tft = TfidfTransformer(
norm='l2',
use_idf=True,
smooth_idf=True,
sublinear_tf=False)
tft = tft.fit(sparse.vstack((m_q1, m_q2)))
m_q1_tf = tft.transform(m_q1)
m_q2_tf = tft.transform(m_q2)
In [ ]:
if not os.path.isfile('./../data/tmp/20_model.pkl'):
model = check_model(None, data={
'X': sparse.csc_matrix(sparse.hstack((m_q1_tf, m_q2_tf)))[ix_train, :],
'y': df.loc[ix_train]['is_duplicate']
}, do_scaling=False)
print (model.best_score_)
print (model.best_params_)
with open('./../data/tmp/20_model.pkl', 'wb') as f:
pickle.dump(model, f)
else:
with open('./../data/tmp/20_model.pkl', 'rb') as f:
model = pickle.load(f)
Fitting 10 folds for each of 90 candidates, totalling 900 fits
[Parallel(n_jobs=-1)]: Done 18 tasks | elapsed: 1.4min
[Parallel(n_jobs=-1)]: Done 168 tasks | elapsed: 8.2min
[Parallel(n_jobs=-1)]: Done 418 tasks | elapsed: 19.2min
[Parallel(n_jobs=-1)]: Done 768 tasks | elapsed: 34.4min
[Parallel(n_jobs=-1)]: Done 900 out of 900 | elapsed: 39.9min finished
-0.365063639733
{'en__l1_ratio': 0.0001, 'en__alpha': 1e-05}
In [ ]:
y_test_pred = model.predict_proba(sparse.csc_matrix(sparse.hstack((m_q1_tf, m_q2_tf)))[ix_test, :])[:, 1]
y_test_pred_fixed = link_function(y_test_pred)
if not os.path.isfile('./../data/tmp/20_pred_fixed.csv'):
pd.DataFrame.from_records(
zip(df.loc[ix_test]['test_id'].values,
y_test_pred_fixed),
columns=['test_id', 'is_duplicate']).to_csv('./../data/tmp/20_pred_fixed.csv', index=False)
In [ ]:
data={
'X_train': sparse.csc_matrix(sparse.hstack((m_q1_tf, m_q2_tf)))[ix_train, :],
'y_train': df.loc[ix_train]['is_duplicate'],
'X_test': sparse.csc_matrix(sparse.hstack((m_q1_tf, m_q2_tf)))[ix_test, :],
'y_train_pred': np.zeros(ix_train.shape[0]),
'y_test_pred': []
}
del(m_q1_tf, m_q2_tf)
In [ ]:
if not os.path.isfile('./../data/tmp/21_model_pred.pkl'):
n_splits = 10
folder = StratifiedKFold(n_splits=n_splits, shuffle=True)
for ix_first, ix_second in tqdm_notebook(folder.split(np.zeros(data['y_train'].shape[0]), data['y_train']),
total=n_splits):
# {'en__l1_ratio': 0.0001, 'en__alpha': 1e-05}
model = SGDClassifier(
loss='log',
penalty='elasticnet',
fit_intercept=True,
n_iter=100,
shuffle=True,
n_jobs=-1,
l1_ratio=0.0001,
alpha=1e-05,
class_weight=None)
model = model.fit(data['X_train'][ix_first, :], data['y_train'][ix_first])
data['y_train_pred'][ix_second] = model.predict_proba(data['X_train'][ix_second, :])[:, 1]
data['y_test_pred'].append(model.predict_proba(data['X_test'])[:, 1])
data['y_test_pred'] = np.array(data['y_test_pred']).T.mean(axis=1)
with open('./../data/tmp/21_model_pred.pkl', 'wb') as f:
pickle.dump({
'model': model,
'y_train_pred': data['y_train_pred'],
'y_test_pred': data['y_test_pred']
}, f)
else:
with open('./../data/tmp/21_model_pred.pkl', 'rb') as f:
tmp = pickle.load(f)
model = tmp['model']
data['y_train_pred'] = tmp['y_train_pred']
data['y_test_pred'] = tmp['y_test_pred']
del(tmp)
In [ ]:
fig = plt.figure()
ax1 = plt.subplot2grid((1, 2), (0, 0))
ax2 = plt.subplot2grid((1, 2), (0, 1))
sns.distplot(data['y_train_pred'], label='train', ax=ax1)
sns.distplot(data['y_test_pred'], label='test', ax=ax1)
ax1.axvline(np.mean(data['y_train_pred']), color='b', alpha=1, linestyle='--', label='mean(train)')
ax1.axvline(np.mean(data['y_test_pred']), color='g', alpha=1, linestyle='--', label='mean(test)')
ax1.legend(loc='upper right', prop={'size': 18})
ax1.set_title('Train/test OOF predictions')
sns.distplot(data['y_train_pred'][:ix_train.shape[0]//2], label='train first part', ax=ax2)
sns.distplot(data['y_train_pred'][ix_train.shape[0]//2:], label='train second part', ax=ax2)
ax2.legend(loc='upper right', prop={'size': 18})
ax2.set_title('Train/train OOF predictions')
plt.show()
In [ ]:
mp = np.mean(data['y_train_pred'])
print (mp)
def func(w):
return (mp*data['y_test_pred'].shape[0] -
np.sum(w[0]*data['y_test_pred']/(w[0]*data['y_test_pred'] +
(1 - w[0]) * (1 - data['y_test_pred']))))**2
print (func(np.array([1])))
res = minimize(func, np.array([1]), method='L-BFGS-B', bounds=[(0, 1)])
print (res)
In [ ]:
w = res['x'][0]
def fix_function(x):
return w*x/(w*x + (1 - w)*(1 - x))
support = np.linspace(0, 1, 1000)
values = fix_function(support)
fig, ax = plt.subplots()
ax.plot(support, values)
ax.set_title('Fix transformation', fontsize=20)
ax.set_xlabel('x')
ax.set_ylabel('f(x)')
plt.show()
data['y_test_pred_fixed'] = fix_function(data['y_test_pred'])
In [ ]:
fig = plt.figure()
ax1 = plt.subplot2grid((1, 2), (0, 0))
ax2 = plt.subplot2grid((1, 2), (0, 1))
sns.distplot(data['y_train_pred'], label='train', ax=ax1)
sns.distplot(data['y_test_pred_fixed'], label='test', ax=ax1)
ax1.axvline(np.mean(data['y_train_pred']), color='b', alpha=1, linestyle='--', label='mean(train)')
ax1.axvline(np.mean(data['y_test_pred_fixed']), color='g', alpha=1, linestyle='--', label='mean(test)')
ax1.legend(loc='upper right', prop={'size': 18})
ax1.set_title('Train/test OOF predictions')
sns.distplot(data['y_train_pred'][:ix_train.shape[0]//2], label='train first part', ax=ax2)
sns.distplot(data['y_train_pred'][ix_train.shape[0]//2:], label='train second part', ax=ax2)
ax2.legend(loc='upper right', prop={'size': 18})
ax2.set_title('Train/train OOF predictions')
plt.show()
In [ ]:
df['m_w1l_tfidf_oof'] = np.zeros(df.shape[0])
df.loc[ix_train, 'm_w1l_tfidf_oof'] = data['y_train_pred']
df.loc[ix_test, 'm_w1l_tfidf_oof'] = data['y_test_pred_fixed']
del(data)
plot_real_feature('m_w1l_tfidf_oof')
In [ ]:
predictors = df.columns[7:]
print (predictors)
In [ ]:
class QuoraKFold():
def __init__(self, n_splits=10, r_split=0.7):
self.n_splits = n_splits
self.r_split = r_split
def get_n_splits(self, X=None, y=None, groups=None):
return self.n_splits
def split(self, X, y, groups=None):
if self.r_split is None:
self.r_split = 1 - 1.0/self.n_splits
k = r1*d[0]/(r0*d[1])
ix_ones = np.where(y == 1)[0]
ix_zeros = np.where(y == 0)[0]
for i in range(self.n_splits):
ix_first_zeros = np.random.choice(
ix_zeros,
size=int(self.r_split * ix_zeros.shape[0]),
replace=False)
ix_first_ones = np.random.choice(
ix_ones,
size=int(ix_first_zeros.shape[0] * d[1]/d[0]),
replace=False)
ix_second_zeros = np.setdiff1d(ix_zeros, ix_first_zeros)
ix_second_ones = np.random.choice(
np.setdiff1d(ix_ones, ix_first_ones),
size=int(ix_second_zeros.shape[0] * r1/r0),
replace=False)
ix_first = np.hstack((ix_first_zeros, ix_first_ones))
ix_first = ix_first[np.random.choice(
range(ix_first.shape[0]), size=ix_first.shape[0], replace=False)]
ix_second = np.hstack((ix_second_zeros, ix_second_ones))
ix_second = ix_second[np.random.choice(
range(ix_second.shape[0]), size=ix_second.shape[0], replace=False)]
yield ix_first, ix_second
def check_model(predictors, data=None, do_scaling=True, folder=None, r_split=0.7):
classifier = lambda: SGDClassifier(
loss='log',
penalty='elasticnet',
fit_intercept=True,
n_iter=100,
shuffle=True,
n_jobs=-1,
class_weight=None)
steps = []
if do_scaling:
steps.append(('ss', StandardScaler()))
steps.append(('en', classifier()))
model = Pipeline(steps=steps)
parameters = {
'en__alpha': [0.00001, 0.0001, 0.001, 0.01, 0.02, 0.1, 0.5, 0.9, 1],
'en__l1_ratio': [0, 0.0001, 0.001, 0.01, 0.1, 0.3, 0.5, 0.75, 0.9, 1]
}
if folder is None:
folder = QuoraKFold(n_splits=10, r_split=r_split)
grid_search = GridSearchCV(
model,
parameters,
cv=folder,
n_jobs=-1,
scoring=make_scorer(log_loss_lf, greater_is_better=False, needs_proba=True),
verbose=1)
if data is None:
grid_search = grid_search.fit(df.loc[ix_train][predictors],
df.loc[ix_train]['is_duplicate'])
else:
grid_search = grid_search.fit(data['X'],
data['y'])
return grid_search
In [ ]:
if not os.path.isfile('./../data/tmp/22_model.pkl'):
model = check_model(None, data={
'X': df.loc[ix_train][predictors],
'y': df.loc[ix_train]['is_duplicate']
}, do_scaling=True, r_split=0.7)
print (model.best_score_)
print (model.best_params_)
with open('./../data/tmp/22_model.pkl', 'wb') as f:
pickle.dump(model, f)
else:
with open('./../data/tmp/22_model.pkl', 'rb') as f:
model = pickle.load(f)
Fitting 10 folds for each of 90 candidates, totalling 900 fits
[Parallel(n_jobs=-1)]: Done 18 tasks | elapsed: 43.1s
[Parallel(n_jobs=-1)]: Done 168 tasks | elapsed: 4.6min
[Parallel(n_jobs=-1)]: Done 418 tasks | elapsed: 11.0min
[Parallel(n_jobs=-1)]: Done 768 tasks | elapsed: 20.4min
[Parallel(n_jobs=-1)]: Done 900 out of 900 | elapsed: 23.9min finished
-0.307676722634
{'en__l1_ratio': 1, 'en__alpha': 0.0001}
In [ ]:
display(
eli5.sklearn.explain_linear_classifier_weights(
model.best_estimator_.steps[1][1],
feature_names=predictors.tolist(),
top=len(predictors)))
In [ ]:
y_test_pred = model.predict_proba(df.loc[ix_test, predictors])[:, 1]
y_test_pred_fixed = link_function(y_test_pred)
if not os.path.isfile('./../data/tmp/22_pred_fixed.csv'):
pd.DataFrame.from_records(
zip(df.loc[ix_test]['test_id'].values,
y_test_pred_fixed),
columns=['test_id', 'is_duplicate']).to_csv('./../data/tmp/22_pred_fixed.csv', index=False)
In [ ]:
fig = plt.figure()
ax1 = plt.subplot2grid((1, 2), (0, 0))
ax2 = plt.subplot2grid((1, 2), (0, 1))
sns.distplot(y_test_pred, ax=ax1)
ax1.set_title('Model prediction')
sns.distplot(y_test_pred_fixed, ax=ax2)
ax2.set_title('Model prediction fixed')
plt.show()
In [ ]:
del(m_q1, m_q2)
In [ ]:
predictors = df.columns[7:]
print (predictors)
In [ ]:
xgb_models = []
if not os.path.isfile('./../data/tmp/25_pred_fixed.csv'):
def log_loss_lf_xgb(y_pred, y_true):
return 'llf', log_loss(y_true.get_label(), link_function(y_pred), eps=eps)
xgb_params = {
'max_depth': 9,
'learning_rate': 0.1,
'n_estimators': 2500,
'objective': 'binary:logistic',
'nthread': 16,
'gamma': 0,
'subsample': 0.75,
'colsample_bytree': 0.75,
'colsample_bylevel': 1,
'reg_alpha': 0,
'reg_lambda': 1,
'scale_pos_weight': 1
}
y_test = []
folder = QuoraKFold(n_splits=10, r_split=0.7)
splits = folder.split(np.zeros(ix_train.shape[0]), df.loc[ix_train, 'is_duplicate'])
for ix_first, ix_second in tqdm_notebook(splits, total=10):
model = xgb.XGBClassifier(silent=True).set_params(**xgb_params)
model = model.fit(df.loc[ix_first, predictors], df.loc[ix_first, 'is_duplicate'],
eval_set=[(df.loc[ix_second, predictors], df.loc[ix_second, 'is_duplicate'])],
eval_metric=log_loss_lf_xgb,
early_stopping_rounds=100,
verbose=False)
y_test.append(model.predict_proba(df.loc[ix_test, predictors])[:, 1])
xgb_models.append(model)
y_test_pred = np.array(y_test).T.mean(axis=1)
y_test_pred_fixed = link_function(y_test_pred)
pd.DataFrame.from_records(
zip(df.loc[ix_test]['test_id'].values,
y_test_pred_fixed),
columns=['test_id', 'is_duplicate']).to_csv('./../data/tmp/25_pred_fixed.csv', index=False)
In [ ]:
if not os.path.isfile('./../data/tmp/bm1.pkl'):
with open('./../data/tmp/bm1.pkl', 'wb') as f:
pickle.dump(xgb_models, f)
else:
with open('./../data/tmp/bm1.pkl', 'rb') as f:
xgb_models = pickle.load(f)
In [ ]:
model = xgb_models[0]
In [ ]:
xgb.plot_importance(model)
plt.show()
In [ ]:
from pygoose import *
In [ ]:
project = kg.project.discover()
In [ ]:
feature_list_id = '3rdparty_mephistopheies'
In [ ]:
X_train = df.loc[ix_train][feature_names].values
X_test = df.loc[ix_test][feature_names].values
In [ ]:
X_train.shape, X_test.shape
In [ ]:
feature_names = [
'meph_' + feature_name
for feature_name in predictors.tolist()
]
In [ ]:
project.save_features(X_train, X_test, feature_names, feature_list_id)