In [2]:
import os
import re
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from scipy import sparse
from tqdm import tqdm
from sklearn.svm import LinearSVC
from pymystem3 import Mystem
from functools import lru_cache

from nltk.corpus import stopwords
from gensim.models.word2vec import Word2Vec
from sklearn.manifold import TSNE

tqdm.pandas()

%matplotlib inline

In [4]:
data_dir = "/home/sabr/PycharmProjects/kaggle_competitions/sberbank/taskA/data"
train_csv = os.path.join(data_dir, "train_task1_latest.csv")
test_csv = os.path.join(data_dir, "sdsj_A_test.csv")

df = pd.read_csv(train_csv)
df.head()


Out[4]:
paragraph_id question_id paragraph question target
0 1094 46273 В отличие от рыб, земноводные (амфибии) и прес... С какого года Русское Царство перешло на летои... 0.0
1 7414 19164 В 1049 году Балдуину V удалось отнять у Герман... Кто упомянул о его первых разногласиях со Штей... 0.0
2 6744 39767 Стремление достичь предельных значений ёмкости... Как называется имеющая мировое значение эпоха ... 0.0
3 7300 36318 Первый практически пригодный двухтактный газов... Что усугублялось из-за международного давления... 0.0
4 7077 41534 Требуя от художника углубленного изучения изоб... Какой характер носят пророчества Леонардо да В... 0.0

In [5]:
df.shape


Out[5]:
(119398, 5)

In [9]:
STOP_WORDS_RU = set(stopwords.words("russian"))

mystem = Mystem()

# TOKEN_LEN_THRESHOLD = 2  # word's length should be minimum 1
rgx_remove_sym = re.compile(r'[^\w]')

# remove symbols -> split by word -> lowercase -> lemmatize -> remove stop words -> join by space
# @lru_cache()
def normalize(sentence):
    tokens = []
    for word in rgx_remove_sym.sub(" ", sentence).split():
        lemma = "".join(mystem.lemmatize(word.lower())).strip().replace('ё', 'е')
        tokens.append(lemma)
    filtered = list(set(tokens) - STOP_WORDS_RU)
    return " ".join(filtered)


Installing mystem to /home/sabr/.local/bin/mystem from http://download.cdn.yandex.net/mystem/mystem-3.0-linux3.1-64bit.tar.gz

In [10]:
%%time

df.loc[:, "paragraph"] = df.paragraph.progress_apply(lambda x: normalize(x))
df.loc[:, "question"] = df.question.progress_apply(lambda x: normalize(x))


100%|██████████| 119398/119398 [11:20<00:00, 175.50it/s]
100%|██████████| 119398/119398 [01:41<00:00, 1182.03it/s]
CPU times: user 5min 10s, sys: 46 s, total: 5min 56s
Wall time: 13min 1s


In [11]:
df.to_csv(os.path.join(data_dir, "train_norm.csv"))

In [12]:
from gensim.models.word2vec import Word2Vec
from sklearn.manifold import TSNE

paragraph_vector = Word2Vec(workers=4, size=100, min_count=50, window=10, sample=1e-3)
question_vector = Word2Vec(workers=4, size=100, min_count=50, window=10, sample=1e-3)

In [13]:
%%time
paragraph_vector.build_vocab([x for x in tqdm(df.paragraph)])


100%|██████████| 119398/119398 [00:00<00:00, 2199009.85it/s]
CPU times: user 6.46 s, sys: 0 ns, total: 6.46 s
Wall time: 6.46 s

In [14]:
question_vector.build_vocab([x for x in tqdm(df.question)])


100%|██████████| 119398/119398 [00:00<00:00, 1680790.43it/s]

In [15]:
length = len(df.paragraph)
assert length == len(df.question)

In [16]:
paragraph_vector.train(df.paragraph, total_examples=length, epochs=10)


Out[16]:
146288018

In [17]:
question_vector.train(df.question, total_examples=length, epochs=10)


Out[17]:
18833367

In [ ]:


In [19]:
X_1 = paragraph_vector[paragraph_vector.wv.vocab]

tsne = TSNE(n_components=2)
X_1_tsne = tsne.fit_transform(X_1)

plt.scatter(X_1_tsne[:, 0], X_1_tsne[:, 1])
plt.show()



In [20]:
X_2 = question_vector[question_vector.wv.vocab]

tsne = TSNE(n_components=2)
X_2_tsne = tsne.fit_transform(X_2)

plt.scatter(X_2_tsne[:, 0], X_2_tsne[:, 1])
plt.show()



In [53]:
print(X_1.shape)
print(X_2.shape)
print(X_1_tsne.shape)
print(X_2_tsne.shape)


(114, 100)
(78, 100)
(114, 2)
(78, 2)

In [60]:
from sklearn.metrics.pairwise import cosine_distances, cosine_similarity
cosines = cosine_distances(X_1_tsne, X_2_tsne)
cosines.shape


Out[60]:
(114, 78)

In [63]:
df["target"].shape


Out[63]:
(119398,)

In [ ]:


In [65]:
import gensim
model = gensim.models.KeyedVectors.load_word2vec_format("/home/sabr/Downloads/ruscorpora_1_600_2.bin", encoding="utf8", unicode_errors="ignore", binary=True)

In [76]:
p = df.head(1)["paragraph"][0]
q = df.head(1)["question"][0]

In [82]:
c = 0
try:
    model.wv.n_similarity(p.split(), q.split())
except KeyError as ex:
    c += 1
print(c)


1

In [ ]: