In [2]:
import os
import re
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from scipy import sparse
from tqdm import tqdm
from sklearn.svm import LinearSVC
from pymystem3 import Mystem
from functools import lru_cache
from nltk.corpus import stopwords
from gensim.models.word2vec import Word2Vec
from sklearn.manifold import TSNE
tqdm.pandas()
%matplotlib inline
In [4]:
data_dir = "/home/sabr/PycharmProjects/kaggle_competitions/sberbank/taskA/data"
train_csv = os.path.join(data_dir, "train_task1_latest.csv")
test_csv = os.path.join(data_dir, "sdsj_A_test.csv")
df = pd.read_csv(train_csv)
df.head()
Out[4]:
In [5]:
df.shape
Out[5]:
In [9]:
STOP_WORDS_RU = set(stopwords.words("russian"))
mystem = Mystem()
# TOKEN_LEN_THRESHOLD = 2 # word's length should be minimum 1
rgx_remove_sym = re.compile(r'[^\w]')
# remove symbols -> split by word -> lowercase -> lemmatize -> remove stop words -> join by space
# @lru_cache()
def normalize(sentence):
tokens = []
for word in rgx_remove_sym.sub(" ", sentence).split():
lemma = "".join(mystem.lemmatize(word.lower())).strip().replace('ё', 'е')
tokens.append(lemma)
filtered = list(set(tokens) - STOP_WORDS_RU)
return " ".join(filtered)
In [10]:
%%time
df.loc[:, "paragraph"] = df.paragraph.progress_apply(lambda x: normalize(x))
df.loc[:, "question"] = df.question.progress_apply(lambda x: normalize(x))
In [11]:
df.to_csv(os.path.join(data_dir, "train_norm.csv"))
In [12]:
from gensim.models.word2vec import Word2Vec
from sklearn.manifold import TSNE
paragraph_vector = Word2Vec(workers=4, size=100, min_count=50, window=10, sample=1e-3)
question_vector = Word2Vec(workers=4, size=100, min_count=50, window=10, sample=1e-3)
In [13]:
%%time
paragraph_vector.build_vocab([x for x in tqdm(df.paragraph)])
In [14]:
question_vector.build_vocab([x for x in tqdm(df.question)])
In [15]:
length = len(df.paragraph)
assert length == len(df.question)
In [16]:
paragraph_vector.train(df.paragraph, total_examples=length, epochs=10)
Out[16]:
In [17]:
question_vector.train(df.question, total_examples=length, epochs=10)
Out[17]:
In [ ]:
In [19]:
X_1 = paragraph_vector[paragraph_vector.wv.vocab]
tsne = TSNE(n_components=2)
X_1_tsne = tsne.fit_transform(X_1)
plt.scatter(X_1_tsne[:, 0], X_1_tsne[:, 1])
plt.show()
In [20]:
X_2 = question_vector[question_vector.wv.vocab]
tsne = TSNE(n_components=2)
X_2_tsne = tsne.fit_transform(X_2)
plt.scatter(X_2_tsne[:, 0], X_2_tsne[:, 1])
plt.show()
In [53]:
print(X_1.shape)
print(X_2.shape)
print(X_1_tsne.shape)
print(X_2_tsne.shape)
In [60]:
from sklearn.metrics.pairwise import cosine_distances, cosine_similarity
cosines = cosine_distances(X_1_tsne, X_2_tsne)
cosines.shape
Out[60]:
In [63]:
df["target"].shape
Out[63]:
In [ ]:
In [65]:
import gensim
model = gensim.models.KeyedVectors.load_word2vec_format("/home/sabr/Downloads/ruscorpora_1_600_2.bin", encoding="utf8", unicode_errors="ignore", binary=True)
In [76]:
p = df.head(1)["paragraph"][0]
q = df.head(1)["question"][0]
In [82]:
c = 0
try:
model.wv.n_similarity(p.split(), q.split())
except KeyError as ex:
c += 1
print(c)
In [ ]: