Based on the pre-trained word embeddings, we'll compute the Word Mover's Distance between each tokenized question pair.
This utility package imports numpy
, pandas
, matplotlib
and a helper kg
module into the root namespace.
In [1]:
from pygoose import *
In [2]:
from gensim.models.wrappers.fasttext import FastText
Automatically discover the paths to various data folders and compose the project structure.
In [3]:
project = kg.Project.discover()
Identifier for storing these features on disk and referring to them later.
In [4]:
feature_list_id = 'wmd'
Preprocessed and tokenized questions.
In [5]:
tokens_train = kg.io.load(project.preprocessed_data_dir + 'tokens_lowercase_spellcheck_no_stopwords_train.pickle')
tokens_test = kg.io.load(project.preprocessed_data_dir + 'tokens_lowercase_spellcheck_no_stopwords_test.pickle')
In [6]:
tokens = tokens_train + tokens_test
Pretrained word vector database.
In [7]:
embedding_model = FastText.load_word2vec_format(project.aux_dir + 'fasttext_vocab.vec')
In [8]:
def wmd(pair):
return embedding_model.wmdistance(pair[0], pair[1])
In [9]:
wmds = kg.jobs.map_batch_parallel(
tokens,
item_mapper=wmd,
batch_size=1000,
)
In [10]:
wmds = np.array(wmds).reshape(-1, 1)
In [11]:
X_train = wmds[:len(tokens_train)]
X_test = wmds[len(tokens_train):]
In [12]:
print('X_train:', X_train.shape)
print('X_test: ', X_test.shape)
In [13]:
feature_names = [
'wmd',
]
In [14]:
project.save_features(X_train, X_test, feature_names, feature_list_id)