Based on the pre-trained word embeddings, we'll calculate the mean embedding vector of each question (as well as the unit-length normalized sum of word embeddings), and compute vector distances between these aggregate vectors.
This utility package imports numpy
, pandas
, matplotlib
and a helper kg
module into the root namespace.
In [1]:
from pygoose import *
In [2]:
from gensim.models.wrappers.fasttext import FastText
In [3]:
from scipy.spatial.distance import cosine, euclidean, cityblock
Automatically discover the paths to various data folders and compose the project structure.
In [4]:
project = kg.Project.discover()
Identifier for storing these features on disk and referring to them later.
In [5]:
feature_list_id = 'phrase_embedding'
Preprocessed and tokenized questions.
In [6]:
tokens_train = kg.io.load(project.preprocessed_data_dir + 'tokens_lowercase_spellcheck_no_stopwords_train.pickle')
tokens_test = kg.io.load(project.preprocessed_data_dir + 'tokens_lowercase_spellcheck_no_stopwords_test.pickle')
In [7]:
tokens = tokens_train + tokens_test
Pretrained word vector database.
In [8]:
embedding_model = FastText.load_word2vec_format(project.aux_dir + 'fasttext_vocab.vec')
In [9]:
def get_phrase_embedding_distances(pair):
q1_vectors = [embedding_model[token] for token in pair[0] if token in embedding_model]
q2_vectors = [embedding_model[token] for token in pair[1] if token in embedding_model]
if len(q1_vectors) == 0:
q1_vectors.append(np.zeros(word_vector_dim))
if len(q2_vectors) == 0:
q2_vectors.append(np.zeros(word_vector_dim))
q1_mean = np.mean(q1_vectors, axis=0)
q2_mean = np.mean(q2_vectors, axis=0)
q1_sum = np.sum(q1_vectors, axis=0)
q2_sum = np.sum(q2_vectors, axis=0)
q1_norm = q1_sum / np.sqrt((q1_sum ** 2).sum())
q2_norm = q2_sum / np.sqrt((q2_sum ** 2).sum())
return [
cosine(q1_mean, q2_mean),
np.log(cityblock(q1_mean, q2_mean) + 1),
euclidean(q1_mean, q2_mean),
cosine(q1_norm, q2_norm),
np.log(cityblock(q1_norm, q2_norm) + 1),
euclidean(q1_norm, q2_norm),
]
In [10]:
distances = kg.jobs.map_batch_parallel(
tokens,
item_mapper=get_phrase_embedding_distances,
batch_size=1000,
)
In [11]:
distances = np.array(distances)
In [12]:
X_train = distances[:len(tokens_train)]
X_test = distances[len(tokens_train):]
In [13]:
print('X_train:', X_train.shape)
print('X_test: ', X_test.shape)
In [14]:
feature_names = [
'phrase_emb_mean_cosine',
'phrase_emb_mean_cityblock_log',
'phrase_emb_mean_euclidean',
'phrase_emb_normsum_cosine',
'phrase_emb_normsum_cityblock_log',
'phrase_emb_normsum_euclidean',
]
In [15]:
project.save_features(X_train, X_test, feature_names, feature_list_id)