Create TF-IDF vectors from question texts and compute vector distances between them.
This utility package imports numpy
, pandas
, matplotlib
and a helper kg
module into the root namespace.
In [1]:
from pygoose import *
In [2]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_distances, euclidean_distances
Automatically discover the paths to various data folders and compose the project structure.
In [3]:
project = kg.Project.discover()
Identifier for storing these features on disk and referring to them later.
In [4]:
feature_list_id = 'tfidf'
Preprocessed and tokenized questions.
In [5]:
tokens_train = kg.io.load(project.preprocessed_data_dir + 'tokens_lowercase_spellcheck_no_stopwords_train.pickle')
tokens_test = kg.io.load(project.preprocessed_data_dir + 'tokens_lowercase_spellcheck_no_stopwords_test.pickle')
In [6]:
tokens = tokens_train + tokens_test
Extract a set of unique question texts (document corpus).
In [7]:
all_questions_flat = np.array(tokens).ravel()
In [8]:
documents = list(set(' '.join(question) for question in all_questions_flat))
In [9]:
del all_questions_flat
Create a bag-of-token-unigrams vectorizer.
In [10]:
vectorizer = TfidfVectorizer(
encoding='utf-8',
analyzer='word',
strip_accents='unicode',
ngram_range=(1, 1),
lowercase=True,
norm='l2',
use_idf=True,
smooth_idf=True,
sublinear_tf=True,
)
In [11]:
vectorizer.fit(documents)
Out[11]:
In [12]:
model_filename = 'tfidf_vectorizer_{}_ngrams_{}_{}_penalty_{}.pickle'.format(
vectorizer.analyzer,
vectorizer.ngram_range[0],
vectorizer.ngram_range[1],
vectorizer.norm,
)
In [13]:
kg.io.save(vectorizer, project.trained_model_dir + model_filename)
In [14]:
def compute_pair_distances(pair):
q1_doc = ' '.join(pair[0])
q2_doc = ' '.join(pair[1])
pair_dtm = vectorizer.transform([q1_doc, q2_doc])
q1_doc_vec = pair_dtm[0]
q2_doc_vec = pair_dtm[1]
return [
cosine_distances(q1_doc_vec, q2_doc_vec)[0][0],
euclidean_distances(q1_doc_vec, q2_doc_vec)[0][0],
]
In [15]:
features = kg.jobs.map_batch_parallel(
tokens,
item_mapper=compute_pair_distances,
batch_size=1000,
)
In [16]:
X_train = np.array(features[:len(tokens_train)], dtype='float64')
X_test = np.array(features[len(tokens_train):], dtype='float64')
In [17]:
print('X_train:', X_train.shape)
print('X_test: ', X_test.shape)
In [18]:
feature_names = [
'tfidf_cosine',
'tfidf_euclidean',
]
In [19]:
project.save_features(X_train, X_test, feature_names, feature_list_id)