Train a Latent Dirichlet Allocation model with 300 topics on the question corpus and compute topic distances between the question pairs.
This utility package imports numpy
, pandas
, matplotlib
and a helper kg
module into the root namespace.
In [ ]:
from pygoose import *
In [ ]:
from gensim.corpora import Dictionary
from gensim.models import LdaMulticore
In [ ]:
from nltk.stem import SnowballStemmer
In [ ]:
from sklearn.metrics.pairwise import cosine_distances, euclidean_distances
Automatically discover the paths to various data folders and compose the project structure.
In [ ]:
project = kg.Project.discover()
Identifier for storing these features on disk and referring to them later.
In [ ]:
feature_list_id = 'lda'
Number of LDA topics to train.
In [ ]:
NUM_TOPICS = 300
Make subsequent runs reproducible.
In [ ]:
RANDOM_SEED = 42
Preprocessed and tokenized questions.
In [ ]:
tokens_train = kg.io.load(project.preprocessed_data_dir + 'tokens_lowercase_spellcheck_no_stopwords_train.pickle')
tokens_test = kg.io.load(project.preprocessed_data_dir + 'tokens_lowercase_spellcheck_no_stopwords_test.pickle')
Build a corpus of stemmed documents.
In [ ]:
stemmer = SnowballStemmer('english')
In [ ]:
def stem_pair(pair):
return [
[stemmer.stem(token) for token in pair[0]],
[stemmer.stem(token) for token in pair[1]],
]
In [ ]:
tokens = kg.jobs.map_batch_parallel(
tokens_train + tokens_test,
item_mapper=stem_pair,
batch_size=1000,
)
In [ ]:
documents = list(np.array(tokens).ravel())
Based on the corpus, train the BoW vectorizer and the topic model.
In [ ]:
dictionary = Dictionary(documents)
In [ ]:
corpus = [dictionary.doc2bow(document) for document in documents]
In [ ]:
model = LdaMulticore(
corpus,
num_topics=NUM_TOPICS,
id2word=dictionary,
random_state=RANDOM_SEED,
)
In [ ]:
model.save(project.trained_model_dir + f'lda_{NUM_TOPICS}.pickle')
In [ ]:
def compute_topic_distances(pair):
q1_bow = dictionary.doc2bow(pair[0])
q2_bow = dictionary.doc2bow(pair[1])
q1_topic_vec = np.array(model.get_document_topics(q1_bow, minimum_probability=0))[:, 1].reshape(1, -1)
q2_topic_vec = np.array(model.get_document_topics(q2_bow, minimum_probability=0))[:, 1].reshape(1, -1)
return [
cosine_distances(q1_topic_vec, q2_topic_vec)[0][0],
euclidean_distances(q1_topic_vec, q2_topic_vec)[0][0],
]
In [ ]:
distances = kg.jobs.map_batch_parallel(
tokens,
item_mapper=compute_topic_distances,
batch_size=1000,
)
In [ ]:
X_train = np.array(distances[:len(tokens_train)], dtype='float64')
X_test = np.array(distances[len(tokens_train):], dtype='float64')
In [ ]:
print('X_train:', X_train.shape)
print('X_test: ', X_test.shape)
In [ ]:
feature_names = [
'lda_cosine',
'lda_euclidean',
]
In [ ]:
project.save_features(X_train, X_test, feature_names, feature_list_id)
In [ ]:
pd.DataFrame(X_train).describe()
In [ ]:
pd.DataFrame(X_test).describe()
In [ ]:
pd.DataFrame(X_train).plot.hist()