In [2]:
%matplotlib notebook
import itertools
import logging
from functools import partial
import gensim
import matplotlib
import matplotlib.pyplot as plt
import numpy as np
import pandas as pnd
from sklearn.cluster import *
from sklearn.decomposition import PCA, RandomizedPCA
from sklearn.manifold import TSNE
from knub.thesis.util import *
matplotlib.style.use('ggplot')
In [10]:
pnd.set_option("display.max_colwidth", 100)
Evaluated using Palmetto tool from Exploring the Space of Topic Coherence Measures paper Values still seem low compared to example values from the paper
In [4]:
df_tc_results = pnd.DataFrame([
("topic.full.alpha-1-100.256-400.model", 0.469500859375, 0.00617111859067, 0.6463414634146342),
("topic.16-400.model", 0.43805875, 0.00390183951094, 0.5975609756097561),
("topic.256-1000.model", 0.473455351563, 0.00635883046394, 0.5853658536585366),
("topic.64-400.model", 0.45327734375, 0.00385141007263, 0.6341463414634146),
("topic.256-400.model", 0.46836359375, 0.00599032492068, 0.5731707317073171),
("topic.full.fixed-vocabulary.alpha-1-100.256-400.model", 0.468437070312, 0.00562772603243, 0.5975609756097561),
("topic.full.256-400.model", 0.472498945313, 0.00624853749772, 0.5975609756097561),
("topic.256-600.model", 0.478640273437, 0.00685787139094, 0.5609756097560975)
],
columns=["Topic model parameters", "TC_mean", "TC_var", "CC_purity"])
del df_tc_results["CC_purity"]
In [5]:
df_tc_results.sort_values(by="TC_mean", ascending=False)
Out[5]:
In [7]:
df_tc_results.sort_values(by="TC_var", ascending=False)
Out[7]:
In [12]:
df_tc_results_2 = pnd.read_csv("../models/topic_models_coherence_2.tsv", sep="\t", index_col=None)
df_tc_results_2.sort_values(by="TC_mean", ascending=False)
Out[12]:
Using the question word data set (~19k questions) from Efficient Estimation of Word Representations in Vector Space (word2vec).
In [8]:
df_ar_results = pnd.DataFrame([
("embedding.skip-gram.size-200.window-5.negative-5.model", 0.481221858371),
("embedding.cbow.size-200.window-5.model", 0.416547277937),
("embedding.google.size-300", 0.735878018829),
],
columns=["Word Embeddings", "Analogy_Reasoning"])
df_ar_results.sort_values(by="Analogy_Reasoning", ascending=False)
Out[8]:
Testing only skip-gram architecture.
In [12]:
df_ar_spearmint_results = pnd.read_csv("../code/python/knub/thesis/spearmint_analogy_reasoning/results.csv", index_col="model")
df_ar_spearmint_results.sort_values(by="Analogy_Reasoning", ascending=False)
Out[12]: