In [3]:
%matplotlib notebook
import itertools
import logging
from functools import partial
from collections import defaultdict
import gensim
import matplotlib
from matplotlib import rc
import matplotlib.pyplot as plt
from matplotlib.colors import LogNorm
from matplotlib import colors as plt_colors
import numpy as np
import pandas as pnd
import os
from sklearn.cluster import *
from sklearn import mixture
from sklearn.preprocessing import normalize
from sklearn.cross_validation import train_test_split
from sklearn.decomposition import PCA, RandomizedPCA
from sklearn.manifold import TSNE
from sklearn import svm, metrics
from multiprocessing import Pool
from knub.thesis.util import *
pnd.set_option("display.max_colwidth", 200)
LIGHT_COLORS = ["#a6cee3", "#b2df8a", "#fb9a99", "#fdbf6f", "#cab2d6", "#ffff99"] # light colors
DARK_COLORS = ["#1f78b4", "#33a02c", "#e31a1c", "#ff7f00", "#6a3d9a"] # dark colors
COLORS = ["#a6cee3", "#1f78b4", "#b2df8a", "#33a02c", "#fb9a99", "#e31a1c",
"#fdbf6f", "#ff7f00", "#cab2d6", "#6a3d9a", "#ffff99", "#b15928"]
rc('font', **{'family': 'serif', 'serif': ['Computer Modern']})
rc('text', usetex=True)
SIZE = 10
rc('font', size=SIZE) # controls default text sizes
rc('axes', titlesize=SIZE) # fontsize of the axes title
rc('axes', labelsize=SIZE) # fontsize of the x and y labels
rc('xtick', labelsize=SIZE) # fontsize of the tick labels
rc('ytick', labelsize=SIZE) # fontsize of the tick labels
rc('legend', fontsize=SIZE) # legend fontsize
rc('figure', titlesize=SIZE) # fontsize of the figure title
def cm2inch(*tupl):
inch = 2.54
if isinstance(tupl[0], tuple):
return tuple(i/inch for i in tupl[0])
else:
return tuple(i/inch for i in tupl)
Approach:
In [24]:
def load_topic_coherence(f):
f_server = "/home/stefan.bunk/master-thesis/results/topic-coherence/"
if os.path.exists(f_server):
return pnd.read_csv(f_server + f, sep="\t", header=None)
f_home = "/home/knub/Repositories/master-thesis/results/topic-coherence/"
if os.path.exists(f_home):
return pnd.read_csv(f_home + f, sep="\t", header=None)
raise Exception("File does not exist")
def plot_iteration_for(df_param, params, r=0.9, param_filter=None, figsize=(14.69, 7.9), colors_offset=0, dpi=300):
#matplotlib.style.use('classic')
plt.figure(figsize=cm2inch(*figsize), dpi=dpi)
#colors = iter(["green", "red", "blue", "yellow"] + plt_colors.cnames.keys())
#colors = iter(plt_colors.cnames.keysf7f7f7
# diverging - blue to red
#colors = iter(["#053061", "#2166ac", "#4393c3", "#92c5de", "#d1e5f0", "#f7f7f7",
# "#fddbc7", "#f4a582", "#d6604d", "#b2182b", "#67001f", "black"])
colors = COLORS[colors_offset:]
colors = iter(colors)
for model, df_group in df_param.groupby(params):
if param_filter is not None and model not in param_filter:
continue
label = model
if isinstance(model, float):
label = "%.2f" % model
plt.plot(df_group["iteration"], df_group["TC"] * 100, linestyle="None", color='white',
label=label, marker='o', markerfacecolor=next(colors), markersize=5)
plt.xlabel("Iteration \\#")
plt.ylabel("Topic Coherence $TC$")
min_TC = df_param["TC"].min() * 100
max_TC = df_param["TC"].max() * 100
diff_TC = max_TC - min_TC
lower_TC = min_TC - r * diff_TC
upper_TC = max_TC + r * diff_TC
plt.ylim((lower_TC, upper_TC))
plt.xlim((df_param["iteration"].min() - 20, df_param["iteration"].max() + 20))
legend_title = None
if "actual_lambda" in params:
legend_title = "$\\lambda_{act}$"
plt.legend(loc="upper left", numpoints=1, borderpad=0.5, handlelength=0.25, title=legend_title).get_frame().set_linewidth(0.5)
plt.tight_layout()
In [4]:
df_tc_lda = load_topic_coherence("lda.txt")
df_tc_lda.columns = ["alpha", "beta", "TC", "TC_std"]
df_tc_lda = df_tc_lda.sort_values("TC", ascending=False)
df_tc_lda
Out[4]:
In [45]:
f, (ax1, ax2) = plt.subplots(1, 2, sharey=True, figsize=cm2inch(12, 5), dpi=300)
df_tc_lda["Avg. topic coherence $\\overline{TC}$"] = df_tc_lda["TC"] * 100
df_tc_lda.groupby("alpha").mean().reset_index().plot(x="alpha", y="Avg. topic coherence $\\overline{TC}$",
c="black", marker="x", s=25, linewidth=0.65, kind="scatter", ax=ax1)
#ax1.set_xticks(ax1.get_xticks()[::2])
#ax1.set_yticks(ax1.get_yticks()[1:])
ax1.set_xlim((-0.012, 0.09))
ax2.set_xlim((-0.012, 0.11))
ax1.set_ylim((41.7, 42.9))
df_tc_lda.groupby("beta").mean().reset_index().plot(x="beta", y="Avg. topic coherence $\\overline{TC}$",
c="black", marker="x", s=25, linewidth=0.65, kind="scatter", ax=ax2)
ax1.set_xlabel("$\\alpha$")
ax2.set_xlabel("$\\beta$")
plt.tight_layout()
In [21]:
df_tc_lda["TC"].mean()
Out[21]:
In [12]:
df_tc_lda["TC"].std()
Out[12]:
In [11]:
df_tc_lda_nips = load_topic_coherence("lda_nips.txt")
df_tc_lda_nips.columns = ["alpha", "beta", "TC", "TC_std"]
#df_tc_lda_nips = df_tc_lda_nips.sort_values("TC", ascending=False)
df_tc_lda_nips
Out[11]:
In [143]:
df_tc_lda_nips["TC"].mean()
Out[143]:
In [51]:
df_tc_lda_iterations = load_topic_coherence("palmetto_lda_topic_evolution.log")
df_tc_lda_iterations.columns = ["iteration", "TC", "TC_std"]
df_tc_lda_iterations["TC"] *= 100
#df_tc_lda_iterations.head(31)["TC"] = [x + y for x, y in zip(np.linspace(0.32, 0.389, 31), np.random.normal(0, 0.005, 31))]
plt.figure(figsize=cm2inch(12, 6), dpi=300)
#plt.xlim(-50, 1100)
plt.scatter(df_tc_lda_iterations["iteration"], df_tc_lda_iterations["TC"], marker="x", c="black", s=8, linewidth=0.35)
plt.xlabel("Iteration \\#")
plt.ylabel("Topic Coherence $TC$")
plt.tight_layout()
LFLDA does not sample only from the multinomial topic-word distribution as LDA, but also in the embedding space by defining a word distribution for a given topic based on the similarity of words with the topic vector. From which distribution to sample from is governed by the $\lambda$ parameter.
In [83]:
df_tc_lflda = load_topic_coherence("lflda.txt")
df_tc_lflda.columns = ["embedding-dimension", "lambda", "alpha", "beta", "iteration", "TC", "TC_std"]
#df_tc_lflda = df_tc_lflda[df_tc_lflda["lambda"] == 0.6]
df_tc_lflda = df_tc_lflda[(df_tc_lflda["iteration"] - 1) % 10 == 2]
df_tc_lflda.head()
Out[83]:
In [84]:
plot_iteration_for(df_tc_lflda, ["embedding-dimension", "lambda"], r=1)
In [97]:
df_tc_lflda2 = load_topic_coherence("lflda2.txt")
df_tc_lflda2.columns = ["embedding-dimension", "lambda", "alpha", "beta", "iteration", "TC", "TC_std"]
df_tc_lflda2.head()
Out[97]:
In [98]:
plot_iteration_for(df_tc_lflda2, ["embedding-dimension", "lambda"], r=1)
Results: LFTM achieves improvements of $~ 1.8$ in TC. Training time is very slow (see below).
TODO:
TopicVec replaces the multinomial topic-word distribution from LDA and replaces with an WE-inspired word distribution based on the immediate context of a word. Topics are points in the embedding space.
Parameters:
In [13]:
df_tc_topicvec = load_topic_coherence("topicvec.txt")
df_tc_topicvec.columns = ["corpus", "embedding-dimension", "iteration", "TC", "TC_std"]
df_tc_topicvec = df_tc_topicvec.sort_values(["corpus", "embedding-dimension", "iteration"])
df_tc_topicvec.head()
Out[13]:
In [14]:
plot_iteration_for(df_tc_topicvec, ["embedding-dimension", "corpus"])
Results: TopicVec achieves the highest topic coherence with 46.8. Higher dimensions converge slower, but seem to end with higher TC.
TODO:
In [8]:
df_tc_topicvec_topics = pnd.read_csv("/home/knub/Repositories/topicvec/results/corpus-orig.dim-50.iterations-500/topics", sep=" ", header=None)
df_tc_topicvec_topics.columns = ["topic-id", "probability"] + list(range(1, 11))
df_tc_topicvec_topics
With 20news.dim-50 embeddings
In [82]:
df_tc_topicvec_20news_dim50 = load_topic_coherence("topicvec.embedding-20news-dim-50.txt")
df_tc_topicvec_20news_dim50.columns = ["embedding", "iteration", "TC", "TC_std"]
df_tc_topicvec_20news_dim50 = df_tc_topicvec_20news_dim50.sort_values(["embedding", "iteration"])
plot_iteration_for(df_tc_topicvec_20news_dim50, ["embedding"])
Learning works better with lower number of dimensions (50 instead of 200). Results are better, but still not good
Yes, I have a feeling, I ran the experiments on 50 dimensional vectors. I am a little sad that it does not work for higher dimensional embeddings, but clearly there is something interesting going on which might give some insights about the structure of the space. Can you point me to the topics for D = {50, 200}.
Topics are better, but still not good. Previously, the topics were almost uniform.
Now, actual topics emerge but there are still a lot of noise topics. Pre-initializing the topic vectors to the mean of standard LDA's topics did not help
Parameters:
In [18]:
pnd.read_csv("/home/stefan.bunk/Gaussian_LDA/results/old/dim-50.alpha-0-02.output/090.topics", sep=" ", header=None)
Out[18]:
In [88]:
df_tc_gaussian = load_topic_coherence("gaussian_lda.txt")
df_tc_gaussian.columns = ["iteration", "initialization-method", "embedding-dimension", "alpha", "TC", "TC_std"]
df_tc_gaussian = df_tc_gaussian.sort_values(["embedding-dimension", "initialization-method", "alpha", "iteration"])
df_tc_gaussian.head()
Out[88]:
In [89]:
plot_iteration_for(df_tc_gaussian, ["initialization-method", "embedding-dimension", "alpha"], r=0.2)
Iterations against likelihood
In [92]:
df_tc_gaussian_iterations = pnd.read_csv("/home/knub/Repositories/Gaussian_LDA/results/old/initialization-mean.dim-50.alpha-0-02.output/iterations.txt", sep="\t")
df_tc_gaussian_iterations.plot("iteration", "likelihood", kind="scatter", label="Likelihood")
Out[92]:
Time for each iteration
In [93]:
df_tc_gaussian_iterations.plot("iteration", "time", kind="scatter")
Out[93]:
Results: Gaussian LDA achieves the lowest TC of all systems. Also, the model cannot handle higher dimensions.
TODO:
NIPS
In [99]:
df_tc_gaussian_nips = load_topic_coherence("gaussian_lda_nips.txt")
df_tc_gaussian_nips.columns = ["iteration", "dataset", "embedding-dimension", "alpha", "TC", "TC_std"]
plot_iteration_for(df_tc_gaussian_nips, ["embedding-dimension", "alpha"], r=1.0)
In [96]:
pnd.read_csv("/home/knub/Repositories/Gaussian_LDA/results/data-nips.dim-50.alpha-0-02/030.topics", sep=" ", header=None)
Out[96]:
Similar to Gaussian LDA, but models a von Mises-Fisher distribution on the hypercube instead of a Gaussian distribution in the embedding space.
Parameters:
So far no results with 20news. Parameter search did not help:
As far as I remember we couldn't get the same good performance of nips dataset with 20news (as we reported in Table 2). However, other methods were also doing poorly on the same dataset. One reason for that could be using 50-dimensional word2vec representations; we didn't try other possible pretrained word vector representations with higher dimensions (e.g. Glove). Also we didn't try larger minibatch sizes (e.g. 100) for SVI training. These modifications might have improved the results for 20news corpus.
We've been using the model in the medical domain and we are getting good performance; however, one observation that we had was the quality of the sHDP is dependent on the quality of pretrained word vectors that we use.
In [24]:
# kappa, tau in [(0.6, 0.8), (0.505, 0.8), (0.6, 0.1), (0.6, 10), (0.6, 100)]:
# alpha in [0.5, 0.9, 1.0, 1.5, 10]:
# gamma in [0.5, 1.0, 1.5, 10]:
# run_shdp()
In [25]:
x = np.arange(1, 101)
plt.figure()
for kappa, tau in [(0.6, 0.8), (0.95, 0.8), (0.505, 0.8), (0.6, 0.1), (0.6, 10), (0.6, 100)]:
y = (x + tau)**(-kappa)
plt.plot(x, y, label="kappa=" + str(kappa) + ", tau=" + str(tau))
plt.legend()
Out[25]:
In [86]:
df_tc_shdp_nips = load_topic_coherence("shdp_nips.txt")
df_tc_shdp_nips.columns = ["embeddings", "dimensions", "seed", "topics", "alpha", "gamma",
"kappa", "tau", "step-size", "TC", "TC_std"]
df_tc_shdp_nips["iteration"] = 1
df_tc_shdp_nips
Out[86]:
In [87]:
plot_iteration_for(df_tc_shdp_nips, ["embeddings"])
NIPS
In [29]:
df_tc_shdp_nips_topics = pnd.read_csv("/home/stefan.bunk/sHDP/results/20news/embeddings-ours.dim-50.seed-1.topics-50.alpha-1-0.gamma-2-0.kappa-0-6.tau-0-8.batch-10/prob-based.topics", header=None, sep=" ")
df_tc_shdp_nips_topics
Out[29]:
Results: Similarly to Gaussian LDA, learning a model/finding the right parameters is hard. The model also seems to have a problem with higher dimensions.
TODO:
WELDA resamples the words in the text based on a Gaussian distribution over the topic space. The resample probability is $\lambda$.
In [5]:
df_tc_welda = load_topic_coherence("gaussian_welda.txt")
df_tc_welda.columns = ["iteration", "distance-method", "lambda", "TC", "TC_std"]
df_tc_welda = df_tc_welda.sort_values(["distance-method", "lambda", "iteration"])
plot_iteration_for(df_tc_welda, ["distance-method", "lambda"])
Results: WELDA increases the TC compared to standard LDA by $~ 2.8$. After topicvec, it is the second best model with regards to TC.
In [4]:
df_tc_welda_2 = load_topic_coherence("gaussian_welda2.txt")
df_tc_welda_2.columns = ["embedding-dimension", "distance-method", "lambda", "iteration", "TC", "TC_std"]
df_tc_welda_2 = df_tc_welda_2.sort_values(["lambda", "iteration"])
plot_iteration_for(df_tc_welda_2, ["lambda"])
With 20news.dim-50 embedding
In [51]:
df_tc_welda_gaussian_20news = load_topic_coherence("gaussian_welda.embedding-20news-dim-50.txt")
df_tc_welda_gaussian_20news.columns = ["embedding-dimension", "distance-method", "lambda", "iteration", "TC", "TC_std"]
df_tc_welda_gaussian_20news = df_tc_welda_gaussian_20news.sort_values(["distance-method", "lambda", "iteration"])
plot_iteration_for(df_tc_welda_gaussian_20news, ["distance-method", "lambda"])
NIPS
In [111]:
df_tc_nips_welda = load_topic_coherence("gaussian_welda_nips.txt")
df_tc_nips_welda.columns = ["iteration", "embedding-dimensions", "distance-method", "lambda", "TC", "TC_std"]
df_tc_nips_welda = df_tc_nips_welda[df_tc_nips_welda["iteration"] % 10 == 0]
df_tc_nips_welda.head()
Out[111]:
In [117]:
#plot_iteration_for(df_tc_nips_welda, ["lambda"])
In [49]:
df_tc_nips_welda2 = load_topic_coherence("gaussian_welda_nips2.txt")
df_tc_nips_welda2.columns = ["embedding-dimensions", "distance-method", "lambda", "iteration", "TC", "TC_std"]
plot_iteration_for(df_tc_nips_welda2, ["lambda"])
Results: Again WELDA increases the TC by $0.4$
New sampling method
In [52]:
df_tc_welda3 = load_topic_coherence("gaussian_welda.sampling-corrected.txt")
df_tc_welda3.columns = ["embedding-dimensions", "distance-method", "lambda", "iteration", "TC", "TC_std"]
In [53]:
plot_iteration_for(df_tc_welda3[df_tc_welda3["embedding-dimensions"] == "20news-50"], ["lambda"])
In [312]:
plot_iteration_for(df_tc_welda3[df_tc_welda3["embedding-dimensions"] == "200"], ["lambda"])
Topics from the overfitted lambda=1.0 experiment
In [80]:
pnd.read_csv(
"/data/wikipedia/2016-06-21/topic-models/topic.20news.50-1500.alpha-0-02.beta-0-02/model.20news.dim-50.skip-gram.embedding.welda.gaussian.distance-cos.lambda-1-0/welda.iteration-200.topics",
sep=" ", header=None)
Out[80]:
Topics from the best lambda=0.6 experiment
In [54]:
pnd.read_csv(
"/data/wikipedia/2016-06-21/topic-models/topic.20news.50-1500.alpha-0-02.beta-0-02/model.20news.dim-50.skip-gram.embedding.welda.gaussian.distance-cos.lambda-0-6/welda.iteration-200.topics",
sep=" ", header=None)
Top sampling
In [319]:
df_tc_welda_top = load_topic_coherence("gaussian_welda_top.txt")
df_tc_welda_top.columns = ["file", "embedding-dimensions", "sampling", "distance-method", "lambda", "iteration", "TC", "TC_std"]
plot_iteration_for(df_tc_welda_top[df_tc_welda_top["embedding-dimensions"] == "20news-50"], ["lambda"])
In [320]:
df_tc_welda_top = load_topic_coherence("gaussian_welda_top.txt")
df_tc_welda_top.columns = ["file", "embedding-dimensions", "sampling", "distance-method", "lambda", "iteration", "TC", "TC_std"]
plot_iteration_for(df_tc_welda_top[df_tc_welda_top["embedding-dimensions"] == "200"], ["lambda"])
In [322]:
df_tc_welda_top[df_tc_welda_top["TC"] > 0.48]
Out[322]:
In [326]:
pnd.read_csv(
"/data/wikipedia/2016-06-21/topic-models/topic.20news.50-1500.alpha-0-02.beta-0-02/model.dim-200.skip-gram.embedding.welda.gaussian-top.distance-cos.lambda-0-1/welda.iteration-080.topics",
sep=" ", header=None).drop(0)
Out[326]:
In [327]:
pnd.read_csv(
"/data/wikipedia/2016-06-21/topic-models/topic.20news.50-1500.alpha-0-02.beta-0-02/model.dim-200.skip-gram.embedding.welda.gaussian-top.distance-cos.lambda-0-05/welda.iteration-200.topics",
sep=" ", header=None).drop(0)
Out[327]:
PCA sampling analysis
In [336]:
df_tc_welda_sampling_analysis = load_topic_coherence("gaussian_welda_pca.txt")
df_tc_welda_sampling_analysis.columns = ["file", "embedding-dimensions", "pca-dim", "top-words", "lambda", "iteration", "TC", "TC_std"]
del df_tc_welda_sampling_analysis["file"]
df_tc_welda_sampling_analysis[df_tc_welda_sampling_analysis["embedding-dimensions"] == "20news-50"].sort_values(["TC"], ascending=False)
Out[336]:
In [337]:
df_tc_welda_sampling_analysis[df_tc_welda_sampling_analysis["embedding-dimensions"] == "200"].sort_values(["TC"], ascending=False)
Out[337]:
In [171]:
df_tc_welda_gaussian_lambda = load_topic_coherence("welda-gaussian-lambda.txt")
df_tc_welda_gaussian_lambda.columns = ["embedding-dimensions","topic0", "pca", "samples", "lambda", "actual_lambda", "iteration", "TC", "TC_std"]
df_tc_welda_gaussian_lambda = df_tc_welda_gaussian_lambda[df_tc_welda_gaussian_lambda["lambda"].apply(lambda x: x * 10 in [0, 2, 4, 5, 6, 8, 10])]
def merge_runs(df_param):
tcs = df_param["TC"]
tcs = sorted(tcs)
#tc =
df_return = df_param.iloc[0]
#df_return["TC"] = tc
return df_return
df_tc_welda_gaussian_lambda = df_tc_welda_gaussian_lambda.groupby(["embedding-dimensions", "lambda", "iteration"]).apply(merge_runs).reset_index(drop=True)
#df_tc_welda_gaussian_lambda
plot_iteration_for(df_tc_welda_gaussian_lambda[df_tc_welda_gaussian_lambda["embedding-dimensions"] == "20news-50"],
["actual_lambda"], r=0.2)
plot_iteration_for(df_tc_welda_gaussian_lambda[df_tc_welda_gaussian_lambda["embedding-dimensions"] == "200"],
["actual_lambda"], r=0.2)
NIPS
In [108]:
df_tc_welda_gaussian_nips = load_topic_coherence("welda-gaussian-nips.txt")
df_tc_welda_gaussian_nips.columns = ["embedding-dimensions", "topic0", "pca", "samples", "lambda", "actual_lambda", "iteration", "TC", "TC_std"]
plot_iteration_for(df_tc_welda_gaussian_nips[df_tc_welda_gaussian_nips["embedding-dimensions"] == "nips-50"],
["actual_lambda"], r=0.2, param_filter={0.0, 0.24, 0.43, 0.52, 0.62, 0.81, 1.0})
plt.legend(loc="lower left", ncol=7, numpoints=1, borderpad=0.5, handlelength=0.25, title="$\\lambda_{act}$").get_frame().set_linewidth(0.5)
plot_iteration_for(df_tc_welda_gaussian_nips[df_tc_welda_gaussian_nips["embedding-dimensions"] == "200"],
["actual_lambda"], r=0.2, param_filter={0.0, 0.24, 0.43, 0.52, 0.62, 0.81, 1.0})
plt.legend(loc="lower left", ncol=7, numpoints=1, borderpad=0.5, handlelength=0.25, title="$\\lambda_{act}$").get_frame().set_linewidth(0.5)
In [132]:
df_tc_welda_background_topic = load_topic_coherence("welda-gaussian-background-topic.txt")
df_tc_welda_background_topic.columns = ["embedding-dimensions", "topic0", "pca", "samples", "lambda", "actual_lambda", "iteration", "TC", "TC_std"]
df_tc_welda_background_topic["background"] = "no-background"
df_welda_background = load_topic_coherence("welda-gaussian-lambda-with-background.txt")
df_welda_background.columns = ["embedding-dimensions","topic0", "pca", "samples", "lambda", "actual_lambda", "iteration", "TC", "TC_std"]
df_welda_background["background"] = "background"
df_tc_welda_background_topic_comparison = pnd.concat([df_tc_welda_background_topic, df_welda_background])
plot_iteration_for(df_tc_welda_background_topic_comparison[df_tc_welda_background_topic_comparison["embedding-dimensions"] == 200],
["actual_lambda", "background"], r=0.15)
#leg = plt.legend(bbox_to_anchor=(1., 0.34))
leg = plt.legend(loc="lower right", ncol=1, numpoints=1, borderpad=0.5, handlelength=0.25, title="$N_{pca}$")
plt.tight_layout()
plt.axhline(y=45.1, linewidth=1.4, color=DARK_COLORS[0], ls='dashed')
plt.axhline(y=45.6, linewidth=1.4, color=DARK_COLORS[1], ls='dashed')
leg.get_frame().set_linewidth(0.5)
leg.get_texts()[0].set_text('$\\lambda$ = 0.2 without background topic')
leg.get_texts()[1].set_text('$\\lambda$ = 0.2 with background topic')
leg.get_texts()[2].set_text('$\\lambda$ = 0.5 without background topic')
leg.get_texts()[3].set_text('$\\lambda$ = 0.5 with background topic')
In [62]:
#df_tc_welda_gaussian_pca_samples = load_topic_coherence("palmetto_welda-gaussian-lambda-more-runs.log")
#df_tc_welda_gaussian_pca_samples.columns = ["file", "embeddings", "run", "topic0", "pca", "samples", "lambda_v", "actual_lambda", "iteration", "TC", "TC_std"]
#del df_tc_welda_gaussian_pca_samples["file"]
#def merge_runs(df_param):
# tcs = df_param["TC"]
# tcs = sorted(tcs)
# #tc =
# df_return = df_param.iloc[0]
# #df_return["TC"] = tc
# return df_return
#df_tc_welda_gaussian_pca_samples = df_tc_welda_gaussian_pca_samples.query("(lambda_v == 0.5) & (embeddings == '200')")
#df_tc_welda_gaussian_pca_samples = df_tc_welda_gaussian_pca_samples.groupby(["embeddings", "lambda_v", "iteration"]).apply(merge_runs).reset_index(drop=True)
#df_tc_welda_gaussian_pca_samples.query("embeddings == '200'")
In [241]:
df_tc_welda_gaussian_pca_samples = load_topic_coherence("welda-gaussian-pca-samples-short.log")
df_tc_welda_gaussian_pca_samples.columns = ["embedding-dimensions", "topic0", "pca", "samples", "lambda", "actual_lambda", "iteration", "TC", "TC_std"]
del df_tc_welda_gaussian_pca_samples["topic0"]
del df_tc_welda_gaussian_pca_samples["embedding-dimensions"]
def print_pca_correlation(df_param):
for samples, df_group in df_param[df_param.iteration == 200].groupby(["samples"]):
print samples
print df_group["pca"].corr(df_group["TC"])
def plot_pca_samples(df_param):
colors = iter(DARK_COLORS)
plt.figure(figsize=cm2inch(14.69, 7.9), dpi=300)
for pca, df_group in df_param.groupby("pca"):
df_tmp = df_group.groupby("samples")["TC"].max().to_frame()
df_tmp["samples"] = df_tmp.index
print "%d %f" % (pca, df_tmp["samples"].corr(df_tmp["TC"], "pearson"))
#print df_tmp
c = next(colors)
plt.plot(df_tmp.index, df_tmp["TC"] * 100, linestyle="None", color='white',
label="%s" % str(pca), marker='o', markerfacecolor=c, markersize=5)
m, b = np.polyfit(df_tmp.index, df_tmp["TC"] * 100, 1)
print m * 400
plt.plot([0, 400], m*pnd.Series([0, 400]) + b, '-', color=c, label='_nolegend_')
plt.xlabel("\\# of samples $N_{top}$")
plt.ylabel("Topic Coherence $TC$")
#plt.legend(loc="lower right", numpoints=1)
plt.legend(loc="lower right", ncol=5, numpoints=1, borderpad=0.5, handlelength=0.25, title="$N_{pca}$").get_frame().set_linewidth(0.5)
plt.tight_layout()
plot_pca_samples(df_tc_welda_gaussian_pca_samples[df_tc_welda_gaussian_pca_samples["lambda"] == 0.5])
print_pca_correlation(df_tc_welda_gaussian_pca_samples[df_tc_welda_gaussian_pca_samples["lambda"] == 0.5])
plt.xlim((-10, 420))
plt.ylim((42.0, 49))
Out[241]:
In [243]:
df_tc_welda_gaussian_nips_pca_samples = load_topic_coherence("welda-gaussian-nips-pca-samples.log")
df_tc_welda_gaussian_nips_pca_samples.columns = ["embedding-dimensions", "topic0", "pca", "samples", "lambda", "actual_lambda", "iteration", "TC", "TC_std"]
del df_tc_welda_gaussian_nips_pca_samples["topic0"]
del df_tc_welda_gaussian_nips_pca_samples["embedding-dimensions"]
plot_pca_samples(df_tc_welda_gaussian_nips_pca_samples[df_tc_welda_gaussian_nips_pca_samples["lambda"] == 0.5])
print_pca_correlation(df_tc_welda_gaussian_nips_pca_samples[df_tc_welda_gaussian_nips_pca_samples["lambda"] == 0.5])
plt.xlim((-10, 420))
plt.ylim((40.0, 47.0))
Out[243]:
In [37]:
df_tc_welda_gaussian_random_init = load_topic_coherence("welda-gaussian-random-init.log")
df_tc_welda_gaussian_random_init.columns = ["embedding-dimensions","topic0", "pca", "samples", "lambda", "actual_lambda", "iteration", "TC", "TC_std"]
plot_iteration_for(df_tc_welda_gaussian_random_init,
["embedding-dimensions"], r=0.2)
plt.xlim((-20, 1540))
leg = plt.legend(loc="lower right", numpoints=1, borderpad=0.5, handlelength=0.25, title="Embedding model")
leg.get_frame().set_linewidth(0.5)
leg.get_texts()[0].set_text('Wikipedia, 200 dimensions')
leg.get_texts()[1].set_text('\\textsc{20News}, 50 dimensions')
#plt.legend(loc="lower right")
In [77]:
fluctuations_corpus_specific_embeddings = pnd.Series([0.455, 0.452, 0.450, 0.445, 0.450, 0.448, 0.449, 0.453, 0.450, 0.450])
fluctuations_corpus_specific_embeddings *= 100
print fluctuations_corpus_specific_embeddings.mean()
print fluctuations_corpus_specific_embeddings.std()
fluctuations_wiki_embeddings = pnd.Series([0.448, 0.450, 0.449, 0.451, 0.453, 0.445, 0.450, 0.451, 0.448, 0.449])
fluctuations_wiki_embeddings *= 100
print fluctuations_wiki_embeddings.mean()
print fluctuations_wiki_embeddings.std()
In [135]:
df_tc_welda_vmf_20news = load_topic_coherence("vmf_welda.embedding-20news-dim-50.txt")
df_tc_welda_vmf_20news.columns = ["embedding-dimension", "distance-method", "lambda", "iteration", "TC", "TC_std"]
df_tc_welda_vmf_20news = df_tc_welda_vmf_20news.sort_values(["distance-method", "lambda", "iteration"])
plot_iteration_for(df_tc_welda_vmf_20news, ["distance-method", "lambda"])
In [136]:
df_tc_welda_vmf = load_topic_coherence("vmf_welda.conc-20.txt")
df_tc_welda_vmf.columns = ["embedding-dimension", "distance-method", "concentration", "lambda", "iteration", "TC", "TC_std"]
del df_tc_welda_vmf["concentration"]
df_tc_welda_vmf = df_tc_welda_vmf.sort_values(["distance-method", "lambda", "iteration"])
plot_iteration_for(df_tc_welda_vmf, ["distance-method", "lambda"])
In [33]:
df_tc_welda_vmf_conc50 = load_topic_coherence("vmf_welda.conc-50.txt")
df_tc_welda_vmf_conc50.columns = ["embedding-dimension", "distance-method", "concentration", "lambda", "iteration", "TC", "TC_std"]
del df_tc_welda_vmf_conc50["concentration"]
df_tc_welda_vmf_conc50 = df_tc_welda_vmf_conc50.sort_values(["distance-method", "lambda", "iteration"])
plot_iteration_for(df_tc_welda_vmf_conc50, ["distance-method", "lambda"])
Full kappafactor investigation
In [294]:
df_tc_welda_vmf_kappa = load_topic_coherence("vmf_welda.conc-experiments.txt")
df_tc_welda_vmf_kappa.columns = ["embedding-dimension", "distance-method", "concentration", "lambda", "iteration", "TC", "TC_std"]
df_tc_welda_vmf_kappa = df_tc_welda_vmf_kappa.sort_values(["distance-method", "lambda", "iteration"])
TODO: BETTER PLOTS
In [309]:
plot_iteration_for(
df_tc_welda_vmf_kappa[(df_tc_welda_vmf_kappa["embedding-dimension"] == "200") &
(df_tc_welda_vmf_kappa["lambda"].apply(lambda x: x in [0.0, 0.1, 0.5, 1.0]))],
["concentration", "lambda"])
In [175]:
df_tc_welda_vmf_lambda
Out[175]:
In [184]:
df_tc_welda_vmf_lambda = load_topic_coherence("welda-vmf.log")
df_tc_welda_vmf_lambda.columns = ["embedding-dimensions", "topic-distribution", "topic0", "pca", "samples", "lambda", "kappa-factor", "actual_lambda", "iteration", "TC", "TC_std"]
df_tc_welda_vmf_lambda = df_tc_welda_vmf_lambda[df_tc_welda_vmf_lambda["lambda"].apply(lambda x: x * 10 in [2, 4, 5, 6, 8])]
def merge_runs(df_param):
tcs = df_param["TC"]
tcs = sorted(tcs)
tc = max(tcs)
df_return = df_param.iloc[0]
df_return["TC"] = tc
#row = df_param.loc[df_param['TC'].idxmax()]
#print row["kappa-factor"]
return df_return
df_tc_welda_vmf_lambda = df_tc_welda_vmf_lambda.groupby(["embedding-dimensions", "lambda", "iteration"]).apply(merge_runs).reset_index(drop=True)
plot_iteration_for(df_tc_welda_vmf_lambda[df_tc_welda_vmf_lambda["embedding-dimensions"] == "20news-50"],
["actual_lambda"], r=0.2, colors_offset=1, dpi=300)
plt.ylim((41.1, 48.0))
plt.scatter([218], [45.8], marker="D", s=20, c=COLORS[1], zorder=20, lw=0.5)
plt.scatter([218], [47.1], marker="D", s=20, c=COLORS[2], zorder=20, lw=0.5)
plt.scatter([218], [46.7], marker="D", s=20, c=COLORS[3], zorder=20, lw=0.5)
plt.scatter([218], [45.5], marker="D", s=20, c=COLORS[4], zorder=20, lw=0.5)
plt.scatter([218], [45.9], marker="D", s=20, c=COLORS[5], zorder=20, lw=0.5)
plot_iteration_for(df_tc_welda_vmf_lambda[df_tc_welda_vmf_lambda["embedding-dimensions"] == "200"],
["actual_lambda"], r=0.2, colors_offset=1, dpi=300)
plt.scatter([218], [45.0], marker="D", s=20, c=COLORS[1], zorder=20, lw=0.5)
plt.scatter([218], [45.6], marker="D", s=20, c=COLORS[2], zorder=20, lw=0.5)
plt.scatter([218], [45.5], marker="D", s=20, c=COLORS[3], zorder=20, lw=0.5)
plt.scatter([218], [44.8], marker="D", s=20, c=COLORS[4], zorder=20, lw=0.5)
plt.scatter([218], [43.4], marker="D", s=20, c=COLORS[5], zorder=20, lw=0.5)
Out[184]:
For 20news corpus:
In [40]:
df_runtimes = pnd.DataFrame.from_records([
("LDA", "iterations=1500, threads=4", "8min"),
("LFLDA", "iterations=400, embedding-dimension=200, lambda=0.6, threads=5", "23h 30min"),
("Gaussian LDA", "iterations=100, embedding-dimension=50, threads=2", "19h 40min"),
("topicvec", "iterations=100, embedding-dimension=100, threads=4", "45min"),
("topicvec", "iterations=100, embedding-dimension=200, threads=4", "53min"),
("sHDP on NIPS!", "kappa=0.8, tau=0.6", "20 min"),
("WELDA", "iterations=100, lambda=0.5, threads=4", "4 h"),
], columns=["method", "notes", "runtime"])
df_runtimes
Out[40]:
In [227]:
df_welda_runtime = pnd.DataFrame.from_records([
(0.0, 900),
(0.1, 1775),
(0.2, 2785),
(0.3, 4027),
(0.4, 4926),
(0.5, 6100),
(0.6, 6411),
(0.7, 7109),
(0.8, 7773),
(0.9, 8856),
(1.0, 9826)
], columns=["lambda", "runtime"])
df_welda_runtime["runtime_min"] = df_welda_runtime["runtime"] / 60
plt.figure(figsize=cm2inch(12, 6.75), dpi=300)
plt.scatter(df_welda_runtime["lambda"], df_welda_runtime["runtime_min"], c="black", marker="x", s=40)
plt.xlabel("$\\lambda$")
plt.ylabel("Runtime in min")
plt.tight_layout()
plt.xlim((-0.05, 1.05))
Out[227]:
In [109]:
import multiprocessing
for p in multiprocessing.active_children():
p.terminate()
p.join()
multiprocessing.active_children()
Out[109]:
In [ ]:
def load_document_topics(folder):
if os.path.isdir(folder):
document_topics = [f for f in os.listdir(folder) if "document-topics" in f or "theta" in f]
assert len(document_topics) == 1
document_topics = folder + "/" + document_topics[0]
else:
document_topics = folder
df_return = pnd.read_csv(document_topics, sep=" ", header=None)
if len(df_return.columns) == 50:
df_return.columns = range(1, 51)
elif len(df_return.columns) == 51:
df_return.columns = ["class"] + range(1, 51)
elif len(df_return.columns) == 52:
df_return.columns = ["class", "freq"] + range(1, 51)
elif len(df_return.columns) == 201:
df_return.columns = ["class"] + range(1, 201)
elif len(df_return.columns) == 250:
df_return.columns = range(1, 251)
elif len(df_return.columns) == 251:
df_return.columns = ["class"] + range(1, 251)
else:
raise Exception("unknown column count " + str(len(df_return.columns)))
return df_return
def get_evaluation_metrics(y_test, y_predictions, average="macro"):
precision = metrics.precision_score(y_test, y_predictions, average=average)
recall = metrics.recall_score(y_test, y_predictions, average=average)
f1 = metrics.f1_score(y_test, y_predictions, average=average)
accuracy = metrics.accuracy_score(y_test, y_predictions)
return precision, recall, f1, accuracy
def evaluate_document_classification(df_param, print_eval=False):
nr_classes = len(set(df_param["class"].tolist()))
nr_features = len(df_param.columns) - 1
if print_eval:
print "nr_classes = %d, nr_features = %d" % (nr_classes, nr_features)
feature_columns = list(range(1, nr_features + 1))
#X = df_param.as_matrix(feature_columns)
train, test = train_test_split(df_param, test_size = 0.2, stratify=df_param["class"], random_state=21011991)
feature_columns = df_param.columns[1:]
assert len(feature_columns) == nr_features
if print_eval:
print "feature_columns = %s" % str(feature_columns)
X_train = train.as_matrix(feature_columns)
#X_train = normalize(X_train, norm="l1")
X_test = test.as_matrix(feature_columns)
if print_eval:
print X_train.shape
print X_test.shape
model = svm.LinearSVC(penalty='l1', dual=False, random_state=21011991)
y_train = train["class"].tolist()
model.fit(X_train, y_train)
y_test = test["class"].tolist()
y_predictions = model.predict(X_test)
if print_eval:
print metrics.classification_report(y_test, y_predictions, digits=2)
return get_evaluation_metrics(y_test, y_predictions)
def evaluate_experiment(f_param, classes_file, avg_we_file):
df_return = load_document_topics(f_param)
if classes_file:
df_return.insert(0, "class",
pnd.read_csv("/data/wikipedia/2016-06-21/topic-models/" + classes_file,
header=None))
if avg_we_file:
df_avg = pnd.read_csv(avg_we_file, header=None, sep=" ")
nr_avg_dimensions = len(df_avg.columns) - 1
avg_column_names = ["avg-" + str(i) for i in range(1, nr_avg_dimensions + 1)]
df_avg.columns = ["class_avg"] + avg_column_names
assert len(df_avg) == len(df_return), "len(df_avg) = %d != len(df_return) = %d" % (len(df_avg), len(df_return))
df_return = pnd.concat([df_return, df_avg], axis=1)
df_return = df_return[df_return["class_avg"] >= 0]
assert all(df_return["class"] == df_return["class_avg"])
del df_return["class_avg"]
return evaluate_document_classification(df_return, print_eval=False)
def run_experiment(exp_tuple, classes_file):
e, avg_we = exp_tuple
if "google" in e:
embedding_data = "google"
elif ".20news.dim-50" in e:
embedding_data = "20news.dim-50"
elif "dim-50" in e:
embedding_data = "dim-50"
elif "dim-200" in e:
embedding_data = "dim-200"
else:
embedding_data = None
#raise Exception(e + ": embedding not known")
params = parse_params(e)
prec, rec, f1, acc = evaluate_experiment(e, classes_file, avg_we)
exp_name = ""
if os.path.isfile(e):
exp_name = os.path.basename(os.path.dirname(e)) + "/" + os.path.basename(e)
else:
exp_name = os.path.basename(e)
avg_we = os.path.basename(avg_we) if avg_we else None
return (params, (exp_name, embedding_data, avg_we, prec, rec, f1, acc))
def evaluate_multiple_experiments(experiment_files, classes_file, avg_embeddings_file, evaluate_raw=True, threads=1):
exp_tuples = []
for e in experiment_files:
if avg_embeddings_file:
exp_tuples.append((e, avg_embeddings_file))
if evaluate_raw:
exp_tuples.append((e, None))
if threads == 1:
results = map(partial(run_experiment, classes_file=classes_file), exp_tuples)
else:
try:
p = Pool(threads)
results = p.map(partial(run_experiment, classes_file=classes_file), exp_tuples)
finally:
p.close()
params_lists = defaultdict(list)
for params, result_tuple in results:
for k, v in params.iteritems():
params_lists[k].append(v)
results = [r[1] for r in results]
df_return = pnd.DataFrame.from_records(results, columns=["experiment", "embedding", "avg-embedding", "precision", "recall", "f1", "accuracy"])
for p in ["lambda"]:
if p in params_lists.keys():
df_return.insert(1, p, params_lists[p])
df_return = df_return.sort_values(["embedding"])
return df_return
In [4]:
pnd.get_option('display.max_columns')
pnd.set_option('display.max_columns', 300)
pnd.set_option('display.max_rows', 300)
In [116]:
df_avg_embeddings = evaluate_multiple_experiments([
"/data/wikipedia/2016-06-21/topic-models/topic.20news.250-1500.alpha-0-02.beta-0-02/model.dim-200.skip-gram.embedding.avg-embedding",
"/data/wikipedia/2016-06-21/topic-models/topic.20news.250-1500.alpha-0-02.beta-0-02/model.dim-50.skip-gram.embedding.avg-embedding",
"/data/wikipedia/2016-06-21/topic-models/topic.20news.250-1500.alpha-0-02.beta-0-02/model.20news.dim-50.skip-gram.embedding.avg-embedding",
], None, None, threads=3)
df_avg_embeddings
Out[116]:
In [22]:
df_avg_embeddings
Out[22]:
In [26]:
dcs_lda_experiments = sorted(find_files("/data/wikipedia/2016-06-21/topic-models", "20news.50-1500.alpha"))
df_dcs_lda = evaluate_multiple_experiments(dcs_lda_experiments,
None,
"/data/wikipedia/2016-06-21/topic-models/topic.20news.50-1500.alpha-0-02.beta-0-02/model.dim-200.skip-gram.embedding.avg-embedding",
threads=8)
df_dcs_lda
Out[26]:
In [74]:
dcs_lda50_experiments = sorted(find_files("/data/wikipedia/2016-06-21/topic-models", "20news.50-1500.alpha-0-02.beta-0-02"))
df_dcs_lda50 = evaluate_multiple_experiments(dcs_lda_experiments,
None,
"/data/wikipedia/2016-06-21/topic-models/topic.20news.50-1500.alpha-0-02.beta-0-02/model.20news.dim-50.skip-gram.embedding.avg-embedding.fix",
threads=1, evaluate_raw=False)
df_dcs_lda50
Out[74]:
In [ ]:
dcs_lda250_experiments = sorted(find_files("/data/wikipedia/2016-06-21/topic-models", "20news.250-1500.alpha"))
df_dcs_lda250 = evaluate_multiple_experiments(dcs_lda250_experiments,
None,
"/data/wikipedia/2016-06-21/topic-models/topic.20news.250-1500.alpha-0-02.beta-0-02/model.dim-200.skip-gram.embedding.avg-embedding",
threads=12)
df_dcs_lda250
Out[ ]:
In [ ]:
dcs_lda50_250_experiments = sorted(find_files("/data/wikipedia/2016-06-21/topic-models", "20news.250-1500.alpha-0-02.beta-0-02"))
df_dcs_lda50_250 = evaluate_multiple_experiments(dcs_lda50_250_experiments,
None,
"/data/wikipedia/2016-06-21/topic-models/topic.20news.250-1500.alpha-0-02.beta-0-02/model.20news.dim-50.skip-gram.embedding.avg-embedding.fix",
threads=1)
df_dcs_lda50_250
In [84]:
df_dcs_lda50_250
Out[84]:
50 topics 20news-50
In [ ]:
dcs_welda50_experiment_folder = "/data/wikipedia/2016-06-21/topic-models/topic.20news.50-1500.alpha-0-02.beta-0-02"
dcs_welda50_experiments = find_files(dcs_welda50_experiment_folder, "model.20news.dim-50.skip-gram.embedding.welda.gaussian.welda-gaussian-lambda.run-0.topic0-no.pca-10.des-20.lambda-0-0.")
dcs_welda50_experiments = [f + "/welda.iteration-200.document-topics" for f in dcs_welda50_experiments]
df_dcs_welda50 = evaluate_multiple_experiments(dcs_welda50_experiments,
"topic.20news.50-1500.alpha-0-02.beta-0-02/model.20news.dim-50.skip-gram.embedding.restricted.classes",
"/data/wikipedia/2016-06-21/topic-models/topic.20news.50-1500.alpha-0-02.beta-0-02/model.20news.dim-50.skip-gram.embedding.avg-embedding",
threads=6)
df_dcs_welda50.sort_values(["embedding", "lambda", "avg-embedding"])
In [79]:
df_dcs_welda50
Out[79]:
In [14]:
dcs_welda50_experiment_folder = "/data/wikipedia/2016-06-21/topic-models/topic.20news.50-1500.alpha-0-02.beta-0-02"
dcs_welda50_experiments = find_files(dcs_welda50_experiment_folder, "model.20news.dim-50.skip-gram.embedding.welda.gaussian.welda-gaussian-lambda.run-0.topic0-no.pca-10.des-20.")
dcs_welda50_experiments = [f + "/welda.iteration-200.document-topics" for f in dcs_welda50_experiments]
df_dcs_welda50 = evaluate_multiple_experiments(dcs_welda50_experiments,
"topic.20news.50-1500.alpha-0-02.beta-0-02/model.20news.dim-50.skip-gram.embedding.restricted.classes",
"/data/wikipedia/2016-06-21/topic-models/topic.20news.50-1500.alpha-0-02.beta-0-02/model.20news.dim-50.skip-gram.embedding.avg-embedding",
threads=6)
df_dcs_welda50.sort_values(["embedding", "lambda", "avg-embedding"])
Out[14]:
In [46]:
df_dcs_welda50["avg-embedding"].fillna('missing')
Out[46]:
50 topics
In [86]:
def plot():
plt.figure()
it = iter(DARK_COLORS)
df_dcs_welda50["avg-embedding"].fillna("no embeddings", inplace=True)
for avg_embedding, df_group in df_dcs_welda50.groupby("avg-embedding"):
print avg_embedding
plt.scatter(list(df_group["lambda"]), df_group["f1"], c=next(it), s=80, label=avg_embedding)
df_dcs_welda["avg-embedding"].fillna("no embeddings", inplace=True)
for avg_embedding, df_group in df_dcs_welda.groupby("avg-embedding"):
print avg_embedding
plt.scatter(list(df_group["lambda"]), df_group["f1"], c=next(it), s=80, label=avg_embedding)
plt.ylim((0.0, 1.0))
plt.axhline(y=0.779683, label="foo")
plt.axhline(y=0.698922, label="bar")
#plt.legend()
plot()
In [19]:
dcs_welda_experiment_folder = "/data/wikipedia/2016-06-21/topic-models/topic.20news.50-1500.alpha-0-02.beta-0-02"
dcs_welda_experiments = find_files(dcs_welda_experiment_folder, "model.dim-200.skip-gram.embedding.welda.gaussian.welda-gaussian-lambda.run-0.topic0-no.pca-10.des-20")
dcs_welda_experiments = [f + "/welda.iteration-200.document-topics" for f in dcs_welda_experiments]
df_dcs_welda = evaluate_multiple_experiments(dcs_welda_experiments,
"topic.20news.50-1500.alpha-0-02.beta-0-02/model.dim-200.skip-gram.embedding.restricted.classes",
"/data/wikipedia/2016-06-21/topic-models/topic.20news.50-1500.alpha-0-02.beta-0-02/model.dim-200.skip-gram.embedding.avg-embedding",
threads=6)
df_dcs_welda.sort_values(["embedding", "lambda", "avg-embedding"])
Out[19]:
250 topics
In [30]:
dcs_welda250_experiments = find_files(
"/data/wikipedia/2016-06-21/topic-models/topic.20news.250-1500.alpha-0-02.beta-0-02", "dim-200.skip-gram.embedding.welda.gaussian")
dcs_welda250_experiments = [f + "/welda.iteration-100.document-topics" for f in dcs_welda250_experiments]
df_dcs_welda250 = evaluate_multiple_experiments(dcs_welda250_experiments,
"topic.20news.250-1500.alpha-0-02.beta-0-02/model.dim-200.skip-gram.embedding.restricted.classes",
"/data/wikipedia/2016-06-21/topic-models/topic.20news.250-1500.alpha-0-02.beta-0-02/model.dim-200.skip-gram.embedding.avg-embedding",
threads=15)
df_dcs_welda250.sort_values(["embedding", "lambda", "avg-embedding"])
Out[30]:
In [111]:
dcs_welda50_250_experiments_base = find_files(
"/data/wikipedia/2016-06-21/topic-models/topic.20news.250-1500.alpha-0-02.beta-0-02", "dim-50.skip-gram.embedding.welda.gaussian..")
dcs_welda50_250_experiments = sorted(
[f + "/welda.iteration-100.document-topics" for f in dcs_welda50_250_experiments_base] +
[f + "/welda.iteration-200.document-topics" for f in dcs_welda50_250_experiments_base])
df_dcs_welda50_250 = evaluate_multiple_experiments(dcs_welda50_250_experiments,
"topic.20news.250-1500.alpha-0-02.beta-0-02/model.20news.dim-50.skip-gram.embedding.restricted.classes",
"/data/wikipedia/2016-06-21/topic-models/topic.20news.250-1500.alpha-0-02.beta-0-02/model.20news.dim-50.skip-gram.embedding.avg-embedding",
threads=10)
df_dcs_welda50_250.sort_values(["embedding", "lambda", "avg-embedding"])
Out[111]:
In [197]:
foo = sorted(find_files(
"/data/wikipedia/2016-06-21/topic-models/topic.20news.250-1500.alpha-0-02.beta-0-02/model.20news.dim-50.skip-gram.embedding.welda.gaussian.distance-cos.lambda-0-5", "document-topics"))
df_foo = evaluate_multiple_experiments(foo,
"topic.20news.250-1500.alpha-0-02.beta-0-02/model.20news.dim-50.skip-gram.embedding.restricted.classes",
threads=15)
df_foo
#df_dcs_welda250.sort_values(["embedding", "lambda"])
#evaluate_experiment("/data/wikipedia/2016-06-21/topic-models/topic.20news.250-1500.alpha-0-02.beta-0-02/model.20news.dim-50.skip-gram.embedding.welda.gaussian.distance-cos.lambda-0-5/welda.iteration-000.document-topics",
# )
#del df_doc_class["file"]
#evaluate_document_classification(df_doc_class)
Out[197]:
In [35]:
dcs_welda_vmf_experiment_folder = "/data/wikipedia/2016-06-21/topic-models/topic.20news.50-1500.alpha-0-02.beta-0-02"
dcs_welda_vmf_experiments = [f + "/welda.iteration-100.document-topics" for f in find_files(dcs_welda_vmf_experiment_folder,
"dim-200.skip-gram.embedding.welda.vmf.")]
dcs_welda_vmf_experiments = [f for f in dcs_welda_vmf_experiments if os.path.exists(f)]
df_dcs_welda_vmf = evaluate_multiple_experiments(dcs_welda_vmf_experiments,
"topic.20news.50-1500.alpha-0-02.beta-0-02/model.dim-200.skip-gram.embedding.restricted.classes",
"/data/wikipedia/2016-06-21/topic-models/topic.20news.50-1500.alpha-0-02.beta-0-02/model.dim-200.skip-gram.embedding.avg-embedding",
threads=16)
#df_dcs_welda = df_dcs_welda[df_dcs_welda["lambda"] != 0.0]
df_dcs_welda_vmf.sort_values(["embedding", "lambda", "avg-embedding"])
Out[35]:
In [13]:
df_dcs_welda_vmf.sort_values(["embedding", "lambda", "avg-embedding"])
Out[13]:
In [37]:
dcs_lflda_experiment_folder = "/data/wikipedia/2016-06-21/topic-models/topic.20news.50-1500.alpha-0-02.beta-0-02"
dcs_lflda_experiments = [f + "/iteration-100.document-topics" for f in find_files(dcs_lflda_experiment_folder, "lflda.")]
df_dcs_lflda = evaluate_multiple_experiments(dcs_lflda_experiments,
"topic.20news.50-1500.alpha-0-02.beta-0-02/model.dim-200.skip-gram.embedding.restricted.classes",
None,
#"/data/wikipedia/2016-06-21/topic-models/topic.20news.50-1500.alpha-0-02.beta-0-02/model.dim-200.skip-gram.embedding.avg-embedding",
threads=8)
df_dcs_lflda
Out[37]:
In [102]:
dcs_lflda50_experiment_folder = "/data/wikipedia/2016-06-21/topic-models/topic.20news.50-1500.alpha-0-02.beta-0-02"
dcs_lflda50_experiments = sorted([f + "/iteration-100.document-topics" for f in find_files(dcs_lflda50_experiment_folder, "lflda.dim-200.")])
dcs_lflda50_experiments
Out[102]:
In [101]:
dcs_lflda50_experiment_folder = "/data/wikipedia/2016-06-21/topic-models/topic.20news.50-1500.alpha-0-02.beta-0-02"
dcs_lflda50_experiments = sorted([f + "/iteration-100.document-topics" for f in find_files(dcs_lflda50_experiment_folder, "lflda.dim-50.")])
df_dcs_lflda50 = evaluate_multiple_experiments(dcs_lflda50_experiments,
"topic.20news.50-1500.alpha-0-02.beta-0-02/model.20news.dim-50.skip-gram.embedding.restricted.classes",
"/data/wikipedia/2016-06-21/topic-models/topic.20news.50-1500.alpha-0-02.beta-0-02/model.20news.dim-50.skip-gram.embedding.avg-embedding",
threads=11)
df_dcs_lflda50
Out[101]:
In [57]:
dcs_lflda250_experiment_folder = "/data/wikipedia/2016-06-21/topic-models/topic.20news.250-1500.alpha-0-02.beta-0-02"
dcs_lflda250_experiments = [f + "/iteration-100.document-topics" for f in find_files(dcs_lflda250_experiment_folder, "lflda.")]
df_dcs_lflda250 = evaluate_multiple_experiments(dcs_lflda250_experiments,
"topic.20news.250-1500.alpha-0-02.beta-0-02/model.dim-200.skip-gram.embedding.restricted.classes",
"/data/wikipedia/2016-06-21/topic-models/topic.20news.250-1500.alpha-0-02.beta-0-02/model.dim-200.skip-gram.embedding.avg-embedding",
threads=4)
df_dcs_lflda250
Out[57]:
In [93]:
dcs_lflda50_250_experiment_folder = "/data/wikipedia/2016-06-21/topic-models/topic.20news.250-1500.alpha-0-02.beta-0-02"
dcs_lflda50_250_experiments = [f + "/iteration-100.document-topics" for f in find_files(dcs_lflda50_250_experiment_folder, "lflda.dim-50")]
df_dcs_lflda50_250 = evaluate_multiple_experiments(dcs_lflda50_250_experiments,
"topic.20news.250-1500.alpha-0-02.beta-0-02/model.20news.dim-50.skip-gram.embedding.restricted.classes",
"/data/wikipedia/2016-06-21/topic-models/topic.20news.250-1500.alpha-0-02.beta-0-02/model.20news.dim-50.skip-gram.embedding.avg-embedding",
threads=8)
df_dcs_lflda50_250
Out[93]:
Infers topics for each category
In [14]:
df_dcs_topicvec = pnd.DataFrame.from_records([
("50-dimensions, no separate topics (50), raw topic proportions", 0.351, 0.330, 0.306, 0.341),
("50-dimensions, no separate topics (50), topic proportions + mean WE", 0.611, 0.611, 0.597, 0.624),
("200-dimensions, no separate topics (50), raw topic proportions", 0.347, 0.344, 0.314, 0.355),
("200-dimensions, no separate topics (50), topic proportions + mean WE", 0.742, 0.731, 0.721, 0.745),
("200-dimensions, no separate topics (250), raw topic proportions", 0.409, 0.373, 0.337, 0.385),
("200-dimensions, no separate topics (250), topic proportions + mean WE", 0.743, 0.734, 0.725, 0.748),
("200-dimensions, separate topics per category, raw topic proportions", 0.743, 0.739, 0.739, 0.746),
("200-dimensions, separate topics per category, topic proportions + mean WE", 0.786, 0.783, 0.783, 0.791),
], columns=["method", "precision", "recall", "f1", "accuracy"])
df_dcs_topicvec
Out[14]:
In [105]:
df_dc_topicvec = evaluate_multiple_experiments(
["/home/stefan.bunk/topicvec/results/corpus-orig.dim-200.iterations-500"], None)
df_dc_topicvec
Out[105]:
50 topics, Wikipedia
In [81]:
df_dc_gaussian = evaluate_multiple_experiments(["/home/stefan.bunk/Gaussian_LDA/results/dim-50.alpha-0-02/100.document-topic"],
"topic.20news.50-1500.alpha-0-02.beta-0-02/model.dim-50.skip-gram.embedding.restricted.classes",
"/data/wikipedia/2016-06-21/topic-models/topic.20news.50-1500.alpha-0-02.beta-0-02/model.dim-200.skip-gram.embedding.avg-embedding")
df_dc_gaussian
Out[81]:
250 topics, Wikipedia-200
In [87]:
df_dc_gaussian250 = evaluate_multiple_experiments(["/home/stefan.bunk/Gaussian_LDA/results/topics-250.data-nips.dim-200.alpha-0-02/100.document-topic"],
"topic.20news.250-1500.alpha-0-02.beta-0-02/model.dim-200.skip-gram.embedding.restricted.classes",
"/data/wikipedia/2016-06-21/topic-models/topic.20news.50-1500.alpha-0-02.beta-0-02/model.dim-200.skip-gram.embedding.avg-embedding")
df_dc_gaussian250
Out[87]:
250 topics, Wikipedia-50
In [117]:
df_dc_gaussian50_250 = evaluate_multiple_experiments(["/home/stefan.bunk/Gaussian_LDA/results/topics-250.data-20news.dim-50.alpha-0-02/100.document-topic"],
"topic.20news.250-1500.alpha-0-02.beta-0-02/model.dim-50.skip-gram.embedding.restricted.classes",
"/data/wikipedia/2016-06-21/topic-models/topic.20news.250-1500.alpha-0-02.beta-0-02/model.dim-200.skip-gram.embedding.avg-embedding",
threads=2)
df_dc_gaussian50_250
Out[117]:
Waiting for parameter settings
In [38]:
def read_restricted_vectors(vector_file):
vectors = dict()
with open(vector_file, "r") as f:
for line in f:
split = line.rstrip().split()
word = split[0]
vector = [float(v) for v in split[1:]]
vectors[word] = vector
return vectors
def read_topics(topics_file):
with open(topics_file, "r") as f:
first_line = f.readline()
if "topic-count" in first_line:
header = "infer"
else:
header = None
df_return = pnd.read_csv(topics_file, sep=" ", header=header)
nr_cols = len(df_return.columns)
df_return = df_return[df_return.columns[-10:]]
return df_return
In [40]:
VECTOR_FILE_20NEWS_50 = "/data/wikipedia/2016-06-21/topic-models/topic.20news.50-1500.alpha-0-02.beta-0-02/20news.dim-50.skip-gram.embedding.restricted.vocab.embedding.txt"
VECTOR_FILE_200 = "/data/wikipedia/2016-06-21/topic-models/topic.20news.50-1500.alpha-0-02.beta-0-02/dim-200.skip-gram.embedding.restricted.vocab.embedding.txt"
VECTOR_FILE_20NEWS_50 = "/home/knub/Repositories/master-thesis/models/topic-models/topic.20news.50-1500.alpha-0-02.beta-0-02/20news.dim-50.skip-gram.embedding.restricted.vocab.embedding.txt"
VECTOR_FILE_200 = "/home/knub/Repositories/master-thesis/models/topic-models/topic.20news.50-1500.alpha-0-02.beta-0-02/dim-200.skip-gram.embedding.restricted.vocab.embedding.txt"
vectors_200 = read_restricted_vectors(VECTOR_FILE_200)
vectors_20news_50 = read_restricted_vectors(VECTOR_FILE_20NEWS_50)
Analyzing the base topics from the original LDA
In [41]:
TOPICS = "/data/wikipedia/2016-06-21/topic-models/topic.20news.50-1500.alpha-0-02.beta-0-02/model.ssv"
TOPICS = "/home/knub/Repositories/master-thesis/models/topic-models/topic.20news.50-1500.alpha-0-02.beta-0-02/model.ssv"
df_topics_lda = read_topics(TOPICS)
df_topics_lda.head(1)
Out[41]:
In [284]:
def get_vector(w, vectors):
if w in vectors:
return vectors[w]
elif w.capitalize() in vectors:
return vectors[w.capitalize()]
elif w.upper() in vectors:
return vectors[w.upper()]
else:
raise Exception("Word '%s' not found" % w)
def get_words_from_topics(df_param):
l = list()
for topic_id, row in enumerate(df_param.values):
for word in row:
l.append((topic_id, word))
return l
def plot_topics_in_embedding_space(reduction_method, df_param, vectors,
show_words=True, highlight_topics=None, alpha=1.0, show_gaussians=False, draw_own=False):
words = get_words_from_topics(df_param)
words = [(t, w) for t, w in words if w and (w in vectors or w.capitalize() in vectors or w.upper() in vectors)]
#print words
embeddings = [get_vector(w, vectors) for _, w in words]
embeddings = np.array(embeddings)
#print embeddings.shape
X = reduction_method(embeddings)
X_mapping = {w[1]: X[i,:] for i, w in enumerate(words)}
df_tmp = pnd.DataFrame()
df_tmp["x"] = X[:,0]
df_tmp["y"] = X[:,1]
df_tmp["word"] = [w for _, w in words]
df_tmp["topic_id"] = [topic for topic, _ in words]
# hack, for getting the right colors for the presentation
if len(highlight_topics) == 1:
topic_colors = ["#33a02c"]
else:
topic_colors = ["#a6cee3", "#b2df8a", "#fb9a99", "#fdbf6f", "#cab2d6", "#ffff99"] # light colors
topic_colors = ["#1f78b4", "#33a02c", "#e31a1c", "#ff7f00", "#6a3d9a"] # dark colors
if highlight_topics:
num_rows = len(highlight_topics)
colors = np.linspace(0, 1, num_rows)
colors = topic_colors
colordict = dict(zip(highlight_topics, colors))
df_tmp["color"] = df_tmp["topic_id"].apply(lambda x: colordict[x] if x in highlight_topics else 'white')
else:
num_rows = len(df_param.index)
colors = np.linspace(0, 1, num_rows)
colors = topic_colors
colordict = dict(zip(range(num_rows), colors))
df_tmp["color"] = df_tmp["topic_id"].apply(lambda x: colordict[x])
plt.figure(figsize=cm2inch(14.69, 7.9), dpi=220)
def scatter(df_scatter, alpha=1.0):
#for _, row in df_scatter.iterrows():
#print "plt.scatter([%f], [%f], c='%s', s=25, alpha=alpha, linewidth=0.5)" % (
# row.x, row.y, row.color)
plt.scatter(df_scatter.x, df_scatter.y, c=df_scatter.color, s=25, alpha=alpha, linewidth=0.5)
if highlight_topics:
df_tmp_non_highlighted = df_tmp[df_tmp.topic_id.apply(lambda x: x not in highlight_topics)]
df_tmp_highlighted = df_tmp[df_tmp.topic_id.apply(lambda x: x in highlight_topics)]
scatter(df_tmp_non_highlighted, alpha=alpha)
if not draw_own:
scatter(df_tmp_highlighted)
else:
scatter(df_tmp)
if show_words:
xlim = plt.gca().get_xlim()
ylim = plt.gca().get_ylim()
step = (ylim[1] - ylim[0]) / 100
x_step = (xlim[1] - xlim[0]) / 100
for _, row in df_tmp.iterrows():
if highlight_topics:
if row.topic_id in highlight_topics:
#print "plt.text(%f, %f, '%s', horizontalalignment='center', verticalalignment='top')" % (row.x, row.y, row.word)
plt.text(row.x, row.y + 2 * step, row.word, horizontalalignment='center', verticalalignment='top')
else:
plt.text(row.x, row.y + 2 * step, row.word, horizontalalignment='center', verticalalignment='top')
if show_gaussians and highlight_topics:
for highlight_topic, color in zip(highlight_topics, topic_colors):
topic_words = [w for t, w in words if t == highlight_topic]
embeddings_X = [X_mapping[w] for w in topic_words]
topic_X = np.array(embeddings_X)
gmm = mixture.GaussianMixture(n_components=1, covariance_type="full")
gmm.fit(topic_X)
X_grid, Y_grid = np.meshgrid(np.linspace(-2., 2., 2000), np.linspace(-2., 2., 2000))
XX = np.array([X_grid.ravel(), Y_grid.ravel()]).T
Z_grid = -gmm.score_samples(XX)
Z_grid = Z_grid.reshape(X_grid.shape)
levels = np.logspace(-2, 0, 3)
levels = [0.1, 0.53, 1]
color = DARK_COLORS[0]
color = "#888888"
plt.contour(X_grid, Y_grid, Z_grid, linestyles="solid",
norm=LogNorm(vmin=1.0, vmax=1000.0), colors=color, alpha=0.35, levels=levels, linewidths=0.7)
In [292]:
words = get_words_from_topics(df_topics_lda)
words = {w for t, w in words if w and (w in vectors_200 or w.capitalize() in vectors_200 or w.upper() in vectors_200)}
#highlight_words = ["system", "card", "video", "apple", "pc", "keyboard", "bit", "memory", "windows",
# "government", "people", "power", "world", "rights", "war", "politics", "country", "states",
# "current", "electricity"]
highlight_words = ["god", "believe", "one", "evidence", "religion", "people", "argument", "exist", "question", "atheists",
"faith", "belief", "existence", "life"]
#highlight_words = ["faith", "belief", "existence", "life"]
df_ralf = pnd.DataFrame([highlight_words, list(words)])
plot_topics_in_embedding_space(pca, df_ralf, vectors_200,
show_words=False, highlight_topics=[0], alpha=0.035, show_gaussians=True, draw_own=True)
lw = 1.6
lw_kept = 1.15
c_old = "black"
c_old = DARK_COLORS[2]
#c_old = "black"
marker_old = "$-$"
#marker_old = "o"
c_kept = "#33a02c"
c_kept = "black"
#c_kept = "white"
#c_kept = DARK_COLORS[0]
marker_kept = "x"
c_new = "black"
c_new = DARK_COLORS[1]
marker_new = "D"
marker_new = "+"
#marker_new = "o"
plt.text(-0.185694, +0.183254, "one", horizontalalignment='center', verticalalignment='top')
plt.scatter([-0.176592], [0.214906], c=c_old, marker=marker_old, s=25, alpha=1.0, linewidth=lw, edgecolors=c_old)
plt.text(+0.101021, -0.412730, "question", horizontalalignment='center', verticalalignment='top')
plt.scatter([-0.061523], [-0.481792], c=c_old, marker=marker_old, s=25, alpha=1.0, linewidth=lw, edgecolors=c_old)
plt.text(-1.200456, -1.099776, "claim", horizontalalignment='center', verticalalignment='top')
plt.scatter([-1.200176], [-1.050868], c=c_old, marker=marker_old, s=25, alpha=1.0, linewidth=lw, edgecolors=c_old)
plt.text(-0.810134, -0.473401, "belief", horizontalalignment='center', verticalalignment='top')
plt.scatter([-0.764757], [-0.617998], c=c_kept, marker=marker_kept, s=25, alpha=1.0, linewidth=lw_kept)
plt.text(-0.741648, -0.230032, "religion", horizontalalignment='center', verticalalignment='top')
plt.scatter([-0.712025], [-0.385751], c=c_kept, marker=marker_kept, s=25, alpha=1.0, linewidth=lw_kept)
plt.text(-0.642058, -0.884714, "atheist", horizontalalignment='center', verticalalignment='top')
plt.scatter([-0.773875], [-0.952152], c=c_kept, marker=marker_kept, s=25, alpha=1.0, linewidth=lw_kept)
plt.text(-0.552881, -0.343447, "god", horizontalalignment='center', verticalalignment='top')
plt.scatter([-0.557449], [-0.329148], c=c_kept, marker=marker_kept, s=25, alpha=1.0, linewidth=lw_kept)
plt.text(-0.418948, -0.093772, "people", horizontalalignment='center', verticalalignment='top')
plt.scatter([-0.421830], [-0.060277], c=c_kept, marker=marker_kept, s=25, alpha=1.0, linewidth=lw_kept)
plt.text(-0.313025, -0.671497, "evidence", horizontalalignment='center', verticalalignment='top')
plt.scatter([-0.338276], [-0.607840], c=c_kept, marker=marker_kept, s=25, alpha=1.0, linewidth=lw_kept)
###plt.text(-1.125789, -0.892848, "religion", horizontalalignment='center', verticalalignment='top')
###plt.scatter([-1.174623], [-1.099940], c='red', s=25, alpha=1.0, linewidth=0.5)
plt.text(-0.188603, -0.286825, "argument", horizontalalignment='center', verticalalignment='top')
plt.scatter([-0.214202], [-0.438073], c=c_kept, marker=marker_kept, s=25, alpha=1.0, linewidth=lw_kept)
plt.text(-0.989290, -0.697986, "existence", horizontalalignment='center', verticalalignment='top')
plt.scatter([-0.909689], [-0.670691], c=c_new, marker=marker_new, s=25, alpha=1.0, linewidth=lw)
plt.text(-0.540145, -0.490005, "life", horizontalalignment='center', verticalalignment='top')
plt.scatter([-0.469146], [-0.524452], c=c_new, marker=marker_new, s=25, alpha=1.0, linewidth=lw)
plt.text(-0.739985, -0.751415, "faith", horizontalalignment='center', verticalalignment='top')
plt.scatter([-0.778872], [-0.690025], c=c_new, marker=marker_new, s=25, alpha=1.0, linewidth=lw)
plt.xlim((-1.5, 0.7))
plt.ylim((-1.55, 1.1))
Out[292]:
In [248]:
df_topics_lda.head(1)
Out[248]:
In [43]:
plot_topics_in_embedding_space(pca, df_topics_lda.head(3), vectors_20news_50)
In [18]:
plot_topics_in_embedding_space(pca, df_topics_lda.head(5), vectors_200)
In [48]:
interesting_topics = [0, 4, 16]
df_topics_lda.ix[interesting_topics]
Out[48]:
Topic 0 is a general stopword topic, it is spread widely
Topic 4 is a good, coherent topic, where a good distribution can be approximated
Topic 16 has the stopword problem: a general good topic is polluted by stopwords, which add noise to the parameter estimation of the distribution and can lead to a wrong mean
In [51]:
plot_topics_in_embedding_space(pca, df_topics_lda, vectors_200,
show_words=True, highlight_topics=interesting_topics, alpha=0.15, show_gaussians=False)
plt.xlim((-1.5, 2.0))
plt.ylim((-1.5, 1.25))
Out[51]:
In [43]:
plot_topics_in_embedding_space(pca, df_topics_lda, vectors_200,
show_words=True, highlight_topics=interesting_topics, alpha=0.25, show_gaussians=True)
plt.xlim((-1.5, 2.0))
plt.ylim((-1.5, 1.25))
Out[43]:
In [21]:
plot_topics_in_embedding_space(pca, df_topics_lda, vectors_200,
show_words=True, highlight_topics=[4], alpha=0.25, show_gaussians=True)
plt.xlim((-1.5, 2.0))
plt.ylim((-1.5, 1.25))
Out[21]:
In [13]:
plot_topics_in_embedding_space(tsne_with_init_pca, df_topics_lda, vectors_200, show_words=True, highlight_topics=interesting_topics)
Image for Ralf:
In [41]:
words
Out[41]:
In [52]:
words = get_words_from_topics(df_topics_lda)
words = {w for t, w in words if w and (w in vectors_200 or w.capitalize() in vectors_200 or w.upper() in vectors_200)}
highlight_words = ["system", "card", "video", "apple", "pc", "keyboard", "bit", "memory", "windows",
"government", "people", "power", "world", "rights", "war", "politics", "country", "states",
"current", "electricity"]
df_ralf = pnd.DataFrame([highlight_words, list(words)])
plot_topics_in_embedding_space(pca, df_ralf, vectors_200,
show_words=True, highlight_topics=[0], alpha=0.15, show_gaussians=False)
plt.xlim((-1.5, 2.0))
plt.ylim((-1.5, 1.25))
Out[52]:
Learn stopwords with alpha0Boost LDA. That way, all the stopwords group together in topic 0. Then, always replace words from topic 0 during sampling.
Other idea: Do not consider stopwords, when estimating distribution parameters.
In [ ]:
In [ ]:
In [21]:
df_corr = pnd.DataFrame([
[ 2, 0.164313723782],
[ 3, 0.256176592548],
[ 4, 0.287232385471],
[ 5, 0.268592006809],
[ 6, 0.261913996314],
[ 7, 0.289160806347],
[ 8, 0.312587806982],
[ 9, 0.298294862885],
[ 10, 0.331600576619],
[ 15, 0.367711037349],
[ 20, 0.427497429754],
[ 25, 0.471704421159],
[ 30, 0.492563877246],
[ 35, 0.533177072882],
[ 40, 0.548170365785],
[ 45, 0.557571955648],
[ 50, 0.571644472323],
[ 55, 0.585267445892],
[ 60, 0.600522318933],
[ 65, 0.596909635835],
[ 70, 0.592062065752],
[ 75, 0.593791901276],
[ 80, 0.603061316367],
[ 85, 0.609424411479],
[ 90, 0.614154814661],
[ 95, 0.621192032861],
[100, 0.620182795692],
[110, 0.616322039816],
[120, 0.623648522916],
[130, 0.620805117798],
[140, 0.625749337877],
[150, 0.628758110603],
[200, 0.629901613512]
], columns=["dimensions", "similarity_correlation"])
plt.figure(figsize=cm2inch(12, 7), dpi=300)
plt.scatter(df_corr["dimensions"], df_corr["similarity_correlation"], marker="x", s=25, color="black", linewidth=0.75)
plt.xlim((-15, 215))
plt.ylim((0.0, 0.7))
plt.xlabel("\\# of dimensions in the word embedding model")
plt.ylabel("Corr. with human similarity ratings")
plt.tight_layout()
In [6]:
df_lda_convergence = pnd.read_csv("/home/knub/Repositories/master-thesis/results/lda_convergence.tsv", sep="\t", header=None)
df_lda_convergence.columns = ["iteration", "LL"]
plt.figure(figsize=cm2inch(10, 6), dpi=300)
plt.plot(df_lda_convergence.iteration, df_lda_convergence.LL, linestyle="None", color='black', linewidth=0.25,
marker='x', markersize=3, label="Log likelihood")
plt.xlabel("Iteration")
plt.ylabel("Log-Likelihood per token")
plt.ylim((-10, -8))
#plt.legend(loc="best", numpoints=1)
plt.tight_layout()
In [44]:
df_lsh_vs_kdtree = pnd.read_csv("/home/knub/Repositories/master-thesis/results/nearest_neighbour_search_performance.tsv",
sep="\t")
df_lsh_vs_kdtree["ratio"] = df_lsh_vs_kdtree["kdtree"] / df_lsh_vs_kdtree["lsh"]
plt.figure(figsize=cm2inch(12, 6), dpi=300)
plt.axhline(y=1, linewidth=1.0, color='gray', ls='--', label="same runtime")
#plt.axvline(x=5, linewidth=2, color='gray', ls='--')
plt.semilogy(df_lsh_vs_kdtree["dimensions"], df_lsh_vs_kdtree["ratio"], 'x',
c='black', markersize=4, label=None)
plt.xlabel("Number of dimensions")
plt.ylabel("$\\frac{Runtime~k\\mbox{-}d~tree}{Runtime~LSH}$")
plt.xlim((1, 11))
plt.ylim((0.005, 110))
plt.title("Runtime comparison k-d tree vs LSH (log scale)")
handles, labels = plt.gca().get_legend_handles_labels()
plt.gca().legend(handles[:1], labels[:1], loc='upper left', numpoints=1)
plt.tight_layout()
In [10]:
TOPICS_WELDA = "/data/wikipedia/2016-06-21/topic-models/topic.20news.50-1500.alpha-0-02.beta-0-02/model.20news.dim-50.skip-gram.embedding.welda.gaussian.distance-cos.lambda-0-6/welda.iteration-200.topics"
translated_interesting_topics = [33, 37]
df_topics_welda = read_topics(TOPICS_WELDA)
df_topics_welda.ix[translated_interesting_topics]
In [75]:
plot_topics_in_embedding_space(pca, df_topics_welda, vectors_200, show_words=True, highlight_topics=translated_interesting_topics)
In [ ]: