In [1]:
%matplotlib notebook
import itertools
import logging
from functools import partial
import gensim
import matplotlib
import matplotlib.pyplot as plt
import numpy as np
import pandas as pnd
from sklearn.cluster import *
from sklearn.decomposition import PCA, RandomizedPCA
from sklearn.manifold import TSNE
from codecs import open
import gc
from knub.thesis.util import *
matplotlib.style.use('ggplot')
In [2]:
from IPython.core.display import HTML
HTML("""
<style>
div.text_cell_render p, div.text_cell_render ul, table.dataframe {
font-size:1.3em;
line-height:1.1em;
}
</style>
""")
Out[2]:
In [3]:
MODEL = "../models/topic-models/topic.full.alpha-1-100.256-400.model"
#MODEL = "../models/topic-models/topic.256-400.first-2000.alpha-001.beta-001.model"
In [4]:
print "Load vectors"
vectors = load_skip_gram()
model = TopicModelLoader(MODEL, vectors)
print "Load topic probs"
df_topic_probs_full = model.load_topic_probs()
print "Load topics"
df_topics = model.load_topics()
print "Load topic similars"
df_topic_similars = model.load_all_topic_similars()
In [5]:
word_prob_lower_threshold = df_topic_probs_full["word-prob"].quantile(0.4)
word_prob_upper_threshold = df_topic_probs_full["word-prob"].quantile(0.99)
In [6]:
df_topic_probs = df_topic_probs_full[df_topic_probs_full["word"].apply(lambda w: w in model.topic_words)].copy()
word-prob does not sum to one, because we only write out frequent words
In [7]:
df_topic_probs_full["word-prob"].sum()
Out[7]:
In [8]:
def topic_prob_difference_from_first_to(row, n):
s = sorted(row, reverse=True)
return s[0] - s[n - 1]
for diff in [2, 5, 50]:
column_name = "diff-" + str(diff)
df_topic_probs_full[column_name] = df_topic_probs_full[model.prob_columns].apply(
partial(topic_prob_difference_from_first_to, n=diff), axis=1)
In [9]:
plt.figure()
df_topic_probs_full["diff-2"].hist(bins=20)
Out[9]:
In [10]:
plt.figure()
df_topic_probs_full["diff-5"].hist(bins=20)
Out[10]:
In [11]:
plt.figure()
df_topic_probs_full["diff-50"].hist(bins=20)
Out[11]:
In [12]:
df_topic_probs_full.sort_values(by="word-prob", ascending=False).head(10)[["word", "word-prob"]]
Out[12]:
In [13]:
df_topic_probs["stddev"] = df_topic_probs[model.prob_columns].std(axis=1)
df_topic_probs.sort_values(by="stddev", ascending=False).head(10)[["word", "stddev"]]
Out[13]:
In [14]:
df_topic_probs["stddev"] = df_topic_probs[model.prob_columns].std(axis=1)
df_topic_probs.sort_values(by="stddev", ascending=True).head(10)[["word", "stddev"]]
Out[14]:
Topic model similarity evaluated using different probability distribution similarity measures (evaluated on the normalized word-topic distributions):
In [15]:
df_topic_similars["jensen-shannon"].head()
Out[15]:
In [16]:
model.sim_functions = ["max", "sum", "bhattacharyya", "hellinger", "jensen-shannon"]
sim_corrs_spearman = []
sim_corrs_pearson = []
for sim_function in model.sim_functions:
corr_spearman = df_topic_similars[sim_function][["tm_sim", "we_sim"]].corr("spearman").ix[0,1]
corr_pearson = df_topic_similars[sim_function][["tm_sim", "we_sim"]].corr("pearson").ix[0,1]
sim_corrs_spearman.append(corr_spearman)
sim_corrs_pearson.append(corr_pearson)
df_tmp = pnd.DataFrame(model.sim_functions, columns=["sim_function"])
df_tmp["sim_corr_spearman"] = sim_corrs_spearman
df_tmp["sim_corr_pearson"] = sim_corrs_pearson
df_tmp
Out[16]:
In [17]:
def correlation_in_group(corr_function):
def correlation(df_group):
return df_group.ix[:,-2:].corr(corr_function).ix[0,1]
return correlation
sim_corrs_spearman = []
sim_corrs_pearson = []
for sim_function in model.sim_functions:
df_tmp = df_topic_similars[sim_function]
df_group = df_tmp.groupby(np.arange(len(df_tmp)) // 10)
corr_spearman = df_group.apply(correlation_in_group("spearman")).mean()
corr_pearson = df_group.apply(correlation_in_group("pearson")).mean()
sim_corrs_spearman.append(corr_spearman)
sim_corrs_pearson.append(corr_pearson)
df_tmp = pnd.DataFrame(model.sim_functions, columns=["sim_function"])
df_tmp["sim_corr_spearman"] = sim_corrs_spearman
df_tmp["sim_corr_pearson"] = sim_corrs_pearson
df_tmp
Out[17]:
Note: Similar results Google vectors
In [18]:
plt.figure()
df_topic_similars["jensen-shannon"]["tm_sim"].hist(bins=100)
Out[18]:
In [19]:
plt.figure()
df_topic_similars["jensen-shannon"]["we_sim"].hist(bins=50)
Out[19]:
In [20]:
plt.figure()
df_topic_similars["jensen-shannon"]["we_sim"].hist(bins=50, cumulative=True, normed=True)
Out[20]:
In [21]:
def join_to_get_word_prob(df_param):
df_result = df_param.merge(df_topic_probs_full[["word", "word-prob"]],
left_on="similar_word", right_on="word",
suffixes=('', '_y'))
del df_result["word_y"]
return df_result
In [22]:
df_sim = join_to_get_word_prob(df_topic_similars["bhattacharyya"])
df_sim = df_sim[(df_sim["word-prob"] >= word_prob_lower_threshold) &
(df_sim["word-prob"] <= word_prob_upper_threshold)]
In [23]:
df_high_tm_low_we = df_sim[(df_sim["we_sim"] < 0.4)]
df_high_tm_low_we.iloc[np.random.permutation(len(df_high_tm_low_we))]
Out[23]:
In [24]:
df_high_tm_low_we = df_sim[(df_sim["we_sim"] > 0.8)]
df_high_tm_low_we.iloc[np.random.permutation(len(df_high_tm_low_we))]
Out[24]:
In [25]:
df_embedding_similars = pnd.read_csv("../models/word-embeddings/embedding.model.skip-gram.similars.with-tm",
sep="\t", header=None)
df_embedding_similars.columns = ["word", "similar_word", "we_sim", "tm_sim"]
df_embedding_similars.head()
Out[25]:
In [26]:
plt.figure()
df_embedding_similars["we_sim"].hist(bins=20)
Out[26]:
In [27]:
plt.figure()
df_embedding_similars["tm_sim"].hist(bins=20)
Out[27]:
In [28]:
plt.figure()
df_embedding_similars["tm_sim"].hist(bins=20, cumulative=True, normed=True)
Out[28]:
In [29]:
df_sim2 = join_to_get_word_prob(df_embedding_similars)
df_sim2 = df_sim2[(df_sim2["word-prob"] >= word_prob_lower_threshold) &
(df_sim2["word-prob"] <= word_prob_upper_threshold)]
In [30]:
df_embedding_similars[df_embedding_similars["word"] == "france-based"]
Out[30]:
In [31]:
df_low_tm_high_we = df_sim2[(df_sim2["tm_sim"] > 0.0) &
(df_sim2["tm_sim"] < 0.4)]
df_low_tm_high_we
Out[31]:
syntatic variations play a bigger role in WE models, example:
(development, developed): TM-sim: 0.960519 WE-SIM: 0.360895
(composed, composers) TM-SIM: 0.973376 WE-SIM: 0.329483
(works, working) TM-SIM: 0.969470 WE-SIM: 0.274090
topic models are better at capturing loose relationships, such as:
(war, commander) TM-SIM: 0.922352 WE-SIM: 0.187498
(living, households) TM-SIM: 0.983162 WE-SIM: 0.207906
(county, rural) TM-SIM: 0.882099 WE-SIM: 0.257984
Roughly the same results after using the same algorithm for both systems
In [32]:
def get_embedding_from_word_embedding(word):
try:
return vectors[word]
except:
return vectors["this"]
columns = [str(i) for i in range(256)]
def get_embedding_from_topics(word):
df_row = df_topic_probs_full[df_topic_probs_full["word"] == word]
assert len(df_row) == 1, "not exactly one row found: " + word + " " + len(df_row)
return df_row[columns].iloc[0,:].tolist()
def get_df_concept(embedding_function):
df_concept = pnd.read_csv(
"/home/knub/Repositories/master-thesis/data/concept-categorization/battig_concept-categorization.tsv",
sep="\t",
header=None)
df_concept.columns = ["word", "concept"]
df_concept["embeddings"] = df_concept["word"].apply(embedding_function)
return df_concept
df_we_concept = get_df_concept(get_embedding_from_word_embedding)
df_tm_concept = get_df_concept(get_embedding_from_topics)
df_tm_concept.head(2)
Out[32]:
In [33]:
len(df_tm_concept.ix[0,"embeddings"])
Out[33]:
In [34]:
from sklearn import metrics
# http://stats.stackexchange.com/questions/95731/how-to-calculate-purity
def single_cluster_purity(df_param):
return df_param["concept"].value_counts().max()
def calculate_purity(df_param):
purity = float(sum([single_cluster_purity(df_cluster_group)
for _, df_cluster_group
in df_param.groupby("cluster_id")])) / len(df_param)
return purity
def evaluate_clustering_algorithm(df_param, clustering):
X = np.array(df_param["embeddings"].tolist())
X_sim = metrics.pairwise.pairwise_distances(X, metric="cosine")
# sim or not sim? PCA or not PCA?
clusters = clustering.fit_predict(pca(X_sim, 20))
df_param["cluster_id"] = clusters
return calculate_purity(df_param)
In [41]:
for df_concept in [df_we_concept, df_tm_concept]:
print "-" * 100
for clustering in [KMeans(n_clusters=10, init="k-means++", n_jobs=1)]:
print clustering.__class__.__name__
print evaluate_clustering_algorithm(df_concept, clustering)
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [35]:
for df_concept in [df_we_concept, df_tm_concept]:
print "-" * 100
for clustering in [KMeans(n_clusters=10, init="k-means++", n_jobs=1),
AgglomerativeClustering(n_clusters=10, linkage="ward"),
AgglomerativeClustering(n_clusters=10, linkage="complete"),
AgglomerativeClustering(n_clusters=10, linkage="average"),
AffinityPropagation(damping=0.5),
AffinityPropagation(damping=0.6),
AffinityPropagation(damping=0.7),
AffinityPropagation(damping=0.8),
AffinityPropagation(damping=0.9),
SpectralClustering(n_clusters=3)]:
print clustering.__class__.__name__
print evaluate_clustering_algorithm(df_concept, clustering)
In [36]:
def word_similarity(f):
try:
df_sim = pnd.read_csv(MODEL + f, sep="\t")
df_sim["embedding-sim"] = df_sim[["word1", "word2"]].apply(
lambda x: model.get_similarity(x["word1"], x["word2"], vectors), axis=1)
topic_sim_column = df_sim.columns[3]
topic_corr = df_sim[["human-sim", topic_sim_column]].corr("spearman").ix[0,1]
embedding_corr = df_sim[["human-sim", "embedding-sim"]].corr("spearman").ix[0, 1]
return pnd.DataFrame([[topic_corr, embedding_corr]],
columns=["topic_corr", "embedding_corr"],
index=[f])
except Exception as e:
return None
df_tmp = pnd.concat([word_similarity(".wordsim353-all-bhattacharyya"),
word_similarity(".wordsim353-all-hellinger"),
word_similarity(".wordsim353-all-jensen-shannon"),
word_similarity(".wordsim353-all-sum"),
word_similarity(".wordsim353-rel-bhattacharyya"),
word_similarity(".wordsim353-rel-hellinger"),
word_similarity(".wordsim353-rel-jensen-shannon"),
word_similarity(".wordsim353-rel-sum"),
word_similarity(".wordsim353-sim-bhattacharyya"),
word_similarity(".wordsim353-sim-hellinger"),
word_similarity(".wordsim353-sim-jensen-shannon"),
word_similarity(".wordsim353-sim-sum")])
df_tmp.sort_values(by="topic_corr", ascending=False)
Out[36]:
In [3]:
orig_vectors = load_skip_gram()
In [15]:
#orig_vectors.save_word2vec_format("/home/knub/Repositories/master-thesis/data/word-similarity/wordsim353_sim_rel/dim-200.embedding", binary=False)
In [11]:
with open("/home/knub/Repositories/master-thesis/data/word-similarity/wordsim353_sim_rel/dim-200.embedding", "r", encoding="utf-8") as f:
lines = [line.rstrip() for line in f]
count = int(lines[0].split(" ")[0])
lines = lines[1:]
words = []
vectors = []
for line in lines:
split = line.split(" ")
word = split[0]
words.append(word)
vector = [float(s) for s in split[1:]]
vectors.append(vector)
del lines
X = np.array(vectors)
print "Read embeddings"
In [13]:
print X.shape
print len(words)
In [68]:
def project_down(n):
with open("/home/knub/Repositories/master-thesis/data/word-similarity/wordsim353_sim_rel/dim-%d.embedding" % n, "w", encoding="utf-8") as f:
f.write("%d %d\n" % (count, n))
pca_X = pca(X, n)
for i in range(count):
vector = pca_X[i,:]
output_vector = " ".join([str(v) for v in vector])
f.write("%s %s\n" % (words[i], output_vector))
DIMENSIONS = [110, 120, 130, 140]
for n in [d for d in DIMENSIONS if d != 200]:
print n
project_down(n)
gc.collect()
In [69]:
df_wordsim353 = pnd.read_csv("/home/knub/Repositories/master-thesis/data/word-similarity/wordsim353_sim_rel/wordsim_all_goldstandard.txt",
sep="\t", header=None, names=["word1", "word2", "similarity"])
def get_similarity(word1, word2, v):
# ugly but works for now
if word1 not in v:
if word1.lower() in v:
word1 = word1.lower()
if word1.upper() in v:
word1 = word1.upper()
if word1.title() in v:
word1 = word1.title()
if word2 not in v:
if word2.lower() in v:
word2 = word2.lower()
if word2.upper() in v:
word2 = word2.upper()
if word2.title() in v:
word2 = word2.title()
try:
return v.similarity(word1, word2)
except KeyError:
print word1, word2
if word1 not in v:
print word1
if word2 not in v:
print word2
def evaluate():
for dim in DIMENSIONS:
gc.collect()
print dim
vectors = gensim.models.word2vec.Word2Vec.load_word2vec_format(
"/home/knub/Repositories/master-thesis/data/word-similarity/wordsim353_sim_rel/dim-%d.embedding" % dim,
binary=False)
df_wordsim353["dim-%d" % dim] = df_wordsim353[["word1", "word2"]].apply(
lambda x: get_similarity(x["word1"], x["word2"], vectors), axis=1)
evaluate()
gc.collect()
Out[69]:
In [70]:
for dim in DIMENSIONS:
print dim
print df_wordsim353["similarity"].corr(df_wordsim353["dim-%d" % dim])
In [ ]: