concept categorization (data, how to define on topic models?)
In [1]:
    
%matplotlib notebook
import itertools
import logging
from functools import partial
import gensim
import matplotlib.pyplot as plt
import numpy as np
import pandas as pnd
from sklearn.cluster import *
from sklearn.decomposition import PCA, RandomizedPCA
from sklearn.manifold import TSNE
from knub.thesis.util import *
    
In [8]:
    
d = np.array([
    [1.0, 2.0, 3.1],
    [0.5, 1.2, 4.0],
    [-1.0, 2.1, 1.0]
])
pca(d, 2)
    
    Out[8]:
In [2]:
    
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
    
In [4]:
    
from IPython.core.display import HTML
HTML("""
<style>
div.text_cell_render p, div.text_cell_render ul, table.dataframe {
font-size:1.3em;
line-height:1.1em;
}
</style>
""")
    
    Out[4]:
In [13]:
    
# Prepare data in long form
df_topics = pnd.read_csv("../models/topic-models/topic.full.fixed-vocabulary.alpha-1-100.256-400.model.ssv",
                         sep=" ")
df_topics = df_topics.ix[:,-10:]
df_topics.columns = list(range(10))
df_topics["topic"] =  df_topics.index
df_topics["topic_name"] = df_topics[0]
df = pnd.melt(df_topics, id_vars=["topic", "topic_name"], var_name="position", value_name="word")
df = df[["word", "topic", "topic_name", "position"]]
df = df.sort_values(by=["topic", "position"]).reset_index(drop=True)
df[df.topic == 0]
    
    Out[13]:
In [40]:
    
WORD2VEC_VECTOR_FILE = "/home/knub/Repositories/master-thesis/models/word-embeddings/GoogleNews-vectors-negative300.bin"
GLOVE_VECTOR_FILE = "/home/knub/Repositories/master-thesis/models/word-embeddings/glove.6B.50d.txt"
CBOW_VECTOR_FILE = "/home/knub/Repositories/master-thesis/models/word-embeddings/embedding.model.cbow"
SKIP_GRAM_VECTOR_FILE = "/home/knub/Repositories/master-thesis/models/word-embeddings/embedding.model.skip-gram"
#vectors_glove = gensim.models.Word2Vec.load_word2vec_format(GLOVE_VECTOR_FILE, binary=False)
#vectors_skip = gensim.models.Word2Vec.load_word2vec_format(SKIP_GRAM_VECTOR_FILE, binary=True)
#vectors_cbow = gensim.models.Word2Vec.load_word2vec_format(CBOW_VECTOR_FILE, binary=True)
vectors_word2vec = gensim.models.Word2Vec.load_word2vec_format(WORD2VEC_VECTOR_FILE, binary=True)
vectors_default = vectors_word2vec
    
In [42]:
    
def get_data_frame_from_word_vectors(df_param, vectors):
    df_param = df_param[df_param["word"].apply(lambda word: word in vectors)]    
    df_param["embeddings"] = df_param["word"].apply(lambda word: vectors[word])
    return df_param
df = get_data_frame_from_word_vectors(df.copy(), vectors_default)
df[df.topic == 0]
    
    
    Out[42]:
In [43]:
    
# financial, muslim, teams in sport, atom physics, math
nice_topics = [5, 117, 158, 164, 171]
nice_topics = [0, 7, 236]
df_part = df[df.topic.apply(lambda topic: topic in nice_topics)].copy()
# Show topics of interest
df_tmp = pnd.DataFrame(df_part.groupby("topic")["word"].apply(lambda l: l.tolist()).tolist())
df_tmp.index = nice_topics
df_tmp
    
    Out[43]:
In [45]:
    
def plot_topics_in_embedding_space(reduction_method, df_param):
    embeddings = np.array(df_param["embeddings"].tolist())
    X = reduction_method(embeddings)
    df_tmp = df_param.copy()
    df_tmp["x"] = X[:,0]
    df_tmp["y"] = X[:,1]
    df_tmp = df_tmp[df_tmp.topic.apply(lambda topic: topic in nice_topics)]
    colors = {0: "red", 7: "blue", 236: "green", 164: "yellow", 171: "black"}
    plt.figure(figsize=(12, 8))
    plt.scatter(df_tmp.x, df_tmp.y, c=df_tmp.topic.apply(lambda topic: colors[topic]), s=80)
    
    ylim = plt.gca().get_ylim()
    step = (ylim[1] - ylim[0]) / 100
    
    for index, row in df_tmp.iterrows():
        plt.text(row.x, row.y - step, row.word, horizontalalignment='center', verticalalignment='top')
    
In [46]:
    
#plot_topics_in_embedding_space(pca, df)
    
In [47]:
    
plot_topics_in_embedding_space(pca, df_part) # third dimensions
    
    
    
In [ ]:
    
#plot_topics_in_embedding_space(tsne, df)
    
In [22]:
    
plot_topics_in_embedding_space(tsne_with_init_pca, df)
    
    
    
Topics from the topic model do not seem to be in similar positions in the vector space, in general.
In [48]:
    
def average_pairwise_similarity(words, vectors):
    word_pairs = itertools.permutations(words, 2)
    similarities = [vectors.similarity(word1, word2) for word1, word2 in word_pairs if word1 < word2]
    return np.mean(similarities)
def average_top_similarity(words, vectors):
    word_pairs = itertools.permutations(words, 2)
    similarities = [(word1, vectors.similarity(word1, word2)) for word1, word2 in word_pairs]
    max_similarities = [max([s for w, s in l]) for _, l in itertools.groupby(similarities, lambda s: s[0])]
    return np.mean(max_similarities)
    
In [49]:
    
topic_lengths = list(range(2, 11))
def calculate_similarities_for_topic(df_topic, sim_function, vectors):
    words_in_topic = df_topic["word"].tolist()
    
    average_similarities = [sim_function(words_in_topic[:topic_length], vectors)
                            for topic_length in topic_lengths]
    
    return pnd.Series(average_similarities)
def calculate_similarity_matrix(sim_function, vectors):
    def partial_function(df_topic):
        return calculate_similarities_for_topic(df_topic, sim_function, vectors)
    df_similarities = df.groupby("topic").apply(partial_function)
    df_similarities.columns = ["%s-words" % i for i in topic_lengths]
    return df_similarities
    
In [50]:
    
df_similarities = calculate_similarity_matrix(average_pairwise_similarity, vectors_default)
df_similarities.mean()
    
    Out[50]:
In [51]:
    
means = df_similarities.mean().tolist()
plt.figure(figsize=(12, 8))
plt.scatter(topic_lengths, means, s=80)
plt.title("Avg. word similarity (cosine similarity in WE space) of topics up to the nth word")
plt.xlim(0, 11)
plt.xticks(list(range(1, 12)))
#plt.ylim((0, 0.35))
plt.xlabel("topic length")
plt.ylabel("average similarity")
    
    
    
    Out[51]:
For comparison, here are a few standard similarities:
king-prince: {{vectors_default.similarity("king", "prince")}} king-queen: {{vectors_default.similarity("king", "queen")}} topic-topics: {{vectors_default.similarity("topic", "topics")}} buy-purchase: {{vectors_default.similarity("buy", "purchase")}}
In [52]:
    
def show_highest_similar_topics(topic_length, nr_topics=3):
    column = "%s-words" % topic_length
    df_top = df_similarities.sort_values(by=column, ascending=False)[:nr_topics]
    return df_top.join(df_topics)[[column] + list(range(topic_length))]
    
In [53]:
    
show_highest_similar_topics(3)
    
    Out[53]:
In [54]:
    
show_highest_similar_topics(6)
    
    Out[54]:
In [55]:
    
show_highest_similar_topics(10)
    
    Out[55]: