• Spearmint for analogy reasoning
  • Gaussian LDA
  • Evaluate word analogy reasoning
  • evalutate topic models
  • find background noise
  • find word pairs

Setup


In [1]:
%matplotlib notebook

import itertools
import logging
from functools import partial

import gensim
import matplotlib
import matplotlib.pyplot as plt
import numpy as np
import pandas as pnd
from sklearn.cluster import *
from sklearn.decomposition import PCA, RandomizedPCA
from sklearn.manifold import TSNE
from codecs import open
import gc

from knub.thesis.util import *
matplotlib.style.use('ggplot')

In [2]:
from IPython.core.display import HTML
HTML("""
<style>
div.text_cell_render p, div.text_cell_render ul, table.dataframe {
font-size:1.3em;
line-height:1.1em;
}
</style>
""")


Out[2]:

Preprocessing


In [3]:
MODEL = "../models/topic-models/topic.full.alpha-1-100.256-400.model"
#MODEL = "../models/topic-models/topic.256-400.first-2000.alpha-001.beta-001.model"

In [4]:
print "Load vectors"
vectors = load_skip_gram()
model = TopicModelLoader(MODEL, vectors)
print "Load topic probs"
df_topic_probs_full = model.load_topic_probs()
print "Load topics"
df_topics = model.load_topics()
print "Load topic similars"
df_topic_similars = model.load_all_topic_similars()


Load vectors
Load topic probs
Load topics
Load topic similars

In [5]:
word_prob_lower_threshold = df_topic_probs_full["word-prob"].quantile(0.4)
word_prob_upper_threshold = df_topic_probs_full["word-prob"].quantile(0.99)

Topic Probs Analysis


In [6]:
df_topic_probs = df_topic_probs_full[df_topic_probs_full["word"].apply(lambda w: w in model.topic_words)].copy()

word-prob does not sum to one, because we only write out frequent words


In [7]:
df_topic_probs_full["word-prob"].sum()


Out[7]:
0.9603721877610281

In [8]:
def topic_prob_difference_from_first_to(row, n):
    s = sorted(row, reverse=True)
    return s[0] - s[n - 1]
    

for diff in [2, 5, 50]:
    column_name = "diff-" + str(diff)
    df_topic_probs_full[column_name] = df_topic_probs_full[model.prob_columns].apply(
        partial(topic_prob_difference_from_first_to, n=diff), axis=1)

Strength of topic prevalence

Against second best topic


In [9]:
plt.figure()
df_topic_probs_full["diff-2"].hist(bins=20)


Out[9]:
<matplotlib.axes._subplots.AxesSubplot at 0x7fee9c2fa350>

Against fifth best topic


In [10]:
plt.figure()
df_topic_probs_full["diff-5"].hist(bins=20)


Out[10]:
<matplotlib.axes._subplots.AxesSubplot at 0x7fee446986d0>

Against fiftieth best topic


In [11]:
plt.figure()
df_topic_probs_full["diff-50"].hist(bins=20)


Out[11]:
<matplotlib.axes._subplots.AxesSubplot at 0x7fee4460b3d0>

Most common words


In [12]:
df_topic_probs_full.sort_values(by="word-prob", ascending=False).head(10)[["word", "word-prob"]]


Out[12]:
word word-prob
1490 also 0.004686
3245 first 0.004077
1355 one 0.003606
5907 new 0.003460
2339 two 0.002904
3616 time 0.002163
2156 school 0.001988
7001 years 0.001947
351 may 0.001916
3116 would 0.001780

Highest std. dev.


In [13]:
df_topic_probs["stddev"] = df_topic_probs[model.prob_columns].std(axis=1)
df_topic_probs.sort_values(by="stddev", ascending=False).head(10)[["word", "stddev"]]


Out[13]:
word stddev
89573 gmina 0.062500
154769 cerambycidae 0.062500
43498 forewings 0.062341
120376 rabbi 0.061799
43347 moth 0.061736
277953 nascar 0.061622
119287 nigerian 0.061617
89560 voivodeship 0.061585
301489 kuala 0.061550
307513 nhl 0.061477

Lowest std. dev.


In [14]:
df_topic_probs["stddev"] = df_topic_probs[model.prob_columns].std(axis=1)
df_topic_probs.sort_values(by="stddev", ascending=True).head(10)[["word", "stddev"]]


Out[14]:
word stddev
1355 one 0.002358
1490 also 0.002516
2339 two 0.002998
3245 first 0.003318
2708 known 0.003707
3616 time 0.004129
5030 part 0.004133
3119 later 0.004259
21313 however 0.004622
15990 early 0.004699

Correlation TM similarity and WE similarity

Topic model similarity evaluated using different probability distribution similarity measures (evaluated on the normalized word-topic distributions):

Ten most similar words for each top-10-topic word


In [15]:
df_topic_similars["jensen-shannon"].head()


Out[15]:
word similar_word tm_sim we_sim
0 military stationed 0.722825 0.391147
1 military army’s 0.730260 0.566314
2 military war 0.736751 0.287915
3 military non-military 0.741515 0.643097
4 military commanders 0.754676 0.564292

Correlation between TM and WE similarity


In [16]:
model.sim_functions = ["max", "sum", "bhattacharyya", "hellinger", "jensen-shannon"]

sim_corrs_spearman = []
sim_corrs_pearson = []
for sim_function in model.sim_functions:
    corr_spearman = df_topic_similars[sim_function][["tm_sim", "we_sim"]].corr("spearman").ix[0,1]
    corr_pearson = df_topic_similars[sim_function][["tm_sim", "we_sim"]].corr("pearson").ix[0,1]
    sim_corrs_spearman.append(corr_spearman)
    sim_corrs_pearson.append(corr_pearson)

df_tmp = pnd.DataFrame(model.sim_functions, columns=["sim_function"])
df_tmp["sim_corr_spearman"] = sim_corrs_spearman
df_tmp["sim_corr_pearson"] = sim_corrs_pearson
df_tmp


Out[16]:
sim_function sim_corr_spearman sim_corr_pearson
0 max -0.076134 -0.043531
1 sum 0.367308 0.349043
2 bhattacharyya 0.316480 0.312920
3 hellinger 0.316480 0.297119
4 jensen-shannon 0.331737 0.329467

In [17]:
def correlation_in_group(corr_function):
    def correlation(df_group):
        return df_group.ix[:,-2:].corr(corr_function).ix[0,1]
    return correlation

sim_corrs_spearman = []
sim_corrs_pearson = []
for sim_function in model.sim_functions:
    df_tmp = df_topic_similars[sim_function]
    df_group = df_tmp.groupby(np.arange(len(df_tmp)) // 10)
    corr_spearman = df_group.apply(correlation_in_group("spearman")).mean()
    corr_pearson = df_group.apply(correlation_in_group("pearson")).mean()

    sim_corrs_spearman.append(corr_spearman)
    sim_corrs_pearson.append(corr_pearson)

df_tmp = pnd.DataFrame(model.sim_functions, columns=["sim_function"])
df_tmp["sim_corr_spearman"] = sim_corrs_spearman
df_tmp["sim_corr_pearson"] = sim_corrs_pearson
df_tmp


Out[17]:
sim_function sim_corr_spearman sim_corr_pearson
0 max 0.184915 0.220690
1 sum 0.282030 0.341541
2 bhattacharyya 0.311571 0.388735
3 hellinger 0.311571 0.390448
4 jensen-shannon 0.304427 0.375609

Note: Similar results Google vectors

Distribution of TM similarity


In [18]:
plt.figure()
df_topic_similars["jensen-shannon"]["tm_sim"].hist(bins=100)


Out[18]:
<matplotlib.axes._subplots.AxesSubplot at 0x7fee44515850>

Distribution of WE similarity


In [19]:
plt.figure()
df_topic_similars["jensen-shannon"]["we_sim"].hist(bins=50)


Out[19]:
<matplotlib.axes._subplots.AxesSubplot at 0x7fee441fbcd0>

In [20]:
plt.figure()
df_topic_similars["jensen-shannon"]["we_sim"].hist(bins=50, cumulative=True, normed=True)


Out[20]:
<matplotlib.axes._subplots.AxesSubplot at 0x7fee4408d450>

In [21]:
def join_to_get_word_prob(df_param):
    df_result = df_param.merge(df_topic_probs_full[["word", "word-prob"]],
                               left_on="similar_word", right_on="word",
                               suffixes=('', '_y'))
    del df_result["word_y"]
    return df_result

In [22]:
df_sim = join_to_get_word_prob(df_topic_similars["bhattacharyya"])
df_sim = df_sim[(df_sim["word-prob"] >= word_prob_lower_threshold) &
                (df_sim["word-prob"] <= word_prob_upper_threshold)]


/opt/anaconda3/envs/py27/lib/python2.7/site-packages/pandas/tools/merge.py:714: UnicodeWarning: Unicode equal comparison failed to convert both arguments to Unicode - interpreting them as being unequal
  rlab = rizer.factorize(rk)

High TM similarity, low WE similarity


In [23]:
df_high_tm_low_we = df_sim[(df_sim["we_sim"] < 0.4)]
df_high_tm_low_we.iloc[np.random.permutation(len(df_high_tm_low_we))]


Out[23]:
word similar_word tm_sim we_sim word-prob
18573 monument flagstones 0.772262 0.270321 2.351414e-07
23278 cards djmax 0.836673 0.303376 2.618171e-07
14808 american jay 0.664744 0.174906 4.067452e-05
9720 cuisine momofuku 0.958693 0.263991 1.679581e-07
18129 delhi lathi 0.964216 0.357408 1.956218e-07
17111 madrid argentine 0.923548 0.341278 3.785183e-05
16802 law attorneys 0.765067 0.386007 1.168495e-05
9771 greece dhekelia 0.933184 0.306799 1.926578e-07
5799 city neighboring 0.648637 0.248059 3.765621e-05
6674 album pop-oriented 0.960294 0.397142 5.947693e-07
379 war armistice 0.751718 0.364160 1.030374e-05
3128 version adds 0.709375 0.222375 2.246489e-05
24346 refer nawrahta 0.957810 0.028592 1.728981e-07
11677 played stints 0.743628 0.334111 5.208678e-06
23316 folk mandolin 0.783935 0.352664 4.553641e-06
16297 wingspan suffusions 0.976056 0.228607 2.242735e-07
8107 production co-production 0.636724 0.335409 4.246376e-06
22771 shell operculum 0.738934 0.360926 2.550987e-06
4274 canadian neepawa 0.926113 0.287028 1.926578e-07
13130 union organizing 0.583596 0.254639 2.373841e-05
17545 disney boop 0.952422 0.334652 6.797364e-07
5901 state state-wide 0.588537 0.325053 3.744478e-07
15984 capital populace 0.633115 0.136286 9.986592e-06
9463 football football-related 0.821115 0.241274 1.956218e-07
14760 u.s veterans 0.652536 0.261016 3.930516e-05
21265 library librarians 0.884875 0.389618 3.652595e-06
16428 holland ghent 0.862597 0.391970 5.976345e-06
1405 games teammates 0.780641 0.248470 1.520811e-05
14713 centre centred 0.669714 0.296191 1.178868e-05
23691 ukrainian volost 0.971107 0.364184 4.702827e-07
... ... ... ... ... ...
13167 construction accommodate 0.768493 0.235632 3.472090e-05
23720 korean donghak 0.954994 0.339093 2.183456e-07
18593 century flourishing 0.738145 0.327776 5.107903e-06
17509 directed directorial 0.811355 0.363225 7.945407e-06
19305 term necessarily 0.696363 0.100200 2.791464e-05
248 training assisting 0.660984 0.338259 1.551143e-05
12463 league player-coach 0.838715 0.383079 2.045137e-06
24339 list kokang 0.813293 -0.018073 1.748740e-07
18127 bengal sambhaji 0.958907 0.296152 3.714838e-07
8695 males non-families 0.812880 0.345067 4.535758e-05
16953 locomotive railfans 0.980472 0.273334 2.331654e-07
6711 recorded rock'n'roll 0.649686 0.349751 1.117415e-06
356 war pre-war 0.739857 0.237450 7.566019e-06
6891 air hangars 0.800036 0.318578 3.426346e-06
21557 founded co-founders 0.654880 0.272643 3.293955e-06
21798 example differ 0.863670 0.145109 2.495561e-05
8951 born hometown 0.724388 0.193922 2.926522e-05
3283 school extracurricular 0.820486 0.214389 6.894187e-06
1620 team all-time 0.811156 0.285777 2.722404e-05
4988 earthquake hawaiians 0.823516 0.053774 1.597578e-06
16404 belgian clercq 0.924262 0.368734 2.272375e-07
1255 game three-game 0.727667 0.362397 1.753680e-06
10132 season back-to-back 0.794158 0.336944 8.306023e-06
19349 basketball cavaliers 0.837376 0.390892 5.827159e-06
15976 capital north-east 0.615869 0.295450 3.341280e-05
20620 rio pardos 0.896315 0.263085 2.232855e-07
23648 federal hearings 0.622535 0.366522 1.020790e-05
21617 ethiopian mengistu 0.971849 0.372536 5.038744e-07
561 prague henryk 0.928931 0.352832 2.339558e-06
16394 belgian rtbf 0.909984 0.361009 3.872917e-07

3463 rows × 5 columns

High TM similarity, high WE similarity


In [24]:
df_high_tm_low_we = df_sim[(df_sim["we_sim"] > 0.8)]
df_high_tm_low_we.iloc[np.random.permutation(len(df_high_tm_low_we))]


Out[24]:
word similar_word tm_sim we_sim word-prob
10529 shah akbar 0.933788 0.859334 6.987058e-06
9179 president vice-president 0.842706 0.820344 2.448434e-05
24271 miss pageant 0.854207 0.819474 2.145912e-05
18922 miller dunn 0.855836 0.821616 1.231923e-05
12011 daughter step-daughter 0.874835 0.822123 4.100154e-07
7112 cells epithelial 0.890961 0.852985 3.481673e-06
24493 rifle rifles 0.913138 0.826603 2.494870e-05
19569 label labels 0.908865 0.805011 2.640400e-05
23810 helsinki turku 0.975045 0.810423 3.786962e-06
15477 disease sepsis 0.874862 0.806278 1.333785e-06
9744 wine wines 0.905587 0.817430 1.608446e-05
10489 pradesh rajasthan 0.989044 0.849344 1.198035e-05
17181 juan prieto 0.937059 0.802128 1.440488e-06
11197 malaysia brunei 0.951622 0.823318 8.141029e-06
12677 infantry regiments 0.953145 0.843952 2.663618e-05
24668 bacteria microbes 0.967804 0.862982 2.538144e-06
23117 jacques alain 0.851055 0.810180 8.150909e-06
17615 lens lenses 0.932138 0.808962 1.187859e-05
18916 bob larry 0.889984 0.818060 4.072293e-05
21843 committee sub-committee 0.734741 0.858180 1.413812e-06
14908 wang feng 0.959282 0.915305 6.745000e-06
24169 acid succinic 0.928491 0.833541 2.084657e-07
23745 cambodia laos 0.952723 0.908524 1.272925e-05
17821 piano violin 0.925077 0.945782 2.883940e-05
21947 server server-based 0.944265 0.816643 2.055017e-07
7101 cells tissues 0.878793 0.815024 1.364512e-05
18160 pradesh maharashtra 0.987723 0.824605 1.948215e-05
19820 directed co-directed 0.781438 0.825753 3.066718e-06
7411 thomas edmund 0.833110 0.816595 2.472245e-05
14402 fruit fruits 0.906045 0.871048 2.295493e-05
... ... ... ... ... ...
10386 car truck 0.802171 0.808960 4.456917e-05
9176 president vice-president 0.842706 0.820344 2.448434e-05
7122 cells phagocytic 0.899362 0.830606 2.361294e-07
23230 bosnia herzegovina 0.990890 0.843310 2.153421e-05
17120 argentine uruguayan 0.979802 0.862899 7.172800e-06
21208 munich leipzig 0.865363 0.818143 1.724139e-05
1673 released rereleased 0.805355 0.836386 1.313037e-06
23101 jacques michel 0.868368 0.849798 2.085842e-05
19129 zimbabwe namibia 0.974864 0.880560 1.229552e-05
18707 temperature humidity 0.791894 0.802060 8.998603e-06
12040 william edwin 0.812141 0.819160 2.142948e-05
15461 ship warship 0.891284 0.806428 7.543296e-06
18134 singh balbir 0.972971 0.812354 2.450213e-07
14075 opened reopened 0.855194 0.821982 2.032194e-05
14930 liu ouyang 0.986282 0.819061 5.671056e-07
23095 les oiseaux 0.841693 0.876154 2.509492e-07
14963 yang wang 0.913551 0.885794 3.716617e-05
14928 liu zhu 0.985223 0.885810 1.101410e-05
20529 cars trucks 0.783873 0.812309 2.547628e-05
10548 ali hassan 0.909054 0.875668 1.329833e-05
21182 von friedrich 0.887480 0.885945 2.820214e-05
6966 aircraft twin-engine 0.973642 0.808022 1.444440e-06
9724 cuisine cuisines 0.974887 0.825062 2.403777e-06
23964 gothenburg stockholm 0.981954 0.905866 3.115030e-05
20138 online on-line 0.857398 0.858626 5.626597e-06
17182 juan prieto 0.937059 0.802128 1.440488e-06
23192 serbia montenegro 0.958477 0.858856 1.682150e-05
9906 radio broadcasts 0.860078 0.805244 3.912239e-05
21185 von wilhelm 0.900363 0.867713 2.838492e-05
22242 maria antonia 0.688678 0.825313 3.707922e-06

599 rows × 5 columns

Low TM similarity, high WE similarity


In [25]:
df_embedding_similars = pnd.read_csv("../models/word-embeddings/embedding.model.skip-gram.similars.with-tm",
                                 sep="\t", header=None)
df_embedding_similars.columns = ["word", "similar_word", "we_sim", "tm_sim"]
df_embedding_similars.head()


Out[25]:
word similar_word we_sim tm_sim
0 announcer sportscaster 0.741271 0.856420
1 announcer announcers 0.739855 0.929492
2 announcer play-by-play 0.721995 0.824415
3 shipbuilding drydock 0.705620 0.756655
4 shipbuilding shipyards 0.691392 0.853786

In [26]:
plt.figure()
df_embedding_similars["we_sim"].hist(bins=20)


Out[26]:
<matplotlib.axes._subplots.AxesSubplot at 0x7fee41f9ae50>

In [27]:
plt.figure()
df_embedding_similars["tm_sim"].hist(bins=20)


Out[27]:
<matplotlib.axes._subplots.AxesSubplot at 0x7fee41ea2610>

In [28]:
plt.figure()
df_embedding_similars["tm_sim"].hist(bins=20, cumulative=True, normed=True)


Out[28]:
<matplotlib.axes._subplots.AxesSubplot at 0x7fee41e0a790>

In [29]:
df_sim2 = join_to_get_word_prob(df_embedding_similars)
df_sim2 = df_sim2[(df_sim2["word-prob"] >= word_prob_lower_threshold) &
                 (df_sim2["word-prob"] <= word_prob_upper_threshold)]

In [30]:
df_embedding_similars[df_embedding_similars["word"] == "france-based"]


Out[30]:
word similar_word we_sim tm_sim
99908 france-based belgium-based 0.752608 0.245589
99909 france-based switzerland-based 0.742562 0.343845
99910 france-based netherlands-based 0.711484 0.297970

In [31]:
df_low_tm_high_we = df_sim2[(df_sim2["tm_sim"] > 0.0) &
                            (df_sim2["tm_sim"] < 0.4)]
df_low_tm_high_we


Out[31]:
word similar_word we_sim tm_sim word-prob
18 thrashers braves 0.685548 0.175429 1.572187e-05
34 obscenity sedition 0.715250 0.176227 2.615207e-06
40 downloads subscribers 0.687755 0.375498 1.269961e-05
42 proposition paragraph 0.528047 0.324924 6.761796e-06
43 xvi-c paragraph 0.447301 0.048870 6.761796e-06
269 payne bennett 0.822779 0.198622 2.891547e-05
270 reeves bennett 0.749370 0.188538 2.891547e-05
271 goodwin bennett 0.735547 0.126398 2.891547e-05
272 pollard bennett 0.676034 0.201049 2.891547e-05
274 lockhart bennett 0.713061 0.315204 2.891547e-05
275 ladd bennett 0.714404 0.206487 2.891547e-05
277 cosgrove bennett 0.664828 0.102002 2.891547e-05
280 chown bennett 0.644461 0.274781 2.891547e-05
281 finegan bennett 0.638635 0.201281 2.891547e-05
301 catalan galician 0.805551 0.116282 5.137542e-06
302 majorcan galician 0.593155 0.117201 5.137542e-06
305 prestige wealth 0.560134 0.359994 4.377087e-05
306 inestimable wealth 0.531444 0.213525 4.377087e-05
307 prestige respectability 0.534297 0.208291 1.172743e-06
308 prestige pre-eminence 0.514734 0.357879 5.335140e-07
314 peasant landowning 0.593793 0.339807 8.180548e-07
317 arpaio prosecutors 0.597503 0.208481 9.359219e-06
329 climaxing climaxed 0.687626 0.390494 5.127663e-07
331 exchanged swapped 0.561437 0.391173 4.151529e-06
350 nyugat periodical 0.626758 0.094901 8.342579e-06
358 cane beet 0.809086 0.204410 2.373149e-06
360 cane beets 0.714927 0.249207 1.323905e-06
371 navalar acharya 0.690064 0.052314 3.080549e-06
379 chips crisps 0.645343 0.348511 5.918054e-07
397 emanuel karl 0.663532 0.368562 4.564706e-05
... ... ... ... ... ...
231650 kaipa metal/rock 0.687895 0.150378 1.195467e-07
231669 eschen/mauren lusitanos 0.777070 0.016040 1.155947e-07
231688 pial submucosa 0.643347 0.348595 1.106548e-07
231689 arcada tecnologia 0.525121 0.099907 2.134056e-07
231690 tainy luny 0.687774 0.169446 2.549011e-07
231700 bartles ensley 0.589917 0.140103 2.222975e-07
231701 eagley brimsdown 0.588161 0.091801 1.027508e-07
231711 potshot rock'n 0.504370 0.263245 2.302014e-07
231723 destron machination 0.681315 0.030431 1.037388e-07
231725 osoaviakhim tsagi 0.613134 0.372947 1.570902e-07
231731 b/e aerostructures 0.715328 0.058543 1.017629e-07
231738 rachtman riki 0.569933 0.212883 7.567995e-07
231750 langobardia transdanubia 0.621077 0.021857 2.163696e-07
231751 langobardia illyricum 0.617507 0.161718 6.402168e-07
231756 ilonka iliana 0.602602 0.041255 1.057148e-07
231777 chatkal ghizer 0.673802 0.015847 1.076908e-07
231789 zuz shekel 0.560893 0.279382 2.934327e-07
231794 outclass outmaneuver 0.543353 0.146818 2.045137e-07
231806 evermann dibblee 0.603700 0.022791 1.047268e-07
231807 power-house curtain-raiser 0.533938 0.312562 1.847539e-07
231809 no-ship wundagore 0.597962 0.019265 1.225106e-07
231823 moreno-ocampo garzon 0.630333 0.234288 1.195467e-07
231830 waksal scrushy 0.614403 0.047874 1.284386e-07
231833 eccas unasur 0.624376 0.016497 1.906819e-07
231837 pentominoes polyominoes 0.701090 0.020921 1.333785e-07
231840 berdichevsky roitman 0.634883 0.171965 1.452344e-07
231851 tomochichi kgosi 0.462471 0.017393 1.738861e-07
231854 ingarden jakobson 0.709116 0.095362 3.537000e-07
231856 sugarfree horchata 0.711885 0.019901 1.057148e-07
231867 kullback lefschetz 0.600741 0.295084 2.420573e-07

37867 rows × 5 columns

Findings

  • syntatic variations play a bigger role in WE models, example:

    (development, developed): TM-sim: 0.960519 WE-SIM: 0.360895

    (composed, composers) TM-SIM: 0.973376 WE-SIM: 0.329483

    (works, working) TM-SIM: 0.969470 WE-SIM: 0.274090

  • topic models are better at capturing loose relationships, such as:

    (war, commander) TM-SIM: 0.922352 WE-SIM: 0.187498

    (living, households) TM-SIM: 0.983162 WE-SIM: 0.207906

    (county, rural) TM-SIM: 0.882099 WE-SIM: 0.257984

Concept categorization in TM and WE

Roughly the same results after using the same algorithm for both systems


In [32]:
def get_embedding_from_word_embedding(word):
    try:
        return vectors[word]
    except:
        return vectors["this"]

columns = [str(i) for i in range(256)]
def get_embedding_from_topics(word):
    df_row = df_topic_probs_full[df_topic_probs_full["word"] == word]
    assert len(df_row) == 1, "not exactly one row found: " + word + " " + len(df_row)
    return df_row[columns].iloc[0,:].tolist()

def get_df_concept(embedding_function):
    df_concept = pnd.read_csv(
        "/home/knub/Repositories/master-thesis/data/concept-categorization/battig_concept-categorization.tsv",
        sep="\t",
        header=None)
    df_concept.columns = ["word", "concept"]
    df_concept["embeddings"] = df_concept["word"].apply(embedding_function)
    return df_concept

df_we_concept = get_df_concept(get_embedding_from_word_embedding)
df_tm_concept = get_df_concept(get_embedding_from_topics)
df_tm_concept.head(2)


Out[32]:
word concept embeddings
0 dog land-mammals [0.00352872698013, 2.17180189723e-08, 9.670517...
1 elephant land-mammals [4.18416122346e-08, 7.66164594561e-08, 0.00046...

In [33]:
len(df_tm_concept.ix[0,"embeddings"])


Out[33]:
256

In [34]:
from sklearn import metrics

# http://stats.stackexchange.com/questions/95731/how-to-calculate-purity
def single_cluster_purity(df_param):
    return df_param["concept"].value_counts().max()

def calculate_purity(df_param):
    purity = float(sum([single_cluster_purity(df_cluster_group)
                        for _, df_cluster_group
                        in df_param.groupby("cluster_id")])) / len(df_param)
    return purity


def evaluate_clustering_algorithm(df_param, clustering):
    X = np.array(df_param["embeddings"].tolist())
    X_sim = metrics.pairwise.pairwise_distances(X, metric="cosine")
    # sim or not sim? PCA or not PCA?
    clusters = clustering.fit_predict(pca(X_sim, 20))
    df_param["cluster_id"] = clusters
    return calculate_purity(df_param)

In [41]:
for df_concept in [df_we_concept, df_tm_concept]:
    print "-" * 100
    for clustering in [KMeans(n_clusters=10, init="k-means++", n_jobs=1)]:
        print clustering.__class__.__name__
        print evaluate_clustering_algorithm(df_concept, clustering)


----------------------------------------------------------------------------------------------------
KMeans
0.780487804878
----------------------------------------------------------------------------------------------------
KMeans
0.792682926829

In [ ]:


In [ ]:


In [ ]:


In [ ]:


In [ ]:


In [ ]:


In [ ]:


In [ ]:


In [ ]:


In [ ]:


In [ ]:


In [35]:
for df_concept in [df_we_concept, df_tm_concept]:
    print "-" * 100
    for clustering in [KMeans(n_clusters=10, init="k-means++", n_jobs=1),
                       AgglomerativeClustering(n_clusters=10, linkage="ward"),
                       AgglomerativeClustering(n_clusters=10, linkage="complete"),
                       AgglomerativeClustering(n_clusters=10, linkage="average"),
                       AffinityPropagation(damping=0.5),
                       AffinityPropagation(damping=0.6),
                       AffinityPropagation(damping=0.7),
                       AffinityPropagation(damping=0.8),
                       AffinityPropagation(damping=0.9),
                   SpectralClustering(n_clusters=3)]:
        print clustering.__class__.__name__
        print evaluate_clustering_algorithm(df_concept, clustering)


----------------------------------------------------------------------------------------------------
KMeans
0.780487804878
AgglomerativeClustering
0.756097560976
AgglomerativeClustering
0.719512195122
AgglomerativeClustering
0.719512195122
AffinityPropagation
0.707317073171
AffinityPropagation
0.707317073171
AffinityPropagation
0.707317073171
AffinityPropagation
0.707317073171
AffinityPropagation
0.707317073171
SpectralClustering
0.353658536585
----------------------------------------------------------------------------------------------------
KMeans
0.792682926829
AgglomerativeClustering
0.792682926829
AgglomerativeClustering
0.792682926829
AgglomerativeClustering
0.792682926829
AffinityPropagation
0.707317073171
AffinityPropagation
0.707317073171
AffinityPropagation
0.707317073171
AffinityPropagation
0.707317073171
AffinityPropagation
0.707317073171
SpectralClustering
0.365853658537

Word Similarity

Similarity


In [36]:
def word_similarity(f):
    try:        
        df_sim = pnd.read_csv(MODEL + f, sep="\t")
        df_sim["embedding-sim"] = df_sim[["word1", "word2"]].apply(
            lambda x: model.get_similarity(x["word1"], x["word2"], vectors), axis=1)
        topic_sim_column = df_sim.columns[3]
        
        topic_corr     = df_sim[["human-sim", topic_sim_column]].corr("spearman").ix[0,1]
        embedding_corr = df_sim[["human-sim", "embedding-sim"]].corr("spearman").ix[0, 1]
        
        return pnd.DataFrame([[topic_corr, embedding_corr]],
                             columns=["topic_corr", "embedding_corr"],
                             index=[f])
    except Exception as e:
        return None

df_tmp = pnd.concat([word_similarity(".wordsim353-all-bhattacharyya"),
            word_similarity(".wordsim353-all-hellinger"),
            word_similarity(".wordsim353-all-jensen-shannon"),
            word_similarity(".wordsim353-all-sum"),
            word_similarity(".wordsim353-rel-bhattacharyya"),
            word_similarity(".wordsim353-rel-hellinger"),
            word_similarity(".wordsim353-rel-jensen-shannon"),
            word_similarity(".wordsim353-rel-sum"),
            word_similarity(".wordsim353-sim-bhattacharyya"),
            word_similarity(".wordsim353-sim-hellinger"),
            word_similarity(".wordsim353-sim-jensen-shannon"),
            word_similarity(".wordsim353-sim-sum")])
df_tmp.sort_values(by="topic_corr", ascending=False)


Out[36]:
topic_corr embedding_corr
.wordsim353-sim-bhattacharyya 0.629259 0.743211
.wordsim353-sim-hellinger 0.629104 0.743211
.wordsim353-sim-jensen-shannon 0.626463 0.743211
.wordsim353-sim-sum 0.613472 0.743211
.wordsim353-all-bhattacharyya 0.577273 0.656986
.wordsim353-all-hellinger 0.577256 0.656986
.wordsim353-all-jensen-shannon 0.571601 0.656986
.wordsim353-all-sum 0.554150 0.656986
.wordsim353-rel-hellinger 0.526983 0.572367
.wordsim353-rel-bhattacharyya 0.526891 0.572367
.wordsim353-rel-jensen-shannon 0.521408 0.572367
.wordsim353-rel-sum 0.498535 0.572367

Word Similarity performance with lower embedding dimensions

Create word embeddings with different sizes


In [3]:
orig_vectors = load_skip_gram()

In [15]:
#orig_vectors.save_word2vec_format("/home/knub/Repositories/master-thesis/data/word-similarity/wordsim353_sim_rel/dim-200.embedding", binary=False)

In [11]:
with open("/home/knub/Repositories/master-thesis/data/word-similarity/wordsim353_sim_rel/dim-200.embedding", "r", encoding="utf-8") as f:
    lines = [line.rstrip() for line in f]
    count = int(lines[0].split(" ")[0])
    lines = lines[1:]
    words = []
    vectors = []
    for line in lines:
        split = line.split(" ")
        word = split[0]
        words.append(word)
        vector = [float(s) for s in split[1:]]
        vectors.append(vector)       
    del lines
X = np.array(vectors)
print "Read embeddings"


Read embeddings

In [13]:
print X.shape
print len(words)


(386046, 200)
386046

In [68]:
def project_down(n):
    with open("/home/knub/Repositories/master-thesis/data/word-similarity/wordsim353_sim_rel/dim-%d.embedding" % n, "w", encoding="utf-8") as f:
        f.write("%d %d\n" % (count, n))
        pca_X = pca(X, n)
        for i in range(count):
            vector = pca_X[i,:]
            output_vector = " ".join([str(v) for v in vector])
            f.write("%s %s\n" % (words[i], output_vector))

DIMENSIONS = [110, 120, 130, 140]
for n in [d for d in DIMENSIONS if d != 200]:
    print n
    project_down(n)
    gc.collect()


110
/opt/anaconda3/envs/py27/lib/python2.7/site-packages/sklearn/utils/deprecation.py:52: DeprecationWarning: Class RandomizedPCA is deprecated; RandomizedPCA was deprecated in 0.18 and will be removed in 0.20. Use PCA(svd_solver='randomized') instead. The new implementation DOES NOT store whiten ``components_``. Apply transform to get them.
  warnings.warn(msg, category=DeprecationWarning)
120
/opt/anaconda3/envs/py27/lib/python2.7/site-packages/sklearn/utils/deprecation.py:52: DeprecationWarning: Class RandomizedPCA is deprecated; RandomizedPCA was deprecated in 0.18 and will be removed in 0.20. Use PCA(svd_solver='randomized') instead. The new implementation DOES NOT store whiten ``components_``. Apply transform to get them.
  warnings.warn(msg, category=DeprecationWarning)
130
/opt/anaconda3/envs/py27/lib/python2.7/site-packages/sklearn/utils/deprecation.py:52: DeprecationWarning: Class RandomizedPCA is deprecated; RandomizedPCA was deprecated in 0.18 and will be removed in 0.20. Use PCA(svd_solver='randomized') instead. The new implementation DOES NOT store whiten ``components_``. Apply transform to get them.
  warnings.warn(msg, category=DeprecationWarning)
140
/opt/anaconda3/envs/py27/lib/python2.7/site-packages/sklearn/utils/deprecation.py:52: DeprecationWarning: Class RandomizedPCA is deprecated; RandomizedPCA was deprecated in 0.18 and will be removed in 0.20. Use PCA(svd_solver='randomized') instead. The new implementation DOES NOT store whiten ``components_``. Apply transform to get them.
  warnings.warn(msg, category=DeprecationWarning)

Evaluate performance


In [69]:
df_wordsim353 = pnd.read_csv("/home/knub/Repositories/master-thesis/data/word-similarity/wordsim353_sim_rel/wordsim_all_goldstandard.txt",
                            sep="\t", header=None, names=["word1", "word2", "similarity"])

def get_similarity(word1, word2, v):
    # ugly but works for now
    if word1 not in v:
        if word1.lower() in v:
            word1 = word1.lower()
        if word1.upper() in v:
            word1 = word1.upper()
        if word1.title() in v:
            word1 = word1.title()
    if word2 not in v:
        if word2.lower() in v:
            word2 = word2.lower()
        if word2.upper() in v:
            word2 = word2.upper()
        if word2.title() in v:
            word2 = word2.title()
    try:
        return v.similarity(word1, word2)
    except KeyError:
        print word1, word2
        if word1 not in v:
            print word1
        if word2 not in v:
            print word2

def evaluate():
    for dim in DIMENSIONS:
        gc.collect()
        print dim
        vectors = gensim.models.word2vec.Word2Vec.load_word2vec_format(
            "/home/knub/Repositories/master-thesis/data/word-similarity/wordsim353_sim_rel/dim-%d.embedding" % dim,
            binary=False)
        df_wordsim353["dim-%d" % dim] = df_wordsim353[["word1", "word2"]].apply(
            lambda x: get_similarity(x["word1"], x["word2"], vectors), axis=1)

evaluate()
gc.collect()


110
120
130
140
Out[69]:
14

In [70]:
for dim in DIMENSIONS:
    print dim
    print df_wordsim353["similarity"].corr(df_wordsim353["dim-%d" % dim])


110
0.616322039816
120
0.623648522916
130
0.620805117798
140
0.630749337877

In [ ]: