Spearmint for analogy reasoning
Gaussian LDA
Evaluate word analogy reasoning
evalutate topic models
find background noise
find word pairs

Setup



In [1]:

    
%matplotlib notebook

import itertools
import logging
from functools import partial

import gensim
import matplotlib
import matplotlib.pyplot as plt
import numpy as np
import pandas as pnd
from sklearn.cluster import *
from sklearn.decomposition import PCA, RandomizedPCA
from sklearn.manifold import TSNE
from codecs import open
import gc

from knub.thesis.util import *
matplotlib.style.use('ggplot')



In [2]:

    
from IPython.core.display import HTML
HTML("""
<style>
div.text_cell_render p, div.text_cell_render ul, table.dataframe {
font-size:1.3em;
line-height:1.1em;
}
</style>
""")









    Out[2]:

Preprocessing



In [3]:

    
MODEL = "../models/topic-models/topic.full.alpha-1-100.256-400.model"
#MODEL = "../models/topic-models/topic.256-400.first-2000.alpha-001.beta-001.model"



In [4]:

    
print "Load vectors"
vectors = load_skip_gram()
model = TopicModelLoader(MODEL, vectors)
print "Load topic probs"
df_topic_probs_full = model.load_topic_probs()
print "Load topics"
df_topics = model.load_topics()
print "Load topic similars"
df_topic_similars = model.load_all_topic_similars()









    



Load vectors
Load topic probs
Load topics
Load topic similars



In [5]:

    
word_prob_lower_threshold = df_topic_probs_full["word-prob"].quantile(0.4)
word_prob_upper_threshold = df_topic_probs_full["word-prob"].quantile(0.99)

Topic Probs Analysis



In [6]:

    
df_topic_probs = df_topic_probs_full[df_topic_probs_full["word"].apply(lambda w: w in model.topic_words)].copy()

word-prob does not sum to one, because we only write out frequent words



In [7]:

    
df_topic_probs_full["word-prob"].sum()









    Out[7]:





0.9603721877610281



In [8]:

    
def topic_prob_difference_from_first_to(row, n):
    s = sorted(row, reverse=True)
    return s[0] - s[n - 1]
    

for diff in [2, 5, 50]:
    column_name = "diff-" + str(diff)
    df_topic_probs_full[column_name] = df_topic_probs_full[model.prob_columns].apply(
        partial(topic_prob_difference_from_first_to, n=diff), axis=1)

Strength of topic prevalence

Against second best topic



In [9]:

    
plt.figure()
df_topic_probs_full["diff-2"].hist(bins=20)









    














    











    Out[9]:





<matplotlib.axes._subplots.AxesSubplot at 0x7fee9c2fa350>

Against fifth best topic



In [10]:

    
plt.figure()
df_topic_probs_full["diff-5"].hist(bins=20)









    














    











    Out[10]:





<matplotlib.axes._subplots.AxesSubplot at 0x7fee446986d0>

Against fiftieth best topic



In [11]:

    
plt.figure()
df_topic_probs_full["diff-50"].hist(bins=20)









    














    











    Out[11]:





<matplotlib.axes._subplots.AxesSubplot at 0x7fee4460b3d0>

Most common words



In [12]:

    
df_topic_probs_full.sort_values(by="word-prob", ascending=False).head(10)[["word", "word-prob"]]

Highest std. dev.



In [13]:

    
df_topic_probs["stddev"] = df_topic_probs[model.prob_columns].std(axis=1)
df_topic_probs.sort_values(by="stddev", ascending=False).head(10)[["word", "stddev"]]









    Out[13]:






  
    
      
      word
      stddev
    
  
  
    
      89573
      gmina
      0.062500
    
    
      154769
      cerambycidae
      0.062500
    
    
      43498
      forewings
      0.062341
    
    
      120376
      rabbi
      0.061799
    
    
      43347
      moth
      0.061736
    
    
      277953
      nascar
      0.061622
    
    
      119287
      nigerian
      0.061617
    
    
      89560
      voivodeship
      0.061585
    
    
      301489
      kuala
      0.061550
    
    
      307513
      nhl
      0.061477

Lowest std. dev.



In [14]:

    
df_topic_probs["stddev"] = df_topic_probs[model.prob_columns].std(axis=1)
df_topic_probs.sort_values(by="stddev", ascending=True).head(10)[["word", "stddev"]]

Correlation TM similarity and WE similarity

Topic model similarity evaluated using different probability distribution similarity measures (evaluated on the normalized word-topic distributions):

Jensen-Shannon divergence
Hellinger distance
Bhattacharyya coefficient
Max difference
Sum difference (total variation distance)

Ten most similar words for each top-10-topic word



In [15]:

    
df_topic_similars["jensen-shannon"].head()









    Out[15]:






  
    
      
      word
      similar_word
      tm_sim
      we_sim
    
  
  
    
      0
      military
      stationed
      0.722825
      0.391147
    
    
      1
      military
      army’s
      0.730260
      0.566314
    
    
      2
      military
      war
      0.736751
      0.287915
    
    
      3
      military
      non-military
      0.741515
      0.643097
    
    
      4
      military
      commanders
      0.754676
      0.564292

Correlation between TM and WE similarity



In [16]:

    
model.sim_functions = ["max", "sum", "bhattacharyya", "hellinger", "jensen-shannon"]

sim_corrs_spearman = []
sim_corrs_pearson = []
for sim_function in model.sim_functions:
    corr_spearman = df_topic_similars[sim_function][["tm_sim", "we_sim"]].corr("spearman").ix[0,1]
    corr_pearson = df_topic_similars[sim_function][["tm_sim", "we_sim"]].corr("pearson").ix[0,1]
    sim_corrs_spearman.append(corr_spearman)
    sim_corrs_pearson.append(corr_pearson)

df_tmp = pnd.DataFrame(model.sim_functions, columns=["sim_function"])
df_tmp["sim_corr_spearman"] = sim_corrs_spearman
df_tmp["sim_corr_pearson"] = sim_corrs_pearson
df_tmp









    Out[16]:






  
    
      
      sim_function
      sim_corr_spearman
      sim_corr_pearson
    
  
  
    
      0
      max
      -0.076134
      -0.043531
    
    
      1
      sum
      0.367308
      0.349043
    
    
      2
      bhattacharyya
      0.316480
      0.312920
    
    
      3
      hellinger
      0.316480
      0.297119
    
    
      4
      jensen-shannon
      0.331737
      0.329467



In [17]:

    
def correlation_in_group(corr_function):
    def correlation(df_group):
        return df_group.ix[:,-2:].corr(corr_function).ix[0,1]
    return correlation

sim_corrs_spearman = []
sim_corrs_pearson = []
for sim_function in model.sim_functions:
    df_tmp = df_topic_similars[sim_function]
    df_group = df_tmp.groupby(np.arange(len(df_tmp)) // 10)
    corr_spearman = df_group.apply(correlation_in_group("spearman")).mean()
    corr_pearson = df_group.apply(correlation_in_group("pearson")).mean()

    sim_corrs_spearman.append(corr_spearman)
    sim_corrs_pearson.append(corr_pearson)

df_tmp = pnd.DataFrame(model.sim_functions, columns=["sim_function"])
df_tmp["sim_corr_spearman"] = sim_corrs_spearman
df_tmp["sim_corr_pearson"] = sim_corrs_pearson
df_tmp









    Out[17]:






  
    
      
      sim_function
      sim_corr_spearman
      sim_corr_pearson
    
  
  
    
      0
      max
      0.184915
      0.220690
    
    
      1
      sum
      0.282030
      0.341541
    
    
      2
      bhattacharyya
      0.311571
      0.388735
    
    
      3
      hellinger
      0.311571
      0.390448
    
    
      4
      jensen-shannon
      0.304427
      0.375609

Note: Similar results Google vectors

Distribution of TM similarity



In [18]:

    
plt.figure()
df_topic_similars["jensen-shannon"]["tm_sim"].hist(bins=100)









    














    











    Out[18]:





<matplotlib.axes._subplots.AxesSubplot at 0x7fee44515850>

Distribution of WE similarity



In [19]:

    
plt.figure()
df_topic_similars["jensen-shannon"]["we_sim"].hist(bins=50)









    














    











    Out[19]:





<matplotlib.axes._subplots.AxesSubplot at 0x7fee441fbcd0>



In [20]:

    
plt.figure()
df_topic_similars["jensen-shannon"]["we_sim"].hist(bins=50, cumulative=True, normed=True)









    














    











    Out[20]:





<matplotlib.axes._subplots.AxesSubplot at 0x7fee4408d450>



In [21]:

    
def join_to_get_word_prob(df_param):
    df_result = df_param.merge(df_topic_probs_full[["word", "word-prob"]],
                               left_on="similar_word", right_on="word",
                               suffixes=('', '_y'))
    del df_result["word_y"]
    return df_result



In [22]:

    
df_sim = join_to_get_word_prob(df_topic_similars["bhattacharyya"])
df_sim = df_sim[(df_sim["word-prob"] >= word_prob_lower_threshold) &
                (df_sim["word-prob"] <= word_prob_upper_threshold)]









    



/opt/anaconda3/envs/py27/lib/python2.7/site-packages/pandas/tools/merge.py:714: UnicodeWarning: Unicode equal comparison failed to convert both arguments to Unicode - interpreting them as being unequal
  rlab = rizer.factorize(rk)

High TM similarity, low WE similarity



In [23]:

    
df_high_tm_low_we = df_sim[(df_sim["we_sim"] < 0.4)]
df_high_tm_low_we.iloc[np.random.permutation(len(df_high_tm_low_we))]









    Out[23]:






  
    
      
      word
      similar_word
      tm_sim
      we_sim
      word-prob
    
  
  
    
      18573
      monument
      flagstones
      0.772262
      0.270321
      2.351414e-07
    
    
      23278
      cards
      djmax
      0.836673
      0.303376
      2.618171e-07
    
    
      14808
      american
      jay
      0.664744
      0.174906
      4.067452e-05
    
    
      9720
      cuisine
      momofuku
      0.958693
      0.263991
      1.679581e-07
    
    
      18129
      delhi
      lathi
      0.964216
      0.357408
      1.956218e-07
    
    
      17111
      madrid
      argentine
      0.923548
      0.341278
      3.785183e-05
    
    
      16802
      law
      attorneys
      0.765067
      0.386007
      1.168495e-05
    
    
      9771
      greece
      dhekelia
      0.933184
      0.306799
      1.926578e-07
    
    
      5799
      city
      neighboring
      0.648637
      0.248059
      3.765621e-05
    
    
      6674
      album
      pop-oriented
      0.960294
      0.397142
      5.947693e-07
    
    
      379
      war
      armistice
      0.751718
      0.364160
      1.030374e-05
    
    
      3128
      version
      adds
      0.709375
      0.222375
      2.246489e-05
    
    
      24346
      refer
      nawrahta
      0.957810
      0.028592
      1.728981e-07
    
    
      11677
      played
      stints
      0.743628
      0.334111
      5.208678e-06
    
    
      23316
      folk
      mandolin
      0.783935
      0.352664
      4.553641e-06
    
    
      16297
      wingspan
      suffusions
      0.976056
      0.228607
      2.242735e-07
    
    
      8107
      production
      co-production
      0.636724
      0.335409
      4.246376e-06
    
    
      22771
      shell
      operculum
      0.738934
      0.360926
      2.550987e-06
    
    
      4274
      canadian
      neepawa
      0.926113
      0.287028
      1.926578e-07
    
    
      13130
      union
      organizing
      0.583596
      0.254639
      2.373841e-05
    
    
      17545
      disney
      boop
      0.952422
      0.334652
      6.797364e-07
    
    
      5901
      state
      state-wide
      0.588537
      0.325053
      3.744478e-07
    
    
      15984
      capital
      populace
      0.633115
      0.136286
      9.986592e-06
    
    
      9463
      football
      football-related
      0.821115
      0.241274
      1.956218e-07
    
    
      14760
      u.s
      veterans
      0.652536
      0.261016
      3.930516e-05
    
    
      21265
      library
      librarians
      0.884875
      0.389618
      3.652595e-06
    
    
      16428
      holland
      ghent
      0.862597
      0.391970
      5.976345e-06
    
    
      1405
      games
      teammates
      0.780641
      0.248470
      1.520811e-05
    
    
      14713
      centre
      centred
      0.669714
      0.296191
      1.178868e-05
    
    
      23691
      ukrainian
      volost
      0.971107
      0.364184
      4.702827e-07
    
    
      ...
      ...
      ...
      ...
      ...
      ...
    
    
      13167
      construction
      accommodate
      0.768493
      0.235632
      3.472090e-05
    
    
      23720
      korean
      donghak
      0.954994
      0.339093
      2.183456e-07
    
    
      18593
      century
      flourishing
      0.738145
      0.327776
      5.107903e-06
    
    
      17509
      directed
      directorial
      0.811355
      0.363225
      7.945407e-06
    
    
      19305
      term
      necessarily
      0.696363
      0.100200
      2.791464e-05
    
    
      248
      training
      assisting
      0.660984
      0.338259
      1.551143e-05
    
    
      12463
      league
      player-coach
      0.838715
      0.383079
      2.045137e-06
    
    
      24339
      list
      kokang
      0.813293
      -0.018073
      1.748740e-07
    
    
      18127
      bengal
      sambhaji
      0.958907
      0.296152
      3.714838e-07
    
    
      8695
      males
      non-families
      0.812880
      0.345067
      4.535758e-05
    
    
      16953
      locomotive
      railfans
      0.980472
      0.273334
      2.331654e-07
    
    
      6711
      recorded
      rock'n'roll
      0.649686
      0.349751
      1.117415e-06
    
    
      356
      war
      pre-war
      0.739857
      0.237450
      7.566019e-06
    
    
      6891
      air
      hangars
      0.800036
      0.318578
      3.426346e-06
    
    
      21557
      founded
      co-founders
      0.654880
      0.272643
      3.293955e-06
    
    
      21798
      example
      differ
      0.863670
      0.145109
      2.495561e-05
    
    
      8951
      born
      hometown
      0.724388
      0.193922
      2.926522e-05
    
    
      3283
      school
      extracurricular
      0.820486
      0.214389
      6.894187e-06
    
    
      1620
      team
      all-time
      0.811156
      0.285777
      2.722404e-05
    
    
      4988
      earthquake
      hawaiians
      0.823516
      0.053774
      1.597578e-06
    
    
      16404
      belgian
      clercq
      0.924262
      0.368734
      2.272375e-07
    
    
      1255
      game
      three-game
      0.727667
      0.362397
      1.753680e-06
    
    
      10132
      season
      back-to-back
      0.794158
      0.336944
      8.306023e-06
    
    
      19349
      basketball
      cavaliers
      0.837376
      0.390892
      5.827159e-06
    
    
      15976
      capital
      north-east
      0.615869
      0.295450
      3.341280e-05
    
    
      20620
      rio
      pardos
      0.896315
      0.263085
      2.232855e-07
    
    
      23648
      federal
      hearings
      0.622535
      0.366522
      1.020790e-05
    
    
      21617
      ethiopian
      mengistu
      0.971849
      0.372536
      5.038744e-07
    
    
      561
      prague
      henryk
      0.928931
      0.352832
      2.339558e-06
    
    
      16394
      belgian
      rtbf
      0.909984
      0.361009
      3.872917e-07
    
  

3463 rows × 5 columns

High TM similarity, high WE similarity



In [24]:

    
df_high_tm_low_we = df_sim[(df_sim["we_sim"] > 0.8)]
df_high_tm_low_we.iloc[np.random.permutation(len(df_high_tm_low_we))]









    Out[24]:






  
    
      
      word
      similar_word
      tm_sim
      we_sim
      word-prob
    
  
  
    
      10529
      shah
      akbar
      0.933788
      0.859334
      6.987058e-06
    
    
      9179
      president
      vice-president
      0.842706
      0.820344
      2.448434e-05
    
    
      24271
      miss
      pageant
      0.854207
      0.819474
      2.145912e-05
    
    
      18922
      miller
      dunn
      0.855836
      0.821616
      1.231923e-05
    
    
      12011
      daughter
      step-daughter
      0.874835
      0.822123
      4.100154e-07
    
    
      7112
      cells
      epithelial
      0.890961
      0.852985
      3.481673e-06
    
    
      24493
      rifle
      rifles
      0.913138
      0.826603
      2.494870e-05
    
    
      19569
      label
      labels
      0.908865
      0.805011
      2.640400e-05
    
    
      23810
      helsinki
      turku
      0.975045
      0.810423
      3.786962e-06
    
    
      15477
      disease
      sepsis
      0.874862
      0.806278
      1.333785e-06
    
    
      9744
      wine
      wines
      0.905587
      0.817430
      1.608446e-05
    
    
      10489
      pradesh
      rajasthan
      0.989044
      0.849344
      1.198035e-05
    
    
      17181
      juan
      prieto
      0.937059
      0.802128
      1.440488e-06
    
    
      11197
      malaysia
      brunei
      0.951622
      0.823318
      8.141029e-06
    
    
      12677
      infantry
      regiments
      0.953145
      0.843952
      2.663618e-05
    
    
      24668
      bacteria
      microbes
      0.967804
      0.862982
      2.538144e-06
    
    
      23117
      jacques
      alain
      0.851055
      0.810180
      8.150909e-06
    
    
      17615
      lens
      lenses
      0.932138
      0.808962
      1.187859e-05
    
    
      18916
      bob
      larry
      0.889984
      0.818060
      4.072293e-05
    
    
      21843
      committee
      sub-committee
      0.734741
      0.858180
      1.413812e-06
    
    
      14908
      wang
      feng
      0.959282
      0.915305
      6.745000e-06
    
    
      24169
      acid
      succinic
      0.928491
      0.833541
      2.084657e-07
    
    
      23745
      cambodia
      laos
      0.952723
      0.908524
      1.272925e-05
    
    
      17821
      piano
      violin
      0.925077
      0.945782
      2.883940e-05
    
    
      21947
      server
      server-based
      0.944265
      0.816643
      2.055017e-07
    
    
      7101
      cells
      tissues
      0.878793
      0.815024
      1.364512e-05
    
    
      18160
      pradesh
      maharashtra
      0.987723
      0.824605
      1.948215e-05
    
    
      19820
      directed
      co-directed
      0.781438
      0.825753
      3.066718e-06
    
    
      7411
      thomas
      edmund
      0.833110
      0.816595
      2.472245e-05
    
    
      14402
      fruit
      fruits
      0.906045
      0.871048
      2.295493e-05
    
    
      ...
      ...
      ...
      ...
      ...
      ...
    
    
      10386
      car
      truck
      0.802171
      0.808960
      4.456917e-05
    
    
      9176
      president
      vice-president
      0.842706
      0.820344
      2.448434e-05
    
    
      7122
      cells
      phagocytic
      0.899362
      0.830606
      2.361294e-07
    
    
      23230
      bosnia
      herzegovina
      0.990890
      0.843310
      2.153421e-05
    
    
      17120
      argentine
      uruguayan
      0.979802
      0.862899
      7.172800e-06
    
    
      21208
      munich
      leipzig
      0.865363
      0.818143
      1.724139e-05
    
    
      1673
      released
      rereleased
      0.805355
      0.836386
      1.313037e-06
    
    
      23101
      jacques
      michel
      0.868368
      0.849798
      2.085842e-05
    
    
      19129
      zimbabwe
      namibia
      0.974864
      0.880560
      1.229552e-05
    
    
      18707
      temperature
      humidity
      0.791894
      0.802060
      8.998603e-06
    
    
      12040
      william
      edwin
      0.812141
      0.819160
      2.142948e-05
    
    
      15461
      ship
      warship
      0.891284
      0.806428
      7.543296e-06
    
    
      18134
      singh
      balbir
      0.972971
      0.812354
      2.450213e-07
    
    
      14075
      opened
      reopened
      0.855194
      0.821982
      2.032194e-05
    
    
      14930
      liu
      ouyang
      0.986282
      0.819061
      5.671056e-07
    
    
      23095
      les
      oiseaux
      0.841693
      0.876154
      2.509492e-07
    
    
      14963
      yang
      wang
      0.913551
      0.885794
      3.716617e-05
    
    
      14928
      liu
      zhu
      0.985223
      0.885810
      1.101410e-05
    
    
      20529
      cars
      trucks
      0.783873
      0.812309
      2.547628e-05
    
    
      10548
      ali
      hassan
      0.909054
      0.875668
      1.329833e-05
    
    
      21182
      von
      friedrich
      0.887480
      0.885945
      2.820214e-05
    
    
      6966
      aircraft
      twin-engine
      0.973642
      0.808022
      1.444440e-06
    
    
      9724
      cuisine
      cuisines
      0.974887
      0.825062
      2.403777e-06
    
    
      23964
      gothenburg
      stockholm
      0.981954
      0.905866
      3.115030e-05
    
    
      20138
      online
      on-line
      0.857398
      0.858626
      5.626597e-06
    
    
      17182
      juan
      prieto
      0.937059
      0.802128
      1.440488e-06
    
    
      23192
      serbia
      montenegro
      0.958477
      0.858856
      1.682150e-05
    
    
      9906
      radio
      broadcasts
      0.860078
      0.805244
      3.912239e-05
    
    
      21185
      von
      wilhelm
      0.900363
      0.867713
      2.838492e-05
    
    
      22242
      maria
      antonia
      0.688678
      0.825313
      3.707922e-06
    
  

599 rows × 5 columns

Low TM similarity, high WE similarity



In [25]:

    
df_embedding_similars = pnd.read_csv("../models/word-embeddings/embedding.model.skip-gram.similars.with-tm",
                                 sep="\t", header=None)
df_embedding_similars.columns = ["word", "similar_word", "we_sim", "tm_sim"]
df_embedding_similars.head()









    Out[25]:






  
    
      
      word
      similar_word
      we_sim
      tm_sim
    
  
  
    
      0
      announcer
      sportscaster
      0.741271
      0.856420
    
    
      1
      announcer
      announcers
      0.739855
      0.929492
    
    
      2
      announcer
      play-by-play
      0.721995
      0.824415
    
    
      3
      shipbuilding
      drydock
      0.705620
      0.756655
    
    
      4
      shipbuilding
      shipyards
      0.691392
      0.853786



In [26]:

    
plt.figure()
df_embedding_similars["we_sim"].hist(bins=20)









    














    











    Out[26]:





<matplotlib.axes._subplots.AxesSubplot at 0x7fee41f9ae50>



In [27]:

    
plt.figure()
df_embedding_similars["tm_sim"].hist(bins=20)









    














    











    Out[27]:





<matplotlib.axes._subplots.AxesSubplot at 0x7fee41ea2610>



In [28]:

    
plt.figure()
df_embedding_similars["tm_sim"].hist(bins=20, cumulative=True, normed=True)









    














    











    Out[28]:





<matplotlib.axes._subplots.AxesSubplot at 0x7fee41e0a790>



In [29]:

    
df_sim2 = join_to_get_word_prob(df_embedding_similars)
df_sim2 = df_sim2[(df_sim2["word-prob"] >= word_prob_lower_threshold) &
                 (df_sim2["word-prob"] <= word_prob_upper_threshold)]



In [30]:

    
df_embedding_similars[df_embedding_similars["word"] == "france-based"]









    Out[30]:






  
    
      
      word
      similar_word
      we_sim
      tm_sim
    
  
  
    
      99908
      france-based
      belgium-based
      0.752608
      0.245589
    
    
      99909
      france-based
      switzerland-based
      0.742562
      0.343845
    
    
      99910
      france-based
      netherlands-based
      0.711484
      0.297970



In [31]:

    
df_low_tm_high_we = df_sim2[(df_sim2["tm_sim"] > 0.0) &
                            (df_sim2["tm_sim"] < 0.4)]
df_low_tm_high_we









    Out[31]:






  
    
      
      word
      similar_word
      we_sim
      tm_sim
      word-prob
    
  
  
    
      18
      thrashers
      braves
      0.685548
      0.175429
      1.572187e-05
    
    
      34
      obscenity
      sedition
      0.715250
      0.176227
      2.615207e-06
    
    
      40
      downloads
      subscribers
      0.687755
      0.375498
      1.269961e-05
    
    
      42
      proposition
      paragraph
      0.528047
      0.324924
      6.761796e-06
    
    
      43
      xvi-c
      paragraph
      0.447301
      0.048870
      6.761796e-06
    
    
      269
      payne
      bennett
      0.822779
      0.198622
      2.891547e-05
    
    
      270
      reeves
      bennett
      0.749370
      0.188538
      2.891547e-05
    
    
      271
      goodwin
      bennett
      0.735547
      0.126398
      2.891547e-05
    
    
      272
      pollard
      bennett
      0.676034
      0.201049
      2.891547e-05
    
    
      274
      lockhart
      bennett
      0.713061
      0.315204
      2.891547e-05
    
    
      275
      ladd
      bennett
      0.714404
      0.206487
      2.891547e-05
    
    
      277
      cosgrove
      bennett
      0.664828
      0.102002
      2.891547e-05
    
    
      280
      chown
      bennett
      0.644461
      0.274781
      2.891547e-05
    
    
      281
      finegan
      bennett
      0.638635
      0.201281
      2.891547e-05
    
    
      301
      catalan
      galician
      0.805551
      0.116282
      5.137542e-06
    
    
      302
      majorcan
      galician
      0.593155
      0.117201
      5.137542e-06
    
    
      305
      prestige
      wealth
      0.560134
      0.359994
      4.377087e-05
    
    
      306
      inestimable
      wealth
      0.531444
      0.213525
      4.377087e-05
    
    
      307
      prestige
      respectability
      0.534297
      0.208291
      1.172743e-06
    
    
      308
      prestige
      pre-eminence
      0.514734
      0.357879
      5.335140e-07
    
    
      314
      peasant
      landowning
      0.593793
      0.339807
      8.180548e-07
    
    
      317
      arpaio
      prosecutors
      0.597503
      0.208481
      9.359219e-06
    
    
      329
      climaxing
      climaxed
      0.687626
      0.390494
      5.127663e-07
    
    
      331
      exchanged
      swapped
      0.561437
      0.391173
      4.151529e-06
    
    
      350
      nyugat
      periodical
      0.626758
      0.094901
      8.342579e-06
    
    
      358
      cane
      beet
      0.809086
      0.204410
      2.373149e-06
    
    
      360
      cane
      beets
      0.714927
      0.249207
      1.323905e-06
    
    
      371
      navalar
      acharya
      0.690064
      0.052314
      3.080549e-06
    
    
      379
      chips
      crisps
      0.645343
      0.348511
      5.918054e-07
    
    
      397
      emanuel
      karl
      0.663532
      0.368562
      4.564706e-05
    
    
      ...
      ...
      ...
      ...
      ...
      ...
    
    
      231650
      kaipa
      metal/rock
      0.687895
      0.150378
      1.195467e-07
    
    
      231669
      eschen/mauren
      lusitanos
      0.777070
      0.016040
      1.155947e-07
    
    
      231688
      pial
      submucosa
      0.643347
      0.348595
      1.106548e-07
    
    
      231689
      arcada
      tecnologia
      0.525121
      0.099907
      2.134056e-07
    
    
      231690
      tainy
      luny
      0.687774
      0.169446
      2.549011e-07
    
    
      231700
      bartles
      ensley
      0.589917
      0.140103
      2.222975e-07
    
    
      231701
      eagley
      brimsdown
      0.588161
      0.091801
      1.027508e-07
    
    
      231711
      potshot
      rock'n
      0.504370
      0.263245
      2.302014e-07
    
    
      231723
      destron
      machination
      0.681315
      0.030431
      1.037388e-07
    
    
      231725
      osoaviakhim
      tsagi
      0.613134
      0.372947
      1.570902e-07
    
    
      231731
      b/e
      aerostructures
      0.715328
      0.058543
      1.017629e-07
    
    
      231738
      rachtman
      riki
      0.569933
      0.212883
      7.567995e-07
    
    
      231750
      langobardia
      transdanubia
      0.621077
      0.021857
      2.163696e-07
    
    
      231751
      langobardia
      illyricum
      0.617507
      0.161718
      6.402168e-07
    
    
      231756
      ilonka
      iliana
      0.602602
      0.041255
      1.057148e-07
    
    
      231777
      chatkal
      ghizer
      0.673802
      0.015847
      1.076908e-07
    
    
      231789
      zuz
      shekel
      0.560893
      0.279382
      2.934327e-07
    
    
      231794
      outclass
      outmaneuver
      0.543353
      0.146818
      2.045137e-07
    
    
      231806
      evermann
      dibblee
      0.603700
      0.022791
      1.047268e-07
    
    
      231807
      power-house
      curtain-raiser
      0.533938
      0.312562
      1.847539e-07
    
    
      231809
      no-ship
      wundagore
      0.597962
      0.019265
      1.225106e-07
    
    
      231823
      moreno-ocampo
      garzon
      0.630333
      0.234288
      1.195467e-07
    
    
      231830
      waksal
      scrushy
      0.614403
      0.047874
      1.284386e-07
    
    
      231833
      eccas
      unasur
      0.624376
      0.016497
      1.906819e-07
    
    
      231837
      pentominoes
      polyominoes
      0.701090
      0.020921
      1.333785e-07
    
    
      231840
      berdichevsky
      roitman
      0.634883
      0.171965
      1.452344e-07
    
    
      231851
      tomochichi
      kgosi
      0.462471
      0.017393
      1.738861e-07
    
    
      231854
      ingarden
      jakobson
      0.709116
      0.095362
      3.537000e-07
    
    
      231856
      sugarfree
      horchata
      0.711885
      0.019901
      1.057148e-07
    
    
      231867
      kullback
      lefschetz
      0.600741
      0.295084
      2.420573e-07
    
  

37867 rows × 5 columns

Findings

syntatic variations play a bigger role in WE models, example:

(development, developed): TM-sim: 0.960519 WE-SIM: 0.360895

(composed, composers) TM-SIM: 0.973376 WE-SIM: 0.329483

(works, working) TM-SIM: 0.969470 WE-SIM: 0.274090
topic models are better at capturing loose relationships, such as:

(war, commander) TM-SIM: 0.922352 WE-SIM: 0.187498

(living, households) TM-SIM: 0.983162 WE-SIM: 0.207906

(county, rural) TM-SIM: 0.882099 WE-SIM: 0.257984

Concept categorization in TM and WE

Roughly the same results after using the same algorithm for both systems



In [32]:

    
def get_embedding_from_word_embedding(word):
    try:
        return vectors[word]
    except:
        return vectors["this"]

columns = [str(i) for i in range(256)]
def get_embedding_from_topics(word):
    df_row = df_topic_probs_full[df_topic_probs_full["word"] == word]
    assert len(df_row) == 1, "not exactly one row found: " + word + " " + len(df_row)
    return df_row[columns].iloc[0,:].tolist()

def get_df_concept(embedding_function):
    df_concept = pnd.read_csv(
        "/home/knub/Repositories/master-thesis/data/concept-categorization/battig_concept-categorization.tsv",
        sep="\t",
        header=None)
    df_concept.columns = ["word", "concept"]
    df_concept["embeddings"] = df_concept["word"].apply(embedding_function)
    return df_concept

df_we_concept = get_df_concept(get_embedding_from_word_embedding)
df_tm_concept = get_df_concept(get_embedding_from_topics)
df_tm_concept.head(2)









    Out[32]:






  
    
      
      word
      concept
      embeddings
    
  
  
    
      0
      dog
      land-mammals
      [0.00352872698013, 2.17180189723e-08, 9.670517...
    
    
      1
      elephant
      land-mammals
      [4.18416122346e-08, 7.66164594561e-08, 0.00046...



In [33]:

    
len(df_tm_concept.ix[0,"embeddings"])









    Out[33]:





256



In [34]:

    
from sklearn import metrics

# http://stats.stackexchange.com/questions/95731/how-to-calculate-purity
def single_cluster_purity(df_param):
    return df_param["concept"].value_counts().max()

def calculate_purity(df_param):
    purity = float(sum([single_cluster_purity(df_cluster_group)
                        for _, df_cluster_group
                        in df_param.groupby("cluster_id")])) / len(df_param)
    return purity


def evaluate_clustering_algorithm(df_param, clustering):
    X = np.array(df_param["embeddings"].tolist())
    X_sim = metrics.pairwise.pairwise_distances(X, metric="cosine")
    # sim or not sim? PCA or not PCA?
    clusters = clustering.fit_predict(pca(X_sim, 20))
    df_param["cluster_id"] = clusters
    return calculate_purity(df_param)



In [41]:

    
for df_concept in [df_we_concept, df_tm_concept]:
    print "-" * 100
    for clustering in [KMeans(n_clusters=10, init="k-means++", n_jobs=1)]:
        print clustering.__class__.__name__
        print evaluate_clustering_algorithm(df_concept, clustering)









    



----------------------------------------------------------------------------------------------------
KMeans
0.780487804878
----------------------------------------------------------------------------------------------------
KMeans
0.792682926829



In [ ]:



In [ ]:



In [ ]:



In [ ]:



In [ ]:



In [ ]:



In [ ]:



In [ ]:



In [ ]:



In [ ]:



In [ ]:



In [35]:

    
for df_concept in [df_we_concept, df_tm_concept]:
    print "-" * 100
    for clustering in [KMeans(n_clusters=10, init="k-means++", n_jobs=1),
                       AgglomerativeClustering(n_clusters=10, linkage="ward"),
                       AgglomerativeClustering(n_clusters=10, linkage="complete"),
                       AgglomerativeClustering(n_clusters=10, linkage="average"),
                       AffinityPropagation(damping=0.5),
                       AffinityPropagation(damping=0.6),
                       AffinityPropagation(damping=0.7),
                       AffinityPropagation(damping=0.8),
                       AffinityPropagation(damping=0.9),
                   SpectralClustering(n_clusters=3)]:
        print clustering.__class__.__name__
        print evaluate_clustering_algorithm(df_concept, clustering)









    



----------------------------------------------------------------------------------------------------
KMeans
0.780487804878
AgglomerativeClustering
0.756097560976
AgglomerativeClustering
0.719512195122
AgglomerativeClustering
0.719512195122
AffinityPropagation
0.707317073171
AffinityPropagation
0.707317073171
AffinityPropagation
0.707317073171
AffinityPropagation
0.707317073171
AffinityPropagation
0.707317073171
SpectralClustering
0.353658536585
----------------------------------------------------------------------------------------------------
KMeans
0.792682926829
AgglomerativeClustering
0.792682926829
AgglomerativeClustering
0.792682926829
AgglomerativeClustering
0.792682926829
AffinityPropagation
0.707317073171
AffinityPropagation
0.707317073171
AffinityPropagation
0.707317073171
AffinityPropagation
0.707317073171
AffinityPropagation
0.707317073171
SpectralClustering
0.365853658537

Word Similarity

Similarity



In [36]:

    
def word_similarity(f):
    try:        
        df_sim = pnd.read_csv(MODEL + f, sep="\t")
        df_sim["embedding-sim"] = df_sim[["word1", "word2"]].apply(
            lambda x: model.get_similarity(x["word1"], x["word2"], vectors), axis=1)
        topic_sim_column = df_sim.columns[3]
        
        topic_corr     = df_sim[["human-sim", topic_sim_column]].corr("spearman").ix[0,1]
        embedding_corr = df_sim[["human-sim", "embedding-sim"]].corr("spearman").ix[0, 1]
        
        return pnd.DataFrame([[topic_corr, embedding_corr]],
                             columns=["topic_corr", "embedding_corr"],
                             index=[f])
    except Exception as e:
        return None

df_tmp = pnd.concat([word_similarity(".wordsim353-all-bhattacharyya"),
            word_similarity(".wordsim353-all-hellinger"),
            word_similarity(".wordsim353-all-jensen-shannon"),
            word_similarity(".wordsim353-all-sum"),
            word_similarity(".wordsim353-rel-bhattacharyya"),
            word_similarity(".wordsim353-rel-hellinger"),
            word_similarity(".wordsim353-rel-jensen-shannon"),
            word_similarity(".wordsim353-rel-sum"),
            word_similarity(".wordsim353-sim-bhattacharyya"),
            word_similarity(".wordsim353-sim-hellinger"),
            word_similarity(".wordsim353-sim-jensen-shannon"),
            word_similarity(".wordsim353-sim-sum")])
df_tmp.sort_values(by="topic_corr", ascending=False)









    Out[36]:






  
    
      
      topic_corr
      embedding_corr
    
  
  
    
      .wordsim353-sim-bhattacharyya
      0.629259
      0.743211
    
    
      .wordsim353-sim-hellinger
      0.629104
      0.743211
    
    
      .wordsim353-sim-jensen-shannon
      0.626463
      0.743211
    
    
      .wordsim353-sim-sum
      0.613472
      0.743211
    
    
      .wordsim353-all-bhattacharyya
      0.577273
      0.656986
    
    
      .wordsim353-all-hellinger
      0.577256
      0.656986
    
    
      .wordsim353-all-jensen-shannon
      0.571601
      0.656986
    
    
      .wordsim353-all-sum
      0.554150
      0.656986
    
    
      .wordsim353-rel-hellinger
      0.526983
      0.572367
    
    
      .wordsim353-rel-bhattacharyya
      0.526891
      0.572367
    
    
      .wordsim353-rel-jensen-shannon
      0.521408
      0.572367
    
    
      .wordsim353-rel-sum
      0.498535
      0.572367

Word Similarity performance with lower embedding dimensions

Create word embeddings with different sizes



In [3]:

    
orig_vectors = load_skip_gram()



In [15]:

    
#orig_vectors.save_word2vec_format("/home/knub/Repositories/master-thesis/data/word-similarity/wordsim353_sim_rel/dim-200.embedding", binary=False)



In [11]:

    
with open("/home/knub/Repositories/master-thesis/data/word-similarity/wordsim353_sim_rel/dim-200.embedding", "r", encoding="utf-8") as f:
    lines = [line.rstrip() for line in f]
    count = int(lines[0].split(" ")[0])
    lines = lines[1:]
    words = []
    vectors = []
    for line in lines:
        split = line.split(" ")
        word = split[0]
        words.append(word)
        vector = [float(s) for s in split[1:]]
        vectors.append(vector)       
    del lines
X = np.array(vectors)
print "Read embeddings"









    



Read embeddings



In [13]:

    
print X.shape
print len(words)









    



(386046, 200)
386046



In [68]:

    
def project_down(n):
    with open("/home/knub/Repositories/master-thesis/data/word-similarity/wordsim353_sim_rel/dim-%d.embedding" % n, "w", encoding="utf-8") as f:
        f.write("%d %d\n" % (count, n))
        pca_X = pca(X, n)
        for i in range(count):
            vector = pca_X[i,:]
            output_vector = " ".join([str(v) for v in vector])
            f.write("%s %s\n" % (words[i], output_vector))

DIMENSIONS = [110, 120, 130, 140]
for n in [d for d in DIMENSIONS if d != 200]:
    print n
    project_down(n)
    gc.collect()









    



110






    



/opt/anaconda3/envs/py27/lib/python2.7/site-packages/sklearn/utils/deprecation.py:52: DeprecationWarning: Class RandomizedPCA is deprecated; RandomizedPCA was deprecated in 0.18 and will be removed in 0.20. Use PCA(svd_solver='randomized') instead. The new implementation DOES NOT store whiten ``components_``. Apply transform to get them.
  warnings.warn(msg, category=DeprecationWarning)






    



120






    



/opt/anaconda3/envs/py27/lib/python2.7/site-packages/sklearn/utils/deprecation.py:52: DeprecationWarning: Class RandomizedPCA is deprecated; RandomizedPCA was deprecated in 0.18 and will be removed in 0.20. Use PCA(svd_solver='randomized') instead. The new implementation DOES NOT store whiten ``components_``. Apply transform to get them.
  warnings.warn(msg, category=DeprecationWarning)






    



130






    



/opt/anaconda3/envs/py27/lib/python2.7/site-packages/sklearn/utils/deprecation.py:52: DeprecationWarning: Class RandomizedPCA is deprecated; RandomizedPCA was deprecated in 0.18 and will be removed in 0.20. Use PCA(svd_solver='randomized') instead. The new implementation DOES NOT store whiten ``components_``. Apply transform to get them.
  warnings.warn(msg, category=DeprecationWarning)






    



140






    



/opt/anaconda3/envs/py27/lib/python2.7/site-packages/sklearn/utils/deprecation.py:52: DeprecationWarning: Class RandomizedPCA is deprecated; RandomizedPCA was deprecated in 0.18 and will be removed in 0.20. Use PCA(svd_solver='randomized') instead. The new implementation DOES NOT store whiten ``components_``. Apply transform to get them.
  warnings.warn(msg, category=DeprecationWarning)

Evaluate performance



In [69]:

    
df_wordsim353 = pnd.read_csv("/home/knub/Repositories/master-thesis/data/word-similarity/wordsim353_sim_rel/wordsim_all_goldstandard.txt",
                            sep="\t", header=None, names=["word1", "word2", "similarity"])

def get_similarity(word1, word2, v):
    # ugly but works for now
    if word1 not in v:
        if word1.lower() in v:
            word1 = word1.lower()
        if word1.upper() in v:
            word1 = word1.upper()
        if word1.title() in v:
            word1 = word1.title()
    if word2 not in v:
        if word2.lower() in v:
            word2 = word2.lower()
        if word2.upper() in v:
            word2 = word2.upper()
        if word2.title() in v:
            word2 = word2.title()
    try:
        return v.similarity(word1, word2)
    except KeyError:
        print word1, word2
        if word1 not in v:
            print word1
        if word2 not in v:
            print word2

def evaluate():
    for dim in DIMENSIONS:
        gc.collect()
        print dim
        vectors = gensim.models.word2vec.Word2Vec.load_word2vec_format(
            "/home/knub/Repositories/master-thesis/data/word-similarity/wordsim353_sim_rel/dim-%d.embedding" % dim,
            binary=False)
        df_wordsim353["dim-%d" % dim] = df_wordsim353[["word1", "word2"]].apply(
            lambda x: get_similarity(x["word1"], x["word2"], vectors), axis=1)

evaluate()
gc.collect()



In [70]:

    
for dim in DIMENSIONS:
    print dim
    print df_wordsim353["similarity"].corr(df_wordsim353["dim-%d" % dim])









    



110
0.616322039816
120
0.623648522916
130
0.620805117798
140
0.630749337877



In [ ]:

	word	word-prob
1490	also	0.004686
3245	first	0.004077
1355	one	0.003606
5907	new	0.003460
2339	two	0.002904
3616	time	0.002163
2156	school	0.001988
7001	years	0.001947
351	may	0.001916
3116	would	0.001780

	word	stddev
1355	one	0.002358
1490	also	0.002516
2339	two	0.002998
3245	first	0.003318
2708	known	0.003707
3616	time	0.004129
5030	part	0.004133
3119	later	0.004259
21313	however	0.004622
15990	early	0.004699

	word	stddev
89573	gmina	0.062500
154769	cerambycidae	0.062500
43498	forewings	0.062341
120376	rabbi	0.061799
43347	moth	0.061736
277953	nascar	0.061622
119287	nigerian	0.061617
89560	voivodeship	0.061585
301489	kuala	0.061550
307513	nhl	0.061477

	word	similar_word	tm_sim	we_sim
0	military	stationed	0.722825	0.391147
1	military	army’s	0.730260	0.566314
2	military	war	0.736751	0.287915
3	military	non-military	0.741515	0.643097
4	military	commanders	0.754676	0.564292

	sim_function	sim_corr_spearman	sim_corr_pearson
0	max	-0.076134	-0.043531
1	sum	0.367308	0.349043
2	bhattacharyya	0.316480	0.312920
3	hellinger	0.316480	0.297119
4	jensen-shannon	0.331737	0.329467

	sim_function	sim_corr_spearman	sim_corr_pearson
0	max	0.184915	0.220690
1	sum	0.282030	0.341541
2	bhattacharyya	0.311571	0.388735
3	hellinger	0.311571	0.390448
4	jensen-shannon	0.304427	0.375609

	word	similar_word	tm_sim	we_sim	word-prob
18573	monument	flagstones	0.772262	0.270321	2.351414e-07
23278	cards	djmax	0.836673	0.303376	2.618171e-07
14808	american	jay	0.664744	0.174906	4.067452e-05
9720	cuisine	momofuku	0.958693	0.263991	1.679581e-07
18129	delhi	lathi	0.964216	0.357408	1.956218e-07
17111	madrid	argentine	0.923548	0.341278	3.785183e-05
16802	law	attorneys	0.765067	0.386007	1.168495e-05
9771	greece	dhekelia	0.933184	0.306799	1.926578e-07
5799	city	neighboring	0.648637	0.248059	3.765621e-05
6674	album	pop-oriented	0.960294	0.397142	5.947693e-07
379	war	armistice	0.751718	0.364160	1.030374e-05
3128	version	adds	0.709375	0.222375	2.246489e-05
24346	refer	nawrahta	0.957810	0.028592	1.728981e-07
11677	played	stints	0.743628	0.334111	5.208678e-06
23316	folk	mandolin	0.783935	0.352664	4.553641e-06
16297	wingspan	suffusions	0.976056	0.228607	2.242735e-07
8107	production	co-production	0.636724	0.335409	4.246376e-06
22771	shell	operculum	0.738934	0.360926	2.550987e-06
4274	canadian	neepawa	0.926113	0.287028	1.926578e-07
13130	union	organizing	0.583596	0.254639	2.373841e-05
17545	disney	boop	0.952422	0.334652	6.797364e-07
5901	state	state-wide	0.588537	0.325053	3.744478e-07
15984	capital	populace	0.633115	0.136286	9.986592e-06
9463	football	football-related	0.821115	0.241274	1.956218e-07
14760	u.s	veterans	0.652536	0.261016	3.930516e-05
21265	library	librarians	0.884875	0.389618	3.652595e-06
16428	holland	ghent	0.862597	0.391970	5.976345e-06
1405	games	teammates	0.780641	0.248470	1.520811e-05
14713	centre	centred	0.669714	0.296191	1.178868e-05
23691	ukrainian	volost	0.971107	0.364184	4.702827e-07
...	...	...	...	...	...
13167	construction	accommodate	0.768493	0.235632	3.472090e-05
23720	korean	donghak	0.954994	0.339093	2.183456e-07
18593	century	flourishing	0.738145	0.327776	5.107903e-06
17509	directed	directorial	0.811355	0.363225	7.945407e-06
19305	term	necessarily	0.696363	0.100200	2.791464e-05
248	training	assisting	0.660984	0.338259	1.551143e-05
12463	league	player-coach	0.838715	0.383079	2.045137e-06
24339	list	kokang	0.813293	-0.018073	1.748740e-07
18127	bengal	sambhaji	0.958907	0.296152	3.714838e-07
8695	males	non-families	0.812880	0.345067	4.535758e-05
16953	locomotive	railfans	0.980472	0.273334	2.331654e-07
6711	recorded	rock'n'roll	0.649686	0.349751	1.117415e-06
356	war	pre-war	0.739857	0.237450	7.566019e-06
6891	air	hangars	0.800036	0.318578	3.426346e-06
21557	founded	co-founders	0.654880	0.272643	3.293955e-06
21798	example	differ	0.863670	0.145109	2.495561e-05
8951	born	hometown	0.724388	0.193922	2.926522e-05
3283	school	extracurricular	0.820486	0.214389	6.894187e-06
1620	team	all-time	0.811156	0.285777	2.722404e-05
4988	earthquake	hawaiians	0.823516	0.053774	1.597578e-06
16404	belgian	clercq	0.924262	0.368734	2.272375e-07
1255	game	three-game	0.727667	0.362397	1.753680e-06
10132	season	back-to-back	0.794158	0.336944	8.306023e-06
19349	basketball	cavaliers	0.837376	0.390892	5.827159e-06
15976	capital	north-east	0.615869	0.295450	3.341280e-05
20620	rio	pardos	0.896315	0.263085	2.232855e-07
23648	federal	hearings	0.622535	0.366522	1.020790e-05
21617	ethiopian	mengistu	0.971849	0.372536	5.038744e-07
561	prague	henryk	0.928931	0.352832	2.339558e-06
16394	belgian	rtbf	0.909984	0.361009	3.872917e-07

	word	similar_word	tm_sim	we_sim	word-prob
10529	shah	akbar	0.933788	0.859334	6.987058e-06
9179	president	vice-president	0.842706	0.820344	2.448434e-05
24271	miss	pageant	0.854207	0.819474	2.145912e-05
18922	miller	dunn	0.855836	0.821616	1.231923e-05
12011	daughter	step-daughter	0.874835	0.822123	4.100154e-07
7112	cells	epithelial	0.890961	0.852985	3.481673e-06
24493	rifle	rifles	0.913138	0.826603	2.494870e-05
19569	label	labels	0.908865	0.805011	2.640400e-05
23810	helsinki	turku	0.975045	0.810423	3.786962e-06
15477	disease	sepsis	0.874862	0.806278	1.333785e-06
9744	wine	wines	0.905587	0.817430	1.608446e-05
10489	pradesh	rajasthan	0.989044	0.849344	1.198035e-05
17181	juan	prieto	0.937059	0.802128	1.440488e-06
11197	malaysia	brunei	0.951622	0.823318	8.141029e-06
12677	infantry	regiments	0.953145	0.843952	2.663618e-05
24668	bacteria	microbes	0.967804	0.862982	2.538144e-06
23117	jacques	alain	0.851055	0.810180	8.150909e-06
17615	lens	lenses	0.932138	0.808962	1.187859e-05
18916	bob	larry	0.889984	0.818060	4.072293e-05
21843	committee	sub-committee	0.734741	0.858180	1.413812e-06
14908	wang	feng	0.959282	0.915305	6.745000e-06
24169	acid	succinic	0.928491	0.833541	2.084657e-07
23745	cambodia	laos	0.952723	0.908524	1.272925e-05
17821	piano	violin	0.925077	0.945782	2.883940e-05
21947	server	server-based	0.944265	0.816643	2.055017e-07
7101	cells	tissues	0.878793	0.815024	1.364512e-05
18160	pradesh	maharashtra	0.987723	0.824605	1.948215e-05
19820	directed	co-directed	0.781438	0.825753	3.066718e-06
7411	thomas	edmund	0.833110	0.816595	2.472245e-05
14402	fruit	fruits	0.906045	0.871048	2.295493e-05
...	...	...	...	...	...
10386	car	truck	0.802171	0.808960	4.456917e-05
9176	president	vice-president	0.842706	0.820344	2.448434e-05
7122	cells	phagocytic	0.899362	0.830606	2.361294e-07
23230	bosnia	herzegovina	0.990890	0.843310	2.153421e-05
17120	argentine	uruguayan	0.979802	0.862899	7.172800e-06
21208	munich	leipzig	0.865363	0.818143	1.724139e-05
1673	released	rereleased	0.805355	0.836386	1.313037e-06
23101	jacques	michel	0.868368	0.849798	2.085842e-05
19129	zimbabwe	namibia	0.974864	0.880560	1.229552e-05
18707	temperature	humidity	0.791894	0.802060	8.998603e-06
12040	william	edwin	0.812141	0.819160	2.142948e-05
15461	ship	warship	0.891284	0.806428	7.543296e-06
18134	singh	balbir	0.972971	0.812354	2.450213e-07
14075	opened	reopened	0.855194	0.821982	2.032194e-05
14930	liu	ouyang	0.986282	0.819061	5.671056e-07
23095	les	oiseaux	0.841693	0.876154	2.509492e-07
14963	yang	wang	0.913551	0.885794	3.716617e-05
14928	liu	zhu	0.985223	0.885810	1.101410e-05
20529	cars	trucks	0.783873	0.812309	2.547628e-05
10548	ali	hassan	0.909054	0.875668	1.329833e-05
21182	von	friedrich	0.887480	0.885945	2.820214e-05
6966	aircraft	twin-engine	0.973642	0.808022	1.444440e-06
9724	cuisine	cuisines	0.974887	0.825062	2.403777e-06
23964	gothenburg	stockholm	0.981954	0.905866	3.115030e-05
20138	online	on-line	0.857398	0.858626	5.626597e-06
17182	juan	prieto	0.937059	0.802128	1.440488e-06
23192	serbia	montenegro	0.958477	0.858856	1.682150e-05
9906	radio	broadcasts	0.860078	0.805244	3.912239e-05
21185	von	wilhelm	0.900363	0.867713	2.838492e-05
22242	maria	antonia	0.688678	0.825313	3.707922e-06

	word	similar_word	we_sim	tm_sim
0	announcer	sportscaster	0.741271	0.856420
1	announcer	announcers	0.739855	0.929492
2	announcer	play-by-play	0.721995	0.824415
3	shipbuilding	drydock	0.705620	0.756655
4	shipbuilding	shipyards	0.691392	0.853786

	word	similar_word	we_sim	tm_sim
99908	france-based	belgium-based	0.752608	0.245589
99909	france-based	switzerland-based	0.742562	0.343845
99910	france-based	netherlands-based	0.711484	0.297970

	word	similar_word	we_sim	tm_sim	word-prob
18	thrashers	braves	0.685548	0.175429	1.572187e-05
34	obscenity	sedition	0.715250	0.176227	2.615207e-06
40	downloads	subscribers	0.687755	0.375498	1.269961e-05
42	proposition	paragraph	0.528047	0.324924	6.761796e-06
43	xvi-c	paragraph	0.447301	0.048870	6.761796e-06
269	payne	bennett	0.822779	0.198622	2.891547e-05
270	reeves	bennett	0.749370	0.188538	2.891547e-05
271	goodwin	bennett	0.735547	0.126398	2.891547e-05
272	pollard	bennett	0.676034	0.201049	2.891547e-05
274	lockhart	bennett	0.713061	0.315204	2.891547e-05
275	ladd	bennett	0.714404	0.206487	2.891547e-05
277	cosgrove	bennett	0.664828	0.102002	2.891547e-05
280	chown	bennett	0.644461	0.274781	2.891547e-05
281	finegan	bennett	0.638635	0.201281	2.891547e-05
301	catalan	galician	0.805551	0.116282	5.137542e-06
302	majorcan	galician	0.593155	0.117201	5.137542e-06
305	prestige	wealth	0.560134	0.359994	4.377087e-05
306	inestimable	wealth	0.531444	0.213525	4.377087e-05
307	prestige	respectability	0.534297	0.208291	1.172743e-06
308	prestige	pre-eminence	0.514734	0.357879	5.335140e-07
314	peasant	landowning	0.593793	0.339807	8.180548e-07
317	arpaio	prosecutors	0.597503	0.208481	9.359219e-06
329	climaxing	climaxed	0.687626	0.390494	5.127663e-07
331	exchanged	swapped	0.561437	0.391173	4.151529e-06
350	nyugat	periodical	0.626758	0.094901	8.342579e-06
358	cane	beet	0.809086	0.204410	2.373149e-06
360	cane	beets	0.714927	0.249207	1.323905e-06
371	navalar	acharya	0.690064	0.052314	3.080549e-06
379	chips	crisps	0.645343	0.348511	5.918054e-07
397	emanuel	karl	0.663532	0.368562	4.564706e-05
...	...	...	...	...	...
231650	kaipa	metal/rock	0.687895	0.150378	1.195467e-07
231669	eschen/mauren	lusitanos	0.777070	0.016040	1.155947e-07
231688	pial	submucosa	0.643347	0.348595	1.106548e-07
231689	arcada	tecnologia	0.525121	0.099907	2.134056e-07
231690	tainy	luny	0.687774	0.169446	2.549011e-07
231700	bartles	ensley	0.589917	0.140103	2.222975e-07
231701	eagley	brimsdown	0.588161	0.091801	1.027508e-07
231711	potshot	rock'n	0.504370	0.263245	2.302014e-07
231723	destron	machination	0.681315	0.030431	1.037388e-07
231725	osoaviakhim	tsagi	0.613134	0.372947	1.570902e-07
231731	b/e	aerostructures	0.715328	0.058543	1.017629e-07
231738	rachtman	riki	0.569933	0.212883	7.567995e-07
231750	langobardia	transdanubia	0.621077	0.021857	2.163696e-07
231751	langobardia	illyricum	0.617507	0.161718	6.402168e-07
231756	ilonka	iliana	0.602602	0.041255	1.057148e-07
231777	chatkal	ghizer	0.673802	0.015847	1.076908e-07
231789	zuz	shekel	0.560893	0.279382	2.934327e-07
231794	outclass	outmaneuver	0.543353	0.146818	2.045137e-07
231806	evermann	dibblee	0.603700	0.022791	1.047268e-07
231807	power-house	curtain-raiser	0.533938	0.312562	1.847539e-07
231809	no-ship	wundagore	0.597962	0.019265	1.225106e-07
231823	moreno-ocampo	garzon	0.630333	0.234288	1.195467e-07
231830	waksal	scrushy	0.614403	0.047874	1.284386e-07
231833	eccas	unasur	0.624376	0.016497	1.906819e-07
231837	pentominoes	polyominoes	0.701090	0.020921	1.333785e-07
231840	berdichevsky	roitman	0.634883	0.171965	1.452344e-07
231851	tomochichi	kgosi	0.462471	0.017393	1.738861e-07
231854	ingarden	jakobson	0.709116	0.095362	3.537000e-07
231856	sugarfree	horchata	0.711885	0.019901	1.057148e-07
231867	kullback	lefschetz	0.600741	0.295084	2.420573e-07

	word	concept	embeddings
0	dog	land-mammals	[0.00352872698013, 2.17180189723e-08, 9.670517...
1	elephant	land-mammals	[4.18416122346e-08, 7.66164594561e-08, 0.00046...

	topic_corr	embedding_corr
.wordsim353-sim-bhattacharyya	0.629259	0.743211
.wordsim353-sim-hellinger	0.629104	0.743211
.wordsim353-sim-jensen-shannon	0.626463	0.743211
.wordsim353-sim-sum	0.613472	0.743211
.wordsim353-all-bhattacharyya	0.577273	0.656986
.wordsim353-all-hellinger	0.577256	0.656986
.wordsim353-all-jensen-shannon	0.571601	0.656986
.wordsim353-all-sum	0.554150	0.656986
.wordsim353-rel-hellinger	0.526983	0.572367
.wordsim353-rel-bhattacharyya	0.526891	0.572367
.wordsim353-rel-jensen-shannon	0.521408	0.572367
.wordsim353-rel-sum	0.498535	0.572367