In [1]:
%cd ~/NetBeansProjects/ExpLosion/
from notebooks.common_imports import *
import logging
import pandas as pd
logging.basicConfig(level=logging.INFO,
                        format="%(asctime)s\t%(module)s.%(funcName)s """
                               "(line %(lineno)d)\t%(levelname)s : %(""message)s")
from discoutils.thesaurus_loader import Vectors


/Volumes/LocalDataHD/m/mm/mmb28/NetBeansProjects/ExpLosion

In [2]:
path = '/home/m/mm/mmb28/Desktop/down/exp10-with-obs-phrases-SVD100.events.filtered.strings'
v = Vectors.from_tsv(path)
v.init_sims()

In [3]:
path = '/home/m/mm/mmb28/Desktop/down/word2vec-gigaw-100perc.unigr.strings.rep0'
w2v = Vectors.from_tsv(path)
w2v.init_sims()

In [4]:
compare_neighbours([v, w2v], 
        ['wins', 'w2v'], 
        ['attack/V', 'small/J', 'car/N', 'computer/N', 'official/N', 'monday/N', 'arafat/N', 'kill/V'])


Out[4]:
wins w2v
attack/V raid/N, assault/N, firing/N, shell/V target/V, use/V, abandon/V, harkatul/N
small/J large/J, huge/J, place/V, damage/V large/J, eclectic/J, huge/J, cellphone/N
car/N vehicle/N, truck/N, bus/N, driver/N scooter/N, truck/N, vehicle/N, showroom/N
computer/N software/N, technology/N, internet/N, video/N software/N, photocopier/N, electronic/J, high-...
official/N source/N, statement/N, ministry/N, authority/N expert/N, diplomat/N, offical/N, source/N
monday/N tuesday/N, thursday/N, wednesday/N, friday/N thursday/N, wednesday/N, tuesday/N, friday/N
arafat/N yasser/N, netanyahu/N, plo/N, rabin/N israel/N, sharon/N, palestinian/J, yasser/N
kill/V dead/J, die/V, injure/V, wound/V injure/V, dead/J, wound/V, policeman/N

In [5]:
compare_neighbours([v, w2v], ['wins', 'w2v'])


Out[5]:
wins w2v
aberration/N short-sighted/J, perverse/J, illogical/J, unju... over-reaction/N, childish/J, peculiar/J, overr...
planet/N mars/N, mouse/N, layer/N, episode/N earth/N, martian/N, moon/N, footprint/N
ultra-nationalist/N car-maker/N, fall-out/N, hinterland/N, also-ran/N leftwinger/N, pro-taiwan/J, anti-fascist/J, ma...
jesus/N christ/N, diocese/N, ruiz/N, ayacucho/N crucifixion/N, birthplace/N, lourdes/N, macare...
abuse/N torture/V, sexual/J, criminal/J, offence/N harassment/N, sexual/J, sex/N, dishonesty/N
blister/V blistering/J, ruck/N, smother/V, glance/V faultless/J, four-minute/J, somersault/N, righ...
ratio/N shortfall/N, shrink/V, adjust/V, decrease/V borrowing/N, income/N, seven-percent/J, decele...
intercept/V craft/N, board/V, freighter/N, coastguard/N spot/V, boat/N, onboard/V, ship/N
ratwatte/N anuruddha/N, chavalit/N, pawar/N, likulia/N daluwatte/N, kazbek/N, peiris/N, kasrils/N
fatf/N escap/N, nsc/N, unosom/N, igadd/N fsc/N, hipc/N, opc/N, tpa/N

In [6]:
path = '/lustre/scratch/inf/mmb28/FeatureExtractionToolkit/word2vec_vectors/word2vec-wiki-15perc.unigr.strings.rep%d'
vect = [Vectors.from_tsv(path%i)  for i in [0, 1, 2]]
for v in vect:
    v.init_sims()

In [7]:
df = compare_neighbours(vect, [0, 1, 2])

In [8]:
print(df.to_latex())


\begin{tabular}{llll}
\toprule
{} &                                                  0 &                                                  1 &                                                  2 \\
\midrule
strap/V          &              nail/V, handcuff/V, duck/V, bandage/V &          clutch/V, dangle/V, reattach/V, chained/J &            puncture/V, flick/V, clip/V, handcuff/V \\
opener/N         &         mid-season/J, alcs/N, gabba/N, mystics/UNK &  unbeaten/N, winless/J, non-conference/J, rain-... &  4-4/UNK, seasiders/N, redbacks/UNK, full-forwa... \\
starter/N        &  placekicker/N, bench/V, high-scoring/J, belfour/N &   sixers/UNK, huskers/UNK, left-hander/N, mavs/UNK &          keno/N, uteritz/N, powerball/N, redskin/N \\
whiteside/N      &              bibb/N, izard/N, paulding/N, starke/N &       dorrance/N, carrol/N, prentiss/N, tilghman/N &             nichol/N, ryland/N, beamish/N, tighe/N \\
votive/J         &  mortuary/J, alabaster/N, rock-cut/J, sculptured/J &  reliquary/N, funerary/J, lectern/N, sacrificial/J &     reliquary/N, mandalum/N, recumbent/J, krater/N \\
eschatological/J &  redemptive/J, millennial/J, prophetic/J, apoca... &  trinitarian/J, eschatology/N, predestination/N... &  predestination/N, eschatology/N, theodicy/N, m... \\
staple/J         &      canned/J, wine-making/N, starchy/J, masalum/N &             fugu/N, arabica/N, handcraft/N, coir/N &     venison/N, artisanal/J, microfiber/N, canned/J \\
nightly/J        &      weeknight/N, call-in/N, lunchtime/N, cbs-tv/N &  late-night/J, phone-in/N, three-hour/J, two-ho... &  three-hour/J, lunchtime/N, late-night/J, call-... \\
bsn/N            &                      qmjhl/N, uvm/N, tcu/N, sdsu/N &                        dsa/N, bca/N, usu/N, nabc/N &                        ccaa/N, vcu/N, dpe/N, ccl/N \\
capsule/N        &            pupal/J, airlock/N, grate/N, meniscus/N &          ovule/N, receptacle/N, eggshell/N, husk/N &     receptacle/N, mouthpart/N, ascospore/N, pupa/N \\
\bottomrule
\end{tabular}


In [9]:
df


Out[9]:
0 1 2
strap/V nail/V, handcuff/V, duck/V, bandage/V clutch/V, dangle/V, reattach/V, chained/J puncture/V, flick/V, clip/V, handcuff/V
opener/N mid-season/J, alcs/N, gabba/N, mystics/UNK unbeaten/N, winless/J, non-conference/J, rain-... 4-4/UNK, seasiders/N, redbacks/UNK, full-forwa...
starter/N placekicker/N, bench/V, high-scoring/J, belfour/N sixers/UNK, huskers/UNK, left-hander/N, mavs/UNK keno/N, uteritz/N, powerball/N, redskin/N
whiteside/N bibb/N, izard/N, paulding/N, starke/N dorrance/N, carrol/N, prentiss/N, tilghman/N nichol/N, ryland/N, beamish/N, tighe/N
votive/J mortuary/J, alabaster/N, rock-cut/J, sculptured/J reliquary/N, funerary/J, lectern/N, sacrificial/J reliquary/N, mandalum/N, recumbent/J, krater/N
eschatological/J redemptive/J, millennial/J, prophetic/J, apoca... trinitarian/J, eschatology/N, predestination/N... predestination/N, eschatology/N, theodicy/N, m...
staple/J canned/J, wine-making/N, starchy/J, masalum/N fugu/N, arabica/N, handcraft/N, coir/N venison/N, artisanal/J, microfiber/N, canned/J
nightly/J weeknight/N, call-in/N, lunchtime/N, cbs-tv/N late-night/J, phone-in/N, three-hour/J, two-ho... three-hour/J, lunchtime/N, late-night/J, call-...
bsn/N qmjhl/N, uvm/N, tcu/N, sdsu/N dsa/N, bca/N, usu/N, nabc/N ccaa/N, vcu/N, dpe/N, ccl/N
capsule/N pupal/J, airlock/N, grate/N, meniscus/N ovule/N, receptacle/N, eggshell/N, husk/N receptacle/N, mouthpart/N, ascospore/N, pupa/N

In [10]:
df = compare_neighbours(vect, [0, 1, 2], ['stalin/N', 'microsoft/N', 'fugitive/N', 'car/N', 'paris/N', 'smith/N'])

In [11]:
df


Out[11]:
0 1 2
stalin/N lenin/N, kaganovich/N, shevardnadze/N, trotsky/N lenin/N, hitler/N, goebbels/N, eichmann/N lenin/N, goebbels/N, hitler/N, bakunin/N
microsoft/N linux/N, unix/N, ms-do/N, ibm/N sharepoint/N, ibm/N, sdk/N, ios/N smartphone/N, ms-do/N, ibm/N, linux/N
fugitive/N fugitive/J, manumission/N, superhuman/N, count... escapee/N, pretence/N, looter/N, fugitive/J infiltrator/N, convict/N, escapee/N, non-comba...
car/N truck/N, motorcycle/N, vehicle/N, automobile/N truck/N, motorbike/N, automobile/N, vehicle/N truck/N, motorbike/N, driver/N, automobile/N
paris/N brussels/N, saint-cloud/N, marseille/N, amster... brussels/N, aix-en-provence/N, strasbourg/N, d... brussels/N, aix-en-provence/N, dijon/N, strasb...
smith/N thompson/N, miller/N, chapman/N, taylor/N taylor/N, williams/N, lewis/N, miller/N taylor/N, miller/N, thompson/N, allen/N

In [12]:
print(df.to_latex())


\begin{tabular}{llll}
\toprule
{} &                                                  0 &                                                  1 &                                                  2 \\
\midrule
stalin/N    &   lenin/N, kaganovich/N, shevardnadze/N, trotsky/N &          lenin/N, hitler/N, goebbels/N, eichmann/N &           lenin/N, goebbels/N, hitler/N, bakunin/N \\
microsoft/N &                    linux/N, unix/N, ms-do/N, ibm/N &                  sharepoint/N, ibm/N, sdk/N, ios/N &              smartphone/N, ms-do/N, ibm/N, linux/N \\
fugitive/N  &  fugitive/J, manumission/N, superhuman/N, count... &        escapee/N, pretence/N, looter/N, fugitive/J &  infiltrator/N, convict/N, escapee/N, non-comba... \\
car/N       &     truck/N, motorcycle/N, vehicle/N, automobile/N &      truck/N, motorbike/N, automobile/N, vehicle/N &       truck/N, motorbike/N, driver/N, automobile/N \\
paris/N     &  brussels/N, saint-cloud/N, marseille/N, amster... &  brussels/N, aix-en-provence/N, strasbourg/N, d... &  brussels/N, aix-en-provence/N, dijon/N, strasb... \\
smith/N     &          thompson/N, miller/N, chapman/N, taylor/N &            taylor/N, williams/N, lewis/N, miller/N &            taylor/N, miller/N, thompson/N, allen/N \\
\bottomrule
\end{tabular}


In [13]:
print(pd.DataFrame(df.stack()).to_latex())


\begin{tabular}{lll}
\toprule
         &   &                                                  0 \\
\midrule
stalin/N & 0 &   lenin/N, kaganovich/N, shevardnadze/N, trotsky/N \\
         & 1 &          lenin/N, hitler/N, goebbels/N, eichmann/N \\
         & 2 &           lenin/N, goebbels/N, hitler/N, bakunin/N \\
microsoft/N & 0 &                    linux/N, unix/N, ms-do/N, ibm/N \\
         & 1 &                  sharepoint/N, ibm/N, sdk/N, ios/N \\
         & 2 &              smartphone/N, ms-do/N, ibm/N, linux/N \\
fugitive/N & 0 &  fugitive/J, manumission/N, superhuman/N, count... \\
         & 1 &        escapee/N, pretence/N, looter/N, fugitive/J \\
         & 2 &  infiltrator/N, convict/N, escapee/N, non-comba... \\
car/N & 0 &     truck/N, motorcycle/N, vehicle/N, automobile/N \\
         & 1 &      truck/N, motorbike/N, automobile/N, vehicle/N \\
         & 2 &       truck/N, motorbike/N, driver/N, automobile/N \\
paris/N & 0 &  brussels/N, saint-cloud/N, marseille/N, amster... \\
         & 1 &  brussels/N, aix-en-provence/N, strasbourg/N, d... \\
         & 2 &  brussels/N, aix-en-provence/N, dijon/N, strasb... \\
smith/N & 0 &          thompson/N, miller/N, chapman/N, taylor/N \\
         & 1 &            taylor/N, williams/N, lewis/N, miller/N \\
         & 2 &            taylor/N, miller/N, thompson/N, allen/N \\
\bottomrule
\end{tabular}


In [14]:
import sys
sys.path.append('../thesisgenerator/thesisgenerator/')
from thesisgenerator.plugins.multivectors import MultiVectors
mv = MultiVectors(vect)
mv.init_sims(n_neighbors=100)

In [15]:
data = []
maxn=8
word = 'mercedes/N'
reordered = [x[0] for x in mv.get_nearest_neighbours(word)[:maxn]]
# mercedes (removes names, e.g. alesi, sebastien)
# silver/N (removes "event", "caldecott") which belong to a certain sense (competition)
data.append(reordered)
reordered


Out[15]:
['spyder/N',
 'lancia/N',
 'citroen/N',
 'ferrari/N',
 'infiniti/N',
 'integra/N',
 'flavio/N',
 'cadillac/N']

In [16]:
for v in mv.vectors:
    n = v.get_nearest_neighbours(word)
    data.append([x[0] for x in n[:maxn]])
    print([x[0] for x in n[:maxn]])
    print('---------------')


['lancia/N', 'bugatti/N', 'flavio/N', 'hino/N', 'ligier/N', 'ferrari/N', 'phaeton/N', 'ascari/N']
---------------
['ferrari/N', 'lola/N', 'oldsmobile/N', 'gallardo/N', 'risi/N', 'corse/N', 'scuderia/N', 'lancia/N']
---------------
['integra/N', 'lola/N', 'citroen/N', 'acura/N', 'ascari/N', 'prost/N', 'dallara/N', 'risi/N']
---------------

In [17]:
ddf= pd.DataFrame(data[::-1], index='rep0 rep1 rep2 reordered'.split(),
            columns=['Neigh %d'%(i+1) for i in range(maxn)])
ddf


Out[17]:
Neigh 1 Neigh 2 Neigh 3 Neigh 4 Neigh 5 Neigh 6 Neigh 7 Neigh 8
rep0 integra/N lola/N citroen/N acura/N ascari/N prost/N dallara/N risi/N
rep1 ferrari/N lola/N oldsmobile/N gallardo/N risi/N corse/N scuderia/N lancia/N
rep2 lancia/N bugatti/N flavio/N hino/N ligier/N ferrari/N phaeton/N ascari/N
reordered spyder/N lancia/N citroen/N ferrari/N infiniti/N integra/N flavio/N cadillac/N

In [18]:
print(ddf.to_latex())


\begin{tabular}{lllllllll}
\toprule
{} &    Neigh 1 &    Neigh 2 &       Neigh 3 &     Neigh 4 &     Neigh 5 &    Neigh 6 &     Neigh 7 &     Neigh 8 \\
\midrule
rep0      &  integra/N &     lola/N &     citroen/N &     acura/N &    ascari/N &    prost/N &   dallara/N &      risi/N \\
rep1      &  ferrari/N &     lola/N &  oldsmobile/N &  gallardo/N &      risi/N &    corse/N &  scuderia/N &    lancia/N \\
rep2      &   lancia/N &  bugatti/N &      flavio/N &      hino/N &    ligier/N &  ferrari/N &   phaeton/N &    ascari/N \\
reordered &   spyder/N &   lancia/N &     citroen/N &   ferrari/N &  infiniti/N &  integra/N &    flavio/N &  cadillac/N \\
\bottomrule
\end{tabular}


In [ ]: