Variation due to word2vec's random initialisation

  • fairly small at the word analogy task

In [8]:
%cd ~/NetBeansProjects/ExpLosion/
from itertools import chain
from notebooks.common_imports import *
from gui.output_utils import *
from gui.user_code import pretty_names, pairwise_significance_exp_ids

sns.timeseries.algo.bootstrap = my_bootstrap
sns.categorical.bootstrap = my_bootstrap


/Users/miroslavbatchkarov/NetBeansProjects/ExpLosion

In [9]:
def get(corpus='amazon_grouped-tagged', rep=0, avg=False, reorder=False,
       composers=['Add', 'Mult', 'Left', 'Right'], k=[3]):
    query_dict = {
        'expansions__use_similarity': 0, 
          'expansions__neighbour_strategy':'linear',
          'expansions__vectors__dimensionality': 100, 
          'document_features_ev': 'AN+NN', 
          'document_features_tr': 'J+N+AN+NN', 
          'expansions__allow_overlap': False,
          'expansions__entries_of': None,
          'expansions__vectors__algorithm': 'word2vec',
          'expansions__vectors__composer__in': composers, 
          'expansions__vectors__unlabelled': 'wiki',
          'expansions__decode_handler': 'SignifiedOnlyFeatureHandler',
          'expansions__noise': 0,
          'expansions__use_similarity': 0, 
          'expansions__k__in':k,
          'expansions__vectors__unlabelled_percentage': 15,
          'expansions__vectors__rep': rep,
          'expansions__vectors__avg': avg,
          'expansions__vectors__reorder': reorder,
          'labelled':corpus}
    return [foo.id for foo in Experiment.objects.filter(**query_dict)]

In [13]:
ids = list(chain.from_iterable(get(rep=r) for r in [0, 1, 2]))
print(ids)
df = dataframe_from_exp_ids(ids, fields_to_include={'View':'expansions__vectors__rep',
                                                    'Composer': 'expansions__vectors__composer'})
with sns.color_palette("cubehelix", 4):
    g = sns.factorplot(data=df, x='Composer', y='Accuracy', hue='View', 
                   hue_order='0 1 2'.split(), 
                   kind='bar', ci=68, aspect=2);
plt.savefig('plot-w2v_random_init_var.pdf', format='pdf', dpi=300, bbox_inches='tight', pad_inches=0.1)


[75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86]
Composer has 6000 values
Accuracy has 6000 values
View has 6000 values
folds has 6000 values

In [4]:
ids = list(chain.from_iterable(get(rep=r, composers=['Add']) for r in [0, 1, 2]))
ids


Out[4]:
[75, 79, 83]

In [5]:
# are the differences significant
sign_df, _, _ = get_demsar_params(ids, ['expansions__vectors__rep'])
sign_df


Out[5]:
name1 acc1 name2 acc2 mean_diff pval significant
0 0 0.471414 1 0.469746 0.001668 0.8 False
1 0 0.471414 2 0.450353 0.021061 0.0 True
2 1 0.469746 2 0.450353 0.019393 0.0 True

Repeats on R2 corpus

Does the smaller R2 dataset find differences between repeats on the a sample of approx the same size?

There is a difference up to 4%, but it is not significant


In [6]:
ids = list(chain.from_iterable(get(corpus='reuters21578/r8-tagged-grouped', rep=r) for r in [0, 1, 2]))
print(ids)
get_demsar_params(ids, ['expansions__vectors__rep'])[0]


[364, 365, 366]
Out[6]:
name1 acc1 name2 acc2 mean_diff pval significant
0 0 0.666391 1 0.658423 0.007968 0.676 False
1 0 0.666391 2 0.622785 0.043607 0.020 False
2 1 0.658423 2 0.622785 0.035639 0.066 False

In [15]:
ids = get(rep=0, composers=['Add'], k=[3,30]) +\
      get(rep=3, avg=True, composers=['Add'], k=[3,30]) +\
      list(chain.from_iterable(get(rep=i, reorder=True, composers=['Add'], k=[3,30]) for i in [2,3,4,5]))
print(ids)
df = dataframe_from_exp_ids(ids, fields_to_include={'rep':'expansions__vectors__rep',
                                                    'avg':'expansions__vectors__avg',
                                                    'dice':'expansions__vectors__reorder',
                                                    'k': 'expansions__k',
                                                    'Composer': 'expansions__vectors__composer'}).convert_objects(convert_numeric=True)

df['method'] = 'avg3'
df.loc[df.rep==0, 'method'] = 'std'
for i in [2,3,4,5]:
    df.loc[(df.rep==i) & (df.dice==1), 'method'] = 'dice%d'%i
df = df.drop('avg dice rep'.split(), axis=1)
df['Method'] = df.method
with sns.color_palette("cubehelix", 6):
    g = sns.factorplot(data=df, x='k', y='Accuracy', hue='Method', 
                       kind='bar', ci=68, aspect=2);
    g.set(ylim=(.3, None))
plt.savefig('plot-w2v_random_init_boost.pdf', format='pdf', dpi=300, bbox_inches='tight', pad_inches=0.1)


[75, 390, 71, 391, 296, 367, 297, 368, 298, 369, 299, 370]
Accuracy has 6000 values
k has 6000 values
avg has 6000 values
folds has 6000 values
Composer has 6000 values
rep has 6000 values
dice has 6000 values

In [12]:
Experiment.objects.get(expansions__k=30, expansions__vectors__composer='Add',
                       expansions__vectors__reorder=False).id
# 55= k=30, 100% wiki
# 75= k=3, 15%wiki


Out[12]:
55

In [16]:
get_ci(55)[1], get_ci(370)[1]


Out[16]:
(0.65484279141104296, 0.63607821193323533)

Are the differences significant?


In [13]:
ids1 = [i for i in ids if Experiment.objects.get(id=i).expansions.vectors.composer=='Add']
print(ids1)
get_demsar_params(ids, ['expansions__vectors__composer', 
                        'expansions__vectors__rep'])[0]


[75, 71, 296, 297, 298, 299]
Out[13]:
name1 acc1 name2 acc2 mean_diff pval significant
0 Add-0 0.471414 Mult-0 0.452228 0.019186 0.000 True
1 Add-0 0.471414 Left-0 0.322392 0.149022 0.000 True
2 Add-0 0.471414 Right-0 0.313756 0.157658 0.000 True
3 Add-0 0.471414 Add-3 0.451337 0.020077 0.004 True
4 Add-0 0.471414 Mult-3 0.419040 0.052374 0.000 True
5 Add-0 0.471414 Left-3 0.307700 0.163714 0.000 True
6 Add-0 0.471414 Right-3 0.294694 0.176720 0.000 True
7 Add-0 0.471414 Add-2 0.487203 0.015789 0.014 False
8 Add-0 0.471414 Mult-2 0.481405 0.009992 0.088 False
9 Add-0 0.471414 Left-2 0.327719 0.143695 0.000 True
10 Add-0 0.471414 Right-2 0.325976 0.145438 0.000 True
11 Add-0 0.471414 Add-3 0.497166 0.025752 0.000 True
12 Add-0 0.471414 Mult-3 0.488409 0.016995 0.002 True
13 Add-0 0.471414 Left-3 0.337498 0.133916 0.000 True
14 Add-0 0.471414 Right-3 0.319933 0.151481 0.000 True
15 Add-0 0.471414 Add-4 0.509483 0.038069 0.000 True
16 Add-0 0.471414 Add-5 0.512566 0.041152 0.000 True
17 Mult-0 0.452228 Left-0 0.322392 0.129836 0.000 True
18 Mult-0 0.452228 Right-0 0.313756 0.138472 0.000 True
19 Mult-0 0.452228 Add-3 0.451337 0.000891 0.890 False
20 Mult-0 0.452228 Mult-3 0.419040 0.033187 0.000 True
21 Mult-0 0.452228 Left-3 0.307700 0.144528 0.000 True
22 Mult-0 0.452228 Right-3 0.294694 0.157534 0.000 True
23 Mult-0 0.452228 Add-2 0.487203 0.034975 0.000 True
24 Mult-0 0.452228 Mult-2 0.481405 0.029178 0.000 True
25 Mult-0 0.452228 Left-2 0.327719 0.124509 0.000 True
26 Mult-0 0.452228 Right-2 0.325976 0.126252 0.000 True
27 Mult-0 0.452228 Add-3 0.497166 0.044938 0.000 True
28 Mult-0 0.452228 Mult-3 0.488409 0.036182 0.000 True
29 Mult-0 0.452228 Left-3 0.337498 0.114730 0.000 True
... ... ... ... ... ... ... ...
123 Mult-2 0.481405 Add-4 0.509483 0.028078 0.000 True
124 Mult-2 0.481405 Add-5 0.512566 0.031161 0.000 True
125 Left-2 0.327719 Right-2 0.325976 0.001743 0.810 False
126 Left-2 0.327719 Add-3 0.497166 0.169447 0.000 True
127 Left-2 0.327719 Mult-3 0.488409 0.160691 0.000 True
128 Left-2 0.327719 Left-3 0.337498 0.009779 0.126 False
129 Left-2 0.327719 Right-3 0.319933 0.007786 0.252 False
130 Left-2 0.327719 Add-4 0.509483 0.181764 0.000 True
131 Left-2 0.327719 Add-5 0.512566 0.184848 0.000 True
132 Right-2 0.325976 Add-3 0.497166 0.171190 0.000 True
133 Right-2 0.325976 Mult-3 0.488409 0.162434 0.000 True
134 Right-2 0.325976 Left-3 0.337498 0.011522 0.114 False
135 Right-2 0.325976 Right-3 0.319933 0.006043 0.400 False
136 Right-2 0.325976 Add-4 0.509483 0.183507 0.000 True
137 Right-2 0.325976 Add-5 0.512566 0.186591 0.000 True
138 Add-3 0.497166 Mult-3 0.488409 0.008756 0.120 False
139 Add-3 0.497166 Left-3 0.337498 0.159668 0.000 True
140 Add-3 0.497166 Right-3 0.319933 0.177233 0.000 True
141 Add-3 0.497166 Add-4 0.509483 0.012317 0.028 False
142 Add-3 0.497166 Add-5 0.512566 0.015400 0.012 False
143 Mult-3 0.488409 Left-3 0.337498 0.150912 0.000 True
144 Mult-3 0.488409 Right-3 0.319933 0.168476 0.000 True
145 Mult-3 0.488409 Add-4 0.509483 0.021074 0.000 True
146 Mult-3 0.488409 Add-5 0.512566 0.024157 0.000 True
147 Left-3 0.337498 Right-3 0.319933 0.017565 0.012 False
148 Left-3 0.337498 Add-4 0.509483 0.171985 0.000 True
149 Left-3 0.337498 Add-5 0.512566 0.175069 0.000 True
150 Right-3 0.319933 Add-4 0.509483 0.189550 0.000 True
151 Right-3 0.319933 Add-5 0.512566 0.192633 0.000 True
152 Add-4 0.509483 Add-5 0.512566 0.003083 0.618 False

153 rows × 7 columns

Compare unigram vectors between multiple runs

How many of the top n neighbour of some entries are the same accross multiple runs


In [16]:
from glob import glob
from discoutils.thesaurus_loader import Vectors as V
from random import sample
from itertools import combinations

pattern = '/lustre/scratch/inf/mmb28/FeatureExtractionToolkit/word2vec_vectors/word2vec-wiki-15perc.unigr.strings.rep*'
files = sorted(glob(pattern))
thes = [V.from_tsv(f) for f in files]
for t in thes:
    t.init_sims(n_neighbors=100)

In [17]:
def dice(n1, n2):
    return 2 * len(set(n1) & set(n2))/ (len(n1) + len(n2))

In [7]:
for i,j in combinations(range(len(thes)), 2):
    print(i, j, dice(thes[i].keys(), thes[j].keys()))


0 1 0.8159619711046169
0 2 0.8076966254293989
1 2 0.883361665324625

In [20]:
def dice_loop(words, thes, log=False):
    dice_data = []
    for i, j in combinations(range(len(thes)), 2):
        pair_id = '%d & %r'%(i+1, j+1 if j < 3 else 'A')
        if log:
            print('Doing pair', pair_id, flush=True)
        for num, word in enumerate(words):
            n1 = [x[0] for x in thes[i].get_nearest_neighbours(word)]
            n2 = [x[0] for x in thes[j].get_nearest_neighbours(word)]
            if n1 and n2:
                dice_data.append([pair_id, word, dice(n1, n2)])
    return dice_data

In [22]:
sampled_words = sample(list(thes[0].keys()), 5000)
dice_data = dice_loop(sampled_words, thes, log=True)


Doing pair 1 & 2
Doing pair 1 & 3
Doing pair 2 & 3

In [23]:
df3 = pd.DataFrame(dice_data, columns='Views Word Dice'.split())
with sns.axes_style("white"):
    g = sns.FacetGrid(df3, col="Views", col_wrap=3);
    g.map(sns.distplot, 'Dice', kde=True);

for ax in g.axes.flat:
    sparsify_axis_labels(ax)
    ax.set_xlim(0, 1.01)
    ax.set_yticklabels([])

sns.despine(left=True, bottom=True)
plt.savefig('plot-w2v_random_init_neigh_overlap.pdf', format='pdf', dpi=300, bbox_inches='tight', pad_inches=0.1)


Observations

Neighbours tend to be quite different over multiple runs, but the overall accuracy of the classification task changes very little

Qualitative analysis

See below Seems to me good neighbours (which seem sensible) tend to be the same across repeated runs


In [27]:
def multiway_dice(entry, thesauri):
    df = pd.DataFrame(dice_loop([entry], thesauri), columns='Pair Word Dice'.split())
    return df.Dice.mean()

In [33]:
df = compare_neighbours(thes, [0, 1, 2, 4, 5],
                        words=['balkans/N', 'lesbian/J', 'ottawa/N', 'sneaker/N', 'essay/N', 'falsify/V', 'inborn/J'])
df['mw_dice'] = [multiway_dice(feat, thes) for feat in df.index]
df.to_csv('compare_repeated_w2v.csv')
df.sort('mw_dice')


Out[33]:
0 1 2 4 5 mw_dice
inborn/J demonstrable/J, neuropsychological/J, contrain... posturing/N, physiologically/RB, self-percepti... pernicious/J, nonverbal/J, uncomplicated/J, un... internalization/N, generalised/J, flecainide/N... psychopathy/N, paraphilia/N, internalization/N... 0.115
balkans/N transoxiana/N, dagestan/N, arakan/N, ostsiedlu... cisalpine/N, interbellum/N, south-eastern/N, b... transcaucasia/N, dodecanese/N, carpathians/UNK... transcaucasia/N, rumelia/N, makran/N, ingria/N dodecanese/N, bashkortostan/N, north-eastern/N... 0.176
sneaker/N overalls/N, pinstripe/N, sleeveless/J, necktie/N sweatshirt/N, headband/N, chiffon/N, overalls/N bandana/N, stiletto/N, dreadlock/N, kimono/N sweatshirt/N, necktie/N, tights/N, leggings/N see-through/J, kimono/N, tattered/J, pajamas/N 0.296
falsify/V substantiate/V, cross-examine/V, reexamine/V, ... false/J, adduce/V, malfeasance/N, substantiate/V rebut/V, misrepresent/V, willfully/RB, groundl... falsified/J, adduce/V, mislead/V, retry/V falsification/N, falsified/J, misrepresent/V, ... 0.357
ottawa/N montreal/N, winnipeg/N, catharines/N, toronto/N catharines/N, oshawa/N, winnipeg/N, quebec/N toronto/N, montreal/N, catharines/N, winnipeg/N montreal/N, toronto/N, calgary/N, saskatoon/N mississauga/N, toronto/N, guelph/N, winnipeg/N 0.368
lesbian/J gay/J, lesbian/N, transgender/J, transgendered/J gay/J, transgender/J, bisexual/J, lgbt/N gay/J, bisexual/J, transgender/J, lesbian/N gay/J, transgender/J, bisexual/J, transsexual/J gay/J, bisexual/J, lesbian/N, transgender/J 0.485
essay/N pamphlet/N, monograph/N, poem/N, treatise/N book/N, pamphlet/N, monograph/N, two-volume/J book/N, pamphlet/N, poem/N, monograph/N book/N, monograph/N, pamphlet/N, poem/N monograph/N, pamphlet/N, book/N, poem/N 0.650

In [34]:
print(pd.DataFrame(df.stack()).to_latex())


\begin{tabular}{lll}
\toprule
          &   &                                                  0 \\
\midrule
balkans/N & 0 &  transoxiana/N, dagestan/N, arakan/N, ostsiedlu... \\
          & 1 &  cisalpine/N, interbellum/N, south-eastern/N, b... \\
          & 2 &  transcaucasia/N, dodecanese/N, carpathians/UNK... \\
          & 4 &     transcaucasia/N, rumelia/N, makran/N, ingria/N \\
          & 5 &  dodecanese/N, bashkortostan/N, north-eastern/N... \\
          & mw\_dice &                                              0.176 \\
lesbian/J & 0 &   gay/J, lesbian/N, transgender/J, transgendered/J \\
          & 1 &           gay/J, transgender/J, bisexual/J, lgbt/N \\
          & 2 &        gay/J, bisexual/J, transgender/J, lesbian/N \\
          & 4 &    gay/J, transgender/J, bisexual/J, transsexual/J \\
          & 5 &        gay/J, bisexual/J, lesbian/N, transgender/J \\
          & mw\_dice &                                              0.485 \\
ottawa/N & 0 &    montreal/N, winnipeg/N, catharines/N, toronto/N \\
          & 1 &       catharines/N, oshawa/N, winnipeg/N, quebec/N \\
          & 2 &    toronto/N, montreal/N, catharines/N, winnipeg/N \\
          & 4 &      montreal/N, toronto/N, calgary/N, saskatoon/N \\
          & 5 &     mississauga/N, toronto/N, guelph/N, winnipeg/N \\
          & mw\_dice &                                              0.368 \\
sneaker/N & 0 &   overalls/N, pinstripe/N, sleeveless/J, necktie/N \\
          & 1 &    sweatshirt/N, headband/N, chiffon/N, overalls/N \\
          & 2 &       bandana/N, stiletto/N, dreadlock/N, kimono/N \\
          & 4 &      sweatshirt/N, necktie/N, tights/N, leggings/N \\
          & 5 &     see-through/J, kimono/N, tattered/J, pajamas/N \\
          & mw\_dice &                                              0.296 \\
essay/N & 0 &        pamphlet/N, monograph/N, poem/N, treatise/N \\
          & 1 &      book/N, pamphlet/N, monograph/N, two-volume/J \\
          & 2 &            book/N, pamphlet/N, poem/N, monograph/N \\
          & 4 &            book/N, monograph/N, pamphlet/N, poem/N \\
          & 5 &            monograph/N, pamphlet/N, book/N, poem/N \\
          & mw\_dice &                                               0.65 \\
falsify/V & 0 &  substantiate/V, cross-examine/V, reexamine/V, ... \\
          & 1 &   false/J, adduce/V, malfeasance/N, substantiate/V \\
          & 2 &  rebut/V, misrepresent/V, willfully/RB, groundl... \\
          & 4 &          falsified/J, adduce/V, mislead/V, retry/V \\
          & 5 &  falsification/N, falsified/J, misrepresent/V, ... \\
          & mw\_dice &                                              0.357 \\
inborn/J & 0 &  demonstrable/J, neuropsychological/J, contrain... \\
          & 1 &  posturing/N, physiologically/RB, self-percepti... \\
          & 2 &  pernicious/J, nonverbal/J, uncomplicated/J, un... \\
          & 4 &  internalization/N, generalised/J, flecainide/N... \\
          & 5 &  psychopathy/N, paraphilia/N, internalization/N... \\
          & mw\_dice &                                              0.115 \\
\bottomrule
\end{tabular}


In [35]:
df = compare_neighbours(thes, [0, 1, 2, 4, 5])
df.head()


Out[35]:
0 1 2 4 5
echolocation/N vocalization/N, vocalisation/N, bioluminescenc... locomotion/N, vocalization/N, lifeform/N, vert... well-designed/J, oscilloscope/N, anatomic/J, t... velociraptor/N, mimic/N, baleen/N, trackball/N multitasking/N, hallucinogen/N, mpt/N, spherom...
outgassing/N vaporization/N, solidification/N, quenching/N,... None None None None
mezzo-soprano/N coloratura/N, contralto/N, countertenor/N, bas... contralto/N, soprano/N, bass-baritone/N, color... contralto/N, flautist/N, soprano/N, coloratura/N contralto/N, bass-baritone/N, harpist/N, count... contralto/N, bass-baritone/N, countertenor/N, ...
carnivora/N eudicot/N, procellariiformes/N, conspecific/N,... proteaceae/N, euphorbiaceae/N, ammonite/J, utr... None None None
squire/N jorma/N, macrae/N, bostic/N, formby/N duncan/N, wilkie/N, tyrell/N, berryman/N donnell/N, monro/N, dashwood/N, underhill/N frohman/N, binns/N, henshall/N, comerford/N strutt/N, o'hagan/N, hardcastle/N, hannon/N

In [47]:
all_feats = set.union(*[set(v.keys()) for v in thes[:3]])
len(all_feats)


Out[47]:
83267

In [48]:
from collections import Counter
Counter(sum(f in v for v in thes[:3]) for f in all_feats)


Out[48]:
Counter({3: 51579, 1: 18480, 2: 13208})

In [50]:
pattern = '/lustre/scratch/inf/mmb28/FeatureExtractionToolkit/word2vec_vectors/word2vec-wiki-100perc.unigr.strings.rep0'
v_avg = V.from_tsv(pattern)

In [52]:
v_avg.init_sims(n_neighbors=10)
len(v_avg)


Out[52]:
226627

In [66]:
new_entries = set(v_avg.keys()) - set(thes[0].keys())
old_entries = set(v_avg.keys()) & set(thes[0].keys())

In [63]:
new_nouns = [x for x in new_entries if x.endswith('/N')]

In [77]:
len(new_entries), len(old_entries), len(new_nouns)


Out[77]:
(161314, 65313, 135837)

In [73]:
new_entries


Out[73]:
{'jungleland/N',
 'us-hus/N',
 'blithely/RB',
 'font-weight/J',
 'pipelining/N',
 'saka/UNK',
 'nearshore/N',
 'limbe/N',
 'stigmellum/N',
 'hibbard/N',
 'kanta/N',
 'gommendy/N',
 'macedo/N',
 'waymark/N',
 'nadc/N',
 'claypole/N',
 'flagiocathlete/N',
 'limber/J',
 'zarif/N',
 'kenda/N',
 'oshiro/N',
 'pedrosa/N',
 'angreal/N',
 'kilworth/N',
 "anoa'i/N",
 'montfaucon/N',
 'autologous/J',
 'dotson/N',
 'freeskiing/N',
 'wae/N',
 'premaxillary/J',
 'bartkowski/N',
 'de-stalinization/N',
 'prince-bishopric/J',
 'm-g-m/N',
 'marj/N',
 'deporte/N',
 'liberal-conservative/J',
 'luweero/N',
 'privatised/J',
 'arnsberg/N',
 'headboard/N',
 'macfarren/N',
 'gladney/N',
 'slaves/UNK',
 'stepping-stone/N',
 'add-in/N',
 'sizer/N',
 'mitsuki/N',
 'deianeira/N',
 'zerelda/N',
 'lookahead/N',
 'unionised/J',
 'rayment/N',
 'multiphoton/J',
 'zebediah/N',
 'covergirl/N',
 'zululand/N',
 'ktu/N',
 'trinamool/N',
 'klimek/N',
 'hot-button/J',
 'simo/N',
 'jassi/N',
 'azariah/N',
 'risker/N',
 'gryce/N',
 'one-carbon/J',
 'urara/N',
 'nuclear-capable/J',
 'lulach/N',
 'convertase/N',
 'videoton/N',
 'majorat/N',
 'madhan/N',
 'upson/N',
 'laramide/N',
 'jiwa/N',
 'watkiss/N',
 'inestimable/J',
 'hanumangarh/N',
 'bridgeland/N',
 'macaronesian/J',
 'ganser/N',
 'l&yr/N',
 'doamna/N',
 'heraclides/N',
 'baudette/N',
 'fleetname/N',
 'bondoc/N',
 'plain-clothes/J',
 'litherland/N',
 'hallaur/N',
 'petras/N',
 'tonythetiger/N',
 'rajpoot/N',
 'parornix/N',
 'abstentionism/N',
 'eystein/N',
 'ishibashi/N',
 'lapillus/N',
 'propitiatory/J',
 'wildeshausen/N',
 'virsliga/N',
 'artistique/N',
 'xizong/N',
 'septet/N',
 'teuvo/N',
 'gatefold/J',
 'anti-treaty/N',
 'sinicized/J',
 'aberlour/N',
 'putrid/J',
 'shir/N',
 'mrb/N',
 'rigsdaler/N',
 'oryctolagus/N',
 'roseburg/N',
 'lovering/N',
 'wickersham/N',
 'giuffria/N',
 'lausd/N',
 'goria/N',
 'anti-indian/J',
 'nodosaurid/N',
 'quemoy/N',
 'seachnasaigh/N',
 'fritzlar/N',
 'souphanouvong/N',
 'bozkurt/N',
 'alloway/N',
 'darland/N',
 'beaux-arts/UNK',
 'wyllys/N',
 'atanasov/N',
 'rhumb/N',
 'barsky/N',
 'cella/N',
 'makossa/N',
 'cantarella/N',
 'adk/N',
 'padwa/N',
 'saraiva/N',
 'misattribution/N',
 'freiman/N',
 'amh/N',
 'nightrage/N',
 'vion/N',
 'indolence/N',
 'cibola/N',
 'playset/N',
 'gambela/N',
 'troponin/N',
 'community-owned/J',
 'silicide/N',
 'rishel/N',
 'hafs/N',
 'unlocking/N',
 'fistandantilus/N',
 'kulukundis/N',
 'hron/N',
 'superspeed/N',
 'single-purpose/J',
 'commentaries/UNK',
 'chavdar/N',
 'tamako/N',
 'campau/N',
 'defore/N',
 'selve/N',
 'ochraceous/N',
 'verses/UNK',
 'parcs/N',
 'stichting/N',
 'ausland/N',
 'greenspun/N',
 'misao/N',
 'xamax/N',
 'hamartium/N',
 'komorowski/N',
 'czarniecki/N',
 'zakai/N',
 'mcadams/N',
 'tanjong/N',
 'injuries/UNK',
 'voltio/N',
 'post-baccalaureate/J',
 'chaudhury/N',
 'thoms/N',
 'laziest/J',
 'eleventh-century/J',
 'nonchalance/N',
 'amby/N',
 'tree-kangaroo/N',
 'tarasov/N',
 'poppies/N',
 'dowell/N',
 'bhaun/N',
 'monist/N',
 'chest/V',
 'phasis/N',
 'treepy/N',
 'ignacia/N',
 'celtiberian/J',
 'cedarvale/N',
 'law-maker/N',
 'wancheng/N',
 'tristeza/N',
 'landskap/N',
 'florentino/N',
 'sarker/N',
 'steeg/N',
 'novisuccinea/N',
 'kinka/N',
 'sieger/N',
 'beste/N',
 'noncredit/J',
 'dismasted/J',
 'airdrome/N',
 'queen-in-council/N',
 'news-sentinel/N',
 'saperstein/N',
 'nine-story/J',
 'shesha/N',
 'skellington/N',
 'rambles/N',
 'ahmadis/UNK',
 'khirbet/N',
 'bengal/J',
 'inhalational/J',
 'giannitsa/N',
 'beams/UNK',
 'openside/N',
 'work-around/N',
 'darcis/N',
 'foxcroft/N',
 'pnau/N',
 'squibb/N',
 'deathrocker/N',
 'a-l/N',
 'co-religionist/N',
 'francisquito/N',
 'motorized/N',
 'eiph/N',
 'lobato/N',
 'moc/N',
 'servites/N',
 'brumel/N',
 'non-athletic/J',
 'talk-page/N',
 'hagiographer/N',
 'pekar/N',
 'irie/N',
 'besmirch/V',
 'taue/N',
 'ituano/N',
 'asato/N',
 'worsthorne/N',
 'nmvhw/N',
 'harpidae/N',
 'frode/N',
 'ziemke/N',
 'penglai/N',
 'dank/J',
 'schildkraut/N',
 'relist/V',
 'eyolf/N',
 'well-aimed/J',
 'johnnies/N',
 'nationalrat/N',
 'kandor/N',
 'hares/N',
 'bagnoli/N',
 'kamille/N',
 'guanches/N',
 'sequeira/N',
 'odissi/N',
 'microsurgery/N',
 'marilena/N',
 'aliza/N',
 'mabon/N',
 'al-megrahi/N',
 'polier/N',
 'guilmant/N',
 'sebum/N',
 'filmmuseum/N',
 'majka/N',
 'artos/N',
 'skytte/N',
 'mdh/N',
 'milot/N',
 'pokerstars/N',
 'smr/N',
 'crashdown/N',
 'evang/N',
 'evildoer/N',
 'tsutsui/N',
 'upstroke/N',
 'gonzague/N',
 'ashvin/N',
 'rave/J',
 'continents/UNK',
 'atar/N',
 'mirisch/N',
 'spheeris/N',
 'yeongjo/N',
 'dpr/N',
 'interlibrary/J',
 'vaisey/N',
 'zrenjanin/N',
 'cherones/N',
 'shudder/N',
 'downtown/J',
 'lichtenberger/N',
 'shivaratri/N',
 'ulp/N',
 'rogue/V',
 'fochabers/N',
 'vinayagar/N',
 'sarma/N',
 'aylestone/N',
 'bullwhip/N',
 'treasurers/UNK',
 'quinet/N',
 'ringpost/N',
 'flyhalf/N',
 'muiredach/N',
 'fat-free/N',
 'olli/N',
 'waechter/N',
 'eberhart/N',
 'tomoe/N',
 'stier/N',
 'cavea/N',
 'demokratische/N',
 'sherritt/N',
 'kall/N',
 'bathgate/N',
 'sterett/N',
 'january-february/N',
 'jeffress/N',
 'exegete/N',
 'free-flight/J',
 'vicars/UNK',
 'rhynie/N',
 'qarqar/N',
 'banteay/N',
 'mirosternus/N',
 'rhapsode/N',
 'muire/N',
 'pinn/N',
 'skiboard/N',
 'post-menopausal/J',
 'self-trained/J',
 'hoxsey/N',
 'khukri/N',
 'bhagavathy/N',
 'cusplet/N',
 'fiv/N',
 'hatting/N',
 'totternhoe/N',
 'harkleroad/N',
 'rajshahi/N',
 'exposed/N',
 'chno/N',
 'hyle/N',
 'balangay/N',
 'opening-day/J',
 'hydro-electric/N',
 'disengaged/J',
 'underconsumption/N',
 'saltfleet/N',
 'genzano/N',
 'spier/N',
 'lurianic/J',
 'loe/N',
 'torment/N',
 'circumfix/N',
 'masafumi/N',
 'had/N',
 'noughty/N',
 'subliminally/RB',
 'headstart/N',
 'vermes/N',
 'actualization/N',
 'fmn/N',
 'melanocephalus/N',
 'mycelial/J',
 'coby/N',
 'panegyrist/N',
 'parapsychological/J',
 'qaitbay/N',
 'beban/N',
 'zehn/N',
 'wakanohana/N',
 'itam/N',
 'sirian/N',
 'caridad/N',
 'zerah/N',
 'miracles/N',
 '#eee/N',
 'hewer/N',
 'mopsus/N',
 'naucratis/N',
 'phoblacht/N',
 'waterland/N',
 'petard/N',
 'koryu/N',
 'one/V',
 'taverna/N',
 'bovet/N',
 'maalaala/N',
 'kmm/N',
 'cauvery/N',
 'wavenumber/N',
 'muzika/UNK',
 'short-wheelbase/N',
 'tuberville/N',
 'westerveld/N',
 'calcot/N',
 'leba/N',
 'taenia/N',
 'eklavya/N',
 'playing/J',
 'brionne/N',
 'phlogopite/N',
 'incae/N',
 'sudeley/N',
 'umber/N',
 'kennebunk/N',
 'ticinese/N',
 'lexi/N',
 'ituri/N',
 'rewa/N',
 'dilated/N',
 'big-band/N',
 'holarctic/J',
 'nacion/N',
 'awwa/N',
 'torat/N',
 'hacksaw/N',
 'zhizn/N',
 'critchfield/N',
 'dinefwr/N',
 'nassif/N',
 'andrology/N',
 'swetnam/N',
 'ortsgemeinde/N',
 'zijlstra/N',
 'medcom/N',
 'orkestar/N',
 'blepharitis/N',
 'eberswalde/N',
 'belford/N',
 'prieur/N',
 'chickenfoot/N',
 'osh/N',
 'kerchak/N',
 'geovanni/N',
 'cradley/N',
 'gannaway/N',
 'bafut/N',
 'dues-paying/J',
 'jianzhi/N',
 'denisova/N',
 'citytrain/N',
 'gimbel/N',
 'trach/N',
 'ichthyosis/N',
 'daniello/N',
 'aul/N',
 'defreitas/N',
 'shinwa/N',
 'helwys/N',
 'ddr-oberliga/N',
 'ranni/N',
 'kishiwada/N',
 'blaha/N',
 'harless/N',
 'kagnew/N',
 'technologically-advanced/J',
 'triplane/N',
 'carlucci/N',
 'madding/J',
 'independant/J',
 'hirota/N',
 'chu/UNK',
 'saiyid/N',
 'moonface/N',
 'integumentary/J',
 'postdoc/N',
 't-top/N',
 'moist/V',
 'nolichucky/N',
 'vimalamitra/N',
 'toohey/N',
 'kieswetter/N',
 'aris/N',
 'corbeil/N',
 'woodyard/N',
 'pacem/N',
 'graun/N',
 'trilateral/J',
 'batibo/N',
 'tier-one/J',
 'krishnanagar/N',
 'mcclatchy/N',
 'myeongjong/N',
 'yaki/N',
 'civ/N',
 'nitrosamine/N',
 'kartuzy/N',
 'tolo/N',
 'cabrero/N',
 'kavango/N',
 'keiichi/N',
 'ansted/N',
 'malott/N',
 'dehra/N',
 'elliot-murray-kynynmound/N',
 'pauperism/N',
 'detsen/N',
 "d'oh/N",
 'ensis/N',
 'tsangpo/N',
 'maharal/N',
 'skandia/N',
 'szolnok/N',
 'userfication/N',
 'vergara/N',
 'surendranagar/N',
 'chitra/N',
 'evagoras/N',
 'nuria/N',
 'juliette/UNK',
 'ostracon/N',
 'merete/N',
 'steam-operated/J',
 'maxi-singles/N',
 'borosilicate/J',
 'stairlift/N',
 'sacajawea/N',
 'kohlschreiber/N',
 'kamala/N',
 'yildiz/N',
 'nstar/N',
 'soccer-specific/J',
 'webworm/N',
 'mte/N',
 'al-shabab/N',
 'wanaque/N',
 'part-time/N',
 'iwasawa/N',
 'hally/N',
 'decoded/N',
 'burhan/N',
 'irrationally/RB',
 'aloys/N',
 'quico/N',
 'rebellions/N',
 'footlocker/N',
 'prekmurian/J',
 'blackground/N',
 'frangieh/N',
 'mahanagar/N',
 'krishnamurthi/N',
 'solnhofen/N',
 'marilou/N',
 'edjohnston/N',
 'tgm/N',
 'hippodamia/N',
 'caparas/N',
 'leintwardine/N',
 'dibny/N',
 'wilen/N',
 'flavour/N',
 'ultratop/N',
 'myriokephalon/N',
 'buechler/N',
 'cablelabs/N',
 'multivariable/J',
 'rheinfelden/N',
 'compost/V',
 'take-out/J',
 'adjusted/N',
 'cussen/N',
 'anani/N',
 'passages/UNK',
 'tomasson/N',
 'streamflow/N',
 'foundling/J',
 'natin/N',
 'santosh/N',
 'hawken/N',
 'fair-skinned/J',
 'newsome/N',
 'lyssa/N',
 'mils/N',
 'wooley/N',
 'sarg/N',
 'laugier/N',
 'sure-fire/J',
 'sergeants/N',
 'crucifer/N',
 'incirlik/N',
 'mandamento/N',
 'corporatisation/N',
 'hmie/N',
 'radzymin/N',
 'non-sworn/J',
 'voy/N',
 'rur/N',
 'charro/N',
 'ceridwen/N',
 'hcus/N',
 'shifrin/N',
 'craterus/N',
 'deshpande/N',
 'shawsheen/N',
 'interpersonal/N',
 'forchheim/N',
 'justo/N',
 'world-wide/N',
 'ambient/N',
 'veale/N',
 'a-international/J',
 'matveyev/N',
 'bureaus/N',
 'pufferfish/N',
 'kaptol/N',
 'awen/N',
 'omnimon/N',
 'antifascist/J',
 'spui/N',
 'platanthera/N',
 'ghormley/N',
 'fuelling/N',
 'reequipp/V',
 'kouro/N',
 'komu/N',
 'tulasus/N',
 'cran/N',
 'kiet/N',
 'oraibi/N',
 'lowood/N',
 'flertydig/N',
 'eventide/N',
 'bagnolet/N',
 'dumbwaiter/N',
 'tschammerpokal/N',
 'yamaki/N',
 'cheboksary/N',
 'bindy/N',
 'manglerud/N',
 'lancasters/UNK',
 'ergon/N',
 'naruki/N',
 'bandem/N',
 'self-assembly/N',
 'mightier/J',
 'petrich/N',
 'syriza/N',
 'gynoecium/N',
 'astar/N',
 'superfamilium/N',
 'multi-drug/J',
 'rodley/N',
 'dapa/N',
 'ziz/N',
 'studiorum/N',
 'lavalle/N',
 'bayda/N',
 'osada/N',
 'yazawa/N',
 'howerdel/N',
 'basiliensis/N',
 'patriota/N',
 'proliferative/J',
 'sidenote/N',
 'mhv/N',
 'iguazu/N',
 'jasin/N',
 'osipovich/N',
 'tonalist/N',
 'free-speech/N',
 'harar/N',
 'cuscatlan/N',
 'fantasio/UNK',
 'hoveida/N',
 'borgwarner/N',
 'vice/RB',
 'oir/N',
 'utters/N',
 'gesso/N',
 'jukun/N',
 'hemanta/N',
 'sycorax/N',
 'glossa/N',
 'mintzberg/N',
 'balseiro/N',
 'flook/N',
 'wach/N',
 'devotional/N',
 'sey/N',
 'bencao/N',
 'beavers/N',
 'nelonen/N',
 'devante/N',
 'mpande/N',
 'misclassification/N',
 'penseroso/N',
 'khudadad/N',
 'bajwa/N',
 'purism/N',
 'jatte/N',
 'pasai/N',
 'ypbpr/N',
 'sepolcro/N',
 'northug/N',
 'vathek/N',
 'baddest/J',
 'twenty-eighth/N',
 'geschwaderkommodore/N',
 'partulidae/N',
 'zarathos/N',
 'hindon/N',
 'asthmatic/N',
 'mazar-e-sharif/N',
 'mosport/N',
 'beastly/J',
 'tammet/N',
 'sennen/N',
 'open-mouthed/J',
 'rosendale/N',
 'routt/N',
 'parvez/N',
 'alloying/J',
 'misdirected/J',
 'kidde/N',
 'izvolsky/N',
 'centrex/N',
 'santis/N',
 'closed-source/J',
 'light-harvesting/J',
 'shapely/J',
 'tulkarem/N',
 'danial/N',
 'deviantart/N',
 'soltis/N',
 'kantakouzene/N',
 'skeggs/N',
 'mandrel/N',
 'vlj/N',
 'rosoft/N',
 'industrialists/UNK',
 'bny/N',
 'decorous/J',
 'mohra/N',
 'yusof/N',
 'almaden/N',
 'waynflete/N',
 'uvarov/N',
 'issuant/N',
 'oguri/N',
 'varkey/N',
 'diwata/N',
 'glavin/N',
 'controversialist/N',
 'renaudot/N',
 'labrie/N',
 'newly-hired/J',
 'coolock/N',
 'lincolnville/N',
 'mladina/N',
 'trunks/UNK',
 'cahaba/N',
 'gsx-r/N',
 'tubulin/N',
 'bikash/N',
 'intercounty/N',
 'denliner/N',
 'activesync/N',
 'etcheverry/N',
 'seymore/N',
 'omo/N',
 'ensamble/N',
 'cardone/N',
 "nisga'a/N",
 'sonics/UNK',
 'ciaran/N',
 'torv/N',
 'winchilsea/N',
 'viimsi/N',
 'mclintock/N',
 'suncook/N',
 'benchrest/N',
 'sinugra/N',
 'alupas/N',
 'computed/N',
 'schoendoerffer/N',
 'caras/N',
 'cave-like/J',
 'tombo/N',
 'volokolamsk/N',
 'eucla/N',
 'reinke/N',
 'sirr/N',
 'negligee/N',
 'ngu/N',
 'yannis/N',
 'cittern/N',
 'mifflinburg/N',
 'loddiges/N',
 'stryn/N',
 'kurobe/N',
 'flatman/N',
 "o'daly/N",
 'zwicker/N',
 'macronutrient/N',
 'wolfie/N',
 'hallux/N',
 'eight-thousander/N',
 'mixed-income/J',
 'phs/N',
 'winxp/N',
 'upto/N',
 'veloz/N',
 'wencai/N',
 'sclerite/N',
 'maronite/J',
 'yhombi-opango/N',
 'elderton/N',
 'varietal/J',
 'varkaus/N',
 'campbellton/N',
 'gung/V',
 'razaf/N',
 'unpunished/J',
 'sub-urban/J',
 'egen/N',
 'gunning/N',
 'metallization/N',
 'salcido/N',
 'brittingham/N',
 'kasady/N',
 'saint-gobain/N',
 'kasauli/N',
 'vasanta/N',
 'gevaert/N',
 'kusti/N',
 'japygidae/N',
 'kronoberg/N',
 'landore/N',
 'aaadddaaammm/N',
 'houthis/N',
 'istres/N',
 'unlikely/N',
 'selk/N',
 'fmla/N',
 'government-approved/J',
 'theydon/N',
 'damsire/N',
 'caerulea/N',
 'lamanites/UNK',
 'jayalalithaa/N',
 'fangire/N',
 'swallows/N',
 'nisa/N',
 'catemaco/N',
 'coprosma/N',
 'shankaracharya/N',
 'triiodothyronine/N',
 'lustron/N',
 'grimaldo/N',
 'top-six/J',
 'soundstream/N',
 'labasa/N',
 'mindbender/N',
 'skipper/V',
 'pathologie/N',
 'gothicus/N',
 'child-rearing/N',
 'red-faced/J',
 'ves/UNK',
 'irredeemably/RB',
 'monssen/N',
 'ables/N',
 'maciste/N',
 'explore/N',
 'empennage/N',
 'traut/N',
 'settignano/N',
 'krakus/N',
 'sylviidae/N',
 "ev'ry/N",
 'neofolk/N',
 'pastis/N',
 'dray/N',
 'bicyclic/J',
 'falkor/N',
 'conard/N',
 'tumu/N',
 'presnell/N',
 'smagorinsky/N',
 'phosphorous/J',
 'gubernium/N',
 'german-americans/UNK',
 'purplish-red/J',
 'ntb/N',
 'unibody/N',
 'kephart/N',
 'lathus/N',
 'arngrim/N',
 'outremont/N',
 'trustkill/N',
 'toiyabe/N',
 'helpmate/N',
 'keyshawn/N',
 'criss-crossed/J',
 'nmmt/N',
 'elizaveta/N',
 'slingerlands/N',
 'celcom/N',
 'black-throated/J',
 'zohn/N',
 'maharajas/UNK',
 'repairer/N',
 'senatorial/N',
 'unexpurgated/J',
 'githyankus/N',
 'xuanwei/N',
 'confer/N',
 'cahit/N',
 'laff/N',
 'nesquehoning/N',
 'thurn/N',
 'ihrer/N',
 'regalado/N',
 'dimera/N',
 'riri/N',
 'splenectomy/N',
 'mujhe/N',
 'libration/N',
 'obituarist/N',
 'summer-long/J',
 'geologie/N',
 'measured/J',
 'gurdwaras/UNK',
 'urmila/N',
 'pandacan/N',
 'amyotrophic/J',
 'jizz/N',
 'cohost/V',
 'fortec/N',
 'whalum/N',
 'mortified/J',
 'maozhen/N',
 'bahrami/N',
 'popjustice/N',
 'warmoth/N',
 'paulownia/N',
 'colorist/N',
 'hyperstimulation/N',
 'hajjar/N',
 'yankel/N',
 'cusick/N',
 'sicker/J',
 'cti/N',
 'cornewall/N',
 'sketch-comedy/J',
 'keston/N',
 'gradualist/J',
 'ditlev/N',
 'manhattan-bound/J',
 'clinometer/N',
 'bisley/N',
 'keep/N',
 'polyrhythmic/J',
 'jianzhou/N',
 'conceptual/N',
 'versicolor/N',
 'adorable/J',
 'keshub/N',
 'carbon-oxygen/J',
 'sidiq/N',
 'trousseau/N',
 'hashshashin/N',
 'collum/N',
 'abjuration/N',
 'lethe/N',
 'halfa/N',
 'brynmawr/N',
 'non-playoff/J',
 'borno/N',
 'rupees/UNK',
 'kmtv/N',
 'carmi/N',
 'azz/N',
 'daubenton/N',
 'well-traveled/J',
 'bagot/N',
 ...}

In [80]:
v_avg.get_nearest_neighbours('measured/J')


Out[80]:
[('extravert/N', 1.2206645427572558),
 ('photoinhibition/N', 1.2268042261113046),
 ('inessential/J', 1.2355771198070054),
 ('spillover/J', 1.255496637360789),
 ('strangelet/N', 1.2661434508282658),
 ('retrogression/N', 1.2728655184461561),
 ('photoionization/N', 1.2739789904460517),
 ('self-recognition/N', 1.2808519646775918),
 ('microphysic/N', 1.2869837189422086),
 ('self-affirmation/N', 1.2886332268324958)]

In [ ]: