Variation due to `word2vec`'s random initialisation

fairly small at the word analogy task



In [8]:

    
%cd ~/NetBeansProjects/ExpLosion/
from itertools import chain
from notebooks.common_imports import *
from gui.output_utils import *
from gui.user_code import pretty_names, pairwise_significance_exp_ids

sns.timeseries.algo.bootstrap = my_bootstrap
sns.categorical.bootstrap = my_bootstrap









    



/Users/miroslavbatchkarov/NetBeansProjects/ExpLosion



In [9]:

    
def get(corpus='amazon_grouped-tagged', rep=0, avg=False, reorder=False,
       composers=['Add', 'Mult', 'Left', 'Right'], k=[3]):
    query_dict = {
        'expansions__use_similarity': 0, 
          'expansions__neighbour_strategy':'linear',
          'expansions__vectors__dimensionality': 100, 
          'document_features_ev': 'AN+NN', 
          'document_features_tr': 'J+N+AN+NN', 
          'expansions__allow_overlap': False,
          'expansions__entries_of': None,
          'expansions__vectors__algorithm': 'word2vec',
          'expansions__vectors__composer__in': composers, 
          'expansions__vectors__unlabelled': 'wiki',
          'expansions__decode_handler': 'SignifiedOnlyFeatureHandler',
          'expansions__noise': 0,
          'expansions__use_similarity': 0, 
          'expansions__k__in':k,
          'expansions__vectors__unlabelled_percentage': 15,
          'expansions__vectors__rep': rep,
          'expansions__vectors__avg': avg,
          'expansions__vectors__reorder': reorder,
          'labelled':corpus}
    return [foo.id for foo in Experiment.objects.filter(**query_dict)]



In [13]:

    
ids = list(chain.from_iterable(get(rep=r) for r in [0, 1, 2]))
print(ids)
df = dataframe_from_exp_ids(ids, fields_to_include={'View':'expansions__vectors__rep',
                                                    'Composer': 'expansions__vectors__composer'})
with sns.color_palette("cubehelix", 4):
    g = sns.factorplot(data=df, x='Composer', y='Accuracy', hue='View', 
                   hue_order='0 1 2'.split(), 
                   kind='bar', ci=68, aspect=2);
plt.savefig('plot-w2v_random_init_var.pdf', format='pdf', dpi=300, bbox_inches='tight', pad_inches=0.1)









    



[75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86]
Composer has 6000 values
Accuracy has 6000 values
View has 6000 values
folds has 6000 values



In [4]:

    
ids = list(chain.from_iterable(get(rep=r, composers=['Add']) for r in [0, 1, 2]))
ids









    Out[4]:





[75, 79, 83]



In [5]:

    
# are the differences significant
sign_df, _, _ = get_demsar_params(ids, ['expansions__vectors__rep'])
sign_df









    Out[5]:






  
    
      
      name1
      acc1
      name2
      acc2
      mean_diff
      pval
      significant
    
  
  
    
      0
      0
      0.471414
      1
      0.469746
      0.001668
      0.8
      False
    
    
      1
      0
      0.471414
      2
      0.450353
      0.021061
      0.0
      True
    
    
      2
      1
      0.469746
      2
      0.450353
      0.019393
      0.0
      True

Repeats on R2 corpus

Does the smaller R2 dataset find differences between repeats on the a sample of approx the same size?

There is a difference up to 4%, but it is not significant



In [6]:

    
ids = list(chain.from_iterable(get(corpus='reuters21578/r8-tagged-grouped', rep=r) for r in [0, 1, 2]))
print(ids)
get_demsar_params(ids, ['expansions__vectors__rep'])[0]









    



[364, 365, 366]






    Out[6]:






  
    
      
      name1
      acc1
      name2
      acc2
      mean_diff
      pval
      significant
    
  
  
    
      0
      0
      0.666391
      1
      0.658423
      0.007968
      0.676
      False
    
    
      1
      0
      0.666391
      2
      0.622785
      0.043607
      0.020
      False
    
    
      2
      1
      0.658423
      2
      0.622785
      0.035639
      0.066
      False



In [15]:

    
ids = get(rep=0, composers=['Add'], k=[3,30]) +\
      get(rep=3, avg=True, composers=['Add'], k=[3,30]) +\
      list(chain.from_iterable(get(rep=i, reorder=True, composers=['Add'], k=[3,30]) for i in [2,3,4,5]))
print(ids)
df = dataframe_from_exp_ids(ids, fields_to_include={'rep':'expansions__vectors__rep',
                                                    'avg':'expansions__vectors__avg',
                                                    'dice':'expansions__vectors__reorder',
                                                    'k': 'expansions__k',
                                                    'Composer': 'expansions__vectors__composer'}).convert_objects(convert_numeric=True)

df['method'] = 'avg3'
df.loc[df.rep==0, 'method'] = 'std'
for i in [2,3,4,5]:
    df.loc[(df.rep==i) & (df.dice==1), 'method'] = 'dice%d'%i
df = df.drop('avg dice rep'.split(), axis=1)
df['Method'] = df.method
with sns.color_palette("cubehelix", 6):
    g = sns.factorplot(data=df, x='k', y='Accuracy', hue='Method', 
                       kind='bar', ci=68, aspect=2);
    g.set(ylim=(.3, None))
plt.savefig('plot-w2v_random_init_boost.pdf', format='pdf', dpi=300, bbox_inches='tight', pad_inches=0.1)









    



[75, 390, 71, 391, 296, 367, 297, 368, 298, 369, 299, 370]
Accuracy has 6000 values
k has 6000 values
avg has 6000 values
folds has 6000 values
Composer has 6000 values
rep has 6000 values
dice has 6000 values



In [12]:

    
Experiment.objects.get(expansions__k=30, expansions__vectors__composer='Add',
                       expansions__vectors__reorder=False).id
# 55= k=30, 100% wiki
# 75= k=3, 15%wiki









    Out[12]:





55



In [16]:

    
get_ci(55)[1], get_ci(370)[1]









    Out[16]:





(0.65484279141104296, 0.63607821193323533)

Are the differences significant?



In [13]:

    
ids1 = [i for i in ids if Experiment.objects.get(id=i).expansions.vectors.composer=='Add']
print(ids1)
get_demsar_params(ids, ['expansions__vectors__composer', 
                        'expansions__vectors__rep'])[0]









    



[75, 71, 296, 297, 298, 299]






    Out[13]:






  
    
      
      name1
      acc1
      name2
      acc2
      mean_diff
      pval
      significant
    
  
  
    
      0
      Add-0
      0.471414
      Mult-0
      0.452228
      0.019186
      0.000
      True
    
    
      1
      Add-0
      0.471414
      Left-0
      0.322392
      0.149022
      0.000
      True
    
    
      2
      Add-0
      0.471414
      Right-0
      0.313756
      0.157658
      0.000
      True
    
    
      3
      Add-0
      0.471414
      Add-3
      0.451337
      0.020077
      0.004
      True
    
    
      4
      Add-0
      0.471414
      Mult-3
      0.419040
      0.052374
      0.000
      True
    
    
      5
      Add-0
      0.471414
      Left-3
      0.307700
      0.163714
      0.000
      True
    
    
      6
      Add-0
      0.471414
      Right-3
      0.294694
      0.176720
      0.000
      True
    
    
      7
      Add-0
      0.471414
      Add-2
      0.487203
      0.015789
      0.014
      False
    
    
      8
      Add-0
      0.471414
      Mult-2
      0.481405
      0.009992
      0.088
      False
    
    
      9
      Add-0
      0.471414
      Left-2
      0.327719
      0.143695
      0.000
      True
    
    
      10
      Add-0
      0.471414
      Right-2
      0.325976
      0.145438
      0.000
      True
    
    
      11
      Add-0
      0.471414
      Add-3
      0.497166
      0.025752
      0.000
      True
    
    
      12
      Add-0
      0.471414
      Mult-3
      0.488409
      0.016995
      0.002
      True
    
    
      13
      Add-0
      0.471414
      Left-3
      0.337498
      0.133916
      0.000
      True
    
    
      14
      Add-0
      0.471414
      Right-3
      0.319933
      0.151481
      0.000
      True
    
    
      15
      Add-0
      0.471414
      Add-4
      0.509483
      0.038069
      0.000
      True
    
    
      16
      Add-0
      0.471414
      Add-5
      0.512566
      0.041152
      0.000
      True
    
    
      17
      Mult-0
      0.452228
      Left-0
      0.322392
      0.129836
      0.000
      True
    
    
      18
      Mult-0
      0.452228
      Right-0
      0.313756
      0.138472
      0.000
      True
    
    
      19
      Mult-0
      0.452228
      Add-3
      0.451337
      0.000891
      0.890
      False
    
    
      20
      Mult-0
      0.452228
      Mult-3
      0.419040
      0.033187
      0.000
      True
    
    
      21
      Mult-0
      0.452228
      Left-3
      0.307700
      0.144528
      0.000
      True
    
    
      22
      Mult-0
      0.452228
      Right-3
      0.294694
      0.157534
      0.000
      True
    
    
      23
      Mult-0
      0.452228
      Add-2
      0.487203
      0.034975
      0.000
      True
    
    
      24
      Mult-0
      0.452228
      Mult-2
      0.481405
      0.029178
      0.000
      True
    
    
      25
      Mult-0
      0.452228
      Left-2
      0.327719
      0.124509
      0.000
      True
    
    
      26
      Mult-0
      0.452228
      Right-2
      0.325976
      0.126252
      0.000
      True
    
    
      27
      Mult-0
      0.452228
      Add-3
      0.497166
      0.044938
      0.000
      True
    
    
      28
      Mult-0
      0.452228
      Mult-3
      0.488409
      0.036182
      0.000
      True
    
    
      29
      Mult-0
      0.452228
      Left-3
      0.337498
      0.114730
      0.000
      True
    
    
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
    
    
      123
      Mult-2
      0.481405
      Add-4
      0.509483
      0.028078
      0.000
      True
    
    
      124
      Mult-2
      0.481405
      Add-5
      0.512566
      0.031161
      0.000
      True
    
    
      125
      Left-2
      0.327719
      Right-2
      0.325976
      0.001743
      0.810
      False
    
    
      126
      Left-2
      0.327719
      Add-3
      0.497166
      0.169447
      0.000
      True
    
    
      127
      Left-2
      0.327719
      Mult-3
      0.488409
      0.160691
      0.000
      True
    
    
      128
      Left-2
      0.327719
      Left-3
      0.337498
      0.009779
      0.126
      False
    
    
      129
      Left-2
      0.327719
      Right-3
      0.319933
      0.007786
      0.252
      False
    
    
      130
      Left-2
      0.327719
      Add-4
      0.509483
      0.181764
      0.000
      True
    
    
      131
      Left-2
      0.327719
      Add-5
      0.512566
      0.184848
      0.000
      True
    
    
      132
      Right-2
      0.325976
      Add-3
      0.497166
      0.171190
      0.000
      True
    
    
      133
      Right-2
      0.325976
      Mult-3
      0.488409
      0.162434
      0.000
      True
    
    
      134
      Right-2
      0.325976
      Left-3
      0.337498
      0.011522
      0.114
      False
    
    
      135
      Right-2
      0.325976
      Right-3
      0.319933
      0.006043
      0.400
      False
    
    
      136
      Right-2
      0.325976
      Add-4
      0.509483
      0.183507
      0.000
      True
    
    
      137
      Right-2
      0.325976
      Add-5
      0.512566
      0.186591
      0.000
      True
    
    
      138
      Add-3
      0.497166
      Mult-3
      0.488409
      0.008756
      0.120
      False
    
    
      139
      Add-3
      0.497166
      Left-3
      0.337498
      0.159668
      0.000
      True
    
    
      140
      Add-3
      0.497166
      Right-3
      0.319933
      0.177233
      0.000
      True
    
    
      141
      Add-3
      0.497166
      Add-4
      0.509483
      0.012317
      0.028
      False
    
    
      142
      Add-3
      0.497166
      Add-5
      0.512566
      0.015400
      0.012
      False
    
    
      143
      Mult-3
      0.488409
      Left-3
      0.337498
      0.150912
      0.000
      True
    
    
      144
      Mult-3
      0.488409
      Right-3
      0.319933
      0.168476
      0.000
      True
    
    
      145
      Mult-3
      0.488409
      Add-4
      0.509483
      0.021074
      0.000
      True
    
    
      146
      Mult-3
      0.488409
      Add-5
      0.512566
      0.024157
      0.000
      True
    
    
      147
      Left-3
      0.337498
      Right-3
      0.319933
      0.017565
      0.012
      False
    
    
      148
      Left-3
      0.337498
      Add-4
      0.509483
      0.171985
      0.000
      True
    
    
      149
      Left-3
      0.337498
      Add-5
      0.512566
      0.175069
      0.000
      True
    
    
      150
      Right-3
      0.319933
      Add-4
      0.509483
      0.189550
      0.000
      True
    
    
      151
      Right-3
      0.319933
      Add-5
      0.512566
      0.192633
      0.000
      True
    
    
      152
      Add-4
      0.509483
      Add-5
      0.512566
      0.003083
      0.618
      False
    
  

153 rows × 7 columns

Compare unigram vectors between multiple runs

How many of the top n neighbour of some entries are the same accross multiple runs



In [16]:

    
from glob import glob
from discoutils.thesaurus_loader import Vectors as V
from random import sample
from itertools import combinations

pattern = '/lustre/scratch/inf/mmb28/FeatureExtractionToolkit/word2vec_vectors/word2vec-wiki-15perc.unigr.strings.rep*'
files = sorted(glob(pattern))
thes = [V.from_tsv(f) for f in files]
for t in thes:
    t.init_sims(n_neighbors=100)



In [17]:

    
def dice(n1, n2):
    return 2 * len(set(n1) & set(n2))/ (len(n1) + len(n2))



In [7]:

    
for i,j in combinations(range(len(thes)), 2):
    print(i, j, dice(thes[i].keys(), thes[j].keys()))









    



0 1 0.8159619711046169
0 2 0.8076966254293989
1 2 0.883361665324625



In [20]:

    
def dice_loop(words, thes, log=False):
    dice_data = []
    for i, j in combinations(range(len(thes)), 2):
        pair_id = '%d & %r'%(i+1, j+1 if j < 3 else 'A')
        if log:
            print('Doing pair', pair_id, flush=True)
        for num, word in enumerate(words):
            n1 = [x[0] for x in thes[i].get_nearest_neighbours(word)]
            n2 = [x[0] for x in thes[j].get_nearest_neighbours(word)]
            if n1 and n2:
                dice_data.append([pair_id, word, dice(n1, n2)])
    return dice_data



In [22]:

    
sampled_words = sample(list(thes[0].keys()), 5000)
dice_data = dice_loop(sampled_words, thes, log=True)









    



Doing pair 1 & 2
Doing pair 1 & 3
Doing pair 2 & 3



In [23]:

    
df3 = pd.DataFrame(dice_data, columns='Views Word Dice'.split())
with sns.axes_style("white"):
    g = sns.FacetGrid(df3, col="Views", col_wrap=3);
    g.map(sns.distplot, 'Dice', kde=True);

for ax in g.axes.flat:
    sparsify_axis_labels(ax)
    ax.set_xlim(0, 1.01)
    ax.set_yticklabels([])

sns.despine(left=True, bottom=True)
plt.savefig('plot-w2v_random_init_neigh_overlap.pdf', format='pdf', dpi=300, bbox_inches='tight', pad_inches=0.1)

Observations

Neighbours tend to be quite different over multiple runs, but the overall accuracy of the classification task changes very little

Qualitative analysis

See below Seems to me good neighbours (which seem sensible) tend to be the same across repeated runs



In [27]:

    
def multiway_dice(entry, thesauri):
    df = pd.DataFrame(dice_loop([entry], thesauri), columns='Pair Word Dice'.split())
    return df.Dice.mean()



In [33]:

    
df = compare_neighbours(thes, [0, 1, 2, 4, 5],
                        words=['balkans/N', 'lesbian/J', 'ottawa/N', 'sneaker/N', 'essay/N', 'falsify/V', 'inborn/J'])
df['mw_dice'] = [multiway_dice(feat, thes) for feat in df.index]
df.to_csv('compare_repeated_w2v.csv')
df.sort('mw_dice')









    Out[33]:






  
    
      
      0
      1
      2
      4
      5
      mw_dice
    
  
  
    
      inborn/J
      demonstrable/J, neuropsychological/J, contrain...
      posturing/N, physiologically/RB, self-percepti...
      pernicious/J, nonverbal/J, uncomplicated/J, un...
      internalization/N, generalised/J, flecainide/N...
      psychopathy/N, paraphilia/N, internalization/N...
      0.115
    
    
      balkans/N
      transoxiana/N, dagestan/N, arakan/N, ostsiedlu...
      cisalpine/N, interbellum/N, south-eastern/N, b...
      transcaucasia/N, dodecanese/N, carpathians/UNK...
      transcaucasia/N, rumelia/N, makran/N, ingria/N
      dodecanese/N, bashkortostan/N, north-eastern/N...
      0.176
    
    
      sneaker/N
      overalls/N, pinstripe/N, sleeveless/J, necktie/N
      sweatshirt/N, headband/N, chiffon/N, overalls/N
      bandana/N, stiletto/N, dreadlock/N, kimono/N
      sweatshirt/N, necktie/N, tights/N, leggings/N
      see-through/J, kimono/N, tattered/J, pajamas/N
      0.296
    
    
      falsify/V
      substantiate/V, cross-examine/V, reexamine/V, ...
      false/J, adduce/V, malfeasance/N, substantiate/V
      rebut/V, misrepresent/V, willfully/RB, groundl...
      falsified/J, adduce/V, mislead/V, retry/V
      falsification/N, falsified/J, misrepresent/V, ...
      0.357
    
    
      ottawa/N
      montreal/N, winnipeg/N, catharines/N, toronto/N
      catharines/N, oshawa/N, winnipeg/N, quebec/N
      toronto/N, montreal/N, catharines/N, winnipeg/N
      montreal/N, toronto/N, calgary/N, saskatoon/N
      mississauga/N, toronto/N, guelph/N, winnipeg/N
      0.368
    
    
      lesbian/J
      gay/J, lesbian/N, transgender/J, transgendered/J
      gay/J, transgender/J, bisexual/J, lgbt/N
      gay/J, bisexual/J, transgender/J, lesbian/N
      gay/J, transgender/J, bisexual/J, transsexual/J
      gay/J, bisexual/J, lesbian/N, transgender/J
      0.485
    
    
      essay/N
      pamphlet/N, monograph/N, poem/N, treatise/N
      book/N, pamphlet/N, monograph/N, two-volume/J
      book/N, pamphlet/N, poem/N, monograph/N
      book/N, monograph/N, pamphlet/N, poem/N
      monograph/N, pamphlet/N, book/N, poem/N
      0.650



In [34]:

    
print(pd.DataFrame(df.stack()).to_latex())









    



\begin{tabular}{lll}
\toprule
          &   &                                                  0 \\
\midrule
balkans/N & 0 &  transoxiana/N, dagestan/N, arakan/N, ostsiedlu... \\
          & 1 &  cisalpine/N, interbellum/N, south-eastern/N, b... \\
          & 2 &  transcaucasia/N, dodecanese/N, carpathians/UNK... \\
          & 4 &     transcaucasia/N, rumelia/N, makran/N, ingria/N \\
          & 5 &  dodecanese/N, bashkortostan/N, north-eastern/N... \\
          & mw\_dice &                                              0.176 \\
lesbian/J & 0 &   gay/J, lesbian/N, transgender/J, transgendered/J \\
          & 1 &           gay/J, transgender/J, bisexual/J, lgbt/N \\
          & 2 &        gay/J, bisexual/J, transgender/J, lesbian/N \\
          & 4 &    gay/J, transgender/J, bisexual/J, transsexual/J \\
          & 5 &        gay/J, bisexual/J, lesbian/N, transgender/J \\
          & mw\_dice &                                              0.485 \\
ottawa/N & 0 &    montreal/N, winnipeg/N, catharines/N, toronto/N \\
          & 1 &       catharines/N, oshawa/N, winnipeg/N, quebec/N \\
          & 2 &    toronto/N, montreal/N, catharines/N, winnipeg/N \\
          & 4 &      montreal/N, toronto/N, calgary/N, saskatoon/N \\
          & 5 &     mississauga/N, toronto/N, guelph/N, winnipeg/N \\
          & mw\_dice &                                              0.368 \\
sneaker/N & 0 &   overalls/N, pinstripe/N, sleeveless/J, necktie/N \\
          & 1 &    sweatshirt/N, headband/N, chiffon/N, overalls/N \\
          & 2 &       bandana/N, stiletto/N, dreadlock/N, kimono/N \\
          & 4 &      sweatshirt/N, necktie/N, tights/N, leggings/N \\
          & 5 &     see-through/J, kimono/N, tattered/J, pajamas/N \\
          & mw\_dice &                                              0.296 \\
essay/N & 0 &        pamphlet/N, monograph/N, poem/N, treatise/N \\
          & 1 &      book/N, pamphlet/N, monograph/N, two-volume/J \\
          & 2 &            book/N, pamphlet/N, poem/N, monograph/N \\
          & 4 &            book/N, monograph/N, pamphlet/N, poem/N \\
          & 5 &            monograph/N, pamphlet/N, book/N, poem/N \\
          & mw\_dice &                                               0.65 \\
falsify/V & 0 &  substantiate/V, cross-examine/V, reexamine/V, ... \\
          & 1 &   false/J, adduce/V, malfeasance/N, substantiate/V \\
          & 2 &  rebut/V, misrepresent/V, willfully/RB, groundl... \\
          & 4 &          falsified/J, adduce/V, mislead/V, retry/V \\
          & 5 &  falsification/N, falsified/J, misrepresent/V, ... \\
          & mw\_dice &                                              0.357 \\
inborn/J & 0 &  demonstrable/J, neuropsychological/J, contrain... \\
          & 1 &  posturing/N, physiologically/RB, self-percepti... \\
          & 2 &  pernicious/J, nonverbal/J, uncomplicated/J, un... \\
          & 4 &  internalization/N, generalised/J, flecainide/N... \\
          & 5 &  psychopathy/N, paraphilia/N, internalization/N... \\
          & mw\_dice &                                              0.115 \\
\bottomrule
\end{tabular}



In [35]:

    
df = compare_neighbours(thes, [0, 1, 2, 4, 5])
df.head()









    Out[35]:






  
    
      
      0
      1
      2
      4
      5
    
  
  
    
      echolocation/N
      vocalization/N, vocalisation/N, bioluminescenc...
      locomotion/N, vocalization/N, lifeform/N, vert...
      well-designed/J, oscilloscope/N, anatomic/J, t...
      velociraptor/N, mimic/N, baleen/N, trackball/N
      multitasking/N, hallucinogen/N, mpt/N, spherom...
    
    
      outgassing/N
      vaporization/N, solidification/N, quenching/N,...
      None
      None
      None
      None
    
    
      mezzo-soprano/N
      coloratura/N, contralto/N, countertenor/N, bas...
      contralto/N, soprano/N, bass-baritone/N, color...
      contralto/N, flautist/N, soprano/N, coloratura/N
      contralto/N, bass-baritone/N, harpist/N, count...
      contralto/N, bass-baritone/N, countertenor/N, ...
    
    
      carnivora/N
      eudicot/N, procellariiformes/N, conspecific/N,...
      proteaceae/N, euphorbiaceae/N, ammonite/J, utr...
      None
      None
      None
    
    
      squire/N
      jorma/N, macrae/N, bostic/N, formby/N
      duncan/N, wilkie/N, tyrell/N, berryman/N
      donnell/N, monro/N, dashwood/N, underhill/N
      frohman/N, binns/N, henshall/N, comerford/N
      strutt/N, o'hagan/N, hardcastle/N, hannon/N



In [47]:

    
all_feats = set.union(*[set(v.keys()) for v in thes[:3]])
len(all_feats)









    Out[47]:





83267



In [48]:

    
from collections import Counter
Counter(sum(f in v for v in thes[:3]) for f in all_feats)









    Out[48]:





Counter({3: 51579, 1: 18480, 2: 13208})



In [50]:

    
pattern = '/lustre/scratch/inf/mmb28/FeatureExtractionToolkit/word2vec_vectors/word2vec-wiki-100perc.unigr.strings.rep0'
v_avg = V.from_tsv(pattern)



In [52]:

    
v_avg.init_sims(n_neighbors=10)
len(v_avg)









    Out[52]:





226627



In [66]:

    
new_entries = set(v_avg.keys()) - set(thes[0].keys())
old_entries = set(v_avg.keys()) & set(thes[0].keys())



In [63]:

    
new_nouns = [x for x in new_entries if x.endswith('/N')]



In [77]:

    
len(new_entries), len(old_entries), len(new_nouns)









    Out[77]:





(161314, 65313, 135837)



In [73]:

    
new_entries









    Out[73]:





{'jungleland/N',
 'us-hus/N',
 'blithely/RB',
 'font-weight/J',
 'pipelining/N',
 'saka/UNK',
 'nearshore/N',
 'limbe/N',
 'stigmellum/N',
 'hibbard/N',
 'kanta/N',
 'gommendy/N',
 'macedo/N',
 'waymark/N',
 'nadc/N',
 'claypole/N',
 'flagiocathlete/N',
 'limber/J',
 'zarif/N',
 'kenda/N',
 'oshiro/N',
 'pedrosa/N',
 'angreal/N',
 'kilworth/N',
 "anoa'i/N",
 'montfaucon/N',
 'autologous/J',
 'dotson/N',
 'freeskiing/N',
 'wae/N',
 'premaxillary/J',
 'bartkowski/N',
 'de-stalinization/N',
 'prince-bishopric/J',
 'm-g-m/N',
 'marj/N',
 'deporte/N',
 'liberal-conservative/J',
 'luweero/N',
 'privatised/J',
 'arnsberg/N',
 'headboard/N',
 'macfarren/N',
 'gladney/N',
 'slaves/UNK',
 'stepping-stone/N',
 'add-in/N',
 'sizer/N',
 'mitsuki/N',
 'deianeira/N',
 'zerelda/N',
 'lookahead/N',
 'unionised/J',
 'rayment/N',
 'multiphoton/J',
 'zebediah/N',
 'covergirl/N',
 'zululand/N',
 'ktu/N',
 'trinamool/N',
 'klimek/N',
 'hot-button/J',
 'simo/N',
 'jassi/N',
 'azariah/N',
 'risker/N',
 'gryce/N',
 'one-carbon/J',
 'urara/N',
 'nuclear-capable/J',
 'lulach/N',
 'convertase/N',
 'videoton/N',
 'majorat/N',
 'madhan/N',
 'upson/N',
 'laramide/N',
 'jiwa/N',
 'watkiss/N',
 'inestimable/J',
 'hanumangarh/N',
 'bridgeland/N',
 'macaronesian/J',
 'ganser/N',
 'l&yr/N',
 'doamna/N',
 'heraclides/N',
 'baudette/N',
 'fleetname/N',
 'bondoc/N',
 'plain-clothes/J',
 'litherland/N',
 'hallaur/N',
 'petras/N',
 'tonythetiger/N',
 'rajpoot/N',
 'parornix/N',
 'abstentionism/N',
 'eystein/N',
 'ishibashi/N',
 'lapillus/N',
 'propitiatory/J',
 'wildeshausen/N',
 'virsliga/N',
 'artistique/N',
 'xizong/N',
 'septet/N',
 'teuvo/N',
 'gatefold/J',
 'anti-treaty/N',
 'sinicized/J',
 'aberlour/N',
 'putrid/J',
 'shir/N',
 'mrb/N',
 'rigsdaler/N',
 'oryctolagus/N',
 'roseburg/N',
 'lovering/N',
 'wickersham/N',
 'giuffria/N',
 'lausd/N',
 'goria/N',
 'anti-indian/J',
 'nodosaurid/N',
 'quemoy/N',
 'seachnasaigh/N',
 'fritzlar/N',
 'souphanouvong/N',
 'bozkurt/N',
 'alloway/N',
 'darland/N',
 'beaux-arts/UNK',
 'wyllys/N',
 'atanasov/N',
 'rhumb/N',
 'barsky/N',
 'cella/N',
 'makossa/N',
 'cantarella/N',
 'adk/N',
 'padwa/N',
 'saraiva/N',
 'misattribution/N',
 'freiman/N',
 'amh/N',
 'nightrage/N',
 'vion/N',
 'indolence/N',
 'cibola/N',
 'playset/N',
 'gambela/N',
 'troponin/N',
 'community-owned/J',
 'silicide/N',
 'rishel/N',
 'hafs/N',
 'unlocking/N',
 'fistandantilus/N',
 'kulukundis/N',
 'hron/N',
 'superspeed/N',
 'single-purpose/J',
 'commentaries/UNK',
 'chavdar/N',
 'tamako/N',
 'campau/N',
 'defore/N',
 'selve/N',
 'ochraceous/N',
 'verses/UNK',
 'parcs/N',
 'stichting/N',
 'ausland/N',
 'greenspun/N',
 'misao/N',
 'xamax/N',
 'hamartium/N',
 'komorowski/N',
 'czarniecki/N',
 'zakai/N',
 'mcadams/N',
 'tanjong/N',
 'injuries/UNK',
 'voltio/N',
 'post-baccalaureate/J',
 'chaudhury/N',
 'thoms/N',
 'laziest/J',
 'eleventh-century/J',
 'nonchalance/N',
 'amby/N',
 'tree-kangaroo/N',
 'tarasov/N',
 'poppies/N',
 'dowell/N',
 'bhaun/N',
 'monist/N',
 'chest/V',
 'phasis/N',
 'treepy/N',
 'ignacia/N',
 'celtiberian/J',
 'cedarvale/N',
 'law-maker/N',
 'wancheng/N',
 'tristeza/N',
 'landskap/N',
 'florentino/N',
 'sarker/N',
 'steeg/N',
 'novisuccinea/N',
 'kinka/N',
 'sieger/N',
 'beste/N',
 'noncredit/J',
 'dismasted/J',
 'airdrome/N',
 'queen-in-council/N',
 'news-sentinel/N',
 'saperstein/N',
 'nine-story/J',
 'shesha/N',
 'skellington/N',
 'rambles/N',
 'ahmadis/UNK',
 'khirbet/N',
 'bengal/J',
 'inhalational/J',
 'giannitsa/N',
 'beams/UNK',
 'openside/N',
 'work-around/N',
 'darcis/N',
 'foxcroft/N',
 'pnau/N',
 'squibb/N',
 'deathrocker/N',
 'a-l/N',
 'co-religionist/N',
 'francisquito/N',
 'motorized/N',
 'eiph/N',
 'lobato/N',
 'moc/N',
 'servites/N',
 'brumel/N',
 'non-athletic/J',
 'talk-page/N',
 'hagiographer/N',
 'pekar/N',
 'irie/N',
 'besmirch/V',
 'taue/N',
 'ituano/N',
 'asato/N',
 'worsthorne/N',
 'nmvhw/N',
 'harpidae/N',
 'frode/N',
 'ziemke/N',
 'penglai/N',
 'dank/J',
 'schildkraut/N',
 'relist/V',
 'eyolf/N',
 'well-aimed/J',
 'johnnies/N',
 'nationalrat/N',
 'kandor/N',
 'hares/N',
 'bagnoli/N',
 'kamille/N',
 'guanches/N',
 'sequeira/N',
 'odissi/N',
 'microsurgery/N',
 'marilena/N',
 'aliza/N',
 'mabon/N',
 'al-megrahi/N',
 'polier/N',
 'guilmant/N',
 'sebum/N',
 'filmmuseum/N',
 'majka/N',
 'artos/N',
 'skytte/N',
 'mdh/N',
 'milot/N',
 'pokerstars/N',
 'smr/N',
 'crashdown/N',
 'evang/N',
 'evildoer/N',
 'tsutsui/N',
 'upstroke/N',
 'gonzague/N',
 'ashvin/N',
 'rave/J',
 'continents/UNK',
 'atar/N',
 'mirisch/N',
 'spheeris/N',
 'yeongjo/N',
 'dpr/N',
 'interlibrary/J',
 'vaisey/N',
 'zrenjanin/N',
 'cherones/N',
 'shudder/N',
 'downtown/J',
 'lichtenberger/N',
 'shivaratri/N',
 'ulp/N',
 'rogue/V',
 'fochabers/N',
 'vinayagar/N',
 'sarma/N',
 'aylestone/N',
 'bullwhip/N',
 'treasurers/UNK',
 'quinet/N',
 'ringpost/N',
 'flyhalf/N',
 'muiredach/N',
 'fat-free/N',
 'olli/N',
 'waechter/N',
 'eberhart/N',
 'tomoe/N',
 'stier/N',
 'cavea/N',
 'demokratische/N',
 'sherritt/N',
 'kall/N',
 'bathgate/N',
 'sterett/N',
 'january-february/N',
 'jeffress/N',
 'exegete/N',
 'free-flight/J',
 'vicars/UNK',
 'rhynie/N',
 'qarqar/N',
 'banteay/N',
 'mirosternus/N',
 'rhapsode/N',
 'muire/N',
 'pinn/N',
 'skiboard/N',
 'post-menopausal/J',
 'self-trained/J',
 'hoxsey/N',
 'khukri/N',
 'bhagavathy/N',
 'cusplet/N',
 'fiv/N',
 'hatting/N',
 'totternhoe/N',
 'harkleroad/N',
 'rajshahi/N',
 'exposed/N',
 'chno/N',
 'hyle/N',
 'balangay/N',
 'opening-day/J',
 'hydro-electric/N',
 'disengaged/J',
 'underconsumption/N',
 'saltfleet/N',
 'genzano/N',
 'spier/N',
 'lurianic/J',
 'loe/N',
 'torment/N',
 'circumfix/N',
 'masafumi/N',
 'had/N',
 'noughty/N',
 'subliminally/RB',
 'headstart/N',
 'vermes/N',
 'actualization/N',
 'fmn/N',
 'melanocephalus/N',
 'mycelial/J',
 'coby/N',
 'panegyrist/N',
 'parapsychological/J',
 'qaitbay/N',
 'beban/N',
 'zehn/N',
 'wakanohana/N',
 'itam/N',
 'sirian/N',
 'caridad/N',
 'zerah/N',
 'miracles/N',
 '#eee/N',
 'hewer/N',
 'mopsus/N',
 'naucratis/N',
 'phoblacht/N',
 'waterland/N',
 'petard/N',
 'koryu/N',
 'one/V',
 'taverna/N',
 'bovet/N',
 'maalaala/N',
 'kmm/N',
 'cauvery/N',
 'wavenumber/N',
 'muzika/UNK',
 'short-wheelbase/N',
 'tuberville/N',
 'westerveld/N',
 'calcot/N',
 'leba/N',
 'taenia/N',
 'eklavya/N',
 'playing/J',
 'brionne/N',
 'phlogopite/N',
 'incae/N',
 'sudeley/N',
 'umber/N',
 'kennebunk/N',
 'ticinese/N',
 'lexi/N',
 'ituri/N',
 'rewa/N',
 'dilated/N',
 'big-band/N',
 'holarctic/J',
 'nacion/N',
 'awwa/N',
 'torat/N',
 'hacksaw/N',
 'zhizn/N',
 'critchfield/N',
 'dinefwr/N',
 'nassif/N',
 'andrology/N',
 'swetnam/N',
 'ortsgemeinde/N',
 'zijlstra/N',
 'medcom/N',
 'orkestar/N',
 'blepharitis/N',
 'eberswalde/N',
 'belford/N',
 'prieur/N',
 'chickenfoot/N',
 'osh/N',
 'kerchak/N',
 'geovanni/N',
 'cradley/N',
 'gannaway/N',
 'bafut/N',
 'dues-paying/J',
 'jianzhi/N',
 'denisova/N',
 'citytrain/N',
 'gimbel/N',
 'trach/N',
 'ichthyosis/N',
 'daniello/N',
 'aul/N',
 'defreitas/N',
 'shinwa/N',
 'helwys/N',
 'ddr-oberliga/N',
 'ranni/N',
 'kishiwada/N',
 'blaha/N',
 'harless/N',
 'kagnew/N',
 'technologically-advanced/J',
 'triplane/N',
 'carlucci/N',
 'madding/J',
 'independant/J',
 'hirota/N',
 'chu/UNK',
 'saiyid/N',
 'moonface/N',
 'integumentary/J',
 'postdoc/N',
 't-top/N',
 'moist/V',
 'nolichucky/N',
 'vimalamitra/N',
 'toohey/N',
 'kieswetter/N',
 'aris/N',
 'corbeil/N',
 'woodyard/N',
 'pacem/N',
 'graun/N',
 'trilateral/J',
 'batibo/N',
 'tier-one/J',
 'krishnanagar/N',
 'mcclatchy/N',
 'myeongjong/N',
 'yaki/N',
 'civ/N',
 'nitrosamine/N',
 'kartuzy/N',
 'tolo/N',
 'cabrero/N',
 'kavango/N',
 'keiichi/N',
 'ansted/N',
 'malott/N',
 'dehra/N',
 'elliot-murray-kynynmound/N',
 'pauperism/N',
 'detsen/N',
 "d'oh/N",
 'ensis/N',
 'tsangpo/N',
 'maharal/N',
 'skandia/N',
 'szolnok/N',
 'userfication/N',
 'vergara/N',
 'surendranagar/N',
 'chitra/N',
 'evagoras/N',
 'nuria/N',
 'juliette/UNK',
 'ostracon/N',
 'merete/N',
 'steam-operated/J',
 'maxi-singles/N',
 'borosilicate/J',
 'stairlift/N',
 'sacajawea/N',
 'kohlschreiber/N',
 'kamala/N',
 'yildiz/N',
 'nstar/N',
 'soccer-specific/J',
 'webworm/N',
 'mte/N',
 'al-shabab/N',
 'wanaque/N',
 'part-time/N',
 'iwasawa/N',
 'hally/N',
 'decoded/N',
 'burhan/N',
 'irrationally/RB',
 'aloys/N',
 'quico/N',
 'rebellions/N',
 'footlocker/N',
 'prekmurian/J',
 'blackground/N',
 'frangieh/N',
 'mahanagar/N',
 'krishnamurthi/N',
 'solnhofen/N',
 'marilou/N',
 'edjohnston/N',
 'tgm/N',
 'hippodamia/N',
 'caparas/N',
 'leintwardine/N',
 'dibny/N',
 'wilen/N',
 'flavour/N',
 'ultratop/N',
 'myriokephalon/N',
 'buechler/N',
 'cablelabs/N',
 'multivariable/J',
 'rheinfelden/N',
 'compost/V',
 'take-out/J',
 'adjusted/N',
 'cussen/N',
 'anani/N',
 'passages/UNK',
 'tomasson/N',
 'streamflow/N',
 'foundling/J',
 'natin/N',
 'santosh/N',
 'hawken/N',
 'fair-skinned/J',
 'newsome/N',
 'lyssa/N',
 'mils/N',
 'wooley/N',
 'sarg/N',
 'laugier/N',
 'sure-fire/J',
 'sergeants/N',
 'crucifer/N',
 'incirlik/N',
 'mandamento/N',
 'corporatisation/N',
 'hmie/N',
 'radzymin/N',
 'non-sworn/J',
 'voy/N',
 'rur/N',
 'charro/N',
 'ceridwen/N',
 'hcus/N',
 'shifrin/N',
 'craterus/N',
 'deshpande/N',
 'shawsheen/N',
 'interpersonal/N',
 'forchheim/N',
 'justo/N',
 'world-wide/N',
 'ambient/N',
 'veale/N',
 'a-international/J',
 'matveyev/N',
 'bureaus/N',
 'pufferfish/N',
 'kaptol/N',
 'awen/N',
 'omnimon/N',
 'antifascist/J',
 'spui/N',
 'platanthera/N',
 'ghormley/N',
 'fuelling/N',
 'reequipp/V',
 'kouro/N',
 'komu/N',
 'tulasus/N',
 'cran/N',
 'kiet/N',
 'oraibi/N',
 'lowood/N',
 'flertydig/N',
 'eventide/N',
 'bagnolet/N',
 'dumbwaiter/N',
 'tschammerpokal/N',
 'yamaki/N',
 'cheboksary/N',
 'bindy/N',
 'manglerud/N',
 'lancasters/UNK',
 'ergon/N',
 'naruki/N',
 'bandem/N',
 'self-assembly/N',
 'mightier/J',
 'petrich/N',
 'syriza/N',
 'gynoecium/N',
 'astar/N',
 'superfamilium/N',
 'multi-drug/J',
 'rodley/N',
 'dapa/N',
 'ziz/N',
 'studiorum/N',
 'lavalle/N',
 'bayda/N',
 'osada/N',
 'yazawa/N',
 'howerdel/N',
 'basiliensis/N',
 'patriota/N',
 'proliferative/J',
 'sidenote/N',
 'mhv/N',
 'iguazu/N',
 'jasin/N',
 'osipovich/N',
 'tonalist/N',
 'free-speech/N',
 'harar/N',
 'cuscatlan/N',
 'fantasio/UNK',
 'hoveida/N',
 'borgwarner/N',
 'vice/RB',
 'oir/N',
 'utters/N',
 'gesso/N',
 'jukun/N',
 'hemanta/N',
 'sycorax/N',
 'glossa/N',
 'mintzberg/N',
 'balseiro/N',
 'flook/N',
 'wach/N',
 'devotional/N',
 'sey/N',
 'bencao/N',
 'beavers/N',
 'nelonen/N',
 'devante/N',
 'mpande/N',
 'misclassification/N',
 'penseroso/N',
 'khudadad/N',
 'bajwa/N',
 'purism/N',
 'jatte/N',
 'pasai/N',
 'ypbpr/N',
 'sepolcro/N',
 'northug/N',
 'vathek/N',
 'baddest/J',
 'twenty-eighth/N',
 'geschwaderkommodore/N',
 'partulidae/N',
 'zarathos/N',
 'hindon/N',
 'asthmatic/N',
 'mazar-e-sharif/N',
 'mosport/N',
 'beastly/J',
 'tammet/N',
 'sennen/N',
 'open-mouthed/J',
 'rosendale/N',
 'routt/N',
 'parvez/N',
 'alloying/J',
 'misdirected/J',
 'kidde/N',
 'izvolsky/N',
 'centrex/N',
 'santis/N',
 'closed-source/J',
 'light-harvesting/J',
 'shapely/J',
 'tulkarem/N',
 'danial/N',
 'deviantart/N',
 'soltis/N',
 'kantakouzene/N',
 'skeggs/N',
 'mandrel/N',
 'vlj/N',
 'rosoft/N',
 'industrialists/UNK',
 'bny/N',
 'decorous/J',
 'mohra/N',
 'yusof/N',
 'almaden/N',
 'waynflete/N',
 'uvarov/N',
 'issuant/N',
 'oguri/N',
 'varkey/N',
 'diwata/N',
 'glavin/N',
 'controversialist/N',
 'renaudot/N',
 'labrie/N',
 'newly-hired/J',
 'coolock/N',
 'lincolnville/N',
 'mladina/N',
 'trunks/UNK',
 'cahaba/N',
 'gsx-r/N',
 'tubulin/N',
 'bikash/N',
 'intercounty/N',
 'denliner/N',
 'activesync/N',
 'etcheverry/N',
 'seymore/N',
 'omo/N',
 'ensamble/N',
 'cardone/N',
 "nisga'a/N",
 'sonics/UNK',
 'ciaran/N',
 'torv/N',
 'winchilsea/N',
 'viimsi/N',
 'mclintock/N',
 'suncook/N',
 'benchrest/N',
 'sinugra/N',
 'alupas/N',
 'computed/N',
 'schoendoerffer/N',
 'caras/N',
 'cave-like/J',
 'tombo/N',
 'volokolamsk/N',
 'eucla/N',
 'reinke/N',
 'sirr/N',
 'negligee/N',
 'ngu/N',
 'yannis/N',
 'cittern/N',
 'mifflinburg/N',
 'loddiges/N',
 'stryn/N',
 'kurobe/N',
 'flatman/N',
 "o'daly/N",
 'zwicker/N',
 'macronutrient/N',
 'wolfie/N',
 'hallux/N',
 'eight-thousander/N',
 'mixed-income/J',
 'phs/N',
 'winxp/N',
 'upto/N',
 'veloz/N',
 'wencai/N',
 'sclerite/N',
 'maronite/J',
 'yhombi-opango/N',
 'elderton/N',
 'varietal/J',
 'varkaus/N',
 'campbellton/N',
 'gung/V',
 'razaf/N',
 'unpunished/J',
 'sub-urban/J',
 'egen/N',
 'gunning/N',
 'metallization/N',
 'salcido/N',
 'brittingham/N',
 'kasady/N',
 'saint-gobain/N',
 'kasauli/N',
 'vasanta/N',
 'gevaert/N',
 'kusti/N',
 'japygidae/N',
 'kronoberg/N',
 'landore/N',
 'aaadddaaammm/N',
 'houthis/N',
 'istres/N',
 'unlikely/N',
 'selk/N',
 'fmla/N',
 'government-approved/J',
 'theydon/N',
 'damsire/N',
 'caerulea/N',
 'lamanites/UNK',
 'jayalalithaa/N',
 'fangire/N',
 'swallows/N',
 'nisa/N',
 'catemaco/N',
 'coprosma/N',
 'shankaracharya/N',
 'triiodothyronine/N',
 'lustron/N',
 'grimaldo/N',
 'top-six/J',
 'soundstream/N',
 'labasa/N',
 'mindbender/N',
 'skipper/V',
 'pathologie/N',
 'gothicus/N',
 'child-rearing/N',
 'red-faced/J',
 'ves/UNK',
 'irredeemably/RB',
 'monssen/N',
 'ables/N',
 'maciste/N',
 'explore/N',
 'empennage/N',
 'traut/N',
 'settignano/N',
 'krakus/N',
 'sylviidae/N',
 "ev'ry/N",
 'neofolk/N',
 'pastis/N',
 'dray/N',
 'bicyclic/J',
 'falkor/N',
 'conard/N',
 'tumu/N',
 'presnell/N',
 'smagorinsky/N',
 'phosphorous/J',
 'gubernium/N',
 'german-americans/UNK',
 'purplish-red/J',
 'ntb/N',
 'unibody/N',
 'kephart/N',
 'lathus/N',
 'arngrim/N',
 'outremont/N',
 'trustkill/N',
 'toiyabe/N',
 'helpmate/N',
 'keyshawn/N',
 'criss-crossed/J',
 'nmmt/N',
 'elizaveta/N',
 'slingerlands/N',
 'celcom/N',
 'black-throated/J',
 'zohn/N',
 'maharajas/UNK',
 'repairer/N',
 'senatorial/N',
 'unexpurgated/J',
 'githyankus/N',
 'xuanwei/N',
 'confer/N',
 'cahit/N',
 'laff/N',
 'nesquehoning/N',
 'thurn/N',
 'ihrer/N',
 'regalado/N',
 'dimera/N',
 'riri/N',
 'splenectomy/N',
 'mujhe/N',
 'libration/N',
 'obituarist/N',
 'summer-long/J',
 'geologie/N',
 'measured/J',
 'gurdwaras/UNK',
 'urmila/N',
 'pandacan/N',
 'amyotrophic/J',
 'jizz/N',
 'cohost/V',
 'fortec/N',
 'whalum/N',
 'mortified/J',
 'maozhen/N',
 'bahrami/N',
 'popjustice/N',
 'warmoth/N',
 'paulownia/N',
 'colorist/N',
 'hyperstimulation/N',
 'hajjar/N',
 'yankel/N',
 'cusick/N',
 'sicker/J',
 'cti/N',
 'cornewall/N',
 'sketch-comedy/J',
 'keston/N',
 'gradualist/J',
 'ditlev/N',
 'manhattan-bound/J',
 'clinometer/N',
 'bisley/N',
 'keep/N',
 'polyrhythmic/J',
 'jianzhou/N',
 'conceptual/N',
 'versicolor/N',
 'adorable/J',
 'keshub/N',
 'carbon-oxygen/J',
 'sidiq/N',
 'trousseau/N',
 'hashshashin/N',
 'collum/N',
 'abjuration/N',
 'lethe/N',
 'halfa/N',
 'brynmawr/N',
 'non-playoff/J',
 'borno/N',
 'rupees/UNK',
 'kmtv/N',
 'carmi/N',
 'azz/N',
 'daubenton/N',
 'well-traveled/J',
 'bagot/N',
 ...}



In [80]:

    
v_avg.get_nearest_neighbours('measured/J')









    Out[80]:





[('extravert/N', 1.2206645427572558),
 ('photoinhibition/N', 1.2268042261113046),
 ('inessential/J', 1.2355771198070054),
 ('spillover/J', 1.255496637360789),
 ('strangelet/N', 1.2661434508282658),
 ('retrogression/N', 1.2728655184461561),
 ('photoionization/N', 1.2739789904460517),
 ('self-recognition/N', 1.2808519646775918),
 ('microphysic/N', 1.2869837189422086),
 ('self-affirmation/N', 1.2886332268324958)]



In [ ]:

	name1	acc1	name2	acc2	mean_diff	pval	significant
0	0	0.471414	1	0.469746	0.001668	0.8	False
1	0	0.471414	2	0.450353	0.021061	0.0	True
2	1	0.469746	2	0.450353	0.019393	0.0	True

	name1	acc1	name2	acc2	mean_diff	pval	significant
0	0	0.666391	1	0.658423	0.007968	0.676	False
1	0	0.666391	2	0.622785	0.043607	0.020	False
2	1	0.658423	2	0.622785	0.035639	0.066	False

	name1	acc1	name2	acc2	mean_diff	pval	significant
0	Add-0	0.471414	Mult-0	0.452228	0.019186	0.000	True
1	Add-0	0.471414	Left-0	0.322392	0.149022	0.000	True
2	Add-0	0.471414	Right-0	0.313756	0.157658	0.000	True
3	Add-0	0.471414	Add-3	0.451337	0.020077	0.004	True
4	Add-0	0.471414	Mult-3	0.419040	0.052374	0.000	True
5	Add-0	0.471414	Left-3	0.307700	0.163714	0.000	True
6	Add-0	0.471414	Right-3	0.294694	0.176720	0.000	True
7	Add-0	0.471414	Add-2	0.487203	0.015789	0.014	False
8	Add-0	0.471414	Mult-2	0.481405	0.009992	0.088	False
9	Add-0	0.471414	Left-2	0.327719	0.143695	0.000	True
10	Add-0	0.471414	Right-2	0.325976	0.145438	0.000	True
11	Add-0	0.471414	Add-3	0.497166	0.025752	0.000	True
12	Add-0	0.471414	Mult-3	0.488409	0.016995	0.002	True
13	Add-0	0.471414	Left-3	0.337498	0.133916	0.000	True
14	Add-0	0.471414	Right-3	0.319933	0.151481	0.000	True
15	Add-0	0.471414	Add-4	0.509483	0.038069	0.000	True
16	Add-0	0.471414	Add-5	0.512566	0.041152	0.000	True
17	Mult-0	0.452228	Left-0	0.322392	0.129836	0.000	True
18	Mult-0	0.452228	Right-0	0.313756	0.138472	0.000	True
19	Mult-0	0.452228	Add-3	0.451337	0.000891	0.890	False
20	Mult-0	0.452228	Mult-3	0.419040	0.033187	0.000	True
21	Mult-0	0.452228	Left-3	0.307700	0.144528	0.000	True
22	Mult-0	0.452228	Right-3	0.294694	0.157534	0.000	True
23	Mult-0	0.452228	Add-2	0.487203	0.034975	0.000	True
24	Mult-0	0.452228	Mult-2	0.481405	0.029178	0.000	True
25	Mult-0	0.452228	Left-2	0.327719	0.124509	0.000	True
26	Mult-0	0.452228	Right-2	0.325976	0.126252	0.000	True
27	Mult-0	0.452228	Add-3	0.497166	0.044938	0.000	True
28	Mult-0	0.452228	Mult-3	0.488409	0.036182	0.000	True
29	Mult-0	0.452228	Left-3	0.337498	0.114730	0.000	True
...	...	...	...	...	...	...	...
123	Mult-2	0.481405	Add-4	0.509483	0.028078	0.000	True
124	Mult-2	0.481405	Add-5	0.512566	0.031161	0.000	True
125	Left-2	0.327719	Right-2	0.325976	0.001743	0.810	False
126	Left-2	0.327719	Add-3	0.497166	0.169447	0.000	True
127	Left-2	0.327719	Mult-3	0.488409	0.160691	0.000	True
128	Left-2	0.327719	Left-3	0.337498	0.009779	0.126	False
129	Left-2	0.327719	Right-3	0.319933	0.007786	0.252	False
130	Left-2	0.327719	Add-4	0.509483	0.181764	0.000	True
131	Left-2	0.327719	Add-5	0.512566	0.184848	0.000	True
132	Right-2	0.325976	Add-3	0.497166	0.171190	0.000	True
133	Right-2	0.325976	Mult-3	0.488409	0.162434	0.000	True
134	Right-2	0.325976	Left-3	0.337498	0.011522	0.114	False
135	Right-2	0.325976	Right-3	0.319933	0.006043	0.400	False
136	Right-2	0.325976	Add-4	0.509483	0.183507	0.000	True
137	Right-2	0.325976	Add-5	0.512566	0.186591	0.000	True
138	Add-3	0.497166	Mult-3	0.488409	0.008756	0.120	False
139	Add-3	0.497166	Left-3	0.337498	0.159668	0.000	True
140	Add-3	0.497166	Right-3	0.319933	0.177233	0.000	True
141	Add-3	0.497166	Add-4	0.509483	0.012317	0.028	False
142	Add-3	0.497166	Add-5	0.512566	0.015400	0.012	False
143	Mult-3	0.488409	Left-3	0.337498	0.150912	0.000	True
144	Mult-3	0.488409	Right-3	0.319933	0.168476	0.000	True
145	Mult-3	0.488409	Add-4	0.509483	0.021074	0.000	True
146	Mult-3	0.488409	Add-5	0.512566	0.024157	0.000	True
147	Left-3	0.337498	Right-3	0.319933	0.017565	0.012	False
148	Left-3	0.337498	Add-4	0.509483	0.171985	0.000	True
149	Left-3	0.337498	Add-5	0.512566	0.175069	0.000	True
150	Right-3	0.319933	Add-4	0.509483	0.189550	0.000	True
151	Right-3	0.319933	Add-5	0.512566	0.192633	0.000	True
152	Add-4	0.509483	Add-5	0.512566	0.003083	0.618	False

	0	1	2	4	5	mw_dice
inborn/J	demonstrable/J, neuropsychological/J, contrain...	posturing/N, physiologically/RB, self-percepti...	pernicious/J, nonverbal/J, uncomplicated/J, un...	internalization/N, generalised/J, flecainide/N...	psychopathy/N, paraphilia/N, internalization/N...	0.115
balkans/N	transoxiana/N, dagestan/N, arakan/N, ostsiedlu...	cisalpine/N, interbellum/N, south-eastern/N, b...	transcaucasia/N, dodecanese/N, carpathians/UNK...	transcaucasia/N, rumelia/N, makran/N, ingria/N	dodecanese/N, bashkortostan/N, north-eastern/N...	0.176
sneaker/N	overalls/N, pinstripe/N, sleeveless/J, necktie/N	sweatshirt/N, headband/N, chiffon/N, overalls/N	bandana/N, stiletto/N, dreadlock/N, kimono/N	sweatshirt/N, necktie/N, tights/N, leggings/N	see-through/J, kimono/N, tattered/J, pajamas/N	0.296
falsify/V	substantiate/V, cross-examine/V, reexamine/V, ...	false/J, adduce/V, malfeasance/N, substantiate/V	rebut/V, misrepresent/V, willfully/RB, groundl...	falsified/J, adduce/V, mislead/V, retry/V	falsification/N, falsified/J, misrepresent/V, ...	0.357
ottawa/N	montreal/N, winnipeg/N, catharines/N, toronto/N	catharines/N, oshawa/N, winnipeg/N, quebec/N	toronto/N, montreal/N, catharines/N, winnipeg/N	montreal/N, toronto/N, calgary/N, saskatoon/N	mississauga/N, toronto/N, guelph/N, winnipeg/N	0.368
lesbian/J	gay/J, lesbian/N, transgender/J, transgendered/J	gay/J, transgender/J, bisexual/J, lgbt/N	gay/J, bisexual/J, transgender/J, lesbian/N	gay/J, transgender/J, bisexual/J, transsexual/J	gay/J, bisexual/J, lesbian/N, transgender/J	0.485
essay/N	pamphlet/N, monograph/N, poem/N, treatise/N	book/N, pamphlet/N, monograph/N, two-volume/J	book/N, pamphlet/N, poem/N, monograph/N	book/N, monograph/N, pamphlet/N, poem/N	monograph/N, pamphlet/N, book/N, poem/N	0.650

	0	1	2	4	5
echolocation/N	vocalization/N, vocalisation/N, bioluminescenc...	locomotion/N, vocalization/N, lifeform/N, vert...	well-designed/J, oscilloscope/N, anatomic/J, t...	velociraptor/N, mimic/N, baleen/N, trackball/N	multitasking/N, hallucinogen/N, mpt/N, spherom...
outgassing/N	vaporization/N, solidification/N, quenching/N,...	None	None	None	None
mezzo-soprano/N	coloratura/N, contralto/N, countertenor/N, bas...	contralto/N, soprano/N, bass-baritone/N, color...	contralto/N, flautist/N, soprano/N, coloratura/N	contralto/N, bass-baritone/N, harpist/N, count...	contralto/N, bass-baritone/N, countertenor/N, ...
carnivora/N	eudicot/N, procellariiformes/N, conspecific/N,...	proteaceae/N, euphorbiaceae/N, ammonite/J, utr...	None	None	None
squire/N	jorma/N, macrae/N, bostic/N, formby/N	duncan/N, wilkie/N, tyrell/N, berryman/N	donnell/N, monro/N, dashwood/N, underhill/N	frohman/N, binns/N, henshall/N, comerford/N	strutt/N, o'hagan/N, hardcastle/N, hannon/N