How many neighbours of an entry overlap lexically?

Proportion of neigh that overlap in the first 100 neighbours.


In [4]:
%cd ~/NetBeansProjects/ExpLosion/
from notebooks.common_imports import *
from gui.output_utils import *
from gui.user_code import pretty_names
from discoutils.thesaurus_loader import Vectors
from random import sample


/Volumes/LocalDataHD/m/mm/mmb28/NetBeansProjects/ExpLosion

In [5]:
path = '../FeatureExtractionToolkit/word2vec_vectors/composed/AN_NN_word2vec-wiki_15percent-rep0_Add.events.filtered.strings'
w = Vectors.from_tsv(path, allow_lexical_overlap=True)

In [6]:
w.init_sims(n_neighbors=100)

In [7]:
unigrams = list(x for x in w.keys() if x.count('_') < 1)
phrases = list(x for x in w.keys() if x.count('_') >= 1)

In [39]:
w.get_nearest_neighbours_linear.cache_clear()
%lprun -f Vectors.get_nearest_neighbours_linear w.get_nearest_neighbours_linear('car/N')

In [8]:
len(unigrams), len(phrases), len(w)


Out[8]:
(65309, 1079757, 1145066)

In [9]:
ratios = []
for entry in random.sample(phrases, 100):
    before = w.get_nearest_neighbours(entry)
    after = Vectors.remove_overlapping_neighbours(entry, before)
    ratios.append(len(after) / len(before))

In [10]:
# plt.hist(ratios, bins=20);
ax = sns.distplot(np.array(ratios), bins=20, kde_kws=dict(cut=True))
ax.set_xlim(0, 1)


Out[10]:
(0, 1)

In [27]:
sns.kdeplot?