In [4]:
%cd ~/NetBeansProjects/ExpLosion/
from notebooks.common_imports import *
from gui.output_utils import *
from gui.user_code import pretty_names
from discoutils.thesaurus_loader import Vectors
from random import sample
In [5]:
path = '../FeatureExtractionToolkit/word2vec_vectors/composed/AN_NN_word2vec-wiki_15percent-rep0_Add.events.filtered.strings'
w = Vectors.from_tsv(path, allow_lexical_overlap=True)
In [6]:
w.init_sims(n_neighbors=100)
In [7]:
unigrams = list(x for x in w.keys() if x.count('_') < 1)
phrases = list(x for x in w.keys() if x.count('_') >= 1)
In [39]:
w.get_nearest_neighbours_linear.cache_clear()
%lprun -f Vectors.get_nearest_neighbours_linear w.get_nearest_neighbours_linear('car/N')
In [8]:
len(unigrams), len(phrases), len(w)
Out[8]:
In [9]:
ratios = []
for entry in random.sample(phrases, 100):
before = w.get_nearest_neighbours(entry)
after = Vectors.remove_overlapping_neighbours(entry, before)
ratios.append(len(after) / len(before))
In [10]:
# plt.hist(ratios, bins=20);
ax = sns.distplot(np.array(ratios), bins=20, kde_kws=dict(cut=True))
ax.set_xlim(0, 1)
Out[10]:
In [27]:
sns.kdeplot?