In [4]:
from heapq import heapify, heappushpop

import numpy as np
from tabulate import tabulate

from utils import get_npmi_part_path, PICKLE_PATH, TFIDF_PATH, NPMI_PART_SIZE, VOCABULARY_PATH

In [12]:
vocabulary = np.load(VOCABULARY_PATH)
voc_size = vocabulary.shape[0]

In [15]:
%%time

closest_minus_1 = [(-2, None)] * 10
closest_0 = [(-2, None)] * 10
closest_1 = [(-2, None)] * 10

for left in range(voc_size)[::NPMI_PART_SIZE]:
    npmi_part_path = get_npmi_part_path(left)
    npmi_submatrix = np.load(npmi_part_path)
    for t1 in range(npmi_submatrix.shape[0]):
        for term2 in range(npmi_submatrix.shape[1]):
            npmi = npmi_submatrix[t1, term2]
            term1 = left + t1
            # Don't consider equal terms
            if term1 == term2:
                continue
            pair = (vocabulary[term1], vocabulary[term2])
            heappushpop(closest_minus_1, (-npmi, pair))
            heappushpop(closest_0, (-abs(npmi), pair))
            heappushpop(closest_1, (npmi, pair))


CPU times: user 3h 26min 58s, sys: 56.6 s, total: 3h 27min 55s
Wall time: 3h 27min 18s

In [20]:
[(-c[0], c[1]) for c in closest_minus_1]


Out[20]:
[(-1.0, ('zzz', 'zy')),
 (-1.0, ('zzz', 'zygmund')),
 (-1.0, ('zzz', 'zygalakis')),
 (-1.0, ('zzz', 'zyy')),
 (-1.0, ('zzz', 'zzajklz')),
 (-1.0, ('zzz', 'zylberberg')),
 (-1.0, ('zzz', 'zyx')),
 (-1.0, ('zzz', 'zzs')),
 (-1.0, ('zzz', 'zz')),
 (-1.0, ('zzz', 'zzt'))]

In [17]:
closest_0


Out[17]:
[(-0.0, ('zzt', '3')),
 (-0.0, ('zzt', '4')),
 (-0.0, ('zzt', 'a')),
 (-0.0, ('zzt', 'on')),
 (-0.0, ('zzz', '2')),
 (-0.0, ('zzz', '3')),
 (-0.0, ('zzz', 'on')),
 (-0.0, ('zzz', '1')),
 (-0.0, ('zzz', '4')),
 (-0.0, ('zzz', 'a'))]

In [18]:
closest_1


Out[18]:
[(1.0, ('zzz', 'thir')),
 (1.0, ('zzz', 'tsketch')),
 (1.0, ('zzz', 'tpower')),
 (1.0, ('zzz', 'wi21')),
 (1.0, ('zzz', 'varimax')),
 (1.0, ('zzz', 'wi2k')),
 (1.0, ('zzz', 'ujs')),
 (1.0, ('zzz', 'xxk')),
 (1.0, ('zzz', 'xax')),
 (1.0, ('zzz', 'xjaxj'))]