In [4]:
from heapq import heapify, heappushpop
import numpy as np
from tabulate import tabulate
from utils import get_npmi_part_path, PICKLE_PATH, TFIDF_PATH, NPMI_PART_SIZE, VOCABULARY_PATH
In [12]:
vocabulary = np.load(VOCABULARY_PATH)
voc_size = vocabulary.shape[0]
In [15]:
%%time
closest_minus_1 = [(-2, None)] * 10
closest_0 = [(-2, None)] * 10
closest_1 = [(-2, None)] * 10
for left in range(voc_size)[::NPMI_PART_SIZE]:
npmi_part_path = get_npmi_part_path(left)
npmi_submatrix = np.load(npmi_part_path)
for t1 in range(npmi_submatrix.shape[0]):
for term2 in range(npmi_submatrix.shape[1]):
npmi = npmi_submatrix[t1, term2]
term1 = left + t1
# Don't consider equal terms
if term1 == term2:
continue
pair = (vocabulary[term1], vocabulary[term2])
heappushpop(closest_minus_1, (-npmi, pair))
heappushpop(closest_0, (-abs(npmi), pair))
heappushpop(closest_1, (npmi, pair))
In [20]:
[(-c[0], c[1]) for c in closest_minus_1]
Out[20]:
In [17]:
closest_0
Out[17]:
In [18]:
closest_1
Out[18]: