notebook.community

Edit and run



In [4]:

    
from heapq import heapify, heappushpop

import numpy as np
from tabulate import tabulate

from utils import get_npmi_part_path, PICKLE_PATH, TFIDF_PATH, NPMI_PART_SIZE, VOCABULARY_PATH



In [12]:

    
vocabulary = np.load(VOCABULARY_PATH)
voc_size = vocabulary.shape[0]



In [15]:

    
%%time

closest_minus_1 = [(-2, None)] * 10
closest_0 = [(-2, None)] * 10
closest_1 = [(-2, None)] * 10

for left in range(voc_size)[::NPMI_PART_SIZE]:
    npmi_part_path = get_npmi_part_path(left)
    npmi_submatrix = np.load(npmi_part_path)
    for t1 in range(npmi_submatrix.shape[0]):
        for term2 in range(npmi_submatrix.shape[1]):
            npmi = npmi_submatrix[t1, term2]
            term1 = left + t1
            # Don't consider equal terms
            if term1 == term2:
                continue
            pair = (vocabulary[term1], vocabulary[term2])
            heappushpop(closest_minus_1, (-npmi, pair))
            heappushpop(closest_0, (-abs(npmi), pair))
            heappushpop(closest_1, (npmi, pair))









    



CPU times: user 3h 26min 58s, sys: 56.6 s, total: 3h 27min 55s
Wall time: 3h 27min 18s



In [20]:

    
[(-c[0], c[1]) for c in closest_minus_1]









    Out[20]:





[(-1.0, ('zzz', 'zy')),
 (-1.0, ('zzz', 'zygmund')),
 (-1.0, ('zzz', 'zygalakis')),
 (-1.0, ('zzz', 'zyy')),
 (-1.0, ('zzz', 'zzajklz')),
 (-1.0, ('zzz', 'zylberberg')),
 (-1.0, ('zzz', 'zyx')),
 (-1.0, ('zzz', 'zzs')),
 (-1.0, ('zzz', 'zz')),
 (-1.0, ('zzz', 'zzt'))]



In [17]:

    
closest_0









    Out[17]:





[(-0.0, ('zzt', '3')),
 (-0.0, ('zzt', '4')),
 (-0.0, ('zzt', 'a')),
 (-0.0, ('zzt', 'on')),
 (-0.0, ('zzz', '2')),
 (-0.0, ('zzz', '3')),
 (-0.0, ('zzz', 'on')),
 (-0.0, ('zzz', '1')),
 (-0.0, ('zzz', '4')),
 (-0.0, ('zzz', 'a'))]



In [18]:

    
closest_1









    Out[18]:





[(1.0, ('zzz', 'thir')),
 (1.0, ('zzz', 'tsketch')),
 (1.0, ('zzz', 'tpower')),
 (1.0, ('zzz', 'wi21')),
 (1.0, ('zzz', 'varimax')),
 (1.0, ('zzz', 'wi2k')),
 (1.0, ('zzz', 'ujs')),
 (1.0, ('zzz', 'xxk')),
 (1.0, ('zzz', 'xax')),
 (1.0, ('zzz', 'xjaxj'))]