notebook.community

Edit and run



In [1]:

    
import json
import gensim
import numpy

json_path = '/home/marcel/projects/eco/src/python/nlp/digital_and_internet_theory_v3_vectors.json'
vectors = []
with open(json_path) as json_file:
    print('loading more')
    data = json.load(json_file)
    for line in data:
        sentence = line['sentence']
        list = line['point']

        vector = numpy.array(list)
        vectors.append((sentence, vector))

print('done loading json. loading w2v model')
model_path = '/home/marcel/drive/data/eco/word2vec_models/wiki_plus_v3_combined.txt_word2vec.w2vmodel'
#util.enable_verbose_training(sys.argv[0])
try:
    model = gensim.models.Word2Vec.load_word2vec_format(model_path, binary=True)
    # this raises an exception if the model type is different..
except Exception:
    # just use the other mothod of loading..
    model = gensim.models.Word2Vec.load(model_path)
    loading
print('done')









    



loading more
done loading json. loading w2v model
done



In [7]:

    
from numpy import dot

new_sentence = 'my ubuntu computer is really slow'
vector_words = []
words = []
word_count = 0
for word in new_sentence.split():
    try:
        vector_words.append(model[word])
        words.append(word)
        word_count += 1
    except:
        pass
        # skip vocab unknown word

print(words)
vector = gensim.matutils.unitvec(numpy.array(vector_words).mean(axis=0))
int_vector = []
for fl in vector:
    integer = int(fl * 10000)
    int_vector.append(integer)

similarities = []
index = 0
for pair in vectors:
    #if index % 1000 == 0:
    #    print('processing ' + str(index) + '/' + str(len(vectors)))
    dist = dot(pair[1], int_vector)
    similarities.append((dist, pair[0]))
    index += 1
    
similarities.sort(key=lambda similarities: similarities[0], reverse=True)
print('---- Most similar match by n_similarity:')
for i in range(5):
    text = similarities[i]
    print(text)









    



['my', 'ubuntu', 'computer', 'is', 'really', 'slow']
---- Most similar match by n_similarity:
(74488509, u'When you program a computer something is constantly happening .\n')
(73486639, u'Even if your computer was still running\n')
(73177141, u'This rerouting of attention is what your brain does when you think you are multitasking .\n')
(72953633, u'But is it really this simple ?\n')
(72953633, u'But is it really this simple ?\n')



In [ ]: