In [1]:
import json
import gensim
import numpy
json_path = '/home/marcel/projects/eco/src/python/nlp/digital_and_internet_theory_v3_vectors.json'
vectors = []
with open(json_path) as json_file:
print('loading more')
data = json.load(json_file)
for line in data:
sentence = line['sentence']
list = line['point']
vector = numpy.array(list)
vectors.append((sentence, vector))
print('done loading json. loading w2v model')
model_path = '/home/marcel/drive/data/eco/word2vec_models/wiki_plus_v3_combined.txt_word2vec.w2vmodel'
#util.enable_verbose_training(sys.argv[0])
try:
model = gensim.models.Word2Vec.load_word2vec_format(model_path, binary=True)
# this raises an exception if the model type is different..
except Exception:
# just use the other mothod of loading..
model = gensim.models.Word2Vec.load(model_path)
loading
print('done')
In [7]:
from numpy import dot
new_sentence = 'my ubuntu computer is really slow'
vector_words = []
words = []
word_count = 0
for word in new_sentence.split():
try:
vector_words.append(model[word])
words.append(word)
word_count += 1
except:
pass
# skip vocab unknown word
print(words)
vector = gensim.matutils.unitvec(numpy.array(vector_words).mean(axis=0))
int_vector = []
for fl in vector:
integer = int(fl * 10000)
int_vector.append(integer)
similarities = []
index = 0
for pair in vectors:
#if index % 1000 == 0:
# print('processing ' + str(index) + '/' + str(len(vectors)))
dist = dot(pair[1], int_vector)
similarities.append((dist, pair[0]))
index += 1
similarities.sort(key=lambda similarities: similarities[0], reverse=True)
print('---- Most similar match by n_similarity:')
for i in range(5):
text = similarities[i]
print(text)
In [ ]: