In [1]:
from gensim import corpora, models, similarities, utils
import numpy as np
In [2]:
dictionary = corpora.Dictionary.load('brown.dict')
corpus = corpora.MmCorpus('brown.mm')
print corpus
In [3]:
# 先建立200維的LSI model
lsi = models.LsiModel(corpus, id2word=dictionary, num_topics=200)
先建立similarity matrix
In [4]:
index = similarities.MatrixSimilarity(lsi[corpus], num_features=200)
index.save('brown.lsi.sim')
index = similarities.MatrixSimilarity.load('brown.lsi.sim')
接受使用者輸入,並轉換為query
In [5]:
query = "Kids likes to watch dog"
qvec = dictionary.doc2bow(query.lower().split())
qlsi = lsi[qvec]
從similarity matrix中尋找與query最接近的vector
In [6]:
index[qlsi]
Out[6]:
In [7]:
def search_query(query):
qvec = dictionary.doc2bow(query.lower().split())
qlsi = lsi[qvec]
m = np.argmax(index[qlsi])
return [dictionary[id] for id, _ in corpus[m]]
In [8]:
search_query(query)
Out[8]:
In [ ]: