In [1]:
    
from gensim import corpora, models, similarities, utils
import numpy as np
    
In [2]:
    
dictionary = corpora.Dictionary.load('brown.dict')
corpus = corpora.MmCorpus('brown.mm')
print corpus
    
    
In [3]:
    
# 先建立200維的LSI model
lsi = models.LsiModel(corpus, id2word=dictionary, num_topics=200)
    
先建立similarity matrix
In [4]:
    
index = similarities.MatrixSimilarity(lsi[corpus], num_features=200)
index.save('brown.lsi.sim')
index = similarities.MatrixSimilarity.load('brown.lsi.sim')
    
接受使用者輸入,並轉換為query
In [5]:
    
query = "Kids likes to watch dog"
qvec = dictionary.doc2bow(query.lower().split())
qlsi = lsi[qvec]
    
從similarity matrix中尋找與query最接近的vector
In [6]:
    
index[qlsi]
    
    Out[6]:
In [7]:
    
def search_query(query):
    qvec = dictionary.doc2bow(query.lower().split())
    qlsi = lsi[qvec]
    m = np.argmax(index[qlsi])
    return [dictionary[id] for id, _ in corpus[m]]
    
In [8]:
    
search_query(query)
    
    Out[8]:
In [ ]: