In [38]:
import sys
sys.path.reverse()
from gensim import corpora, models, similarities
In [51]:
open('dogs.txt').readlines()
Out[51]:
In [39]:
#learn words
dictionary = corpora.Dictionary(line.decode('ascii','ignore').lower().split() for line in open('cats.txt').readlines())
In [40]:
dictionary.token2id
Out[40]:
In [42]:
len(dictionary)
Out[42]:
In [43]:
#learn some more words
dictionary.add_documents(line.lower().split() for line in open('dogs.txt').readlines())
In [44]:
dictionary.token2id
Out[44]:
In [45]:
len(dictionary)
Out[45]:
In [46]:
#make a corpus of dog phrases only
corpusDog=[dictionary.doc2bow(line.decode('ascii','ignore').lower().split()) for line in open('dogs.txt').readlines()[0:20000]]
In [49]:
print(len(corpusDog))
print(corpusDog)
In [60]:
#latent semantic index model http://en.wikipedia.org/wiki/Latent_semantic_indexing
lsiDog = models.LsiModel(corpusDog, id2word=dictionary,num_topics=1000)
lsiDog.print_topics(1000)
Out[60]:
In [52]:
corpusCat=[dictionary.doc2bow(line.decode('ascii','ignore').lower().split()) for line in open('cats.txt').readlines()[0:20000]]
lsiCat = models.LsiModel(corpusCat, id2word=dictionary,num_topics=1000)
lsiCat.print_topics(1000)
Out[52]:
In [56]:
aPhrase=r'''let me be the person my dog thinks I am'''
In [57]:
vecPhrase = dictionary.doc2bow(aPhrase.lower().split())
veclsiDog= lsiDog[vecPhrase] # convert the query to LSI space
In [58]:
from operator import itemgetter
sorted(veclsiDog, key=itemgetter(1),reverse=True)
Out[58]:
In [59]:
veclsiCat= lsiCat[vecPhrase] # convert the query to LSI space
sorted(veclsiCat, key=itemgetter(1),reverse=True)
Out[59]:
In [ ]: