In [40]:
import sys
sys.path.reverse()
from gensim import corpora, models, similarities
In [41]:
open('dogs.txt').readlines()
Out[41]:
In [42]:
#learn words
dictionary = corpora.Dictionary(line.decode('ascii','ignore').lower().split() for line in open('cats.txt').readlines())
In [43]:
dictionary.token2id
Out[43]:
In [44]:
len(dictionary)
Out[44]:
In [45]:
#learn some more words
dictionary.add_documents(line.lower().split() for line in open('dogs.txt').readlines())
In [46]:
dictionary.token2id
Out[46]:
In [47]:
len(dictionary)
Out[47]:
In [48]:
#make a corpus of dog phrases only
corpusDog=[dictionary.doc2bow(line.decode('ascii','ignore').lower().split()) for line in open('dogs.txt').readlines()[0:20000]]
In [49]:
print(len(corpusDog))
print(corpusDog)
In [50]:
#latent semantic index model http://en.wikipedia.org/wiki/Latent_semantic_indexing
lsiDog = models.LsiModel(corpusDog, id2word=dictionary,num_topics=1000)
lsiDog.print_topics(1000)
Out[50]:
In [51]:
corpusCat=[dictionary.doc2bow(line.decode('ascii','ignore').lower().split()) for line in open('cats.txt').readlines()[0:20000]]
lsiCat = models.LsiModel(corpusCat, id2word=dictionary,num_topics=1000)
lsiCat.print_topics(1000)
Out[51]:
In [52]:
aPhrase=r'''let me be the person my dog thinks I am'''
In [53]:
vecPhrase = dictionary.doc2bow(aPhrase.lower().split())
veclsiDog= lsiDog[vecPhrase] # convert the query to LSI space
In [54]:
from operator import itemgetter
sorted(veclsiDog, key=itemgetter(1),reverse=True)
Out[54]:
In [55]:
veclsiCat= lsiCat[vecPhrase] # convert the query to LSI space
sorted(veclsiCat, key=itemgetter(1),reverse=True)
Out[55]:
In [76]:
#inverse freqency model, increases value of rare words (bad choice)
model=models.tfidfmodel.TfidfModel(corpusCat,id2word=dictionary,normalize=True)
vectfid=model[vecPhrase]
print(sorted(vectfid, key=itemgetter(1),reverse=True))
print(model.id2word[26])
In [65]:
model=models.tfidfmodel.TfidfModel(corpusDog,id2word=dictionary,normalize=True)
vectfid=model[vecPhrase]
sorted(vectfid, key=itemgetter(1),reverse=True)
Out[65]:
In [82]:
#random projections
model=models.RpModel(corpusCat,id2word=dictionary)
print(sorted(model[vecPhrase], key=itemgetter(1),reverse=True)[-1])
print(model)
model=models.RpModel(corpusDog,id2word=dictionary)
print(sorted(model[vecPhrase], key=itemgetter(1),reverse=True)[-1])
print(model)
In [58]: