In [1]:
import sys
#kludge fix for macports issue: https://trac.macports.org/ticket/31891
sys.path.reverse()
from gensim import corpora, models, similarities
In [2]:
#http://radimrehurek.com/gensim/tut1.html#from-strings-to-vectors
#sample set of 'documents' to convert to bag of words
documents = ["Human machine interface for lab abc computer applications",
"Another Document on machine learning applications"]
In [3]:
stoplist = set('for a of the and on to in'.split())
tokens = [[word for word in document.lower().split() if word not in stoplist]
for document in documents]
In [4]:
tokens
Out[4]:
In [5]:
dictionary = corpora.Dictionary(tokens)
print dictionary.token2id
In [6]:
dictionary.token2id['another']
Out[6]:
In [7]:
new_doc = "Human computer human unknown learning"
new_vec = dictionary.doc2bow(new_doc.lower().split())
print(new_vec)
In [ ]: