In [1]:
import sys
#kludge fix for macports issue: https://trac.macports.org/ticket/31891
sys.path.reverse() 
from gensim import corpora, models, similarities

In [2]:
#http://radimrehurek.com/gensim/tut1.html#from-strings-to-vectors
#sample set of 'documents' to convert to bag of words
documents = ["Human machine interface for lab abc computer applications",
             "Another Document on machine learning applications"]

In [3]:
stoplist = set('for a of the and on to in'.split())
tokens = [[word for word in document.lower().split() if word not in stoplist]
          for document in documents]

In [4]:
tokens


Out[4]:
[['human', 'machine', 'interface', 'lab', 'abc', 'computer', 'applications'],
 ['another', 'document', 'machine', 'learning', 'applications']]

In [5]:
dictionary = corpora.Dictionary(tokens)
print dictionary.token2id


{'learning': 9, 'abc': 0, 'another': 7, 'lab': 5, 'machine': 6, 'applications': 1, 'computer': 2, 'human': 3, 'interface': 4, 'document': 8}

In [6]:
dictionary.token2id['another']


Out[6]:
7

In [7]:
new_doc = "Human computer human unknown learning"
new_vec = dictionary.doc2bow(new_doc.lower().split())
print(new_vec)


[(2, 1), (3, 2), (9, 1)]

In [ ]: