In [8]:
import os
import smart_open
import gensim

In [13]:
"""
Load train data and build train_corpus for Doc2Vec

"""
path = 'data'
train_file = path + os.sep + 'ingredient2vec'

def read_corpus(fname, tokens_only=False):
    with smart_open.smart_open(fname, encoding="iso-8859-1") as f:
        for i, line in enumerate(f):
            if tokens_only:
                yield gensim.utils.simple_preprocess(line)
            else:
                # For training data, add tags
                line_split = line.split(' ')
                ingredient = line_split[0]
                compounds = ' '.join(line_split[1:])
                yield gensim.models.doc2vec.TaggedDocument(gensim.utils.simple_preprocess(compounds), [ingredient])

# Corpus tag to index
def tag_to_index(tags, corpus):
    for doc_id in range(len(corpus)):
        if tags == corpus[doc_id].tags[0]:
            return doc_id
        else:
            continue
    return

# Corpus index to tag                    
def index_to_tag(index, corpus):
    return corpus[index].tags
    


corpus = list(read_corpus(train_file))
corpus_th10 = []

# threshold
for doc_id in range(len(train_corpus)):
    if len(corpus[doc_id].words) > 10:
        corpus_th10.append(corpus[doc_id])

print "Total length of corpus:", len(corpus)
print "Total length of corpus_th10:", len(corpus_th10)


Total length of corpus: 1525
Total length of corpus_th10: 514

In [27]:
from gensim import corpora

documents = ["Human machine interface for lab abc computer applications",
             "A survey of user opinion of computer system response time",
             "The EPS user interface management system",
             "System and human system engineering testing of EPS",              
             "Relation of user perceived response time to error measurement",
             "The generation of random binary unordered trees",
             "The intersection graph of paths in trees",
             "Graph minors IV Widths of trees and well quasi ordering",
             "Graph minors A survey"]

texts = [[word for word in document.lower().split()]
         for document in documents]

for text in texts:
    print text


dictionary = gensim.corpora.SvmLightCorpus(texts)
print dictionary[0]
#new_doc = "Human computer interaction"
#new_vec = dictionary.doc2bow(new_doc.lower().split())
#print new_vec


['human', 'machine', 'interface', 'for', 'lab', 'abc', 'computer', 'applications']
['a', 'survey', 'of', 'user', 'opinion', 'of', 'computer', 'system', 'response', 'time']
['the', 'eps', 'user', 'interface', 'management', 'system']
['system', 'and', 'human', 'system', 'engineering', 'testing', 'of', 'eps']
['relation', 'of', 'user', 'perceived', 'response', 'time', 'to', 'error', 'measurement']
['the', 'generation', 'of', 'random', 'binary', 'unordered', 'trees']
['the', 'intersection', 'graph', 'of', 'paths', 'in', 'trees']
['graph', 'minors', 'iv', 'widths', 'of', 'trees', 'and', 'well', 'quasi', 'ordering']
['graph', 'minors', 'a', 'survey']
---------------------------------------------------------------------------
RuntimeError                              Traceback (most recent call last)
<ipython-input-27-9f9d3a2041cb> in <module>()
     19 
     20 dictionary = gensim.corpora.SvmLightCorpus(texts)
---> 21 print dictionary[0]
     22 #new_doc = "Human computer interaction"
     23 #new_vec = dictionary.doc2bow(new_doc.lower().split())

/usr/local/lib/python2.7/dist-packages/gensim/corpora/indexedcorpus.pyc in __getitem__(self, docno)
    121     def __getitem__(self, docno):
    122         if self.index is None:
--> 123             raise RuntimeError("cannot call corpus[docid] without an index")
    124 
    125         if isinstance(docno, (slice, list, numpy.ndarray)):

RuntimeError: cannot call corpus[docid] without an index

In [ ]: