In [8]:
import os
import smart_open
import gensim
In [13]:
"""
Load train data and build train_corpus for Doc2Vec
"""
path = 'data'
train_file = path + os.sep + 'ingredient2vec'
def read_corpus(fname, tokens_only=False):
with smart_open.smart_open(fname, encoding="iso-8859-1") as f:
for i, line in enumerate(f):
if tokens_only:
yield gensim.utils.simple_preprocess(line)
else:
# For training data, add tags
line_split = line.split(' ')
ingredient = line_split[0]
compounds = ' '.join(line_split[1:])
yield gensim.models.doc2vec.TaggedDocument(gensim.utils.simple_preprocess(compounds), [ingredient])
# Corpus tag to index
def tag_to_index(tags, corpus):
for doc_id in range(len(corpus)):
if tags == corpus[doc_id].tags[0]:
return doc_id
else:
continue
return
# Corpus index to tag
def index_to_tag(index, corpus):
return corpus[index].tags
corpus = list(read_corpus(train_file))
corpus_th10 = []
# threshold
for doc_id in range(len(train_corpus)):
if len(corpus[doc_id].words) > 10:
corpus_th10.append(corpus[doc_id])
print "Total length of corpus:", len(corpus)
print "Total length of corpus_th10:", len(corpus_th10)
In [27]:
from gensim import corpora
documents = ["Human machine interface for lab abc computer applications",
"A survey of user opinion of computer system response time",
"The EPS user interface management system",
"System and human system engineering testing of EPS",
"Relation of user perceived response time to error measurement",
"The generation of random binary unordered trees",
"The intersection graph of paths in trees",
"Graph minors IV Widths of trees and well quasi ordering",
"Graph minors A survey"]
texts = [[word for word in document.lower().split()]
for document in documents]
for text in texts:
print text
dictionary = gensim.corpora.SvmLightCorpus(texts)
print dictionary[0]
#new_doc = "Human computer interaction"
#new_vec = dictionary.doc2bow(new_doc.lower().split())
#print new_vec
In [ ]: