In [8]:
import logging
from gensim.models import Word2Vec
from gensim.models.word2vec import LineSentence
import os
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
In [4]:
class InputSentences(object):
def __init__(self, dirname):
self.dirname = dirname
def __iter__(self):
for fname in os.listdir(self.dirname):
for line in open(os.path.join(self.dirname, fname)):
yield line.split()
In [9]:
# this is a very simple, use-case
sentences = [['Rome', 'Italy'], ['Beijing', 'China']]
# train word2vec on the two sentences
model = Word2Vec(sentences, min_count=1)
In [7]:
model.most_similar(positive=['Rome'], topn=1)
Out[7]: