In [4]:
from gensim import corpora, models
# inp = sc.textFile('./data/new_parsed_no_spam.txt').map(lambda row: row.split(" "))
# word2vec = Word2Vec()
# model = word2vec.fit(inp)
class MySentences(object):
def __init__(self, fname):
self.fname = fname
def __iter__(self):
# for fname in os.listdir(self.dirname):
for line in open(os.path.join(self.fname)):
yield line.split()
sentences = MySentences('./data/new_parsed_no_spam.txt') # a memory-friendly iterator
model = models.Word2Vec(sentences)
In [5]:
word , score = model.most_similar(positive=['台北', '好玩'], negative=['高雄'], topn=1)[0]
print word ,score
In [6]:
from gensim.models.doc2vec import LabeledSentence
class LabeledLineSentence(object):
def __init__(self, filename):
self.filename = filename
def __iter__(self):
for uid, line in enumerate(open(self.filename)):
yield LabeledSentence(words=line.split(), ,labels=['SENT_%s' % uid])
labeledLineSentence = LabeledLineSentence('./data/new_parsed_no_spam.txt') # a memory-friendly iterator
docmodel = models.Doc2Vec(labeledLineSentence, size=100, window=8, min_count=5, workers=4)
In [17]:
docmodel.save('./data/my_model.doc2vec')
In [18]:
model.save('./data/model.doc2vec')
In [24]:
slist = model.most_similar("商家")
for word , score in slist :
print word , score
In [ ]: