In [4]:
from gensim import corpora, models 
# inp = sc.textFile('./data/new_parsed_no_spam.txt').map(lambda row: row.split(" "))
# word2vec = Word2Vec()
# model = word2vec.fit(inp)
class MySentences(object):
    def __init__(self, fname):
        self.fname = fname
    def __iter__(self):
#         for fname in os.listdir(self.dirname):
            for line in open(os.path.join(self.fname)):
                yield line.split()
sentences = MySentences('./data/new_parsed_no_spam.txt') # a memory-friendly iterator
model = models.Word2Vec(sentences)

In [5]:
word , score = model.most_similar(positive=['台北', '好玩'], negative=['高雄'], topn=1)[0]
print word ,score


手拿 0.643935918808

In [6]:
from gensim.models.doc2vec import LabeledSentence
class LabeledLineSentence(object):
    def __init__(self, filename):
        self.filename = filename
    def __iter__(self):
        for uid, line in enumerate(open(self.filename)):
            yield LabeledSentence(words=line.split(), ,labels=['SENT_%s' % uid])
            
labeledLineSentence = LabeledLineSentence('./data/new_parsed_no_spam.txt') # a memory-friendly iterator
docmodel = models.Doc2Vec(labeledLineSentence, size=100, window=8, min_count=5, workers=4)


  File "<ipython-input-6-254ecc99c29e>", line 7
    yield LabeledSentence(words=line.split(), ,labels=['SENT_%s' % uid])
                                              ^
SyntaxError: invalid syntax

In [17]:
docmodel.save('./data/my_model.doc2vec')

In [18]:
model.save('./data/model.doc2vec')

In [24]:
slist = model.most_similar("商家")
for word , score in slist :
    print word , score


洗染 0.845958471298
餐飲業 0.839558362961
洗衣 0.831826210022
展店 0.820595145226
knowhow 0.819600582123
管理模式 0.807650923729
扯上 0.804563522339
節日 0.801911354065
不彰 0.795998036861
授課 0.791624903679

In [ ]: