In [1]:
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.DEBUG)
from gensim.models.doc2vec import TaggedDocument
from gensim.models.doc2vec import Doc2Vec
from tweetokenize import Tokenizer
class LabeledDocumentSentence(object):
def __init__(self, filename):
self.filename = filename
self.data = {}
def __iter__(self):
for uid, line in enumerate(open(self.filename)):
gettokens = Tokenizer()
sent = gettokens.tokenize(line.decode('latin-1'))
klass = 'positive'
self.data['SENT_%s'% uid] = klass
yield TaggedDocument(sent, ['SENT_%s' % uid])
def get_data(self):
return self.data
In [ ]:
In [ ]:
model = Doc2Vec(alpha=0.025, min_alpha=0.025, hs=1, min_count=0, workers=4, iter=100)
sentences = LabeledDocumentSentence('rt-polaritydata/rt-polaritydata/rt-polarity.pos')
model.build_vocab(sentences)
model.train_words=True
model.train_labels=True
for epoch in range(10):
model.train(sentences)
model.alpha -= 0.002
model.min_alpha = model.alpha
In [ ]:
'''
Pega os vetores do modelo e armazena
'''
vectors = []
klasses = []
for i in range(len(sentences.data)):
vectors.append(model['SENT_%s' % i])
klasses.append(sentences.data.get('SENT_%s' % i))
In [ ]:
sources = ('rt-polaritydata/rt-polaritydata/rt-polarity.pos')
documents = LabeledDocumentSentence(sources) # automatically add line_no to each documents and enumerate it
model = Doc2Vec(dm=0, # DBOW
size=400,
window=8,
min_count=10,
dbow_words = 1) # DBOW, simultaneously train word vectors with doc vectors
model.build_vocab(documents)
#model.train(documents)
for epoch in range(10):
print epoch
model.train(documents)
# model.alpha -= 0.002
# model.min_alpha = model.alpha
In [ ]:
print list(documents)[0:10]