In [ ]:
'''
Classe que lê as mensgens e retorna um objeto um objeto da classe LabeledSentende
usa o tokenizador do NLTK
'''


from gensim.models.doc2vec import LabeledSentence
from nltk.tokenize import word_tokenize
import codecs

class LabeledLineSentence(object):
    def __init__(self, filename):
        self.filename = filename
        self.data = {}
        
    def __iter__(self):
        for uid, line in enumerate(codecs.open(self.filename,'r','utf-8')):
            if uid==0:
                pass
            else:
                elem  = line.split('{')
                self.data['SENT_%s'% uid] = elem[0]
                yield LabeledSentence(words=eval(elem[1]), labels=['SENT_%s' % uid])
            
    def get_data(self):
        return self.data

In [ ]:
'''
Cria um modelo usando o Doc2Vec
'''



from gensim.models.doc2vec import Doc2Vec

sentences = LabeledLineSentence('SMSSpamAnalytics')

model = Doc2Vec(sentences,min_count  = 0, workers =2, iter= 100)

In [ ]:
'''
Pega os vetores do modelo e armazena
'''

vectors = []
klasses = []

for key, klass in sentences.data.items():
    vectors.append(model[key])
    klasses.append(klass)

In [ ]:
#klasses[2]
print model['SENT_2909']

In [ ]:
for i in range(5):
    key = 'SENT_%s' % i
    print key
    #vectors.append(model[key])

In [ ]:
model.most_similar("lar")

In [ ]:
'''
Cria classificadores usando SVM, Gaussian Naive Bayes e kNN 
'''

from sklearn import cross_validation
from sklearn.metrics import matthews_corrcoef, make_scorer
from sklearn import svm
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier



classifiers = {'SVM' : svm.SVC(kernel = 'linear',C=3), 'GaussianNB' :GaussianNB(), 'kNN' : KNeighborsClassifier()}

for name, clf in classifiers.items():
    scores = cross_validation.cross_val_score( clf, vectors, klasses, cv=5, scoring=make_scorer(matthews_corrcoef))
    print name, " MCC: %0.3f (+/- %0.3f)" % (scores.mean(), scores.std() * 2)

In [ ]:
vectors[0]

In [ ]: