In [ ]:
'''
Classe que lê as mensgens e retorna um objeto um objeto da classe LabeledSentende
usa o tokenizador do NLTK
'''
from gensim.models.doc2vec import LabeledSentence
from nltk.tokenize import word_tokenize
import codecs
class LabeledLineSentence(object):
def __init__(self, filename):
self.filename = filename
self.data = {}
def __iter__(self):
for uid, line in enumerate(codecs.open(self.filename,'r','utf-8')):
if uid==0:
pass
else:
elem = line.split('{')
self.data['SENT_%s'% uid] = elem[0]
yield LabeledSentence(words=eval(elem[1]), labels=['SENT_%s' % uid])
def get_data(self):
return self.data
In [ ]:
'''
Cria um modelo usando o Doc2Vec
'''
from gensim.models.doc2vec import Doc2Vec
sentences = LabeledLineSentence('SMSSpamAnalytics')
model = Doc2Vec(sentences,min_count = 0, workers =2, iter= 100)
In [ ]:
'''
Pega os vetores do modelo e armazena
'''
vectors = []
klasses = []
for key, klass in sentences.data.items():
vectors.append(model[key])
klasses.append(klass)
In [ ]:
#klasses[2]
print model['SENT_2909']
In [ ]:
for i in range(5):
key = 'SENT_%s' % i
print key
#vectors.append(model[key])
In [ ]:
model.most_similar("lar")
In [ ]:
'''
Cria classificadores usando SVM, Gaussian Naive Bayes e kNN
'''
from sklearn import cross_validation
from sklearn.metrics import matthews_corrcoef, make_scorer
from sklearn import svm
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
classifiers = {'SVM' : svm.SVC(kernel = 'linear',C=3), 'GaussianNB' :GaussianNB(), 'kNN' : KNeighborsClassifier()}
for name, clf in classifiers.items():
scores = cross_validation.cross_val_score( clf, vectors, klasses, cv=5, scoring=make_scorer(matthews_corrcoef))
print name, " MCC: %0.3f (+/- %0.3f)" % (scores.mean(), scores.std() * 2)
In [ ]:
vectors[0]
In [ ]: