In [ ]:
from tweetokenize import Tokenizer
import string
gettokens = Tokenizer()
result = gettokens.tokenize('@raulfreiresi hey play :) #teste#loooll :D')
In [ ]:
import codecs
import numpy as np
open_file = codecs.open('Baseline/en-balanced/baseline_nohashtag/Baseline-total-balanced-nohashtag')
header = open_file.next()
data = []
for row in open_file:
a, b, c, d, e = row.split('{')
#data.append(row.split('{'))
data = np.array(data)
print len(data)
In [ ]:
from gensim.models.doc2vec import LabeledSentence
from gensim.models.doc2vec import Doc2Vec
from tweetokenize import Tokenizer
import unicodedata
class LabeledLineSentence(object):
def __init__(self, filename):
self.filename = filename
self.data = {}
def __iter__(self):
for uid, line in enumerate(open(self.filename)):
klass, _, _, twit, _ = line.split('{')
self.data['SENT_%s'% uid] = klass
gettokens = Tokenizer()
yield LabeledSentence(words=unicodedata.normalize('NFKD', gettokens.tokenize(twit.decode
("utf8").lower())).encode('ASCII', 'ignore'), labels=['SENT_%s' % uid])
def get_data(self):
return self.data
In [ ]:
model = Doc2Vec(alpha=0.025, min_alpha=0.025, hs=1, min_count=0, workers=4, iter=100)
sentences = LabeledLineSentence('Baseline/en-balanced/baseline_nohashtag/Baseline-total-balanced-nohashtag')
model.build_vocab(sentences)
model.train_words=True
model.train_labels=True
for epoch in range(10):
model.train(sentences)
model.alpha -= 0.002
model.min_alpha = model.alpha
In [ ]:
#model.most_similar('internet')
#len(sentences.data)
#model.save('Baseline/en-balanced/baseline_nohashtag/my_model.doc2vec')
#print model["SENT_0"]
#print sentences.data.get("SENT_0")
len(sentences.data)
In [ ]:
'''
Pega os vetores do modelo e armazena
'''
vectors = []
klasses = []
for i in range(len(sentences.data)):
vectors.append(model['SENT_%s' % i])
klasses.append(sentences.data.get('SENT_%s' % i))
In [ ]:
np.array(vectors)
In [ ]:
'''
Cria classificadores usando SVM, Gaussian Naive Bayes, kNN, Regression Logistic, Bagging, Decision Tree(CART)
'''
from sklearn import cross_validation
from sklearn.metrics import matthews_corrcoef, make_scorer
from sklearn import svm
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.naive_bayes import BernoulliNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import BaggingClassifier
classifiers = {'SVM' : svm.SVC(kernel = 'linear', C=3),
'Logistic' :LogisticRegression(),
'GaussianNB' :GaussianNB(),
'BernoulliNB' :BernoulliNB(),
'Bagging' :BaggingClassifier(),
'1-NN' : KNeighborsClassifier(n_neighbors=1),
'3-NN' : KNeighborsClassifier(n_neighbors=3),
'5-NN' : KNeighborsClassifier(n_neighbors=5),
#'RF' : RandomForestClassifier(max_depth=10),
'CART' : DecisionTreeClassifier(max_depth=5)}
for name, clf in classifiers.items():
scores = cross_validation.cross_val_score(clf, vectors, klasses, cv=5, scoring=make_scorer(matthews_corrcoef))
print name, " MCC: %0.3f (+/- %0.3f)" % (scores.mean(), scores.std() * 2)
In [ ]: