In [ ]:
from tweetokenize import Tokenizer
import string
gettokens = Tokenizer()
result = gettokens.tokenize('@raulfreiresi hey play :) #teste#loooll :D')

In [ ]:
import codecs
import numpy as np

open_file = codecs.open('Baseline/en-balanced/baseline_nohashtag/Baseline-total-balanced-nohashtag')
header = open_file.next()
data = []

for row in open_file:
    a, b, c, d, e = row.split('{')
    #data.append(row.split('{'))
data = np.array(data)
print len(data)

In [ ]:
from gensim.models.doc2vec import LabeledSentence
from gensim.models.doc2vec import Doc2Vec
from tweetokenize import Tokenizer
import unicodedata

class LabeledLineSentence(object):
    def __init__(self, filename):
        self.filename = filename
        self.data = {}
        
    def __iter__(self):
        for uid, line in enumerate(open(self.filename)):
            klass, _, _, twit, _ = line.split('{')
            self.data['SENT_%s'% uid] = klass
            gettokens = Tokenizer()            
            yield LabeledSentence(words=unicodedata.normalize('NFKD', gettokens.tokenize(twit.decode
                            ("utf8").lower())).encode('ASCII', 'ignore'), labels=['SENT_%s' % uid])
            
    def get_data(self):
        return self.data

In [ ]:
model = Doc2Vec(alpha=0.025, min_alpha=0.025, hs=1, min_count=0, workers=4, iter=100)
sentences = LabeledLineSentence('Baseline/en-balanced/baseline_nohashtag/Baseline-total-balanced-nohashtag')
model.build_vocab(sentences)
model.train_words=True
model.train_labels=True

for epoch in range(10):
    model.train(sentences)
    model.alpha -= 0.002
    model.min_alpha = model.alpha

In [ ]:
#model.most_similar('internet')
#len(sentences.data)
#model.save('Baseline/en-balanced/baseline_nohashtag/my_model.doc2vec')
#print model["SENT_0"]
#print sentences.data.get("SENT_0")
len(sentences.data)

In [ ]:
'''
Pega os vetores do modelo e armazena
'''

vectors = []
klasses = []

for i in range(len(sentences.data)):
    vectors.append(model['SENT_%s' % i])
    klasses.append(sentences.data.get('SENT_%s' % i))

In [ ]:
np.array(vectors)

In [ ]:
'''
Cria classificadores usando SVM, Gaussian Naive Bayes, kNN, Regression Logistic, Bagging, Decision Tree(CART)
'''

from sklearn import cross_validation
from sklearn.metrics import matthews_corrcoef, make_scorer
from sklearn import svm
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.naive_bayes import BernoulliNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import BaggingClassifier


classifiers = {'SVM' : svm.SVC(kernel = 'linear', C=3),
               'Logistic' :LogisticRegression(),
               'GaussianNB' :GaussianNB(),
               'BernoulliNB' :BernoulliNB(),
               'Bagging' :BaggingClassifier(),
               '1-NN' : KNeighborsClassifier(n_neighbors=1),
               '3-NN' : KNeighborsClassifier(n_neighbors=3),
               '5-NN' : KNeighborsClassifier(n_neighbors=5),
               #'RF' : RandomForestClassifier(max_depth=10), 
               'CART' : DecisionTreeClassifier(max_depth=5)}

for name, clf in classifiers.items():
    scores = cross_validation.cross_val_score(clf, vectors, klasses, cv=5, scoring=make_scorer(matthews_corrcoef))
    print name, " MCC: %0.3f (+/- %0.3f)" % (scores.mean(),  scores.std() * 2)

In [ ]: