In [ ]:
from gensim.models.doc2vec import LabeledSentence
from gensim.models.doc2vec import Doc2Vec
from nltk.tokenize import word_tokenize
import codecs

class LabeledLineSentence(object):
    def __init__(self, filename):
        self.filename = filename
        self.data = {}
 
    def __iter__(self):
        for uid, line in enumerate(codecs.open(self.filename,'r','utf-8')):
            coluna = line.split('{')
            self.data['SENT_%s'% uid] = coluna[0]
            yield LabeledSentence(words=eval(coluna[1]), labels=['SENT_%s' % uid])  
            
    def get_data(self):
        return self.data

In [1]:
#PARA BASE ORIGINAL##

'''
Classe que lê as mensgens e retorna um objeto um objeto da classe LabeledSentende
usa o tokenizador do NLTK
'''
import sys
import gzip
import logging
from nltk.tokenize import word_tokenize
from gensim.models.doc2vec import TaggedDocument, Doc2Vec

logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)


class TaggedLineSentence(object):
    def __init__(self, filename):
        self.filename = filename
        self.data = {}
        
    def __iter__(self):
        for uid, line in enumerate(open(self.filename)):
            klass, sms = line.split('\t')
            self.data['SENT_%s'% uid] = klass
            yield TaggedDocument(words=word_tokenize(sms.decode("utf8").lower()), tags=['SENT_%s' % uid])
            
    def get_data(self):
        return self.data

#PARA BASE ORIGINAL##

In [2]:
#PARA BASE ORIGINAL##

'''
Cria um modelo usando o Doc2Vec
'''

from gensim.models.doc2vec import Doc2Vec

sentences = TaggedLineSentence('SMSSpamCollection')

model = Doc2Vec(alpha=0.025, min_alpha=0.025, docvecs_mapfile='mapfile')  # use fixed learning rate
model.build_vocab(sentences)
 
for epoch in range(1):
    model.train(sentences)
    #model.precalc_sampling()
    model.alpha -= 0.003  # decrease the learning rate
    model.min_alpha = model.alpha  # fix the learning rate, no decay
    
#model = Doc2Vec(sentences,min_count  = 0, workers =2, iter= 100)

#PARA BASE ORIGINAL##

In [3]:
sentences.data.get('SENT_1')


Out[3]:
'ham'

In [5]:
model.syn0_lockf


Out[5]:
array([ 1.,  1.,  1., ...,  1.,  1.,  1.], dtype=float32)

In [ ]:
import numpy as np
ham_vecs = []
spam_vecs = []
for i in range(5572):
    if sentences.data.get('SENT_%s' % i) == 'ham':
        ham_vecs.append(model.docvecs.doctag_syn0[i])
    else:
        spam_vecs.append(model.docvecs.doctag_syn0[i])
#ram_vecs = np.concatenate(ram_vecs)
#spam_vecs = np.concatenate(spam_vecs)
ham_vecs = np.array(ham_vecs, dtype='float')
spam_vecs = np.array(spam_vecs, dtype='float')

In [ ]:
spam_vecs.shape

In [ ]:
from sklearn.manifold import TSNE
import matplotlib
matplotlib.use('Agg')
import matplotlib.pyplot as plt


ts = TSNE(2)
reduced_vecs = ts.fit_transform(np.concatenate((ham_vecs, spam_vecs)))

In [ ]:
reduced_vecs.shape

In [ ]:
import matplotlib
matplotlib.use('Agg')
import matplotlib.pyplot as plt

#color points by word group to see if Word2Vec can separate them
for i in range(len(reduced_vecs)):
    if i <= len(ham_vecs):
        #ham_vecs colored blue
        color = 'b'
    elif i > len(ham_vecs):
        #spam_vecs colored green
        color = 'r'
    plt.plot(reduced_vecs[i,0], reduced_vecs[i,1], marker = 'o', color = color, markersize = 4)
plt.savefig('fourfig')

In [ ]:
plt.close()

In [ ]:
model = Doc2Vec(alpha=0.025, min_alpha=0.025, hs=1, min_count=0, workers=2, iter=100)  # use fixed learning rate
sentences = LabeledLineSentence('SMSSpamAnalytics_noHeader_current')
model.build_vocab(sentences)
model.train_words=True
model.train_labels=True
 
for epoch in range(10):
    model.train(sentences)
    #model.precalc_sampling()
    model.alpha -= 0.003  # decrease the learning rate
    model.min_alpha = model.alpha  # fix the learning rate, no decay
    
#model = Doc2Vec(sentences,min_count  = 0, workers =2, iter= 100)

In [ ]:
#print model["SENT_2"]
model.most_similar(["lar"])
#print sentences.data.get("SENT_1")

In [ ]:
'''
Pega os vetores do modelo e armazena
'''

vectors = []
klasses = []

for i in range(5572):
    vectors.append(model['SENT_%s' % i])
    klasses.append(sentences.data.get('SENT_%s' % i))

In [ ]:
#Apenas para verificar numero de componentes a ser utilizado
from sklearn.decomposition import PCA
import numpy as np
pca = PCA()
pca.fit_transform(vectors)
vetor = np.cumsum(pca.explained_variance_ratio_)
vetor[40]

In [ ]:
#Reduzir matriz à X dimensões
from sklearn.decomposition import PCA
pca = PCA(n_components=40)
matriz_reduzida = pca.fit_transform(vectors)

In [ ]:
# SEM PCA
matriz_reduzida = vectors

In [ ]:
import numpy as np
import scipy.sparse as ssp
from sklearn.datasets import load_svmlight_file, dump_svmlight_file
X_train, y_train = load_svmlight_file("e:\\teste\\tf-idf(SpamAnalytics)\\basetreino.txt")
matrix = ssp.hstack([X_train, matriz_reduzida])
dump_svmlight_file(matrix, y_train, "e:\\teste\\tf-idf(SpamAnalytics)\\matrix_dense.txt")

In [ ]:
print matriz_reduzida.shape
np.array([klasses]).T.shape

In [ ]:
'''
Cria classificadores usando SVM, Gaussian Naive Bayes, kNN, Regression Logistic, Bagging, Decision Tree(CART)
'''

from sklearn import cross_validation
from sklearn.metrics import matthews_corrcoef, make_scorer
from sklearn import svm
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.naive_bayes import BernoulliNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import BaggingClassifier


classifiers = {'SVM' : svm.SVC(kernel = 'linear', C=3),
               'Logistic' :LogisticRegression(),
               'GaussianNB' :GaussianNB(),
               'BernoulliNB' :BernoulliNB(),
               'Bagging' :BaggingClassifier(),
               '1-NN' : KNeighborsClassifier(n_neighbors=1),
               '3-NN' : KNeighborsClassifier(n_neighbors=3),
               '5-NN' : KNeighborsClassifier(n_neighbors=5),
               #'RF' : RandomForestClassifier(max_depth=10), 
               'CART' : DecisionTreeClassifier(max_depth=5)}

for name, clf in classifiers.items():
    scores = cross_validation.cross_val_score( clf, matrix.todense(), klasses, cv=5, scoring=make_scorer(matthews_corrcoef))
    print name, " MCC: %0.3f (+/- %0.3f)" % (scores.mean(),  scores.std() * 2)

In [ ]:
atrnames = ['atr'+str(i) for i in range(40)]

In [ ]:


In [ ]:
def write_to_weka(filename,relationname,attributenames,attributes,comment=''):
    """ writes NumPy arrays with data to WEKA format .arff files
    
        input: relationname (string with a description), attributenames (list 
        of the names of different attributes), attributes (array of attributes, 
        one row for each attribute, WEKA treats last row as classlabels by 
        default), comment (short description of the content)."""
    
    nbrattributes = len(attributenames)
    if attributes.shape[1] != nbrattributes+1:
        raise Exception('Number of attribute names is not equal to length of attributes')
    
    f = open(filename, 'w')
    f.write('% '+comment+'\n')
    f.write('\n')
    f.write('@RELATION '+relationname+'\n')
    
    for a in attributenames:
        f.write('@ATTRIBUTE '+str(a)+' NUMERIC\n') #assume values are numeric
        
    f.write('@ATTRIBUTE class {spam, ham}\n')
    
    f.write('\n')    
    f.write('@DATA\n') #write the data, one attribute vector per line
    for i in range(attributes.shape[0]):
        for j in range(nbrattributes+1):
            f.write(str(attributes[i,j]))
            if j < nbrattributes:
                f.write(', ')
        f.write('\n') 
    f.close()

In [ ]:
from weka.classifiers import Classifier, Evaluation
from weka.core.classes import Random
from weka.core.converters import Loader, Saver
import weka.core.jvm as jvm
jvm.start()

data = np.hstack([matriz_reduzida, np.array([klasses]).T])
write_to_weka('Teste.arff', 'teste', atrnames, data)
loader = Loader(classname='weka.core.converters.ArffLoader')
data = loader.load_file("Teste.arff")
data.class_is_last()

classifier = Classifier(classname="weka.classifiers.meta.AdaBoostM1", options=['-W', 'weka.classifiers.trees.J48'])
evaluation = Evaluation(data)                     # initialize with priors
evaluation.crossvalidate_model(classifier, data, 10, Random(42))  # 10-fold CV
#print(evaluation.summary())
#print("pctCorrect: " + str(evaluation.percent_correct))
#print("incorrect: " + str(evaluation.incorrect))
print("MCC: %0.3f " % (evaluation.matthews_correlation_coefficient(1)))

In [ ]: