In [ ]:
from gensim.models.doc2vec import LabeledSentence
from gensim.models.doc2vec import Doc2Vec
from nltk.tokenize import word_tokenize
import codecs
class LabeledLineSentence(object):
def __init__(self, filename):
self.filename = filename
self.data = {}
def __iter__(self):
for uid, line in enumerate(codecs.open(self.filename,'r','utf-8')):
coluna = line.split('{')
self.data['SENT_%s'% uid] = coluna[0]
yield LabeledSentence(words=eval(coluna[1]), labels=['SENT_%s' % uid])
def get_data(self):
return self.data
In [1]:
#PARA BASE ORIGINAL##
'''
Classe que lê as mensgens e retorna um objeto um objeto da classe LabeledSentende
usa o tokenizador do NLTK
'''
import sys
import gzip
import logging
from nltk.tokenize import word_tokenize
from gensim.models.doc2vec import TaggedDocument, Doc2Vec
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
class TaggedLineSentence(object):
def __init__(self, filename):
self.filename = filename
self.data = {}
def __iter__(self):
for uid, line in enumerate(open(self.filename)):
klass, sms = line.split('\t')
self.data['SENT_%s'% uid] = klass
yield TaggedDocument(words=word_tokenize(sms.decode("utf8").lower()), tags=['SENT_%s' % uid])
def get_data(self):
return self.data
#PARA BASE ORIGINAL##
In [2]:
#PARA BASE ORIGINAL##
'''
Cria um modelo usando o Doc2Vec
'''
from gensim.models.doc2vec import Doc2Vec
sentences = TaggedLineSentence('SMSSpamCollection')
model = Doc2Vec(alpha=0.025, min_alpha=0.025, docvecs_mapfile='mapfile') # use fixed learning rate
model.build_vocab(sentences)
for epoch in range(1):
model.train(sentences)
#model.precalc_sampling()
model.alpha -= 0.003 # decrease the learning rate
model.min_alpha = model.alpha # fix the learning rate, no decay
#model = Doc2Vec(sentences,min_count = 0, workers =2, iter= 100)
#PARA BASE ORIGINAL##
In [3]:
sentences.data.get('SENT_1')
Out[3]:
In [5]:
model.syn0_lockf
Out[5]:
In [ ]:
import numpy as np
ham_vecs = []
spam_vecs = []
for i in range(5572):
if sentences.data.get('SENT_%s' % i) == 'ham':
ham_vecs.append(model.docvecs.doctag_syn0[i])
else:
spam_vecs.append(model.docvecs.doctag_syn0[i])
#ram_vecs = np.concatenate(ram_vecs)
#spam_vecs = np.concatenate(spam_vecs)
ham_vecs = np.array(ham_vecs, dtype='float')
spam_vecs = np.array(spam_vecs, dtype='float')
In [ ]:
spam_vecs.shape
In [ ]:
from sklearn.manifold import TSNE
import matplotlib
matplotlib.use('Agg')
import matplotlib.pyplot as plt
ts = TSNE(2)
reduced_vecs = ts.fit_transform(np.concatenate((ham_vecs, spam_vecs)))
In [ ]:
reduced_vecs.shape
In [ ]:
import matplotlib
matplotlib.use('Agg')
import matplotlib.pyplot as plt
#color points by word group to see if Word2Vec can separate them
for i in range(len(reduced_vecs)):
if i <= len(ham_vecs):
#ham_vecs colored blue
color = 'b'
elif i > len(ham_vecs):
#spam_vecs colored green
color = 'r'
plt.plot(reduced_vecs[i,0], reduced_vecs[i,1], marker = 'o', color = color, markersize = 4)
plt.savefig('fourfig')
In [ ]:
plt.close()
In [ ]:
model = Doc2Vec(alpha=0.025, min_alpha=0.025, hs=1, min_count=0, workers=2, iter=100) # use fixed learning rate
sentences = LabeledLineSentence('SMSSpamAnalytics_noHeader_current')
model.build_vocab(sentences)
model.train_words=True
model.train_labels=True
for epoch in range(10):
model.train(sentences)
#model.precalc_sampling()
model.alpha -= 0.003 # decrease the learning rate
model.min_alpha = model.alpha # fix the learning rate, no decay
#model = Doc2Vec(sentences,min_count = 0, workers =2, iter= 100)
In [ ]:
#print model["SENT_2"]
model.most_similar(["lar"])
#print sentences.data.get("SENT_1")
In [ ]:
'''
Pega os vetores do modelo e armazena
'''
vectors = []
klasses = []
for i in range(5572):
vectors.append(model['SENT_%s' % i])
klasses.append(sentences.data.get('SENT_%s' % i))
In [ ]:
#Apenas para verificar numero de componentes a ser utilizado
from sklearn.decomposition import PCA
import numpy as np
pca = PCA()
pca.fit_transform(vectors)
vetor = np.cumsum(pca.explained_variance_ratio_)
vetor[40]
In [ ]:
#Reduzir matriz à X dimensões
from sklearn.decomposition import PCA
pca = PCA(n_components=40)
matriz_reduzida = pca.fit_transform(vectors)
In [ ]:
# SEM PCA
matriz_reduzida = vectors
In [ ]:
import numpy as np
import scipy.sparse as ssp
from sklearn.datasets import load_svmlight_file, dump_svmlight_file
X_train, y_train = load_svmlight_file("e:\\teste\\tf-idf(SpamAnalytics)\\basetreino.txt")
matrix = ssp.hstack([X_train, matriz_reduzida])
dump_svmlight_file(matrix, y_train, "e:\\teste\\tf-idf(SpamAnalytics)\\matrix_dense.txt")
In [ ]:
print matriz_reduzida.shape
np.array([klasses]).T.shape
In [ ]:
'''
Cria classificadores usando SVM, Gaussian Naive Bayes, kNN, Regression Logistic, Bagging, Decision Tree(CART)
'''
from sklearn import cross_validation
from sklearn.metrics import matthews_corrcoef, make_scorer
from sklearn import svm
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.naive_bayes import BernoulliNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import BaggingClassifier
classifiers = {'SVM' : svm.SVC(kernel = 'linear', C=3),
'Logistic' :LogisticRegression(),
'GaussianNB' :GaussianNB(),
'BernoulliNB' :BernoulliNB(),
'Bagging' :BaggingClassifier(),
'1-NN' : KNeighborsClassifier(n_neighbors=1),
'3-NN' : KNeighborsClassifier(n_neighbors=3),
'5-NN' : KNeighborsClassifier(n_neighbors=5),
#'RF' : RandomForestClassifier(max_depth=10),
'CART' : DecisionTreeClassifier(max_depth=5)}
for name, clf in classifiers.items():
scores = cross_validation.cross_val_score( clf, matrix.todense(), klasses, cv=5, scoring=make_scorer(matthews_corrcoef))
print name, " MCC: %0.3f (+/- %0.3f)" % (scores.mean(), scores.std() * 2)
In [ ]:
atrnames = ['atr'+str(i) for i in range(40)]
In [ ]:
In [ ]:
def write_to_weka(filename,relationname,attributenames,attributes,comment=''):
""" writes NumPy arrays with data to WEKA format .arff files
input: relationname (string with a description), attributenames (list
of the names of different attributes), attributes (array of attributes,
one row for each attribute, WEKA treats last row as classlabels by
default), comment (short description of the content)."""
nbrattributes = len(attributenames)
if attributes.shape[1] != nbrattributes+1:
raise Exception('Number of attribute names is not equal to length of attributes')
f = open(filename, 'w')
f.write('% '+comment+'\n')
f.write('\n')
f.write('@RELATION '+relationname+'\n')
for a in attributenames:
f.write('@ATTRIBUTE '+str(a)+' NUMERIC\n') #assume values are numeric
f.write('@ATTRIBUTE class {spam, ham}\n')
f.write('\n')
f.write('@DATA\n') #write the data, one attribute vector per line
for i in range(attributes.shape[0]):
for j in range(nbrattributes+1):
f.write(str(attributes[i,j]))
if j < nbrattributes:
f.write(', ')
f.write('\n')
f.close()
In [ ]:
from weka.classifiers import Classifier, Evaluation
from weka.core.classes import Random
from weka.core.converters import Loader, Saver
import weka.core.jvm as jvm
jvm.start()
data = np.hstack([matriz_reduzida, np.array([klasses]).T])
write_to_weka('Teste.arff', 'teste', atrnames, data)
loader = Loader(classname='weka.core.converters.ArffLoader')
data = loader.load_file("Teste.arff")
data.class_is_last()
classifier = Classifier(classname="weka.classifiers.meta.AdaBoostM1", options=['-W', 'weka.classifiers.trees.J48'])
evaluation = Evaluation(data) # initialize with priors
evaluation.crossvalidate_model(classifier, data, 10, Random(42)) # 10-fold CV
#print(evaluation.summary())
#print("pctCorrect: " + str(evaluation.percent_correct))
#print("incorrect: " + str(evaluation.incorrect))
print("MCC: %0.3f " % (evaluation.matthews_correlation_coefficient(1)))
In [ ]: