In [ ]:
%matplotlib inline
import matplotlib.pyplot as plt
from gensim import corpora, models, similarities
from gensim.utils import simple_preprocess, lemmatize
from gensim.models.doc2vec import TaggedDocument,Doc2Vec
from sklearn.multiclass import OneVsRestClassifier,OneVsOneClassifier
from sklearn.cross_validation import cross_val_score, train_test_split
from sklearn.metrics import classification_report, f1_score, accuracy_score, confusion_matrix
from gensim import models
from gensim.models.ldamodel import LdaModel
from scipy.stats import skew, kurtosis
from sklearn.preprocessing import LabelEncoder
from nltk.corpus import stopwords
from os import listdir
import pandas as pd
import numpy as np
import codecs
import re
from sklearn.learning_curve import learning_curve
In [ ]:
class MyCorpus(object):
def __init__(self,path):
self.path = path
self.text, self.klasses = self.get_klass_ids()
self.dictionary = corpora.Dictionary(self.text.get(text) for text in self.klasses.keys())
def __iter__(self):
for sent in self.klasses.keys():
#yield self.dictionary.doc2bow(self.text.get(sent))
yield self.text.get(sent)
def text_bow(self,sent):
return self.dictionary.doc2bow(self.text.get(sent))
def get_data(self,fname):
ids_text = {}
data = open(self.path+fname).readlines()
for row in data:
yield self.pre_process(row)
def get_klass_ids(self):
ids_text = {}
ids_klasses = {}
i=0
for klass in listdir(self.path):
for row in self.get_data(klass):
ids_text['SENT_%s'%i] = row
if(klass == 'rt-polarity.neg'):
ids_klasses['SENT_%s'%i] = 'neg'
else:
ids_klasses['SENT_%s'%i] = 'pos'
i=i+1
return ids_text, ids_klasses
def pre_process(self,text):
sentence = re.sub('[.,"]','',text)
#sentence = sentence.lower().decode('ISO-8859-7').split()
# REMOVING STOPWORDS TEXT ~~~~~~~~~~~~~~
#stopCashed = set(stopwords.words('english'))
#sentence = [word for word in sentence.lower().decode('ISO-8859-7').split() if word not in (stopCashed)]
## ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# LEMMATIZE TEXT ~~~~~~~~~~~~~~~~~~~~~~~~
sentence = lemmatize(sentence.decode('ISO-8859-7'))
## ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
return sentence
#return simple_preprocess(text)
In [ ]:
#DOC2VEC
class MyCorpus(object):
def __init__(self,path):
self.path = path
self.text, self.klasses = self.get_klass_ids()
def __iter__(self):
for sent in self.klasses.keys():
yield TaggedDocument(words = self.text.get(sent), tags = [sent])
def get_data(self,fname):
ids_text = {}
data = open(self.path+fname).readlines()
for row in data:
yield self.pre_process(row)
def get_klass_ids(self):
ids_text = {}
ids_klasses = {}
i=0
for klass in listdir(self.path):
for row in self.get_data(klass):
ids_text['SENT_%s'%i] = row
if(klass == 'rt-polarity.neg'):
ids_klasses['SENT_%s'%i] = 'neg'
else:
ids_klasses['SENT_%s'%i] = 'pos'
i=i+1
return ids_text, ids_klasses
def pre_process(self,text):
sentence = re.sub('[.,"]','',text)
#sentence = sentence.lower().decode('ISO-8859-7').split()
# REMOVING STOPWORDS TEXT ~~~~~~~~~~~~~~
#stopCashed = set(stopwords.words('english'))
#sentence = [word for word in sentence.lower().decode('ISO-8859-7').split() if word not in (stopCashed)]
## ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# LEMMATIZE TEXT ~~~~~~~~~~~~~~~~~~~~~~~~
sentence = lemmatize(sentence.decode('ISO-8859-7'))
## ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
return sentence
#return simple_preprocess(text)
In [ ]:
mc = MyCorpus('Bases/nltk/sentence_polarity/')
In [ ]:
import numpy as np
cont = []
cortados = []
i = 0
for sent in mc.klasses.keys():
temp = mc.text.get(sent)
if temp is None:
cortados.append(sent)
mc.text.pop(sent)
mc.klasses.pop(sent)
i = i+1
if len(temp) != 0:
cont.append(len(temp))
else:
cortados.append(sent)
mc.text.pop(sent)
mc.klasses.pop(sent)
i = i+1
print('Foram cortados %s sentenças vazias'%i)
print('Numero de Documentos: %s'%len(cont))
print('Numero total de termos: %s'%np.sum(cont))
print('Numero de termos do maior documento: %s'%np.max(cont))
print('Numero de termos do menor documento: %s'%np.min(cont))
print('Media de termos por documento: %8.6f'%np.mean(cont))
print('Medida de Desvio Padrão: %8.5f'%np.std(cont))
print('Skewness do numero de termos por documento: %6.4f'%skew(cont))
print('Kurtosis do numero de termos por documento: %6.4f'%kurtosis(cont))
In [ ]:
dictionary = corpora.Dictionary(mc)
corpus = [dictionary.doc2bow(text) for text in mc]
print('Tamanho do dicionário: %s'%len(dictionary.keys()))
In [ ]:
# gera o modelo tf-idf
tfidf = models.TfidfModel(corpus)
In [ ]:
# gera modelo lda
lda = models.LdaModel(corpus, id2word=dictionary, passes=10)
print lda.show_topics(2)
In [ ]:
# gera o modelo lsi
lsi = models.LsiModel(corpus, id2word=dictionary)
print lsi.show_topics(2)
In [ ]:
#gera o modelo doc2vec
model = Doc2Vec(mc, size=300, window=8, min_count=1, workers=2)
In [ ]:
model = Doc2Vec(dm=1, dbow_words=0, alpha=0.025, min_alpha=0.025, hs=1, size=300, min_count=0, workers=4, iter=20)#use fixed learning rate
model.build_vocab(mc)
model.train_words=True
model.train_labels=True
for epoch in range(7):
model.train(mc)
model.alpha -= 0.003 # decrease the learning rate
model.min_alpha = model.alpha # fix the learning rate, no decay
In [ ]:
# cria um grafo dirigido (Digrafo)
import networkx as nx
G = nx.DiGraph()
# cada texto é um nó do grafo
# a classe do texto é um atributo do nó do grafo
for k,v in mc.klasses.items():
G.add_node(k,klass=v)
In [ ]:
# adiciona as arestas no grafo
# nomes dos arquivos...
# variável auxiliar...
names = mc.klasses.keys()
# gera o modelo de similaridades
# para encontaros k-vizinhos de cada nó
# num_best é o número de k-vizinhos + 1 (pois o nó é vizinho dele mesmo)
# num_best=11 gera um grafo com 10 vizinhos pra cada nó
#index = similarities.Similarity(None,tfidf[corpus],num_features=len(dictionary.keys()),num_best=11)
# To lda, lsi
index = similarities.Similarity(None,lda[corpus],num_features=len(dictionary.keys()),num_best=11)
for k in names:
for nn in index[lda[mc.text_bow(k)]]:
if not k==names[nn[0]]:
G.add_edge(k,names[nn[0]],weight=nn[1])
In [ ]:
#For doc2vec model
names = mc.klasses.keys()
for k in names:
for nn in model.docvecs.most_similar(k, topn=11):
G.add_edge(k,nn[0],weight=nn[1])
In [ ]:
# calcula a distribuição do grau de cada nó
from collections import Counter
# como o out_degree é sempre 10 (por construção)
# basta usar o in_degree
degree = G.in_degree().values()
cdegree = Counter(degree)
In [ ]:
# skewness and kurtosis mede o quanto não uniforme é a distribuição
print skew(degree), kurtosis(degree)
In [ ]:
%matplotlib inline
import matplotlib.pyplot as plt
plt.plot(cdegree.keys(),cdegree.values(),'bo-')
plt.savefig('Pictures/Doc2Vec-DBOW_SKIPGRAM/Com Stemming/sentencepolarity-GoodDegrees-k11')
In [ ]:
good_bad_edges = {}
for k in names:
good_bad_edges[k] = {}
good_bad_edges[k]['good'] = 0
good_bad_edges[k]['bad'] = 0
good_bad_edges[k]['all'] = 0
for edge in G.in_edges(k):
if G.node[edge[0]]['klass'] == G.node[edge[1]]['klass']:
good_bad_edges[k]['good']+=1
else:
good_bad_edges[k]['bad']+=1
good_bad_edges[k]['all']+=1
In [ ]:
baddegree = [d['bad'] for d in good_bad_edges.values()]
CBad = Counter(baddegree)
plt.plot(cdegree.keys(),cdegree.values(),'bo-')
plt.plot(CBad.keys(),CBad.values(),'ro-')
plt.savefig('Pictures/Doc2Vec-DBOW_SKIPGRAM/Com Stemming/sentencepolarity-GoodBadDegrees-k11')
In [ ]:
print skew(baddegree), kurtosis(baddegree)
In [ ]:
from scipy.stats import spearmanr,pearsonr
import numpy as np
corr = np.array([[d['bad'], d['all']] for d in good_bad_edges.values()])
print('Spearman Correlation: %8.6f, %3.7s'% spearmanr(corr[:,0],corr[:,1]))
print('Pearson Correlation: %8.6f, %3.7s'%pearsonr(corr[:,0],corr[:,1]))
In [ ]:
############# ---------------------- TRAINING MODEL ------------------##################
In [ ]:
## FOR TFIDF MODEL SKLEARN
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
def split_mc_corpus(corpus):
words = corpus
return [word for word in words]
bow_transformer = CountVectorizer(analyzer=split_mc_corpus).fit(mc)
messages_bow = bow_transformer.transform(mc)
tfidf_transformer = TfidfTransformer().fit(messages_bow)
print 'sparse matrix shape:', messages_bow.shape
print 'number of non-zeros:', messages_bow.nnz
print 'sparsity: %.2f%%' % (100.0 * messages_bow.nnz / (messages_bow.shape[0] * messages_bow.shape[1]))
vectors = tfidf_transformer.transform(messages_bow)
klasses = np.array(mc.klasses.values())
In [ ]:
## FOR DOC2VEC MODEL
'''
Doc2Vec Pega os vetores do modelo e armazena
'''
vectors = []
klasses = []
for key in mc.klasses.keys():
vectors.append(model.docvecs[key])
klasses.append(mc.klasses.get(key))
vectors = np.array(vectors)
klasses = np.array(klasses)
In [ ]:
'''
LSI Pega os vetores do modelo e armazena
'''
vectorsX = []
vecs = []
klasses = []
k = 0
for key in mc.klasses.keys():
vecs = np.array(lsi[mc.text_bow(key)], dtype=np.float32).T
try:
if len(vecs[1]) == 200:
vectorsX.append(vecs[1])
klasses.append(mc.klasses.get(key))
else:
k = k+1
except IndexError:
k = k+1
continue
vectors = np.array(vectorsX)
klasses = np.array(klasses)
print "%s: sentenças não continham pesos LSI"%k
vectors.shape
In [ ]:
'''
LDA Pega os vetores do modelo e armazena
'''
## MATRIX FOR LDA AND TFIDF MODELS
def addMatrix(A,B):
""" Soma duas matrizes."""
sizeL=len(A)
sizeC=len(A[0])
s = (sizeL,sizeC)
C = np.zeros(s, dtype=np.float32)
# Soma
for i in range(sizeL):
for j in range(len(B[i])):
C[i][j]=A[i][j]+B[i][j]
return C
vectors = []
klasses = []
temp = 0
q = 0
for key in mc.klasses.keys():
try:
vecs = np.array(lda[mc.text_bow(key)], dtype=np.float32)
s = (1, int(vecs[:,0].max()+1))
A = np.zeros(s)
sizeL=len(A)
sizeC=len(A[0])
img = (sizeL,sizeC)
C = np.zeros(img, dtype=np.float32)
flag = 0
for row in vecs[:,0]:
dim = int(row)+1
for i in range(sizeL):
for j in range(dim):
if j == dim-1:
C[i][j]=1
if temp < dim:
temp = dim
vectors.append(np.hstack(C))
klasses.append(mc.klasses.get(key))
except IndexError:
q=q+1
continue
s = (len(vectors),temp)
A = np.zeros(s)
B = np.array(vectors)
klasses = np.array(klasses)
vectors = addMatrix(A,B)
print "%s: sentenças não continham pesos LDA"%q
print vectors.shape
In [ ]:
# FOR EVERY
# SET TEST VECTOR
vecs_train, vecs_test, label_train, label_test = \
train_test_split(vectors, klasses, test_size=0.4)
print len(vecs_train), len(vecs_test), len(vecs_train) + len(vecs_test)
In [ ]:
from sklearn import svm
abc_detector = svm.LinearSVC().fit(vectors, klasses)
In [ ]:
## FOR CONSULTING
phase = mc.text.get('SENT_15')
it = lda.id2word.doc2bow(tx for tx in phase)
tvec = np.array(lda[it]).T
tvec = tvec[1,]
print 'Class predicted:', abc_detector.predict(tvec)[0]
In [ ]:
print 'predicted:', abc_detector.predict(vectors)[4]
print 'expected:', klasses[4]
In [ ]:
all_predictions = abc_detector.predict(vectors)
print all_predictions[0:20]
In [ ]:
#CONVERT KLASSES TO BINARY
#DEFINE MCC
def multiclass_matthews_corrcoef(y_true,y_pred):
cov_mat = np.cov(y_true,y_pred)
mcc = cov_mat[0][1]/np.sqrt(cov_mat[0][0]*cov_mat[1][1])
return mcc
pe = LabelEncoder()
#pe.fit(all_predictions)
le = LabelEncoder()
le.fit(klasses)
bin_klasses = le.transform(klasses)
#bin_predictions = pe.transform(all_predictions)
In [ ]:
modelo = 'LDA'
data = 'sentencepolarity'
process = 'SemStop'
In [ ]:
import sys
temp = sys.stdout
sys.stdout = open('Logs/log.txt', 'a')
print 'Confusion Matrix '+modelo+' Model >['+data+' dataset]< --'+process+'--'
print 'accuracy', accuracy_score(klasses, all_predictions)
print 'confusion matrix\n', confusion_matrix(klasses, all_predictions)
print '(row=expected, col=predicted)'
print 'Classification Report'
print classification_report(klasses, all_predictions)
sys.stdout.close()
sys.stdout = temp # restore print commands to interactive prompt
## Back to Normal
print 'accuracy', accuracy_score(klasses, all_predictions)
print 'confusion matrix\n', confusion_matrix(klasses, all_predictions)
print '(row=expected, col=predicted)'
In [ ]:
plt.matshow(confusion_matrix(klasses, all_predictions), cmap=plt.cm.binary, interpolation='nearest')
plt.title('confusion matrix')
plt.colorbar()
plt.ylabel('expected label')
plt.xlabel('predicted label')
plt.savefig('Pictures/Doc2Vec-DBOW_SKIPGRAM/CMatrix/ConfusionMatrix-'+data+'-'+process+'')
In [ ]:
print classification_report(klasses, all_predictions)
In [ ]:
# Doc2Vec-DM_CBOW
# Doc2Vec-DBOW_SKIPGRAM
modelo = 'Doc2Vec-DBOW_SKIPGRAM'
data = 'sentencepolarity'
process = 'Com Stemming'
log = 'log3'
import sys
temp = sys.stdout
sys.stdout = open('Logs/'+log+'.txt', 'a')
print 'Matrix '+modelo+' Model >['+data+' dataset]< --'+process+'--'
In [ ]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import StratifiedKFold
from sklearn import cross_validation
from sklearn.naive_bayes import BernoulliNB, MultinomialNB
from sklearn.linear_model import SGDClassifier
from sklearn import svm
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.svm import LinearSVC, SVC
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import BaggingClassifier
from sklearn.metrics import accuracy_score, make_scorer
from sklearn.metrics import f1_score
import time
X = vectors
y = bin_klasses
skf = StratifiedKFold(n_folds=2)
skf.get_n_splits(X, y)
classifiers = {'SVM Linear' : LinearSVC(),
#'RBF SVM' : SVC(gamma=2, C=1),
'3-NN' : KNeighborsClassifier(n_neighbors=3),
'5-NN' : KNeighborsClassifier(n_neighbors=5),
'AdaBoost' : AdaBoostClassifier(),
'Logistic' :LogisticRegression(),
'BernoulliNB' :BernoulliNB(),
'RF' : RandomForestClassifier(max_depth=100, max_features='auto'),
}
for name, clf in classifiers.items():
mccs = []
accs = []
f1s = []
for train_index, test_index in skf.split(X, y):
X_train, X_test = X[train_index], X[test_index]
y_train, y_test = y[train_index], y[test_index]
clf.fit(X_train,y_train)
preds = clf.predict(X_test)
mccs.append(multiclass_matthews_corrcoef(y_test,preds))
accs.append(accuracy_score(y_test,preds))
f1s.append(f1_score(y_test,preds,average=None))
print name, "Accuracy: %0.3f"% np.mean(accs)
print name, "F1: %0.3f"% np.mean(f1s)
print name, "MCC: %0.3f"% np.mean(mccs)
print '=================================================================================================='
sys.stdout.close()
sys.stdout = temp # restore print commands to interactive prompt
In [ ]: