In [3]:
from gensim.utils import simple_preprocess
from os import listdir
In [4]:
from gensim.models.doc2vec import TaggedDocument,Doc2Vec
"""
os textos de cada classes estão divididos em
subdiretórios dentro de path
"""
class MyCorpus(object):
def __init__(self,path):
self.path = path
self.klasses = self.file_ids_klass()
def __iter__(self):
for fname in self.klasses.keys():
yield TaggedDocument(words = self.get_text(fname), tags = [fname])
def file_ids_klass(self):
ids = {}
for klass in listdir(self.path):
for fname in listdir(self.path+'/'+klass):
ids[klass+'/'+fname] = klass
return ids
def get_text(self,fname):
with open(self.path+'/'+fname) as finput:
text = ''.join(finput.readlines())
return self.pre_process(text)
"""
por hora, um preprocessamento simples...
podemos trocar por um mais elaborado depois
"""
def pre_process(self,text):
return simple_preprocess(text)
In [6]:
mc = MyCorpus('/home/kadnoise/Downloads/review_polarity/txt_sentoken/')
In [ ]:
In [4]:
model = Doc2Vec(mc, size=300, window=8, min_count=1, workers=2)
In [5]:
# cria um grafo dirigido (Digrafo)
import networkx as nx
G = nx.DiGraph()
# cada texto é um nó do grafo
# a classe do texto é um atributo do nó do grafo
for k,v in mc.klasses.items():
G.add_node(k,klass=v)
In [6]:
# adiciona as arestas no grafo
# nomes dos arquivos...
# variável auxiliar...
names = mc.klasses.keys()
for k in names:
for nn in model.docvecs.most_similar(k):
G.add_edge(k,nn[0],weight=nn[1])
In [7]:
# calcula a distribuição do grau de cada nó
from collections import Counter
# como o out_degree é sempre 10 (por construção)
# basta usar o in_degree
degree = G.in_degree().values()
cdegree = Counter(degree)
In [8]:
# skewness and kurtosis mede o quanto não uniforme é a distribuição
from scipy.stats import skew, kurtosis
print skew(degree), kurtosis(degree)
In [9]:
%matplotlib inline
import matplotlib.pyplot as plt
In [10]:
plt.plot(cdegree.keys(),cdegree.values(),'bo-')
Out[10]:
In [11]:
good_bad_edges = {}
for k in mc.klasses.keys():
good_bad_edges[k] = {}
good_bad_edges[k]['good'] = 0
good_bad_edges[k]['bad'] = 0
good_bad_edges[k]['all'] = 0
for edge in G.in_edges(k):
if G.node[edge[0]]['klass'] == G.node[edge[1]]['klass']:
good_bad_edges[k]['good']+=1
else:
good_bad_edges[k]['bad']+=1
good_bad_edges[k]['all']+=1
In [12]:
baddegree = [degree['bad'] for degree in good_bad_edges.values()]
CBad = Counter(baddegree)
In [13]:
plt.plot(cdegree.keys(),cdegree.values(),'bo-')
plt.plot(CBad.keys(),CBad.values(),'ro-')
Out[13]:
In [14]:
print skew(baddegree), kurtosis(baddegree)
In [15]:
from scipy.stats import spearmanr,pearsonr
import numpy as np
corr = np.array([[degree['bad'], degree['all']] for degree in good_bad_edges.values()])
print spearmanr(corr[:,0],corr[:,1]), pearsonr(corr[:,0],corr[:,1])
In [ ]: