In [3]:
from gensim.utils import simple_preprocess

from os import listdir

In [4]:
from gensim.models.doc2vec import TaggedDocument,Doc2Vec

""" 
os textos de cada classes estão divididos em
subdiretórios dentro de path
"""
class MyCorpus(object):
    
    def __init__(self,path):
        self.path = path
        self.klasses = self.file_ids_klass()
        
        
        
    def __iter__(self):
        for fname in self.klasses.keys():
            yield TaggedDocument(words = self.get_text(fname), tags = [fname])
            
    
    
    def file_ids_klass(self):
        ids = {}
        for klass in listdir(self.path):
            for fname in listdir(self.path+'/'+klass):
                ids[klass+'/'+fname] = klass
        return ids
                
    def get_text(self,fname):
        with open(self.path+'/'+fname) as finput:
            text = ''.join(finput.readlines())
            return self.pre_process(text)
    
    """
        por hora, um preprocessamento simples...
        podemos trocar por um mais elaborado depois
    """
    def pre_process(self,text):
        return simple_preprocess(text)

In [6]:
mc = MyCorpus('/home/kadnoise/Downloads/review_polarity/txt_sentoken/')

In [ ]:


In [4]:
model = Doc2Vec(mc, size=300, window=8, min_count=1, workers=2)

In [5]:
# cria um grafo dirigido (Digrafo)
import networkx as nx

G = nx.DiGraph()

# cada texto é um nó do grafo
# a classe do texto é um atributo do nó do grafo
for k,v in mc.klasses.items():
    G.add_node(k,klass=v)

In [6]:
# adiciona as arestas no grafo

# nomes dos arquivos...
# variável auxiliar...
names = mc.klasses.keys()


for k in names:
    for nn in model.docvecs.most_similar(k):
        G.add_edge(k,nn[0],weight=nn[1])

In [7]:
# calcula a distribuição do grau de cada nó

from collections import Counter

# como o out_degree é sempre 10 (por construção)
# basta usar o in_degree
degree = G.in_degree().values()
cdegree = Counter(degree)

In [8]:
# skewness and kurtosis mede o quanto não uniforme é a distribuição

from scipy.stats import skew, kurtosis

print skew(degree), kurtosis(degree)


2.45741589366 10.4265653779

In [9]:
%matplotlib inline
import matplotlib.pyplot as plt

In [10]:
plt.plot(cdegree.keys(),cdegree.values(),'bo-')


Out[10]:
[<matplotlib.lines.Line2D at 0x11537f110>]

In [11]:
good_bad_edges = {}

for k in mc.klasses.keys():
    good_bad_edges[k] = {}
    good_bad_edges[k]['good'] = 0
    good_bad_edges[k]['bad'] = 0
    good_bad_edges[k]['all'] = 0
    for edge in G.in_edges(k):
        if G.node[edge[0]]['klass'] == G.node[edge[1]]['klass']:
            good_bad_edges[k]['good']+=1
        else:
            good_bad_edges[k]['bad']+=1
        good_bad_edges[k]['all']+=1

In [12]:
baddegree = [degree['bad'] for degree in good_bad_edges.values()]
CBad = Counter(baddegree)

In [13]:
plt.plot(cdegree.keys(),cdegree.values(),'bo-')
plt.plot(CBad.keys(),CBad.values(),'ro-')


Out[13]:
[<matplotlib.lines.Line2D at 0x11537f950>]

In [14]:
print skew(baddegree), kurtosis(baddegree)


2.43675245653 9.93744053028

In [15]:
from scipy.stats import spearmanr,pearsonr
import numpy as np

corr = np.array([[degree['bad'], degree['all']] for degree in good_bad_edges.values()])

print spearmanr(corr[:,0],corr[:,1]), pearsonr(corr[:,0],corr[:,1])


SpearmanrResult(correlation=0.84189281234697155, pvalue=0.0) (0.8759661812223466, 0.0)

In [ ]: