In [ ]:
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
In [1]:
documents = [
('I love this sandwich', 'pos'),
('I feel very good about these beers', 'pos'),
('This is my best work', 'pos'),
("What an awesome view", 'pos'),
('I do not like this restaurant', 'neg'),
('I am tired of this stuff', 'neg'),
("I cannot deal with this", 'neg'),
('He is my sworn enemy', 'neg'),
('My boss is horrible', 'neg'),
('The beer was good', 'pos'),
('This is an amazing place', 'pos'),
("I cannot believe I am doing this", 'neg'),
('I do not enjoy my job', 'neg'),
("I feel amazing", 'pos'),
("I am not feeling dandy today", 'neg'),
('Gary is a friend of mine', 'pos')
]
In [2]:
from gensim import corpora, models, similarities
texts = [word[0].lower().split() for word in documents]
In [3]:
dictionary = corpora.Dictionary(texts)
In [4]:
#print(dictionary.token2id)
corpus = [dictionary.doc2bow(word) for word in texts]
In [ ]:
tfidf = models.TfidfModel(corpus)
In [ ]:
corpus_tfidf = tfidf[corpus]
#for doc in corpus_tfidf:
#print(doc)
In [ ]:
lsi = models.LsiModel(corpus, id2word=dictionary, num_topics=5)
lsi.print_topics(5)
In [5]:
lda = models.LdaModel(corpus, id2word=dictionary, num_topics=5)
lda.print_topics(2)
Out[5]:
In [ ]:
index = similarities.MatrixSimilarity(tfidf[corpus], num_features=len(dictionary), num_best=6)
In [ ]:
import networkx as nx
G = nx.DiGraph()
# cada texto é um nó do grafo
# a classe do texto é um atributo do nó do grafo
for i, k in enumerate(documents):
G.add_node(i,klass=k[1])
In [ ]:
for k in range(len(documents)):
for nn in index[corpus[k]]:
if not k==nn[0]:
G.add_edge(k,nn[0],weight=nn[1])
In [ ]:
%matplotlib inline
import matplotlib.pyplot as plt
In [ ]:
import numpy as np
color = ['r' if node[1]['klass'] == 'pos' else 'b' for node in G.nodes(data=True)]
degree = G.in_degree().values()
pos = nx.spring_layout(G)
ec = nx.draw_networkx_edges(G, pos, alpha=0.2)
nc = nx.draw_networkx_nodes(G, pos, node_color=color, node_size=3**np.array(list(degree)),alpha=0.3)
nl = nx.draw_networkx_labels(G, pos, dict(zip(range(len(documents)),range(1,len(documents)+1))))
plt.axis('off')
#plt.savefig('lda.png', dpi=200)
plt.show()
In [ ]:
degree
In [ ]:
# calcula a distribuição do grau de cada nó
from collections import Counter
# como o out_degree é sempre 10 (por construção)
# basta usar o in_degree
degree = G.in_degree().values()
cdegree = Counter(degree)
In [ ]:
# skewness and kurtosis mede o quanto não uniforme é a distribuição
from scipy.stats import skew, kurtosis
print skew(degree), kurtosis(degree)
In [ ]:
plt.plot(cdegree.keys(),cdegree.values(),'bo-')
In [ ]:
good_bad_edges = {}
for i, k in documents:
good_bad_edges[k] = {}
good_bad_edges[k]['good'] = 0
good_bad_edges[k]['bad'] = 0
good_bad_edges[k]['all'] = 0
for edge in G.in_edges(k):
if G.node[edge[0]]['klass'] == G.node[edge[1]]['klass']:
good_bad_edges[k]['good']+=1
else:
good_bad_edges[k]['bad']+=1
good_bad_edges[k]['all']+=1
In [ ]:
baddegree = [degree['bad'] for degree in good_bad_edges.values()]
CBad = Counter(baddegree)
In [ ]:
plt.plot(cdegree.keys(),cdegree.values(),'bo-')
plt.plot(CBad.keys(),CBad.values(),'ro-')
In [ ]:
print skew(baddegree), kurtosis(baddegree)
In [ ]:
from scipy.stats import spearmanr,pearsonr
import numpy as np
corr = np.array([[degree['bad'], degree['all']] for degree in good_bad_edges.values()])
print spearmanr(corr[:,0],corr[:,1]), pearsonr(corr[:,0],corr[:,1])
In [ ]:
In [ ]: