notebook.community

Edit and run



In [ ]:

    
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)



In [1]:

    
documents =  [
    ('I love this sandwich', 'pos'),
    ('I feel very good about these beers', 'pos'),
    ('This is my best work', 'pos'),
    ("What an awesome view", 'pos'),
    ('I do not like this restaurant', 'neg'),
    ('I am tired of this stuff', 'neg'),
    ("I cannot deal with this", 'neg'),
    ('He is my sworn enemy', 'neg'),
    ('My boss is horrible', 'neg'),
    ('The beer was good', 'pos'),
    ('This is an amazing place', 'pos'),
    ("I cannot believe I am doing this", 'neg'),
    ('I do not enjoy my job', 'neg'),
    ("I feel amazing", 'pos'),
    ("I am not feeling dandy today", 'neg'),
    ('Gary is a friend of mine', 'pos')    
]



In [2]:

    
from gensim import corpora, models, similarities

texts = [word[0].lower().split() for word in documents]



In [3]:

    
dictionary = corpora.Dictionary(texts)



In [4]:

    
#print(dictionary.token2id)
corpus = [dictionary.doc2bow(word) for word in texts]



In [ ]:

    
tfidf = models.TfidfModel(corpus)



In [ ]:

    
corpus_tfidf = tfidf[corpus]
#for doc in corpus_tfidf:
     #print(doc)



In [ ]:

    
lsi = models.LsiModel(corpus, id2word=dictionary, num_topics=5)
lsi.print_topics(5)



In [5]:

    
lda = models.LdaModel(corpus, id2word=dictionary, num_topics=5)
lda.print_topics(2)









    



WARNING:gensim.models.ldamodel:too few updates, training might not converge; consider increasing the number of passes or iterations to improve accuracy






    Out[5]:





[(1,
  u'0.123*i + 0.084*this + 0.046*feel + 0.046*good + 0.046*beers + 0.046*very + 0.046*cannot + 0.046*these + 0.046*love + 0.046*about'),
 (3,
  u'0.071*i + 0.070*is + 0.070*my + 0.070*horrible + 0.070*amazing + 0.070*boss + 0.070*feel + 0.012*this + 0.012*an + 0.012*am')]



In [ ]:

    
index = similarities.MatrixSimilarity(tfidf[corpus], num_features=len(dictionary), num_best=6)



In [ ]:

    
import networkx as nx

G = nx.DiGraph()

# cada texto é um nó do grafo
# a classe do texto é um atributo do nó do grafo
for i, k in enumerate(documents):
    G.add_node(i,klass=k[1])



In [ ]:

    
for k in range(len(documents)):
    for nn in index[corpus[k]]:
        if not k==nn[0]:
            G.add_edge(k,nn[0],weight=nn[1])



In [ ]:

    
%matplotlib inline
import matplotlib.pyplot as plt



In [ ]:

    
import numpy as np

color = ['r' if node[1]['klass'] == 'pos' else 'b' for node in G.nodes(data=True)]

degree = G.in_degree().values()

pos = nx.spring_layout(G)


ec = nx.draw_networkx_edges(G, pos, alpha=0.2)
nc = nx.draw_networkx_nodes(G,  pos, node_color=color, node_size=3**np.array(list(degree)),alpha=0.3)
nl = nx.draw_networkx_labels(G, pos, dict(zip(range(len(documents)),range(1,len(documents)+1))))
plt.axis('off')
#plt.savefig('lda.png', dpi=200)
plt.show()



In [ ]:

    
degree



In [ ]:

    
# calcula a distribuição do grau de cada nó

from collections import Counter

# como o out_degree é sempre 10 (por construção)
# basta usar o in_degree
degree = G.in_degree().values()
cdegree = Counter(degree)



In [ ]:

    
# skewness and kurtosis mede o quanto não uniforme é a distribuição

from scipy.stats import skew, kurtosis

print skew(degree), kurtosis(degree)



In [ ]:

    
plt.plot(cdegree.keys(),cdegree.values(),'bo-')



In [ ]:

    
good_bad_edges = {}

for i, k in documents:
    good_bad_edges[k] = {}
    good_bad_edges[k]['good'] = 0
    good_bad_edges[k]['bad'] = 0
    good_bad_edges[k]['all'] = 0
    for edge in G.in_edges(k):
        if G.node[edge[0]]['klass'] == G.node[edge[1]]['klass']:
            good_bad_edges[k]['good']+=1
        else:
            good_bad_edges[k]['bad']+=1
        good_bad_edges[k]['all']+=1



In [ ]:

    
baddegree = [degree['bad'] for degree in good_bad_edges.values()]
CBad = Counter(baddegree)



In [ ]:

    
plt.plot(cdegree.keys(),cdegree.values(),'bo-')
plt.plot(CBad.keys(),CBad.values(),'ro-')



In [ ]:

    
print skew(baddegree), kurtosis(baddegree)



In [ ]:

    
from scipy.stats import spearmanr,pearsonr
import numpy as np

corr = np.array([[degree['bad'], degree['all']] for degree in good_bad_edges.values()])

print spearmanr(corr[:,0],corr[:,1]), pearsonr(corr[:,0],corr[:,1])



In [ ]:



In [ ]: