In [ ]:
from gensim import corpora, models, similarities
from gensim.utils import simple_preprocess
import pandas as pd
class MyCorpus(object):
def __init__(self,path):
self.data = pd.read_csv(path, sep="\t", header=0).drop_duplicates()
def __iter__(self):
for sent in self.data['tweet']:
yield self.pre_process(sent)
def pre_process(self,text):
return simple_preprocess(str(text))
In [ ]:
import time
start = time.time()
mc = MyCorpus('Bases/Prontas/superbow/sentic.patter.en-superbow2013.txt')
end = time.time()
print(end - start)
In [ ]:
dictionary = corpora.Dictionary(mc)
corpus = [dictionary.doc2bow(text) for text in mc]
tfidf = models.TfidfModel(corpus, normalize=True)
tfidf_corpus = tfidf[corpus]
del corpus
del tfidf
In [ ]:
# cria um grafo dirigido (Digrafo)
import networkx as nx
G = nx.DiGraph()
# cada texto é um nó do grafo
# a classe do texto é um atributo do nó do grafo
for row in mc.data.itertuples():
G.add_node(row.Index,klass=row.sentiment)
In [ ]:
def chunks(l, n):
"""Yield successive n-sized chunks from l."""
for i in range(0, len(l), n):
yield (i,min(i+n,len(l)))
In [ ]:
start = time.time()
index = similarities.Similarity(None, tfidf_corpus,num_features=len(dictionary.keys()),num_best=11)
del dictionary
end = time.time()
print(end - start)
In [ ]:
len(names)
In [ ]:
#index = similarities.Similarity('tmp',tfidf_corpus,num_features=len(dictionary.keys()),num_best=11)
names = mc.data.index
for chunk in chunks(names,25000):
for item in index[tfidf_corpus[chunk[0]:chunk[1]]]:
if len(item) > 0:
k = names[item[0][0]]
for nn in item[1:]:
G.add_edge(k,names[nn[0]],weight=nn[1])
In [ ]:
# calcula a distribuição do grau de cada nó
from collections import Counter
# como o out_degree é sempre 10 (por construção)
# basta usar o in_degree
degree = G.in_degree().values()
cdegree = Counter(degree)
In [ ]:
# skewness and kurtosis mede o quanto não uniforme é a distribuição
from scipy.stats import skew, kurtosis
print skew(degree), kurtosis(degree)
In [ ]:
%matplotlib inline
import matplotlib.pyplot as plt
plt.plot(cdegree.keys(),cdegree.values(),'bo-')
In [ ]: