In [ ]:
from gensim import corpora, models, similarities
from gensim.utils import simple_preprocess
import pandas as pd
class MyCorpus(object):
def __init__(self,path):
self.data = pd.read_csv(path, sep="\t", header=0, index_col=0)
self.data = self.data.drop_duplicates()
def __iter__(self):
for sent in self.data['tweet']:
yield self.pre_process(sent)
def pre_process(self,text):
return simple_preprocess(str(text))
In [ ]:
mc = MyCorpus('Bases/Prontas/superbow/sentic.patter.en-superbow2013-2.txt')
In [ ]:
dictionary = corpora.Dictionary(mc)
corpus = [dictionary.doc2bow(text) for text in mc]
tfidf = models.TfidfModel(corpus)
tfidf_corpus = tfidf[corpus]
In [ ]:
import networkx as nx
G = nx.DiGraph()
# cada texto é um nó do grafo
# a classe do texto é um atributo do nó do grafo
for row in mc.data.itertuples():
G.add_node(row.Index,klass=row.sentiment)
In [ ]:
index = similarities.Similarity('tmp',tfidf_corpus,num_features=len(dictionary.keys()),num_best=11)
names = mc.data.index
In [ ]:
for item in index[tfidf_corpus]:
if len(item) > 0:
k = names[item[0][0]]
for nn in item[1:]:
G.add_edge(k,names[nn[0]],weight=nn[1])