In [ ]:
import sframe as sf
In [ ]:
import sframe as sf
from gensim import corpora, models, similarities
from gensim.utils import simple_preprocess
class MyCorpus(object):
def __init__(self,path):
self.data = sf.SFrame.read_csv(path,sep="\t",header=False)
def __iter__(self):
for sent in self.data['X1']:
yield simple_preprocess(sent)
def klass(self):
return self.data['X2']
In [ ]:
tweets = MyCorpus("Bases/Prontas/superbow/sentic.patter.en-superbow2013.txt")
In [ ]:
tweets.data.stack
In [ ]:
dictionary = corpora.Dictionary(tweets)
corpus = [dictionary.doc2bow(text) for text in tweets]
tfidf = models.TfidfModel(corpus)
In [ ]:
G = sf.SGraph()
# cada texto é um nó do grafo
# a classe do texto é um atributo do nó do grafo
k = 0
for v in tweets.klass():
G = G.add_vertices(sf.Vertex(k, attr={'klass' : v}))
k = k+1
In [ ]:
index = similarities.Similarity('tmp',tfidf[corpus],num_features=len(dictionary.keys()),num_best=11)
for tid, tweet in enumerate(tweets):
edges = []
for nn in index[tfidf[dictionary.doc2bow(tweet)]]:
if not tid==nn[0]:
edges.append(sf.Edge(tid,nn[0],attr={'weight':nn[1]}))
#gambiarra para contornar o 0.0 neutral
try:
G = G.add_edges(edges)
except:
pass
In [ ]:
In [ ]:
G = sf.SGraph()
# cada texto é um nó do grafo
# a classe do texto é um atributo do nó do grafo
klass = tweets.klass()
for k in range(0,klass.size()):
G = G.add_vertices(sf.Vertex(k, attr={'klass' : klass[k]}))
In [ ]: