notebook.community

Edit and run



In [ ]:

    
import sframe as sf



In [ ]:

    
import sframe as sf
from gensim import corpora, models, similarities
from gensim.utils import simple_preprocess

class MyCorpus(object):
    
    def __init__(self,path):
        self.data = sf.SFrame.read_csv(path,sep="\t",header=False)
        
    def __iter__(self):
        for sent in self.data['X1']:
            yield simple_preprocess(sent)
            
    def klass(self):
        return self.data['X2']



In [ ]:

    
tweets = MyCorpus("Bases/Prontas/superbow/sentic.patter.en-superbow2013.txt")



In [ ]:

    
tweets.data.stack



In [ ]:

    
dictionary = corpora.Dictionary(tweets)
corpus = [dictionary.doc2bow(text) for text in tweets]
tfidf = models.TfidfModel(corpus)



In [ ]:

    
G = sf.SGraph()

# cada texto é um nó do grafo
# a classe do texto é um atributo do nó do grafo
k = 0
for v in tweets.klass():
    G = G.add_vertices(sf.Vertex(k, attr={'klass' : v}))
    k = k+1



In [ ]:

    
index = similarities.Similarity('tmp',tfidf[corpus],num_features=len(dictionary.keys()),num_best=11)

for tid, tweet in enumerate(tweets):
    edges = []
    for nn in index[tfidf[dictionary.doc2bow(tweet)]]:
        if not tid==nn[0]:
            edges.append(sf.Edge(tid,nn[0],attr={'weight':nn[1]}))
    
    #gambiarra para contornar o 0.0 neutral
    try:
        G = G.add_edges(edges)
    except:
        pass



In [ ]:



In [ ]:

    
G = sf.SGraph()

# cada texto é um nó do grafo
# a classe do texto é um atributo do nó do grafo
klass = tweets.klass()

for k in range(0,klass.size()):
   G = G.add_vertices(sf.Vertex(k, attr={'klass' : klass[k]}))



In [ ]: