In [1]:
import sframe as sf

In [2]:
import sframe as sf
from gensim import corpora, models, similarities
from gensim.utils import simple_preprocess

class MyCorpus(object):
    
    def __init__(self,path):
        self.data = sf.SFrame.read_csv(path,sep="\t",header=False)
        
    def __iter__(self):
        for sent in self.data['X1']:
            yield simple_preprocess(sent)
            
    def klass(self):
        return self.data['X2']


2016-05-16 13:56:33,444 [INFO] sframe.cython.cy_server, 172: SFrame v1.9 started. Logging /tmp/sframe_server_1463421393.log

In [7]:
tweets = MyCorpus("Bases/Prontas/superbow/sentic.patter.en-superbow2013.txt")


Finished parsing file /home/kadnoise/#Final_Project/Bases/Prontas/superbow/sentic.patter.en-superbow2013.txt
Parsing completed. Parsed 100 lines in 0.620554 secs.
Read 737499 lines. Lines per second: 1.25731e+06
Finished parsing file /home/kadnoise/#Final_Project/Bases/Prontas/superbow/sentic.patter.en-superbow2013.txt
Parsing completed. Parsed 3332459 lines in 1.88273 secs.
------------------------------------------------------
Inferred types from first 100 line(s) of file as 
column_type_hints=[str,str]
If parsing fails due to incorrect types, you can correct
the inferred type list above and pass it to read_csv in
the column_type_hints argument
------------------------------------------------------

In [ ]:


In [8]:
dictionary = corpora.Dictionary(tweets)
corpus = [dictionary.doc2bow(text) for text in tweets]
tfidf = models.TfidfModel(corpus)

In [9]:
G = sf.SGraph()

# cada texto é um nó do grafo
# a classe do texto é um atributo do nó do grafo
for k,v in enumerate(tweets.klass()):
    G.add_vertices(sf.Vertex(k, attr={'klass' : v}))


---------------------------------------------------------------------------
KeyboardInterrupt                         Traceback (most recent call last)
<ipython-input-9-46477dc58643> in <module>()
      4 # a classe do texto é um atributo do nó do grafo
      5 for k,v in enumerate(tweets.klass()):
----> 6     G.add_vertices(sf.Vertex(k, attr={'klass' : v}))

/usr/local/lib/python2.7/dist-packages/sframe/data_structures/sgraph.pyc in add_vertices(self, vertices, vid_field)
    651         with cython_context():
    652             proxy = self.__proxy__.add_vertices(sf.__proxy__, _VID_COLUMN)
--> 653             return SGraph(_proxy=proxy)
    654 
    655     def add_edges(self, edges, src_field=None, dst_field=None):

/usr/local/lib/python2.7/dist-packages/sframe/cython/context.pyc in __exit__(self, exc_type, exc_value, traceback)
     47             if not self.show_cython_trace:
     48                 # To hide cython trace, we re-raise from here
---> 49                 raise exc_type(exc_value)
     50             else:
     51                 # To show the full trace, we do nothing and let exception propagate

KeyboardInterrupt: 

In [ ]:
index = similarities.Similarity('tmp',tfidf[corpus],num_features=len(dictionary.keys()),num_best=11)

for tid, tweet in enumerate(tweets):
    for nn in index[tfidf[dictionary.doc2bow(tweet)]]:
        if not tid==nn[0]:
            G.add_edges(sf.Edge(tid,nn[0],attr={'weight':nn[1]}))

In [ ]: