In [ ]:
from gensim import corpora, models, similarities
from gensim.utils import simple_preprocess
import pandas as pd

class MyCorpus(object):
    
    def __init__(self,path):
        self.data = pd.read_csv(path, sep="\t", header=0, index_col=0).drop_duplicates()
    
    def __iter__(self):
        for sent in self.data['tweet']:
            yield self.pre_process(sent)
    
    def pre_process(self,text):
        return simple_preprocess(str(text))

In [ ]:
mc = MyCorpus('/home/kadnoise/Desktop/Sample2.txt')

In [ ]:
dictionary = corpora.Dictionary(mc)
corpus = [dictionary.doc2bow(text) for text in mc]
tfidf = models.TfidfModel(corpus)

In [ ]:
dictionary = corpora.Dictionary(mc)
corpus = [dictionary.doc2bow(text) for text in mc]

tfidf = models.TfidfModel(corpus, dictionary, normalize=True)
tfidf_corpus = tfidf[corpus]

lsi_model = models.LsiModel(corpus=tfidf_corpus, id2word=dictionary, num_topics=len(dictionary.keys()))
lsi_corpus = lsi_model[tfidf_corpus]

In [ ]:
import networkx as nx

G = nx.DiGraph()

# cada texto é um nó do grafo
# a classe do texto é um atributo do nó do grafo
for row in mc.data.itertuples():
    G.add_node(row.Index,klass=row.sentiment)

In [ ]:
index = similarities.Similarity(None, lsi_corpus,num_features=len(dictionary.keys()),num_best=11)

In [ ]:
# adiciona as arestas no grafo

# nomes dos arquivos...
# variável auxiliar...
names = mc.data.index

#index = similarities.MatrixSimilarity(tfidf[corpus],num_features=len(dictionary.keys()),num_best=11)

#gera o modelo de similaridades
# para encontaros k-vizinhos de cada nó
# num_best é o número de k-vizinhos + 1 (pois o nó é vizinho dele mesmo)
# num_best=11 gera um grafo com 10 vizinhos pra cada nó
#index = similarities.Similarity('tmp',tfidf[corpus],num_features=len(dictionary.keys()),num_best=11)

for k in names:
    cap = lsi_model[dictionary.doc2bow(mc.pre_process(mc.data['tweet'][k]))]
    for nn in index[cap]:
        if not k==names[nn[0]]:
            G.add_edge(k,names[nn[0]],weight=nn[1])
    del cap

In [ ]:
# calcula a distribuição do grau de cada nó

from collections import Counter

# como o out_degree é sempre 10 (por construção)
# basta usar o in_degree
degree = G.in_degree().values()
cdegree = Counter(degree)

In [ ]:
# skewness and kurtosis mede o quanto não uniforme é a distribuição

from scipy.stats import skew, kurtosis

print skew(degree), kurtosis(degree)

In [ ]:
%matplotlib inline
import matplotlib.pyplot as plt

plt.plot(cdegree.keys(),cdegree.values(),'bo-')

In [ ]:
good_bad_edges = {}

for k in names:
    good_bad_edges[k] = {}
    good_bad_edges[k]['good'] = 0
    good_bad_edges[k]['bad'] = 0
    good_bad_edges[k]['all'] = 0
    for edge in G.in_edges(k):
        if G.node[edge[0]]['klass'] == G.node[edge[1]]['klass']:
            good_bad_edges[k]['good']+=1
        else:
            good_bad_edges[k]['bad']+=1
        good_bad_edges[k]['all']+=1

In [ ]:
baddegree = [d['bad'] for d in good_bad_edges.values()]
CBad = Counter(baddegree)

plt.plot(cdegree.keys(),cdegree.values(),'bo-')
plt.plot(CBad.keys(),CBad.values(),'ro-')

In [ ]:
#another data
import pandas as pd

data = pd.read_csv('Bases/Prontas/superbow/sentic.patter.en-superbow2013.txt', sep='\t', header=None)
    
writex = open('Bases/Prontas/superbow/sentic.patter.en-superbow2013-2.txt', 'w')
writex.write('id' + '\t' + 'tweet' + '\t' + 'sentiment' + '\n')

for i, row in enumerate(data.get_values()):
    
    writex.write('SENT_%s'% i + '\t' + str(row[0]) + '\t' + str(row[1]) + '\n')
writex.close()

In [ ]:


In [2]:
from gensim.utils import lemmatize

In [11]:
lemmatize("that")


Out[11]:
[]

In [ ]: