In [ ]:
from gensim import corpora, models, similarities
from gensim.utils import simple_preprocess
import pandas as pd
class MyCorpus(object):
def __init__(self,path):
self.data = pd.read_csv(path, sep="\t", header=0, index_col=0).drop_duplicates()
def __iter__(self):
for sent in self.data['tweet']:
yield self.pre_process(sent)
def pre_process(self,text):
return simple_preprocess(str(text))
In [ ]:
mc = MyCorpus('/home/kadnoise/Desktop/Sample2.txt')
In [ ]:
dictionary = corpora.Dictionary(mc)
corpus = [dictionary.doc2bow(text) for text in mc]
tfidf = models.TfidfModel(corpus)
In [ ]:
dictionary = corpora.Dictionary(mc)
corpus = [dictionary.doc2bow(text) for text in mc]
tfidf = models.TfidfModel(corpus, dictionary, normalize=True)
tfidf_corpus = tfidf[corpus]
lsi_model = models.LsiModel(corpus=tfidf_corpus, id2word=dictionary, num_topics=len(dictionary.keys()))
lsi_corpus = lsi_model[tfidf_corpus]
In [ ]:
import networkx as nx
G = nx.DiGraph()
# cada texto é um nó do grafo
# a classe do texto é um atributo do nó do grafo
for row in mc.data.itertuples():
G.add_node(row.Index,klass=row.sentiment)
In [ ]:
index = similarities.Similarity(None, lsi_corpus,num_features=len(dictionary.keys()),num_best=11)
In [ ]:
# adiciona as arestas no grafo
# nomes dos arquivos...
# variável auxiliar...
names = mc.data.index
#index = similarities.MatrixSimilarity(tfidf[corpus],num_features=len(dictionary.keys()),num_best=11)
#gera o modelo de similaridades
# para encontaros k-vizinhos de cada nó
# num_best é o número de k-vizinhos + 1 (pois o nó é vizinho dele mesmo)
# num_best=11 gera um grafo com 10 vizinhos pra cada nó
#index = similarities.Similarity('tmp',tfidf[corpus],num_features=len(dictionary.keys()),num_best=11)
for k in names:
cap = lsi_model[dictionary.doc2bow(mc.pre_process(mc.data['tweet'][k]))]
for nn in index[cap]:
if not k==names[nn[0]]:
G.add_edge(k,names[nn[0]],weight=nn[1])
del cap
In [ ]:
# calcula a distribuição do grau de cada nó
from collections import Counter
# como o out_degree é sempre 10 (por construção)
# basta usar o in_degree
degree = G.in_degree().values()
cdegree = Counter(degree)
In [ ]:
# skewness and kurtosis mede o quanto não uniforme é a distribuição
from scipy.stats import skew, kurtosis
print skew(degree), kurtosis(degree)
In [ ]:
%matplotlib inline
import matplotlib.pyplot as plt
plt.plot(cdegree.keys(),cdegree.values(),'bo-')
In [ ]:
good_bad_edges = {}
for k in names:
good_bad_edges[k] = {}
good_bad_edges[k]['good'] = 0
good_bad_edges[k]['bad'] = 0
good_bad_edges[k]['all'] = 0
for edge in G.in_edges(k):
if G.node[edge[0]]['klass'] == G.node[edge[1]]['klass']:
good_bad_edges[k]['good']+=1
else:
good_bad_edges[k]['bad']+=1
good_bad_edges[k]['all']+=1
In [ ]:
baddegree = [d['bad'] for d in good_bad_edges.values()]
CBad = Counter(baddegree)
plt.plot(cdegree.keys(),cdegree.values(),'bo-')
plt.plot(CBad.keys(),CBad.values(),'ro-')
In [ ]:
#another data
import pandas as pd
data = pd.read_csv('Bases/Prontas/superbow/sentic.patter.en-superbow2013.txt', sep='\t', header=None)
writex = open('Bases/Prontas/superbow/sentic.patter.en-superbow2013-2.txt', 'w')
writex.write('id' + '\t' + 'tweet' + '\t' + 'sentiment' + '\n')
for i, row in enumerate(data.get_values()):
writex.write('SENT_%s'% i + '\t' + str(row[0]) + '\t' + str(row[1]) + '\n')
writex.close()
In [ ]:
In [2]:
from gensim.utils import lemmatize
In [11]:
lemmatize("that")
Out[11]:
In [ ]: