In [76]:
from SPARQLWrapper import SPARQLWrapper, JSON
import time, numpy as n
In [6]:
PREFIX="""PREFIX rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#>
PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>
PREFIX ops: <http://purl.org/socialparticipation/ops#>
PREFIX opa: <http://purl.org/socialparticipation/opa#>
PREFIX foaf: <http://xmlns.com/foaf/0.1/>
PREFIX dc: <http://purl.org/dc/terms/>
PREFIX tsioc: <http://rdfs.org/sioc/types#>
PREFIX schema: <http://schema.org/>
"""
In [10]:
NOW=time.time()
q="SELECT ?comentario ?titulo ?texto WHERE {?comentario dc:type tsioc:Comment. OPTIONAL {?comentario dc:title ?titulo . } OPTIONAL {?comentario schema:text ?texto .}}"
sparql3 = SPARQLWrapper("http://localhost:82/participabr/query")
sparql3.setQuery(PREFIX+q)
sparql3.setReturnFormat(JSON)
results4 = sparql3.query().convert()
print("%.2f segundos para puxar todos os comentários do Participa.br"%(time.time()-NOW,))
In [92]:
# fazendo seleção das mensagens com palavras maiores e limiar mínimo de palavras
msgs=results4["results"]["bindings"]
exclude = set(string.punctuation+u'\u201c'+u'\u2018'+u'\u201d'+u'\u2022'+u'\u2013')
def atributos(__msg):
texto=__msg["texto"]["value"]
texto_ = ''.join(ch for ch in texto if ch not in exclude)
palavras=texto_.split()
tams=[]
for palavra in palavras:
tams.append(len(palavra))
return len(tams), n.mean(tams), n.std(tams)
In [97]:
atrs=[]
for msg in msgs:
atrs.append(atributos(msg))
In [98]:
atrs_=n.array(atrs)
In [117]:
max_palavras=115
min_palavras=110
n_msgs=((atrs_[:,0]>min_palavras)*(atrs_[:,0]<max_palavras)).sum()
print(u"são %i mensagens com mais de %i palavras e menos de %i"%
(n_msgs, min_palavras, max_palavras) )
In [127]:
import pylab as p
p.hist(atrs_[:,0],100)
Out[127]:
In [122]:
p.show()
In [101]:
atrs[1]
Out[101]:
In [103]:
atrs_[10]
Out[103]:
In [105]:
msgs[0]["texto"]["value"]
Out[105]:
In [102]:
atrs[2]
Out[102]:
In [67]:
#dados lidos, processando
NOW=time.time()
import string, nltk as k
# histograma com as palavras
palavras=string.join([i["texto"]["value"].lower() for i in results4["results"]["bindings"]])
exclude = set(string.punctuation+u'\u201c'+u'\u2018'+u'\u201d'+u'\u2022'+u'\u2013')
palavras = ''.join(ch for ch in palavras if ch not in exclude)
palavras_=palavras.split()
#fdist=k.FreqDist(palavras_)
print(u"feita lista de todas as palavras de todos os comentários em %.2f"%(time.time()-NOW,))
In [68]:
NOW=time.time()
stopwords = set(k.corpus.stopwords.words('portuguese'))
palavras__=[pp for pp in palavras_ if pp not in stopwords]
fdist_=k.FreqDist(palavras__)
print("retiradas stopwords feita contagem das palavras em %.2f"%(time.time()-NOW,))
In [69]:
for fd,ii in [(fdist_[i],i) for i in fdist_.keys()[:20]]: print fd, ii
In [70]:
fdist_.keys()[:20]
Out[70]:
In [ ]: