In [69]:
# define separate function to use recursion
def listComments(artikkel, reg, url, commentList):
print 'requesting: ',url
page = requests.get(url)
tree = html.fromstring(page.content)
comments = tree.xpath('//div[@class="comment-content-inner"]')
authors = tree.xpath('//div[@class="comment-author"]')
commentDates = tree.xpath('//div[@class="comment-date"]/text()')
commentVotesUp = tree.xpath('//div[@class="comment-votes-up"]/a/span[@class="comment-votes-count"]/text()')
commentVotesDown = tree.xpath('//div[@class="comment-votes-down"]/a/span[@class="comment-votes-count"]/text()')
commentsPagerNext = tree.xpath('//div[@class="comments-pager comments-pager-top"]/a[@class="comments-pager-arrow-last"]')
#print "pikkused:", len(commentDates),len(comments), len(authors), len(commentVotesUp), len(commentVotesDown), len(commentsPagerNext)
# trüki kõik leitud kommentaarid välja
for i in range(len(comments)):
# vote-d on kaks korda miskipärast, seega
ii = i * 2 + 1
# puhasta comment erinevast HTML läbust, reavahe jms
#comment = comments[i]
comment = h.unescape(html.tostring(comments[i])).replace("</div>", "").replace("<div class=\"comment-content-inner\">", "").strip()
# reavahe asendus
comment = comment.replace("<br>\n", u'⏎')
# DELFI spam ära
comment = comment.replace('<font class="delfiCoB">D</font><font class="delfiCoY">E</font><font class="delfiCoB">LFI</font>', 'DELFI')
# puhasta autor HTML-ist
author = h.unescape(html.tostring(authors[i])).replace("</div>", "").replace("<div class=\"comment-author\">", "").replace('<a href="javascript:void(1)" onclick="CommentList.showUserComments(this)">', "").replace("</a>", "").strip()
ts = datetime.strptime(commentDates[i].strip(), '%d.%m.%Y %H:%M')
commentItem = {'artikkel':artikkel, 'reg':reg, 'indx_page':i, 'author': author.encode('utf-8'),'date': ts, 'commentVotesUp': int(commentVotesUp[i].strip()), 'commentVotesDown': int(commentVotesDown[i].strip()), 'comment': comment.encode('utf-8'),'commentUrl': url}
commentList.append(commentItem)
# kui on veel kommentaaride lehti, sisi võta pager-ist URL-id ja päri neid ka, kasutades sama funktsiooni (1-kordne rekursioon)
for pager in commentsPagerNext: # seal on 1 tegelikult alati vaid, või 0 kui pole järgmist lehekülge
newUrl = pager.attrib.get('href')
listComments(artikkel, reg, newUrl, commentList) # rekursiivne
In [70]:
# actual loading of comments of a page
from lxml import html
import requests
import HTMLParser
from datetime import datetime
# kommentaari lehe URL
artikkel = "http://publik.delfi.ee/news/inimesed/video-blogi-ja-fotod-luisa-roivas-oma-abikaasa-umber-lahvatanud-ahistamisskandaalist-olen-hairitud-ja-taavis-pettunud?id=79814146&com=1®=0&no=0&s=1"
# global settings
h = HTMLParser.HTMLParser()
commentList = []
for reg in range(2): # 0 - anon, 1 - registered user
url = artikkel + "&com=1®=%s&no=0&s=1" % (reg)
listComments(artikkel, reg, url, commentList)
In [72]:
# convert dict list to pandas DataFrame for easier manipulation
import pandas as pd
df = pd.DataFrame(commentList)
# check data: structure/fields, and first rows
df.head()
df.dtypes
Out[72]:
In [73]:
# calculate total votes
df['totalVotes'] = df.commentVotesDown + df.commentVotesUp
# show top comments by number of total votes
dfSorted = df.sort_values(by='totalVotes', ascending=False)
pd.options.display.max_colwidth = 500 # otherwise texts are shortened
dfSorted.head()
Out[73]:
In [63]:
# print graph for comment votes over time.
# Not sure how to interpret this really
%matplotlib inline
import matplotlib
matplotlib.style.use('ggplot')
df.plot(x='date', y=['commentVotesUp','commentVotesDown','totalVotes'])
Out[63]:
In [ ]: