Mikhail Kolodin. Project: Internet temperature. 2015-12-15 1.1.2
IPython research for internet temperature. We use now only fontanka.ru website, later other sites and methods will be added.
In [117]:
import requests
import lxml.html as lh
import datetime
now = datetime.datetime.now()
In [118]:
url = "http://www.fontanka.ru/fontanka/"
In [124]:
myyear, mymonth, myday = now.year, now.month, now.day
plus = "{0:04d}/{1:02d}/{2:02d}" .format (myyear, mymonth, myday)
fullurl = url + plus + '/all.html'
print ("Getting data from {}" .format(fullurl))
In [125]:
page = requests.get(fullurl)
tree = lh.fromstring(page.text)
#print(tree.text_content())
In [126]:
bloks_spb = tree.xpath("//div[@class='entry article switcher-all-news switcher-spb-news']")
bloks_rus = tree.xpath("//div[@class='entry article switcher-all-news switcher-russian-news']")
bloks_world = tree.xpath("//div[@class='entry article switcher-all-news switcher-world-news']")
bloks = bloks_spb + bloks_rus + bloks_world
In [127]:
blogs_spb = []
for b in bloks_spb:
blogs_spb.append (("spb", b))
blogs_rus = []
for b in bloks_rus:
blogs_rus.append (("rus", b))
blogs_world = []
for b in bloks_world:
blogs_world.append (("mir", b))
blogs = blogs_spb + blogs_rus + blogs_world
#print (blogs)
In [183]:
def procref (addr):
"""get full text of news"""
if addr == "": return
page = requests.get(addr)
tree = lh.fromstring(page.text)
try:
full = tree.xpath("//div[@class='article_fulltext']")
print (full[0].xpath("./p"))
# print (full[0].text.strip())
except:
print ("None")
In [184]:
for blog in blogs[:10]:
blok = blog[1]
dt = blok.xpath("div[@class='entry_date']")
if dt[0].text.strip()[2] != ":": continue
print (blog[0], plus, dt[0].text.strip(), end=" ")
tit = blok.xpath("div[@class='entry_title']")
ref = tit[0].xpath("a[@href]")
print ("text = [{}]" .format (ref[0].text.strip()), end=" ")
goes = tit[0].xpath("a/@href")[0]
if goes.startswith('/'):
goes = url + goes
print ("goto = [{}]" .format(goes))
# procref(goes)
print ("...\nTotal records: {}" .format(len(blogs)))
In [ ]: