In [37]:
from bs4 import BeautifulSoup
import urllib
import re
In [42]:
link = "http://www.nsoud.cz/Judikatura/judikatura_ns.nsf/$$WebSearch1?SearchView&Query=%5Bdatum_rozhodnuti%5D%3E%3D01%2F01%2F2007%20AND%20%5BARozhodnutiRT%5D%3Dnutn%C3%A1%20obrana&SearchMax=1000&pohled=1&start=1&Count=500"
page = urllib.request.urlopen(link).read()
soup = BeautifulSoup(page)
In [71]:
for row in soup.find_all('tr'):
spzn = row.find('a', href=re.compile('WebSearch')).text
link = 'http://www.nsoud.cz' + str(row.find('a', href=re.compile('CreateWordDocBody'))).split('a href="')[1].split('"><img')[0].replace('amp;', '')
req = urllib.request.Request(link)
req.add_header('Referer', 'http://www.nsoud.cz/Judikatura/judikatura_ns.nsf/$$WebSearch1?SearchView&Query=%5Bdatum_rozhodnuti%5D%3E%3D01%2F01%2F2007%20AND%20%5BARozhodnutiRT%5D%3Dnutn%C3%A1%20obrana&SearchMax=1000&pohled=1&start=1&Count=50')
r = urllib.request.urlopen(req)
output = open('data/' + spzn.replace('/', '-') + '.doc','wb')
output.write(r.read())
output.close()