In [1]:
import urllib2
import time
import pandas as pd
from bs4 import BeautifulSoup
import re
import os
import csv
import numpy as np
from geopy import geocoders
from sys import exit
import sys
In [21]:
mesta = ['404-hlavni-mesto-praha', '400-jihomoravsky-kraj', '402-moravskoslezsky-kraj', '392-jihocesky-kraj', '394-karlovarsky-kraj', '397-kralovehradecky-kraj', '396-liberecky-kraj', '401-olomoucky-kraj', '398-pardubicky-kraj', '393-plzensky-kraj', 'stredocesky-kraj', '395-ustecky-kraj', 'vysocina', '403-zlinsky-kraj', 'z-dalsich-v-cr']
for mesto in mesta:
print mesto
url = urllib2.urlopen('http://www.lunchtime.cz/denni-menu/' + mesto)
out = open('/Users/jancibulka/DEVEL/DATA/lunchtime-ceny/data/scraped/' + mesto + '.html', 'w')
content = url.read()
out.write(content)
out.close()
time.sleep(1)
soup = BeautifulSoup(content, 'html.parser', from_encoding='utf-8')
pagination = soup.find('section', class_='pagination')
counter = 0
for link in pagination.find_all('a', class_='step'):
counter += 1
href = link.get('href')
url_next = urllib2.urlopen('http://www.lunchtime.cz' + href)
out_next = open('/Users/jancibulka/DEVEL/DATA/lunchtime-ceny/data/scraped/' + mesto + '_' + str(counter) + '.html', 'w')
content_next = url_next.read()
out_next.write(content_next)
out_next.close()
time.sleep(1)
In [9]:
#Extrakce dat ze stazenych stranek
In [24]:
#Extrahuje adresy
with open('/Users/jancibulka/DEVEL/DATA/lunchtime-ceny/data/podniky_ceny.csv', 'w') as out:
writer = csv.writer(out, delimiter=',', quotechar='"', quoting=csv.QUOTE_MINIMAL)
writer.writerow(['id', 'adresa', 'prum_cena'])
for filename in os.listdir('/Users/jancibulka/DEVEL/DATA/lunchtime-ceny/data/scraped'):
if(filename == '.DS_Store'):
continue
with open('/Users/jancibulka/DEVEL/DATA/lunchtime-ceny/data/scraped/' + filename) as f:
soup = BeautifulSoup(f, from_encoding='utf-8')
podniky = soup.find_all('section', class_='facility hidden')
for hospoda in podniky:
if (hospoda.find('a', class_='add-to-favorites') is not None):
idcko = hospoda.find('a', class_='add-to-favorites').get('data-id').encode('utf-8')
else:
idcko = 9999
if (hospoda.find('span', class_='address') is not None):
adresa = hospoda.find('span', class_='address').text.encode('utf-8')
else:
continue
ceny = []
for cena in hospoda.find_all('span', class_='price'):
pricetag = cena.text.encode('utf-8').replace(' Kč', '')
if (len(pricetag) > 1 and len(pricetag) < 5):
ceny.append(int(pricetag))
writer.writerow([idcko, adresa, np.mean(ceny)])
In [25]:
#Geocoding
out = open('/Users/jancibulka/DEVEL/DATA/lunchtime-ceny/data/podniky_ceny_geo.csv', 'w')
writer = csv.writer(out, delimiter=',', quotechar='"', quoting=csv.QUOTE_MINIMAL)
writer.writerow(['id', 'adresa', 'prum_cena', 'x', 'y'])
with open('/Users/jancibulka/DEVEL/DATA/lunchtime-ceny/data/podniky_ceny_pha_nedele.csv', 'r') as csvfile:
loaded = csv.reader(csvfile, delimiter=',', quotechar='"')
for addr in loaded:
if (addr[1] == 'adresa'):
continue
try:
g = geocoders.GoogleV3()
geo = g.geocode(addr[1])
if (geo is not None):
place, (lat, lng) = geo
writer.writerow([addr[0], addr[1], addr[2], lng, lat])
time.sleep(1)
else:
print 'Nenalezeno ID: ' + addr[0]
except:
try:
time.sleep(2)
g = geocoders.GoogleV3()
geo = g.geocode(addr[1])
if (geo is not None):
place, (lat, lng) = geo
writer.writerow([addr[0], addr[1], addr[2], lng, lat])
time.sleep(1)
else:
print 'Nenalezeno ID: ' + addr[0]
except:
try:
time.sleep(5)
g = geocoders.GoogleV3()
geo = g.geocode(addr[1])
if (geo is not None):
place, (lat, lng) = geo
writer.writerow([addr[0], addr[1], addr[2], lng, lat])
time.sleep(1)
except:
print 'Chyba u ID ' + addr[0]
continue
out.close()