In [1]:
import urllib2
import time
import pandas as pd
from bs4 import BeautifulSoup
import re
import os
import csv
import numpy as np
from geopy import geocoders
from sys import exit
import sys

In [21]:
mesta = ['404-hlavni-mesto-praha', '400-jihomoravsky-kraj', '402-moravskoslezsky-kraj', '392-jihocesky-kraj', '394-karlovarsky-kraj', '397-kralovehradecky-kraj', '396-liberecky-kraj', '401-olomoucky-kraj', '398-pardubicky-kraj', '393-plzensky-kraj', 'stredocesky-kraj', '395-ustecky-kraj', 'vysocina', '403-zlinsky-kraj', 'z-dalsich-v-cr']

for mesto in mesta:
    print mesto
    url = urllib2.urlopen('http://www.lunchtime.cz/denni-menu/' + mesto)
    out = open('/Users/jancibulka/DEVEL/DATA/lunchtime-ceny/data/scraped/' + mesto + '.html', 'w')
    content = url.read()
    out.write(content)
    out.close()
    time.sleep(1)
    soup = BeautifulSoup(content, 'html.parser', from_encoding='utf-8')
    pagination = soup.find('section', class_='pagination')
    counter = 0
    for link in pagination.find_all('a', class_='step'):
        counter += 1
        href = link.get('href')
        url_next = urllib2.urlopen('http://www.lunchtime.cz' + href)
        out_next = open('/Users/jancibulka/DEVEL/DATA/lunchtime-ceny/data/scraped/' + mesto + '_' + str(counter) + '.html', 'w')
        content_next = url_next.read()
        out_next.write(content_next)
        out_next.close()
        time.sleep(1)


404-hlavni-mesto-praha
400-jihomoravsky-kraj
402-moravskoslezsky-kraj
392-jihocesky-kraj
394-karlovarsky-kraj
397-kralovehradecky-kraj
396-liberecky-kraj
401-olomoucky-kraj
398-pardubicky-kraj
393-plzensky-kraj
stredocesky-kraj
395-ustecky-kraj
vysocina
403-zlinsky-kraj
z-dalsich-v-cr

In [9]:
#Extrakce dat ze stazenych stranek

In [24]:
#Extrahuje adresy
with open('/Users/jancibulka/DEVEL/DATA/lunchtime-ceny/data/podniky_ceny.csv', 'w') as out:
    writer = csv.writer(out, delimiter=',', quotechar='"', quoting=csv.QUOTE_MINIMAL)
    writer.writerow(['id', 'adresa', 'prum_cena'])
    
    for filename in os.listdir('/Users/jancibulka/DEVEL/DATA/lunchtime-ceny/data/scraped'):
        if(filename == '.DS_Store'):
            continue
        with open('/Users/jancibulka/DEVEL/DATA/lunchtime-ceny/data/scraped/' + filename) as f:
            soup = BeautifulSoup(f, from_encoding='utf-8')
            podniky = soup.find_all('section', class_='facility hidden')
            for hospoda in podniky:
                if (hospoda.find('a', class_='add-to-favorites') is not None):
                    idcko = hospoda.find('a', class_='add-to-favorites').get('data-id').encode('utf-8')
                else:
                    idcko = 9999
                if (hospoda.find('span', class_='address') is not None):
                    adresa = hospoda.find('span', class_='address').text.encode('utf-8')
                else:
                    continue
                ceny = []
                for cena in hospoda.find_all('span', class_='price'):
                    pricetag = cena.text.encode('utf-8').replace(' Kč', '')
                    if (len(pricetag) > 1 and len(pricetag) < 5):
                        ceny.append(int(pricetag))
                writer.writerow([idcko, adresa, np.mean(ceny)])

In [25]:
#Geocoding

out = open('/Users/jancibulka/DEVEL/DATA/lunchtime-ceny/data/podniky_ceny_geo.csv', 'w')
writer = csv.writer(out, delimiter=',', quotechar='"', quoting=csv.QUOTE_MINIMAL)
writer.writerow(['id', 'adresa', 'prum_cena', 'x', 'y'])

with open('/Users/jancibulka/DEVEL/DATA/lunchtime-ceny/data/podniky_ceny_pha_nedele.csv', 'r') as csvfile:
    loaded = csv.reader(csvfile, delimiter=',', quotechar='"')
    for addr in loaded:
        if (addr[1] == 'adresa'):
            continue
        try:
            g = geocoders.GoogleV3()
            geo = g.geocode(addr[1])
            if (geo is not None):
                place, (lat, lng) = geo
                writer.writerow([addr[0], addr[1], addr[2], lng, lat])
                time.sleep(1)
            else:
                print 'Nenalezeno ID: ' + addr[0]
        except:
            try:
                time.sleep(2)
                g = geocoders.GoogleV3()
                geo = g.geocode(addr[1])
                if (geo is not None):
                    place, (lat, lng) = geo
                    writer.writerow([addr[0], addr[1], addr[2], lng, lat])
                    time.sleep(1)
                else:
                    print 'Nenalezeno ID: ' + addr[0]
            except:
                try:
                    time.sleep(5)
                    g = geocoders.GoogleV3()
                    geo = g.geocode(addr[1])
                    if (geo is not None):
                        place, (lat, lng) = geo
                        writer.writerow([addr[0], addr[1], addr[2], lng, lat])
                        time.sleep(1)
                except:
                    print 'Chyba u ID ' + addr[0]
                    continue
out.close()


Chyba u ID 5459
Chyba u ID 5415
Chyba u ID 5401