In [26]:
from selenium import webdriver
import time
import json

In [27]:
url_eva = 'http://www.evavzw.be/resto'
driver = webdriver.Firefox()
driver.get(url_eva)

In [ ]:
restaurants = None
restaurants = []
number_of_pages = 83

for i in range(0,number_of_pages-1):
    print(i)
    selections = driver.find_elements_by_class_name('node-restaurant')
    for s in selections:
        info = s.text.split('\n')
        if len(info)>2:
            restaurant = (info[0],info[1],info[2][:4],info[2][4:], info[3:])
        elif len(info)>=1:
            restaurant = (info[0],None,None)
        
        restaurants.append(restaurant)
    
    selection = driver.find_element_by_class_name('pager-next')
    selection.find_element_by_tag_name('a').click()
    time.sleep(3)
    
                  
driver.close()
print(len(restaurants))

In [29]:
restaurants[0]


Out[29]:
('Biesbemd',
 'Kamstraat 33',
 '3040',
 'Neerijse',
 ['Approved by EVA', 'EVA voordeel', 'Veganvriendelijk'])

In [45]:
restaurant_objects = []
tags = []
for r in restaurants:
    if len(r)!=5:
        print('Bad record: ' + str(r))
    else:
        o = {'name':r[0], 'street':r[1], 'zipcode':r[2], 'city':r[3], 'tags':r[4]}
        restaurant_objects.append(o)
        #print(o)
    
        if r[4] is not None:
            tags.extend(r[4])

unique_tags = set(tags)
print(unique_tags)


Bad record: ('Al Barmaki', None, None)
Bad record: ('Bazilikum', None, None)
Bad record: ('Tlemcen', None, None)
{'Eethuis', 'EVA voordeel', '100% vegetarisch', 'Snack', 'Cateraar', 'Approved by EVA', 'Veganvriendelijk', '100% plantaardig', 'Gastronomisch'}

In [47]:
with open('EVA_restodata.json', 'w') as fp:
    json.dump(restaurant_objects, fp, sort_keys=True, indent=4, separators=(',', ': '))

In [48]:
tag_objects = []
for  t in unique_tags:
    tag_objects.append({"tag":t})
    
print(tag_objects)


[{'tag': 'Eethuis'}, {'tag': 'EVA voordeel'}, {'tag': '100% vegetarisch'}, {'tag': 'Snack'}, {'tag': 'Cateraar'}, {'tag': 'Approved by EVA'}, {'tag': 'Veganvriendelijk'}, {'tag': '100% plantaardig'}, {'tag': 'Gastronomisch'}]

In [49]:
with open('EVA_tags.json', 'w') as fp:
    json.dump(tag_objects, fp, sort_keys=True, indent=4, separators=(',', ': '))

In [ ]: