In [1]:
from pattern.web import URL, DOM, plaintext, Element, extension, Crawler, DEPTH
import re
import pickle
import random
import PIL
from PIL import Image
import datetime
In [2]:
class Scraper():
def save_image(self):
pass
def get_recipe(self):
pass
In [3]:
class RecipeDotComScraper(Scraper):
def save_image(self,element,idx,basewidth = 300):
first_rec_photo = element.by_class("photo")[0]
url = first_rec_photo.attributes.get('src','')
img_url = URL(url)
img = Image.open(img_url)
wpercent = (basewidth / float(img.size[0]))
hsize = int((float(img.size[1]) * float(wpercent)))
img = img.resize((basewidth, hsize), PIL.Image.ANTIALIAS)
img.save("img/"+str(idx) + extension(img_url.page))
def get_ingredients(self,element):
ing_nodes = element.by_class("floatleft ingredient")
return "\n".join([plaintext(a.content) for a in ing_nodes])
def get_instructions(self,element):
instr_nodes = element.by_class("stepbystepInstruction instruction")
return "\n".join([plaintext(a.content) for a in instr_nodes])
def get_recipe(self,element):
return self.get_ingredients(element)+"\n"+self.get_instructions(element)
In [4]:
test_urls = ["http://www.recipe.com/broccoli-spinach-soup-with-avocado-toasts/"
, "http://www.recipe.com/lemon-ginger-poached-halibut-with-leeks-and-spinach/?sessionTemplate=full"]
In [ ]:
scraper = RecipeDotComScraper()
data = []
for idx,test_url in enumerate(test_urls):
url = URL(test_url)
html = url.download(cached=True)
element = Element(html)
ing_str = get_ingredients(element)
instruction_str = get_instructions(element)
data.append(ing_str+"\n"+instruction_str)
save_image(element, idx)