In [1]:
from pattern.web import URL, DOM, plaintext, Element, extension, Crawler, DEPTH
import re
import pickle
import random
import PIL
from PIL import Image
import datetime

In [2]:
class Scraper():
    def save_image(self):
        pass
    def get_recipe(self):
        pass

In [3]:
class RecipeDotComScraper(Scraper):
    def save_image(self,element,idx,basewidth = 300):
        first_rec_photo = element.by_class("photo")[0]
        url = first_rec_photo.attributes.get('src','')
        img_url = URL(url)
        img = Image.open(img_url)
        wpercent = (basewidth / float(img.size[0]))
        hsize = int((float(img.size[1]) * float(wpercent)))
        img = img.resize((basewidth, hsize), PIL.Image.ANTIALIAS)
        img.save("img/"+str(idx) + extension(img_url.page))
        
    def get_ingredients(self,element):
        ing_nodes = element.by_class("floatleft ingredient")
        return "\n".join([plaintext(a.content) for a in ing_nodes])

    
    def get_instructions(self,element):
        instr_nodes = element.by_class("stepbystepInstruction instruction")
        return "\n".join([plaintext(a.content) for a in instr_nodes])
    
    def get_recipe(self,element):
        return self.get_ingredients(element)+"\n"+self.get_instructions(element)

In [4]:
test_urls = ["http://www.recipe.com/broccoli-spinach-soup-with-avocado-toasts/"
             , "http://www.recipe.com/lemon-ginger-poached-halibut-with-leeks-and-spinach/?sessionTemplate=full"]

In [ ]:
scraper = RecipeDotComScraper()
data = []
for idx,test_url in enumerate(test_urls):
    url = URL(test_url)
    html = url.download(cached=True)
    element = Element(html)
    ing_str = get_ingredients(element)
    instruction_str = get_instructions(element)
    data.append(ing_str+"\n"+instruction_str)
    save_image(element, idx)