In [1]:
from pattern.web import URL, DOM, plaintext, Element, extension, Crawler, DEPTH
import re
import pickle
import random
import PIL
from PIL import Image
import datetime
In [2]:
class Scraper():
def save_image(self):
pass
def get_recipe(self):
pass
In [3]:
class AllRecipesScraper(Scraper):
def save_image(self,element,idx,basewidth = 300):
print("Saving image",idx)
first_rec_photo = element.by_class("rec-photo")[0]
url = first_rec_photo.attributes.get('src','')
img_url = URL(url)
img = Image.open(img_url)
wpercent = (basewidth / float(img.size[0]))
hsize = int((float(img.size[1]) * float(wpercent)))
img = img.resize((basewidth, hsize), PIL.Image.ANTIALIAS)
img.save("img/"+str(idx) + extension(img_url.page))
def get_ingredients(self,element):
ing_nodes = element.by_class("recipe-ingred_txt added")
return "\n".join([plaintext(a.content) for a in ing_nodes
if "Add all ingredients to list" not in plaintext(a.content)])
def get_instructions(self,element):
instr_nodes = element.by_class("recipe-directions__list--item")
return "\n".join([plaintext(a.content) for a in instr_nodes])
def get_recipe(self,element):
return self.get_ingredients(element)+"\n"+self.get_instructions(element)
In [4]:
class AllRecipesCrawler(Crawler):
def __init__(self,links, delay,recipe_list = None):
super( AllRecipesCrawler, self ).__init__(links=links, delay=delay)
self.scraper = AllRecipesScraper()
if recipe_list is None:
self.recipe_list = {}
else:
self.recipe_list = recipe_list
self.count = 0
def reset_count(self):
self.count = 0
def follow(self, link):
if "recipes/" in str(link.url):
yield True
else:
yield False
def visit(self, link, source=None):
if "recipe/" in str(link.url):
print("visiting", str(link.url),self.count)
try:
rec_id = re.search(".*recipe/(.*)(/.*/)+", str(link.url)).group(1)
if rec_id not in self.recipe_list.keys():
self.scrape(source, rec_id)
else:
print("already scraped",rec_id)
except Exception as detail:
print 'Run-time error:', detail
def scrape(self,source,rec_id):
print("scraping", rec_id)
element = Element(source)
try:
recipe = self.scraper.get_recipe(element)
self.scraper.save_image(element, rec_id)
self.recipe_list[rec_id]=recipe
self.count += 1
except Exception as detail:
print 'Handling run-time error:', detail
In [ ]:
#rerun this code every time
crawler.reset_count()
limit = 2
while (not crawler.done) and crawler.count < limit:
crawler.crawl(method=DEPTH, cached=False)
save_as = "recipe_lists/recipe_list"+str(datetime.datetime.now().strftime("%Y-%m-%d %H:%M"))+".p"
pickle.dump( crawler.recipe_list, open(save_as , "wb" ) )
print "Saved as", save_as
In [ ]:
#In case of kernel restart run this (need to combined saved dicts)
In [7]:
recipe_list = pickle.load( open( "recipe_lists/recipe_list2016-04-24 16:55.p", "rb" ) )
In [ ]:
base_url = "http://allrecipes.com/"
limit = 5000
crawler = AllRecipesCrawler(links=[base_url], delay=1,recipe_list = recipe_list )
crawler.reset_count()
while (not crawler.done) and crawler.count < limit:
crawler.crawl(method=DEPTH, cached=False)
save_as = "recipe_lists/recipe_list"+str(datetime.datetime.now().strftime("%Y-%m-%d %H:%M"))+".p"
pickle.dump( crawler.recipe_list, open(save_as , "wb" ) )
print "Saved as", save_as
In [ ]: