notebook.community

Edit and run



In [1]:

    
from pattern.web import URL, DOM, plaintext, Element, extension, Crawler, DEPTH
import re
import pickle
import random
import PIL
from PIL import Image
import datetime



In [2]:

    
class Scraper():
    def save_image(self):
        pass
    def get_recipe(self):
        pass



In [3]:

    
class AllRecipesScraper(Scraper):
    def save_image(self,element,idx,basewidth = 300):
        print("Saving image",idx)
        first_rec_photo = element.by_class("rec-photo")[0]
        url = first_rec_photo.attributes.get('src','')
        img_url = URL(url)
        img = Image.open(img_url)
        wpercent = (basewidth / float(img.size[0]))
        hsize = int((float(img.size[1]) * float(wpercent)))
        img = img.resize((basewidth, hsize), PIL.Image.ANTIALIAS)
        img.save("img/"+str(idx) + extension(img_url.page))
        
    def get_ingredients(self,element):
        ing_nodes = element.by_class("recipe-ingred_txt added")
        return "\n".join([plaintext(a.content) for a in ing_nodes 
                          if "Add all ingredients to list" not in plaintext(a.content)])

    
    def get_instructions(self,element):
        instr_nodes = element.by_class("recipe-directions__list--item")
        return "\n".join([plaintext(a.content) for a in instr_nodes])
    
    def get_recipe(self,element):
        return self.get_ingredients(element)+"\n"+self.get_instructions(element)



In [4]:

    
class AllRecipesCrawler(Crawler):
    def __init__(self,links, delay,recipe_list = None):
        super( AllRecipesCrawler, self ).__init__(links=links, delay=delay)
        self.scraper = AllRecipesScraper()
        if recipe_list is None:
            self.recipe_list = {}
        else:
            self.recipe_list = recipe_list
        self.count = 0
        
        
    def reset_count(self):
        self.count = 0
        
    def follow(self, link):
        if "recipes/" in str(link.url):
            yield True
        else:
            yield False
            
    def visit(self, link, source=None):
        if "recipe/" in str(link.url):
            print("visiting", str(link.url),self.count)
            try:
                rec_id = re.search(".*recipe/(.*)(/.*/)+", str(link.url)).group(1)
                if rec_id not in self.recipe_list.keys():
                    self.scrape(source, rec_id)
                else:
                    print("already scraped",rec_id)
            except Exception as detail:
                print 'Run-time error:', detail
            
            
    def scrape(self,source,rec_id):
        print("scraping", rec_id)
        element = Element(source)
        try:
            recipe = self.scraper.get_recipe(element)
            self.scraper.save_image(element, rec_id)
            self.recipe_list[rec_id]=recipe
            self.count += 1
        except Exception as detail:
            print 'Handling run-time error:', detail



In [ ]:

    
#rerun this code every time
crawler.reset_count()
limit = 2
while (not crawler.done) and crawler.count < limit:
    crawler.crawl(method=DEPTH, cached=False)
save_as = "recipe_lists/recipe_list"+str(datetime.datetime.now().strftime("%Y-%m-%d %H:%M"))+".p"
pickle.dump( crawler.recipe_list, open(save_as , "wb" ) ) 
print "Saved as", save_as



In [ ]:

    
#In case of kernel restart run this (need to combined saved dicts)



In [7]:

    
recipe_list = pickle.load( open( "recipe_lists/recipe_list2016-04-24 16:55.p", "rb" ) )



In [ ]:

    
base_url = "http://allrecipes.com/"
limit = 5000
crawler = AllRecipesCrawler(links=[base_url], delay=1,recipe_list = recipe_list )
crawler.reset_count()
while (not crawler.done) and crawler.count < limit:
    crawler.crawl(method=DEPTH, cached=False)
save_as = "recipe_lists/recipe_list"+str(datetime.datetime.now().strftime("%Y-%m-%d %H:%M"))+".p"
pickle.dump( crawler.recipe_list, open(save_as , "wb" ) ) 
print "Saved as", save_as



In [ ]: