In [36]:
from pattern.web import URL, DOM, plaintext, Element, extension, Crawler, DEPTH
import re
import pickle
import random
import PIL
from PIL import Image
import os
import time
Remove dummy images from folder
In [ ]:
dummy = Image.open("img/9908.png").histogram() #insert new dummy image
In [19]:
indir = 'img/'
dummies = []
for root, dirs, filenames in os.walk(indir):
for f in filenames:
try:
img = Image.open(indir + f).histogram()
if img == dummy:
print("dummy",f)
dummies.append(os.path.basename(f))
os.remove(indir + f)
except Exception as detail:
print("Error with file",f,detail)
pickle.dump( dummies, open( "dummies.p", "wb" ) )
In [22]:
with open('dummies.csv') as f:
dummies = f.read().splitlines()
recipe_list = pickle.load( open( "recipe_lists/recipe_list2016-04-22.p", "rb" ))
print("orig size", len(recipe_list.keys() ))
print("dummy size", len(dummies))
recipe_list_new = {key: value for key, value in recipe_list.items()
if key not in dummies}
print("new size", len(recipe_list_new.keys() ))
pickle.dump( recipe_list_new, open( "recipe_lists/recipe_list2016-04-25_remove_dummies.p", "wb" ) )
In [24]:
print(5000-293)
In [25]:
class Scraper():
def save_image(self):
pass
def get_recipe(self):
pass
In [26]:
class AllRecipesScraper(Scraper):
def get_ingredients(self,element):
ing_nodes = element.by_class("recipe-ingred_txt added")
return "\n".join([plaintext(a.content) for a in ing_nodes])
def get_instructions(self,element):
instr_nodes = element.by_class("recipe-directions__list--item")
return "\n".join([plaintext(a.content) for a in instr_nodes])
def get_recipe(self,element):
return self.get_ingredients(element)+"\n"+self.get_instructions(element)
In [28]:
class AllRecipesRandomSearch():
def __init__(self,tried_ids = None, recipe_list = None ):
self.scraper = AllRecipesScraper()
if tried_ids is None:
self.tried_ids = set()
else:
self.tried_ids = tried_ids
if recipe_list is None:
self.recipe_list = {}
else:
self.recipe_list = recipe_list
self.count = 0
def reset_count(self):
self.count = 0
def new_id(self,rec_id):
return rec_id not in self.tried_ids
def visit(self,rec_id):
url = URL("http://allrecipes.com/recipe/"+str(rec_id))
try:
source = url.download(cached=True)
self.scrape(source, rec_id)
except Exception as detail:
print 'Unable to Scrape:', rec_id
self.tried_ids.add(rec_id)
def scrape(self,source,rec_id):
print("scraping", rec_id)
element = Element(source)
recipe = self.scraper.get_recipe(element)
self.recipe_list[rec_id]=recipe
self.tried_ids.add(rec_id)
self.count += 1
get recipes for images that were not in the list but saved on disk
In [1]:
recipe_list = pickle.load(open("recipe_lists/recipe_list2016-04-25_remove_dummies.p","rb"))
keys = recipe_list.keys()
print("keys size", len(keys))
search = AllRecipesRandomSearch(recipe_list = recipe_list)
for root, dirs, filenames in os.walk(indir):
for f in filenames:
base = os.path.basename(f)
rec_id = os.path.splitext(base)[0]
if rec_id not in keys:
print("Getting recipe for",rec_id)
search.visit(rec_id)
time.sleep(1)
save_as = "recipe_lists/recipe_list"+str(datetime.datetime.now().strftime("%Y-%m-%d %H:%M"))+".p"
pickle.dump( crawler.recipe_list, open(save_as , "wb" ) )
print "Saved as", save_as
print("new Keys size",len(crawler.recipe_list.keys()))
In [39]:
save_as_list = "recipe_lists/recipe_list_AL.p"
save_as_ids = "recipe_lists/tried_ids_AL.p"
print "Saving the final work"
pickle.dump( search.recipe_list, open(save_as_list, "wb" ) )
pickle.dump( search.tried_ids, open(save_as_ids, "wb" ) )
print("new Keys size",len(search.recipe_list.keys()))