%matplotlib inline
import matplotlib.pyplot as plt

import sys, os, re, time
import urllib

import numpy as np

from IPython import parallel

Downloading from flickr

This flickr parsing code is adapted from here

def extract_urls(html):
    """Extract images URLs from a page."""
    re_imageurl = re.compile(r'src="(http://farm\d+.static.?\d+/\d+_\w+.jpg)"',re.IGNORECASE|re.DOTALL)
    urls = re_imageurl.findall(html)
    if len(urls)==0:
        return []
    return urls

def urls_for_tag(tag='face', min_images=100, max_pages=20):
    """get urls to flickr images with given tag(s)

    scrapes flickr search page
    urls = []
    page = 1
    while len(urls) < min_images and page <= max_pages:
        url = '' % (tag, page)
        print "fetching %s" % url
        urlfile = urllib.urlopen(url)
        # global html
        # print html
        page_urls = extract_urls(html)
        print "found %i images" % len(urls)
        if not len(page_urls):
            print "no new images"
        page += 1
    return urls

urls = urls_for_tag('portrait', 500)

def download_image(url, dest_dir='images'):
    """download an image from a url into a directory

    returns the path to the downloaded image.
    import os
    basename = url.rsplit('/', 1)[-1]
    dest = os.path.join(dest_dir, basename)
    if not os.path.exists(dest_dir):
    if os.path.exists(dest):
        print "already have %s" % dest
        return dest
    print "downloading %s -> %s" % (url, dest)
    urlf = urllib.urlopen(url)
    data =
    with open(dest, 'w') as f:
    return dest

First, initialize OpenCV for simple facial detection

HAAR_CASCADE_PATH = "haarcascade_frontalface_default.xml"
# if you have opencv installed via homebrew, this would be in
# /usr/local/share/OpenCV/haarcascades/

import cv
storage = cv.CreateMemStorage()
cascade = cv.Load(HAAR_CASCADE_PATH)

Then define a few functions for extracting faces from images

def extract_faces(image, faces):
    """Returns any faces in an image in a list of numpy arrays"""
    import numpy as np
    A = np.frombuffer(image.tostring(), dtype=np.uint8).reshape((image.height, image.width, image.nChannels))
    A = A[:,:,::-1]
    face_arrays = []
    for face in faces:
        Aface = A[face[1]:face[1]+face[3],face[0]:face[0]+face[2]]
    return face_arrays

def detect_faces(filename):
    """Loads an image into OpenCV, and detects faces

    returns None if no image is found,
    (filename, [list of numpy arrays]) if there are faces
    image = cv.LoadImage(filename)
    faces = []
    detected = cv.HaarDetectObjects(image, cascade, storage, 1.2, 2, cv.CV_HAAR_DO_CANNY_PRUNING, (100,100))
    if detected:
        for (x,y,w,h),n in detected:
    if faces:
        return filename, extract_faces(image, faces)

And finally, a two-step function that downloads an image from a url, and detects faces in it.

def faces_in_url(url):
    """detect faces in an image downloaded from a url"""
    img_path = download_image(url)
    return detect_faces(img_path)

If the network doesn't work, you can just generate a list of paths to images on your computer. For instance, these pictures are just everything from my iPhoto thumbnails directory, so vary from ~320x240 - 1024x768

import glob
library = os.path.expanduser("~/Pictures/2013.iphotolibrary")
pictures = []
for directory, subdirs, files in os.walk(os.path.join(library, 'Thumbnails')):
    for fname in files:
        if fname.endswith('.jpg'):
            pictures.append(os.path.join(directory, fname))

Or this one, which globs pictures from a particular folder:

import glob
pictures = glob.glob("images/*/*.jpg")

Let's test our

for url in urls:
    found = faces_in_url(url)
    if found:

filename, faces = found
for face in faces:

If the network isn't kind to you, we can skip the downloads, and just use pictures we have on the filesystem:

for p in pictures:
    found = detect_faces(p)
    if found:

filename, faces = found
for face in faces:

Hey, that looks like a face!

Now in parallel

First, we connect our parallel Client

rc = parallel.Client()
all_engines = rc[:]
view = rc.load_balanced_view()

Then we initialize OpenCV on all of the engines (identical to what we did above)

%cd notebooks/parallel

HAAR_CASCADE_PATH = "haarcascade_frontalface_default.xml"

import os, urllib
import cv
storage = cv.CreateMemStorage()
cascade = cv.Load(HAAR_CASCADE_PATH)

and make sure extract_faces is defined everywhere

Now we can iterate through all of our pictures, and detect and display any faces we find

tic = time.time()
# if you are running offline, do this one:
# f = detect_faces
# source = pictures

# or you can download each image as part of the task:
f = faces_in_url
source = urls

amr = view.map_async(f, source[:1000], ordered=False)
nfound = 0
for r in amr:
    if not r:
    filename, faces = r
    nfound += len(faces)
    print "%i faces found in %s" % (len(faces), filename)
    for face in faces:

toc = time.time()

print "found %i faces in %i images in %f s" % (nfound, len(amr), toc-tic)