In [ ]:
%matplotlib inline
import matplotlib.pyplot as plt
In [ ]:
import sys, os, re, time
import urllib
import numpy as np
from IPython import parallel
This flickr parsing code is adapted from here
In [ ]:
def extract_urls(html):
"""Extract images URLs from a page."""
re_imageurl = re.compile(r'src="(http://farm\d+.static.?flickr.com/\d+/\d+_\w+.jpg)"',re.IGNORECASE|re.DOTALL)
urls = re_imageurl.findall(html)
if len(urls)==0:
return []
return urls
In [ ]:
def urls_for_tag(tag='face', min_images=100, max_pages=20):
"""get urls to flickr images with given tag(s)
scrapes flickr search page
"""
urls = []
page = 1
while len(urls) < min_images and page <= max_pages:
url = 'http://www.flickr.com/search/?q=%s&l=cc&ss=0&ct=0&mt=photos&w=all&adv=1&m=tags&page=%i' % (tag, page)
print "fetching %s" % url
urlfile = urllib.urlopen(url)
# global html
html= urlfile.read()
# print html
urlfile.close()
page_urls = extract_urls(html)
urls.extend(page_urls)
print "found %i images" % len(urls)
if not len(page_urls):
print "no new images"
break
page += 1
return urls
In [ ]:
urls = urls_for_tag('portrait', 500)
In [ ]:
def download_image(url, dest_dir='images'):
"""download an image from a url into a directory
returns the path to the downloaded image.
"""
import os
basename = url.rsplit('/', 1)[-1]
dest = os.path.join(dest_dir, basename)
if not os.path.exists(dest_dir):
os.makedirs(dest_dir)
if os.path.exists(dest):
print "already have %s" % dest
return dest
print "downloading %s -> %s" % (url, dest)
urlf = urllib.urlopen(url)
data = urlf.read()
urlf.close()
with open(dest, 'w') as f:
f.write(data)
return dest
First, initialize OpenCV for simple facial detection
In [ ]:
HAAR_CASCADE_PATH = "haarcascade_frontalface_default.xml"
# if you have opencv installed via homebrew, this would be in
# /usr/local/share/OpenCV/haarcascades/
import cv
storage = cv.CreateMemStorage()
cascade = cv.Load(HAAR_CASCADE_PATH)
Then define a few functions for extracting faces from images
In [ ]:
def extract_faces(image, faces):
"""Returns any faces in an image in a list of numpy arrays"""
import numpy as np
A = np.frombuffer(image.tostring(), dtype=np.uint8).reshape((image.height, image.width, image.nChannels))
A = A[:,:,::-1]
face_arrays = []
for face in faces:
Aface = A[face[1]:face[1]+face[3],face[0]:face[0]+face[2]]
face_arrays.append(Aface)
return face_arrays
def detect_faces(filename):
"""Loads an image into OpenCV, and detects faces
returns None if no image is found,
(filename, [list of numpy arrays]) if there are faces
"""
image = cv.LoadImage(filename)
faces = []
detected = cv.HaarDetectObjects(image, cascade, storage, 1.2, 2, cv.CV_HAAR_DO_CANNY_PRUNING, (100,100))
if detected:
for (x,y,w,h),n in detected:
faces.append((x,y,w,h))
if faces:
return filename, extract_faces(image, faces)
And finally, a two-step function that downloads an image from a url, and detects faces in it.
In [ ]:
def faces_in_url(url):
"""detect faces in an image downloaded from a url"""
img_path = download_image(url)
return detect_faces(img_path)
If the network doesn't work, you can just generate a list of paths to images on your computer. For instance, these pictures are just everything from my iPhoto thumbnails directory, so vary from ~320x240 - 1024x768
In [ ]:
import glob
library = os.path.expanduser("~/Pictures/2013.iphotolibrary")
pictures = []
for directory, subdirs, files in os.walk(os.path.join(library, 'Thumbnails')):
for fname in files:
if fname.endswith('.jpg'):
pictures.append(os.path.join(directory, fname))
Or this one, which globs pictures from a particular folder:
In [ ]:
import glob
pictures = glob.glob("images/*/*.jpg")
Let's test our
In [ ]:
for url in urls:
found = faces_in_url(url)
if found:
break
filename, faces = found
for face in faces:
plt.figure()
plt.imshow(face)
If the network isn't kind to you, we can skip the downloads, and just use pictures we have on the filesystem:
In [ ]:
for p in pictures:
found = detect_faces(p)
if found:
break
filename, faces = found
for face in faces:
plt.figure()
plt.imshow(face)
Hey, that looks like a face!
First, we connect our parallel Client
In [ ]:
rc = parallel.Client()
all_engines = rc[:]
view = rc.load_balanced_view()
Then we initialize OpenCV on all of the engines (identical to what we did above)
In [ ]:
%%px
%cd notebooks/parallel
In [ ]:
%%px
HAAR_CASCADE_PATH = "haarcascade_frontalface_default.xml"
import os, urllib
import cv
storage = cv.CreateMemStorage()
cascade = cv.Load(HAAR_CASCADE_PATH)
and make sure extract_faces
is defined everywhere
In [ ]:
all_engines.push(dict(
extract_faces=extract_faces,
detect_faces=detect_faces,
download_image=download_image,
))
Now we can iterate through all of our pictures, and detect and display any faces we find
In [ ]:
tic = time.time()
# if you are running offline, do this one:
# f = detect_faces
# source = pictures
# or you can download each image as part of the task:
f = faces_in_url
source = urls
amr = view.map_async(f, source[:1000], ordered=False)
nfound = 0
for r in amr:
if not r:
continue
filename, faces = r
nfound += len(faces)
print "%i faces found in %s" % (len(faces), filename)
for face in faces:
plt.imshow(face)
plt.show()
toc = time.time()
print "found %i faces in %i images in %f s" % (nfound, len(amr), toc-tic)