In [106]:
import json
from urllib2 import urlopen
from urllib import urlencode
from cStringIO import StringIO
from PIL import Image
from numpy import array, float32

Define a function that can be imported by other module:


In [70]:
criteria={'title':'''Harry Potter and the Philosopher's Stone'''}
legalKeys=set(("title", "author", "q"))
searchUrlBase='http://openlibrary.org/search.json?'
assert len(criteria)>0 and set(criteria.keys()) <= legalKeys, "No keys or illegal keys found in criteria"
results=json.loads(''.join(urlopen(searchUrlBase+urlencode(criteria)).readlines()))

In [107]:
coverBaseUrl='http://covers.openlibrary.org/b/olid/{0}-{1}.jpg'
file = StringIO(urlopen(coverBaseUrl.format('OL9142275M','M')).read())
img = Image.open(file)
shape(array(img, float32))


Out[107]:
(187, 114, 3)

In [111]:
legalKeys=set(("title", "author", "q"))
searchBaseUrl='http://openlibrary.org/search.json?'
coverBaseUrl='http://covers.openlibrary.org/b/olid/{0}-{1}.jpg'
fieldsToKeep=('cover_edition_key', 'title', 'author_name')

def searchAndGetBookInfo(criteria, coverImgSize='M', limit=1):
    '''
    Input:
        criteria: A dictionary specifies the search criteria. Legal keys are: "title", "author", and "q" for
        general search.
        
        coverImgSize: There are 'L', 'M', 'S'.
        
        limit: When there are multiple hits for the search, how many records to return (with book cover images).
    Returns:
        numOfHits: Integer represents the number of hits of this search.
        results: List of JSONs with fields: "cover_edition_key", "title", "author_name", "cover_img"
    '''
    assert len(criteria)>0 and set(criteria.keys()) <= legalKeys, "No keys or illegal keys found in criteria"
    results=json.loads(''.join(urlopen(searchUrlBase+urlencode(criteria)).readlines()))
    numOfHits=results['numFound']
    results=results['docs']
    for i, r in enumerate(results):
        if i==limit: # got enough records
            break
        if 'cover_edition_key' not in r: # if we can't fetch book cover image, ignore this record
            continue
#         results[i]={f:r[f] if f in r else None for f in fieldsToKeep}
        results[i]={f:r[f] for f in fieldsToKeep} # remove all redundant fields
        # create numpy array from remote jpg files
        img = Image.open(StringIO(urlopen(coverBaseUrl.format(r['cover_edition_key'], coverImgSize)).read()))
        results[i]['cover_img']=array(img, float32)
    return numOfHits, results

In [119]:
criteria={'title':'''Harry Potter and the Philosopher's Stone'''}
n, results=searchAndBookInfo(criteria)
print((n, results[0]['title'], results[0]['author_name'], results[0]['cover_img'].shape))


(19, u"Harry Potter and the Philosopher's Stone", [u'J. K. Rowling'], (187, 114, 3))