How to download and save images from url


In [5]:
import urllib.request
from bs4 import BeautifulSoup

In [6]:
def make_soup(url):
    with urllib.request.urlopen(url) as web:
        html = web.read()
    return BeautifulSoup(html,'html.parser')

In [7]:
def get_images(url):
    soup = make_soup(url)
    images = [img for img in soup.findAll('img')]
    images = images[1:len(images)]
    print(str(len(images)) + "images found.")
    print('Downloading images to current working directory.')
    image_links = [each.get('src') for each in images]
    for each in image_links:
        filename=each.split('/')[-1]
        filename=filename.split('?')[0]
        urllib.request.urlretrieve(each, filename)
    return image_links

In [8]:
soup = get_images('https://www.google.com.br/search?q=bar+graph&tbm=isch')


---------------------------------------------------------------------------
HTTPError                                 Traceback (most recent call last)
<ipython-input-8-f03afc1d2802> in <module>()
----> 1 soup = get_images('https://www.google.com.br/search?q=bar+graph&tbm=isch')

<ipython-input-7-d9e097da9fd0> in get_images(url)
      1 def get_images(url):
----> 2     soup = make_soup(url)
      3     images = [img for img in soup.findAll('img')]
      4     images = images[1:len(images)]
      5     print(str(len(images)) + "images found.")

<ipython-input-6-f020dc3b1f34> in make_soup(url)
      1 def make_soup(url):
----> 2     with urllib.request.urlopen(url) as web:
      3         html = web.read()
      4     return BeautifulSoup(html,'html.parser')

/Users/mviana/anaconda3/lib/python3.5/urllib/request.py in urlopen(url, data, timeout, cafile, capath, cadefault, context)
    160     else:
    161         opener = _opener
--> 162     return opener.open(url, data, timeout)
    163 
    164 def install_opener(opener):

/Users/mviana/anaconda3/lib/python3.5/urllib/request.py in open(self, fullurl, data, timeout)
    469         for processor in self.process_response.get(protocol, []):
    470             meth = getattr(processor, meth_name)
--> 471             response = meth(req, response)
    472 
    473         return response

/Users/mviana/anaconda3/lib/python3.5/urllib/request.py in http_response(self, request, response)
    579         if not (200 <= code < 300):
    580             response = self.parent.error(
--> 581                 'http', request, response, code, msg, hdrs)
    582 
    583         return response

/Users/mviana/anaconda3/lib/python3.5/urllib/request.py in error(self, proto, *args)
    507         if http_err:
    508             args = (dict, 'default', 'http_error_default') + orig_args
--> 509             return self._call_chain(*args)
    510 
    511 # XXX probably also want an abstract factory that knows when it makes

/Users/mviana/anaconda3/lib/python3.5/urllib/request.py in _call_chain(self, chain, kind, meth_name, *args)
    441         for handler in handlers:
    442             func = getattr(handler, meth_name)
--> 443             result = func(*args)
    444             if result is not None:
    445                 return result

/Users/mviana/anaconda3/lib/python3.5/urllib/request.py in http_error_default(self, req, fp, code, msg, hdrs)
    587 class HTTPDefaultErrorHandler(BaseHandler):
    588     def http_error_default(self, req, fp, code, msg, hdrs):
--> 589         raise HTTPError(req.full_url, code, msg, hdrs, fp)
    590 
    591 class HTTPRedirectHandler(BaseHandler):

HTTPError: HTTP Error 403: Forbidden

In [14]:
import json
import os
import time
import requests
from PIL import Image
from StringIO import StringIO
from requests.exceptions import ConnectionError

def go(query, path):
  """Download full size images from Google image search.
  Don't print or republish images without permission.
  I used this to train a learning algorithm.
  """
  BASE_URL = 'https://ajax.googleapis.com/ajax/services/search/images?'\
             'v=1.0&q=' + query + '&start=%d'

  BASE_PATH = os.path.join(path, query)

  if not os.path.exists(BASE_PATH):
    os.makedirs(BASE_PATH)

  start = 0 # Google's start query string parameter for pagination.
  while start < 60: # Google will only return a max of 56 results.
    r = requests.get(BASE_URL % start)
    for image_info in json.loads(r.text)['responseData']['results']:
      url = image_info['unescapedUrl']
      try:
        image_r = requests.get(url)
      except ConnectionError, e:
        print 'could not download %s' % url
        continue

      # Remove file-system path characters from name.
      title = image_info['titleNoFormatting'].replace('/', '').replace('\\', '')

      file = open(os.path.join(BASE_PATH, '%s.jpg') % title, 'w')
      try:
        Image.open(StringIO(image_r.content)).save(file, 'JPEG')
      except IOError, e:
        # Throw away some gifs...blegh.
        print 'could not save %s' % url
        continue
      finally:
        file.close()

    print start
    start += 4 # 4 images per page.

    # Be nice to Google and they'll be nice back :)
    time.sleep(1.5)

# Example use
go('landscape', '/Users/mviana/anaconda3/MyBooks/DownloadImages')


  File "<ipython-input-14-a942706b5975>", line 28
    except ConnectionError, e:
                          ^
SyntaxError: invalid syntax

In [ ]: