This code is for automatically crawling and downloading PDF documents from an archive website.


In [5]:
import urllib.request
import re

In [6]:
base_url = "http://aomol.msa.maryland.gov/000001/000538/pdf/am538--"
output_path = "../download/"
extension =".pdf"

In [7]:
# Looping through document 1 - 10
for i in range(1,10):
    print ("opening page ", i)
    
    # Get a file-like object for one of the city directories on the Internet Archive
    f = urllib.request.urlopen( base_url + str(i) + extension )

    # Read from the object, storing the page's contents in 's'.
    data = f.read()
    f.close()
    
    # Set the name of the downloaded file.
    output_name = "am538--" + str(i) + extension
        
    # Make an output file in your PC
    target = open( output_path + output_name, 'wb' )
    target.write( data )
    target.close()


opening page  1
opening page  2
opening page  3
opening page  4
opening page  5
opening page  6
opening page  7
opening page  8
opening page  9

In [ ]:


In [ ]: