In [4]:
from multiprocessing.dummy import Pool as ThreadPool
from ftplib import FTP
import os.path

In [5]:
ftp = FTP('ftp.epa.gov', timeout=60)
ftp.login()
ftp.cwd('dmdnload')
ftp.cwd('emissions')
ftp.cwd('hourly')
ftp.cwd('monthly')
years = []
ftp.retrlines('NLST', years.append)
print(years)
parent_directory = ftp.pwd()


['1995', '1996', '1997', '1998', '1999', '2000', '2001', '2002', '2003', '2004', '2005', '2006', '2007', '2008', '2009', '2010', '2011', '2012', '2013', '2014', '2015', '2016', '2017']

In [ ]:
ftp.cwd(parent_directory)
if not os.path.exists('data'):
    os.makedirs('data')
visited = []
with open("successes.txt", "r") as file:
    for line in file:
        visited.append(line[:-1])
for year in range(2001,2017):
    ftp.cwd(str(year))
    files = []
    ftp.retrlines('NLST', files.append)
    def zip_fetch(n):
        entry = files[n]
        if entry in visited:
            return
        try:
            outfile = open('data/' + entry, 'wb')
            ftp.retrbinary('RETR ' + entry, outfile.write)
            outfile.close()
            print(entry + " succeeded")
            return entry
        except:
            print(entry + " failed")
            return
    pool = ThreadPool(3)
    results = pool.map(zip_fetch, range(len(files)))
    pool.close()
    pool.join()
    for result in results:
        with open("successes.txt", 'a+') as outfile:
            if result is not None:
                outfile.writelines(result + '\n')
    ftp.cwd('..')


2001al01.zip failed

In [2]:



[2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008, 2009, 2010, 2011, 2012, 2013, 2014, 2015, 2016]

In [3]:
str(2001
)


Out[3]:
'2001'

In [ ]: