In [4]:
from multiprocessing.dummy import Pool as ThreadPool
from ftplib import FTP
import os.path
In [5]:
ftp = FTP('ftp.epa.gov', timeout=60)
ftp.login()
ftp.cwd('dmdnload')
ftp.cwd('emissions')
ftp.cwd('hourly')
ftp.cwd('monthly')
years = []
ftp.retrlines('NLST', years.append)
print(years)
parent_directory = ftp.pwd()
In [ ]:
ftp.cwd(parent_directory)
if not os.path.exists('data'):
os.makedirs('data')
visited = []
with open("successes.txt", "r") as file:
for line in file:
visited.append(line[:-1])
for year in range(2001,2017):
ftp.cwd(str(year))
files = []
ftp.retrlines('NLST', files.append)
def zip_fetch(n):
entry = files[n]
if entry in visited:
return
try:
outfile = open('data/' + entry, 'wb')
ftp.retrbinary('RETR ' + entry, outfile.write)
outfile.close()
print(entry + " succeeded")
return entry
except:
print(entry + " failed")
return
pool = ThreadPool(3)
results = pool.map(zip_fetch, range(len(files)))
pool.close()
pool.join()
for result in results:
with open("successes.txt", 'a+') as outfile:
if result is not None:
outfile.writelines(result + '\n')
ftp.cwd('..')
In [2]:
In [3]:
str(2001
)
Out[3]:
In [ ]: