Download script for OMI data

Example of how to download monthly summaries of asc.gz files from temis.nl server


In [ ]:
import os
from glob import glob
import wget
import pandas as pd
import requests
from tqdm import *

In [ ]:
def make_url(year, month, kind):
    """Create url from a year and month string as a download link"""
    base = "http://temis.nl/airpollution/no2col/data/omi/data_v2/"
    file_start = "no2_"
    return ''.join([base, year,'/', month, '/',
                    file_start, year, month, kind])

def make_target(year, month, kind):
    """Make a target folder/filename string from a year month string"""
    return ''.join(["Data/", year, month, kind])

def download_file(year, month, kind):
    """
    Download a file. Must specify:
    year = string of YYYY
    month = string of MM (must be two digit, e.g. '01')
    kind = string of file extension type (e.g. '.asc.gz' or '.grd.gz')
    
    e.g.
    >>>download_file('2016','01','.asc.gz')
    """
    if not os.path.isdir("Data"):
        os.makedirs("Data")
    url = make_url(year, month, kind)
    target = make_target(year, month, kind)
    try:
        wget.download(url, out=target)
    except:
        raise IOError("File not found")

In [ ]:
# Example of downloading a single year/month file
download_file('2016', '05', kind='.grd.gz')

If there is no Data/ folder in the local folder, one will be made if the [year][month].asc.gz file doesnt exist on the server no download will be attempted, and instead an Eror will be raised. (This is so you can use a try: except: syntax to run a loop.)

Download multiple files

To download multiple files you will need to iterate over a range of date-time objects. Each element of the list can then be used to get a month / year pair, used as inputs to the downloader.


In [ ]:
def download_batch(start, end, kind):
    """
    Provide a start and and end date.
    A local Data folder will be created if none exists.
    All files present in temis.nl/airpollution/no2col/data/omi/data_v2/
    will be downloaded there. 
    (Even though dates are given to days, the time steps are monthly.)
    
    start = date string 'YYYYMMDD'
    end =  date string 'YYYYMMDD'
    kind = file type: '.asc.gz' or '.grd.gz'
    
    e.g. 
    >>> download_batch(start='20041001', end='20161001')
    """
    dates = pd.date_range(start=start, end=end,freq='M')
    missing = []
    for date in tqdm(dates):
        month = "{0:02d}".format(date.month)
        year = str(date.year)
        try:
            download_file(year, month, kind)
        except:
            missing.append((year, month))
    if len(missing):     # print info on missing files, if any exist
        for pair in missing:
            print("{0}: No corresponding file found".format(pair))
    # Clean up any partial files
    for badfile in glob('Data/*.tmp'):
        os.remove(badfile)
    # Clean up any duplicated files
    for duplicate in glob('Data/*(?)*'):
        print(os.remove(duplicate))

In [ ]:
download_batch(start='20160101', end='20161001', kind='.grd.gz')