CSW access with OWSLib using ISO queryables

Demonstration of how to use the OGC Catalog Services for the Web (CSW) to search for find all datasets containing a specified variable that fall withing a specified date range and geospatial bounding box, and then use the data access service contained in the returned metadata to retrieve and visualize the data.

Here we are accessing a Geoportal Server CSW, but in the future we should be able to point it toward any another CSW service, such as the one provided by catalog.data.gov.


In [1]:
from pylab import *
from owslib.csw import CatalogueServiceWeb
from owslib import fes
import netCDF4
import pandas as pd
import datetime as dt

In [2]:
from IPython.core.display import HTML
HTML('<iframe src=http://www.ngdc.noaa.gov/geoportal/ width=950 height=400></iframe>')


Out[2]:

In [3]:
# connect to CSW, explore it's properties

endpoint = 'http://www.ngdc.noaa.gov/geoportal/csw' # NGDC Geoportal

#endpoint = 'http://www.nodc.noaa.gov/geoportal/csw'   # NODC Geoportal: granule level
#endpoint = 'http://data.nodc.noaa.gov/geoportal/csw'  # NODC Geoportal: collection level   
#endpoint = 'http://geodiscover.cgdi.ca/wes/serviceManagerCSW/csw'  # NRCAN CUSTOM
#endpoint = 'http://geoport.whoi.edu/gi-cat/services/cswiso' # USGS Woods Hole GI_CAT
#endpoint = 'http://cida.usgs.gov/gdp/geonetwork/srv/en/csw' # USGS CIDA Geonetwork
#endpoint = 'http://cmgds.marine.usgs.gov/geonetwork/srv/en/csw' # USGS Coastal and Marine Program
#endpoint = 'http://geoport.whoi.edu/geoportal/csw' # USGS Woods Hole Geoportal 
#endpoint = 'http://geo.gov.ckan.org/csw'  # CKAN testing site for new Data.gov
#endpoint = 'https://edg.epa.gov/metadata/csw'  # EPA
#endpoint = 'http://cwic.csiss.gmu.edu/cwicv1/discovery'  # CWIC

csw = CatalogueServiceWeb(endpoint,timeout=60)
csw.version


Out[3]:
'2.0.2'

In [4]:
[op.name for op in csw.operations]


Out[4]:
['GetCapabilities',
 'DescribeRecord',
 'GetRecords',
 'GetRecordById',
 'Transaction']

In [5]:
# hopefully something like this will be implemented in fes soon
def dateRange(start_date='1900-01-01',stop_date='2100-01-01',constraint='overlaps'):
    if constraint == 'overlaps':
        start = fes.PropertyIsLessThanOrEqualTo(propertyname='startDate', literal=stop_date)
        stop = fes.PropertyIsGreaterThanOrEqualTo(propertyname='endDate', literal=start_date)
    elif constraint == 'within':
        start = fes.PropertyIsGreaterThanOrEqualTo(propertyname='startDate', literal=start_date)
        stop = fes.PropertyIsLessThanOrEqualTo(propertyname='endDate', literal=stop_date)
    return start,stop

In [6]:
# Perform the CSW query, using Kyle's cool new filters on ISO queryables
# find all datasets in a bounding box and temporal extent that have 
# specific keywords and also can be accessed via OPeNDAP  

box=[-120, 2.0, -110.0, 6.0]
start_date='2012-05-01'
stop_date='2012-06-01'
std_name = 'sea_water_temperature'
service_type='opendap'

In [7]:
# convert User Input into FES filters
start,stop = dateRange(start_date,stop_date)
bbox = fes.BBox(box)
keywords = fes.PropertyIsLike(propertyname='anyText', literal=std_name)
serviceType = fes.PropertyIsLike(propertyname='apiso:ServiceType', literal=('*%s*' % service_type))

In [8]:
# try simplest request first (no constraints)
csw.getrecords2(maxrecords=5,esn='full')
csw.records.keys()


Out[8]:
['ww3_wave/multi_1.at_10m.d20100509t12z.grib2',
 'ww3_wave/multi_1.at_10m.d20100510t00z.grib2',
 'ww3_wave/multi_1.at_10m.d20100510t06z.grib2',
 'ww3_wave/multi_1.at_10m.d20100509t06z.grib2',
 'ww3_wave/multi_1.at_10m.d20100510t12z.grib2']

In [9]:
for rec,item in csw.records.iteritems():
    print item.title


US National Weather Service - NCEP(WMC) Global Multi-Grid Wave Model Forecast products
US National Weather Service - NCEP(WMC) Global Multi-Grid Wave Model Forecast products
US National Weather Service - NCEP(WMC) Global Multi-Grid Wave Model Forecast products
US National Weather Service - NCEP(WMC) Global Multi-Grid Wave Model Forecast products
US National Weather Service - NCEP(WMC) Global Multi-Grid Wave Model Forecast products

In [10]:
# try simple request first
csw.getrecords2(constraints=[keywords],maxrecords=15,esn='full')
csw.records.keys()


Out[10]:
['data/oceansites/DATA/T2N95W/OS_T2N95W_QM003A_R_TEMP.nc',
 'data/oceansites/DATA/T8S95W/OS_T8S95W_DM032A_R_TEMP.nc',
 'data/oceansites/DATA/T0N110W/OS_T0N110W_PM979B_R_TEMP.nc',
 'data/oceansites/DATA/T5S140W/OS_T5S140W_DM035A_R_TEMP.nc',
 'data/oceansites/DATA/T5S125W/OS_T5S125W_DM023B_R_TEMP.nc',
 'ndbcSosWTemp',
 'data/oceansites/DATA/T5N110W/OS_T5N110W_PM997A_R_TEMP.nc',
 'data/oceansites/DATA/T8S125W/OS_T8S125W_DM034A_R_TEMP.nc',
 'data/oceansites/DATA/T2S95W/OS_T2S95W_QM001A_R_TEMP.nc',
 'data/oceansites/DATA/T2N110W/OS_T2N110W_PM978B_R_TEMP.nc',
 'data/oceansites/DATA/T5N140W/OS_T5N140W_DM036A_R_TEMP.nc',
 'data/oceansites/DATA/T5N95W/OS_T5N95W_QM004A_R_TEMP.nc',
 'data/oceansites/DATA/T2S125W/OS_T2S125W_PM983B_R_TEMP.nc',
 'data/oceansites/DATA/T8N125W/OS_T8N125W_QM006A_R_TEMP.nc',
 'data/oceansites/DATA/T2S110W/OS_T2S110W_PM998A_R_TEMP.nc']

In [11]:
# apply all the filters using the "and" syntax: [[filter1,filter2]]
csw.getrecords2(constraints=[[keywords,start,stop,serviceType,bbox]],maxrecords=15,esn='full')
csw.records.keys()


Out[11]:
['data/oceansites/DATA/T2N110W/OS_T2N110W_PM978B_R_TEMP.nc',
 'data/oceansites/DATA/T5N110W/OS_T5N110W_PM997A_R_TEMP.nc']

In [12]:
for rec,item in csw.records.iteritems():
    print item.title


TAO Legacy Production Mooring Data
TAO Legacy Production Mooring Data

In [13]:
# get specific ServiceType URL from records
def service_urls(records,service_string='urn:x-esri:specification:ServiceType:odp:url'):
    urls=[]
    for key,rec in records.iteritems():
        #create a generator object, and iterate through it until the match is found
        #if not found, gets the default value (here "none")
        url = next((d['url'] for d in rec.references if d['scheme'] == service_string), None)
        if url is not None:
            urls.append(url)
    return urls

In [14]:
dap_urls = service_urls(csw.records,service_string='urn:x-esri:specification:ServiceType:odp:url')
print "\n".join(dap_urls)


http://dods.ndbc.noaa.gov/thredds/dodsC/data/oceansites/DATA/T2N110W/OS_T2N110W_PM978B_R_TEMP.nc
http://dods.ndbc.noaa.gov/thredds/dodsC/data/oceansites/DATA/T5N110W/OS_T5N110W_PM997A_R_TEMP.nc

In [15]:
def standard_names(nc):
    '''
    get dictionary of variables with standard_names
    '''
    d={}
    for k,v in nc.iteritems():
        try:
            standard_name=v.getncattr('standard_name')
            try:
                d[standard_name]=[d[standard_name],[k]]
            except:
                d[standard_name]=[k]
        except:
            pass
    return d

In [16]:
for url in dap_urls:
    nc = netCDF4.Dataset(url).variables
    lat = nc['LONGITUDE'][:]
    lon = nc['LATITUDE'][:]
    tvar = nc['TIME']
    
    istart = netCDF4.date2index(dt.datetime.strptime(start_date,'%Y-%m-%d'),tvar,select='nearest')
    istop = netCDF4.date2index(dt.datetime.strptime(stop_date,'%Y-%m-%d'),tvar,select='nearest')
    dtime = netCDF4.num2date(tvar[istart:istop],tvar.units)
    # make a dictionary containing all data from variables that matched the standard_name
    # find list of variables for each standard_name
    d = standard_names(nc)
    # find all the variables matching standard_name=std_name
    d[std_name]
    # read all the data into a dictionary
    data_dict={}
    lev=0
    for v in d[std_name]:
        data_dict[v]=nc[v][istart:istop,lev].flatten()
    # Create Pandas data frame, with time index
    ts = pd.DataFrame.from_dict(data_dict)
    ts.index=dtime
    ts.plot(figsize=(12,4));
    title(std_name)