CSW access with OWSLib using ISO queryables

Demonstration of how to use the OGC Catalog Services for the Web (CSW) to search for find all datasets containing a specified variable that fall withing a specified date range and geospatial bounding box, and then use the data access service contained in the returned metadata to retrieve and visualize the data.

Here we are accessing a Geoportal Server CSW, but in the future we should be able to point it toward any another CSW service, such as the one provided by catalog.data.gov.


In [1]:
from pylab import *
from owslib.csw import CatalogueServiceWeb
from owslib import fes
import netCDF4
import pandas as pd

In [2]:
from IPython.core.display import HTML
HTML('<iframe src=http://geoport.whoi.edu/geoportal/ width=950 height=400></iframe>')


Out[2]:

In [3]:
# connect to CSW, explore it's properties
#endpoint = 'http://www.ngdc.noaa.gov/geoportal/csw' # NGDC Geoportal
#endpoint = 'http://www.nodc.noaa.gov/geoportal/csw'   # NODC Geoportal: granule level
#endpoint = 'http://data.nodc.noaa.gov/geoportal/csw'  # NODC Geoportal: collection level
    
#endpoint = 'http://geodiscover.cgdi.ca/wes/serviceManagerCSW/csw'  # NRCAN CUSTOM
#endpoint = 'http://geoport.whoi.edu/gi-cat/services/cswiso' # USGS Woods Hole GI_CAT
#endpoint = 'http://cida.usgs.gov/gdp/geonetwork/srv/en/csw' # USGS CIDA Geonetwork
#endpoint = 'http://cmgds.marine.usgs.gov/geonetwork/srv/en/csw' # USGS Coastal and Marine Program

#endpoint = 'http://geoport.whoi.edu/geoportal/csw' # USGS Woods Hole Geoportal 

#endpoint = 'http://geo.gov.ckan.org/csw'  # CKAN testing site for new Data.gov
#endpoint = 'https://edg.epa.gov/metadata/csw'  # EPA
endpoint = 'http://cwic.csiss.gmu.edu/cwicv1/discovery' # CEOS CWIC

csw = CatalogueServiceWeb(endpoint,timeout=60)
csw.version


Out[3]:
'2.0.2'

In [4]:
[op.name for op in csw.operations]


Out[4]:
['GetCapabilities', 'DescribeRecord', 'GetRecords', 'GetRecordById']

In [5]:
# hopefully something like this will be implemented in fes soon
def dateRange(start_date='1900-01-01',stop_date='2100-01-01',constraint='overlaps'):
    if constraint == 'overlaps':
        start = fes.PropertyIsLessThanOrEqualTo(propertyname='startDate', literal=stop_date)
        stop = fes.PropertyIsGreaterThanOrEqualTo(propertyname='endDate', literal=start_date)
    elif constraint == 'within':
        start = fes.PropertyIsGreaterThanOrEqualTo(propertyname='startDate', literal=start_date)
        stop = fes.PropertyIsLessThanOrEqualTo(propertyname='endDate', literal=stop_date)
    return start,stop

In [6]:
# Perform the CSW query, using Kyle's cool new filters on ISO queryables
# find all datasets in a bounding box and temporal extent that have 
# specific keywords and also can be accessed via OPeNDAP  

bbox = fes.BBox([-71.5, 39.5, -63.0, 46])
start,stop = dateRange('1970-01-01','1979-02-01')
std_name = 'sea_water_temperature'
keywords = fes.PropertyIsLike(propertyname='anyText', literal=std_name)
serviceType = fes.PropertyIsLike(propertyname='apiso:ServiceType', literal='*opendap*')

In [7]:
csw.getrecords2(maxrecords=2)


---------------------------------------------------------------------------
HTTPError                                 Traceback (most recent call last)
<ipython-input-7-3cf19d5d4035> in <module>()
----> 1 csw.getrecords2(maxrecords=2)

C:\Users\rsignell\AppData\Local\Enthought\Canopy32\User\lib\site-packages\owslib-0.7_dev-py2.7.egg\owslib\csw.pyc in getrecords2(self, constraints, sortby, typenames, esn, outputschema, format, startposition, maxrecords, cql, xml, resulttype)
    339             self.request = node0
    340 
--> 341         self._invoke()
    342 
    343         if self.exceptionreport is None:

C:\Users\rsignell\AppData\Local\Enthought\Canopy32\User\lib\site-packages\owslib-0.7_dev-py2.7.egg\owslib\csw.pyc in _invoke(self)
    574             print 'self_request:'
    575             print self.request
--> 576             self.response = util.http_post(self.url, self.request, self.lang, self.timeout)
    577 
    578         # parse result see if it's XML

C:\Users\rsignell\AppData\Local\Enthought\Canopy32\User\lib\site-packages\owslib-0.7_dev-py2.7.egg\owslib\util.pyc in http_post(url, request, lang, timeout)
    285 
    286         try:
--> 287             up = urllib2.urlopen(r,timeout=timeout);
    288         except TypeError:
    289             import socket

C:\Users\rsignell\AppData\Local\ENTHOU~1\Canopy32\App\appdata\canopy-1.0.3.1262.win-x86\lib\urllib2.pyc in urlopen(url, data, timeout)
    124     if _opener is None:
    125         _opener = build_opener()
--> 126     return _opener.open(url, data, timeout)
    127 
    128 def install_opener(opener):

C:\Users\rsignell\AppData\Local\ENTHOU~1\Canopy32\App\appdata\canopy-1.0.3.1262.win-x86\lib\urllib2.pyc in open(self, fullurl, data, timeout)
    404         for processor in self.process_response.get(protocol, []):
    405             meth = getattr(processor, meth_name)
--> 406             response = meth(req, response)
    407 
    408         return response

C:\Users\rsignell\AppData\Local\ENTHOU~1\Canopy32\App\appdata\canopy-1.0.3.1262.win-x86\lib\urllib2.pyc in http_response(self, request, response)
    517         if not (200 <= code < 300):
    518             response = self.parent.error(
--> 519                 'http', request, response, code, msg, hdrs)
    520 
    521         return response

C:\Users\rsignell\AppData\Local\ENTHOU~1\Canopy32\App\appdata\canopy-1.0.3.1262.win-x86\lib\urllib2.pyc in error(self, proto, *args)
    442         if http_err:
    443             args = (dict, 'default', 'http_error_default') + orig_args
--> 444             return self._call_chain(*args)
    445 
    446 # XXX probably also want an abstract factory that knows when it makes

C:\Users\rsignell\AppData\Local\ENTHOU~1\Canopy32\App\appdata\canopy-1.0.3.1262.win-x86\lib\urllib2.pyc in _call_chain(self, chain, kind, meth_name, *args)
    376             func = getattr(handler, meth_name)
    377 
--> 378             result = func(*args)
    379             if result is not None:
    380                 return result

C:\Users\rsignell\AppData\Local\ENTHOU~1\Canopy32\App\appdata\canopy-1.0.3.1262.win-x86\lib\urllib2.pyc in http_error_default(self, req, fp, code, msg, hdrs)
    525 class HTTPDefaultErrorHandler(BaseHandler):
    526     def http_error_default(self, req, fp, code, msg, hdrs):
--> 527         raise HTTPError(req.get_full_url(), code, msg, hdrs, fp)
    528 
    529 class HTTPRedirectHandler(BaseHandler):

HTTPError: HTTP Error 500: Internal Server Error
self_request:
<?xml version="1.0" encoding="ISO-8859-1" standalone="no"?>
<csw:GetRecords xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xmlns:csw="http://www.opengis.net/cat/csw/2.0.2" outputSchema="http://www.opengis.net/cat/csw/2.0.2" outputFormat="application/xml" version="2.0.2" service="CSW" resultType="results" maxRecords="2" xsi:schemaLocation="http://www.opengis.net/cat/csw/2.0.2 http://schemas.opengis.net/csw/2.0.2/CSW-discovery.xsd"><csw:Query typeNames="csw:Record"><csw:ElementSetName>summary</csw:ElementSetName></csw:Query></csw:GetRecords>

In [ ]:
# try simplest request first (no constraints)
csw.getrecords2(maxrecords=5,esn='full')
csw.records.keys()

In [ ]:
for rec,item in csw.records.iteritems():
    print item.title

In [ ]:
# try simple request first
csw.getrecords2(constraints=[keywords],maxrecords=15,esn='full')
csw.records.keys()

In [ ]:
# apply all the filters using the "and" syntax: [[filter1,filter2]]
csw.getrecords2(constraints=[[keywords,start,stop,serviceType,bbox]],maxrecords=15,esn='full')
csw.records.keys()

In [ ]:
for rec,item in csw.records.iteritems():
    print item.title

In [ ]:
# get specific ServiceType URL from records
def service_urls(records,service_string='urn:x-esri:specification:ServiceType:odp:url'):
    urls=[]
    for key,rec in records.iteritems():
        #create a generator object, and iterate through it until the match is found
        #if not found, gets the default value (here "none")
        url = next((d['url'] for d in rec.references if d['scheme'] == service_string), None)
        if url is not None:
            urls.append(url)
    return urls

In [ ]:
dap_urls = service_urls(csw.records,service_string='urn:x-esri:specification:ServiceType:odp:url')
print ".html\n".join(dap_urls)

In [ ]:
def standard_names(nc):
    '''
    get dictionary of variables with standard_names
    '''
    d={}
    for k,v in nc.iteritems():
        try:
            standard_name=v.getncattr('standard_name')
            try:
                d[standard_name]=[d[standard_name],[k]]
            except:
                d[standard_name]=[k]
        except:
            pass
    return d

In [ ]:
# hack for speed of access and plotting in this demo -- select data with 'A1H' in the URL
dap_urls = [url for url in dap_urls if '-A1H' in url]
print ".html\n".join(dap_urls)

In [ ]:
for url in dap_urls:
    nc = netCDF4.Dataset(url).variables
    lat = nc['lat'][:]
    lon = nc['lon'][:]
    time_var = nc['time']
    dtime = netCDF4.num2date(time_var[:],time_var.units)
    # make a dictionary containing all data from variables that matched the standard_name
    # find list of variables for each standard_name
    d = standard_names(nc)
    # find all the variables matching standard_name=std_name
    d[std_name]
    # read all the data into a dictionary
    data_dict={}
    for v in d[std_name]:
        data_dict[v]=nc[v][:].flatten()
    # Create Pandas data frame, with time index
    ts = pd.DataFrame.from_dict(data_dict)
    ts.index=dtime
    ts.plot(figsize=(12,4));
    title(std_name)

In [ ]: