CSW access with OWSLib using ISO queryables

Test to see if we can SEARCH, ACCESS and USE data using IOOS supported service protocols.

  1. SEARCH a CSW Endpoint to find all datasets that: are containing within a specific bounding box have a specified variable of interest fall withing a specified date range contain a specific data access service type

  2. ACCESS data from the discovered data endpoints

  3. USE the data and see if we can use the data (to the extent of making a time series plot with a labeled x-axis).


In [1]:
from pylab import *
from owslib.csw import CatalogueServiceWeb
from owslib import fes
import random
import netCDF4
import pandas as pd
import datetime as dt

These are tags NGDC would like tested:

apiso:ResponsiblePartyName
apiso:Keywords
apiso:KeywordType
apiso:VertExtent_min
apiso:VertExtent_max
apiso:ServiceType
apiso:ServiceTypeVersion
apiso:OperatesOn
resource.url
website.url

In [2]:
#from IPython.core.display import HTML
#HTML('<iframe src=http://www.ngdc.noaa.gov/geoportal/ width=950 height=400></iframe>')

In [3]:
# connect to CSW, explore it's properties

endpoint = 'http://www.ngdc.noaa.gov/geoportal/csw' # NGDC Geoportal

#endpoint = 'http://www.nodc.noaa.gov/geoportal/csw'   # NODC Geoportal: granule level
#endpoint = 'http://data.nodc.noaa.gov/geoportal/csw'  # NODC Geoportal: collection level   
#endpoint = 'http://geodiscover.cgdi.ca/wes/serviceManagerCSW/csw'  # NRCAN CUSTOM
#endpoint = 'http://geoport.whoi.edu/gi-cat/services/cswiso' # USGS Woods Hole GI_CAT
#endpoint = 'http://cida.usgs.gov/gdp/geonetwork/srv/en/csw' # USGS CIDA Geonetwork
#endpoint = 'http://cmgds.marine.usgs.gov/geonetwork/srv/en/csw' # USGS Coastal and Marine Program
#endpoint = 'http://geoport.whoi.edu/geoportal/csw' # USGS Woods Hole Geoportal 
#endpoint = 'http://geo.gov.ckan.org/csw'  # CKAN testing site for new Data.gov
#endpoint = 'https://edg.epa.gov/metadata/csw'  # EPA
#endpoint = 'http://cwic.csiss.gmu.edu/cwicv1/discovery'  # CWIC

csw = CatalogueServiceWeb(endpoint,timeout=60)
csw.version


Out[3]:
'2.0.2'

In [4]:
[op.name for op in csw.operations]


Out[4]:
['GetCapabilities',
 'DescribeRecord',
 'GetRecords',
 'GetRecordById',
 'Transaction']

In [5]:
# hopefully something like this will be implemented in fes soon
def dateRange(start_date='1900-01-01',stop_date='2100-01-01',constraint='overlaps'):
    if constraint == 'overlaps':
        start = fes.PropertyIsLessThanOrEqualTo(propertyname='apiso:TempExtent_begin', literal=stop_date)
        stop = fes.PropertyIsGreaterThanOrEqualTo(propertyname='apiso:TempExtent_end', literal=start_date)
    elif constraint == 'within':
        start = fes.PropertyIsGreaterThanOrEqualTo(propertyname='apiso:TempExtent_begin', literal=start_date)
        stop = fes.PropertyIsLessThanOrEqualTo(propertyname='apiso:TempExtent_end', literal=stop_date)
    return start,stop

In [6]:
# hopefully something like this will be implemented in fes soon
def zRange(min_z='-5000', max_z='0', constraint='overlaps'):
    if constraint == 'overlaps':
        zmin = fes.PropertyIsLessThanOrEqualTo(propertyname='apiso:VertExtent_min', literal=min_z)
        zmax = fes.PropertyIsGreaterThanOrEqualTo(propertyname='apiso:VertExtent_max', literal=max_z)
    elif constraint == 'within':
        zmin = fes.PropertyIsGreaterThanOrEqualTo(propertyname='apiso:VertExtent_min', literal=min_z)
        zmax = fes.PropertyIsLessThanOrEqualTo(propertyname='apiso:VertExtent_max', literal=max_z)
    return zmin,zmax

In [7]:
# User Input for query

#box=[-120, 2.0, -110.0, 6.0] #  oceansites
box=[-160, 19, -156, 23]   # pacioos
start_date='2012-05-01'
stop_date='2012-06-01'
min_z = '-5000'
max_z = '0'
responsible_party = 'Margaret McManus'
std_name = 'sea_water_temperature'
service_type='opendap'

In [8]:
# convert User Input into FES filters
start,stop = dateRange(start_date,stop_date)
zmin,zmax = zRange(min_z,max_z,constraint='within')
bbox = fes.BBox(box)
#keywords = fes.PropertyIsLike(propertyname='anyText', literal=std_name)
#keywords = fes.PropertyIsLike(propertyname='apiso:Keywords', literal=std_name)
keywords = fes.PropertyIsEqualTo(propertyname='apiso:Keywords', literal=std_name)
serviceType = fes.PropertyIsLike(propertyname='apiso:ServiceType', literal=('*%s*' % service_type))
ResponsiblePartyName = fes.PropertyIsEqualTo(propertyname='apiso:ResponsiblePartyName', literal=responsible_party)
#serviceType = fes.PropertyIsEqualTo(propertyname='apiso:ServiceType', literal=service_type)

In [9]:
# try simple request using only keywords

csw.getrecords2(constraints=[keywords],maxrecords=5)
csw.records.keys()


Out[9]:
['PWS_L1_FCST',
 'Central and Northern California Ocean Observing System SOS',
 'MB_FCST',
 'PWS_L0_FCST',
 'SCB_DAS']

In [10]:
# try request using multiple filters "and" syntax: [[filter1,filter2]]
csw.getrecords2(constraints=[[keywords,start,stop,serviceType,bbox]],maxrecords=5,esn='full')
csw.records.keys()


Out[10]:
['NS10agg', 'NS01agg', 'NS02agg', 'NS03agg', 'NS04agg']

In [11]:
# try request using multiple filters "and" syntax: [[filter1,filter2]]
csw.getrecords2(constraints=[[keywords,start,stop,bbox]],maxrecords=5,esn='full')
csw.records.keys()


Out[11]:
['National Data Buoy Center SOS',
 'NOAA.NOS.CO-OPS SOS',
 'NS01agg',
 'NS02agg',
 'NS03agg']

In the next cell we take a look at the references for a random record. The ServiceType for the OPeNDAP Data URL below is ServiceType:odp:url, which seems correct!


In [12]:
choice=random.choice(list(csw.records.keys()))
print choice
csw.records[choice].references


National Data Buoy Center SOS
Out[12]:
[{'scheme': 'urn:x-esri:specification:ServiceType:distribution:url',
  'url': 'http://sdf.ndbc.noaa.gov/sos/server.php?service=SOS&request=GetCapabilities'},
 {'scheme': 'urn:x-esri:specification:ServiceType:distribution:url',
  'url': 'http://sdf.ndbc.noaa.gov/sos/server.php?service=SOS&request=GetCapabilities'},
 {'scheme': 'urn:x-esri:specification:ServiceType:sos:url',
  'url': 'http://sdf.ndbc.noaa.gov/sos/server.php?service=SOS&request=GetCapabilities'},
 {'scheme': 'urn:x-esri:specification:ServiceType:sos:url',
  'url': 'http://sdf.ndbc.noaa.gov/sos/server.php?service=SOS&request=GetCapabilities'}]

In the next cell we try using the ResponsiblePartyName, we are looking for 'Jim Potemra', who we see in the ISO record for 'NS10agg' here
http://oos.soest.hawaii.edu/thredds/iso/hioos/nss/ns10agg?catalog=http%3A%2F%2Foos.soest.hawaii.edu%2Fthredds%2Fidd%2Fnss_hioos.html&dataset=NS10agg


In [13]:
# try adding responsible party role 
csw.getrecords2(constraints=[ResponsiblePartyName],maxrecords=10,esn='full')
csw.records.keys()


Out[13]:
['NS05agg',
 'NS10agg',
 'NS09agg',
 'NS15agg',
 'NS04agg',
 'NS06agg',
 'NS03agg',
 'NS11agg',
 'NS07agg',
 'NS08agg']

Now print out some titles


In [14]:
for rec,item in csw.records.iteritems():
    print item.title


PacIOOS Nearshore Sensor 05: Pago Pago, American Samoa
PacIOOS Nearshore Sensor 10: Maunalua Bay, Oahu, Hawaii
PacIOOS Nearshore Sensor 09: Cetti Bay, Guam
PacIOOS Nearshore Sensor 15: Pago Bay, Guam
PacIOOS Nearshore Sensor 04: Waikiki Aquarium, Oahu, Hawaii
PacIOOS Nearshore Sensor 06: Pohnpei, Micronesia
PacIOOS Nearshore Sensor 03: Hilton Hawaiian Pier, Oahu, Hawaii
PacIOOS Nearshore Sensor 11: Saipan, CNMI
PacIOOS Nearshore Sensor 07: Majuro, Marshall Islands
PacIOOS Nearshore Sensor 08: Koror, Palau

In the next cell we try using a vertical extent range, which doesn't seem to be working, as we are looking for z values between [-5000, 0] and we see in the ISO record for 'NS10agg' here:
http://oos.soest.hawaii.edu/thredds/iso/hioos/nss/ns10agg?catalog=http%3A%2F%2Foos.soest.hawaii.edu%2Fthredds%2Fidd%2Fnss_hioos.html&dataset=NS10agg
that the vertical extent is from [-2, -2]


In [15]:
csw.getrecords2(constraints=[[keywords,start,stop,serviceType,bbox,zmin,zmax]],maxrecords=5,esn='full')
csw.records.keys()


Out[15]:
['NS10agg', 'NS01agg', 'NS02agg', 'NS03agg', 'NS04agg']

Define a function that will return the endpoint for a specified service type


In [16]:
# get specific ServiceType URL from records
def service_urls(records,service_string='urn:x-esri:specification:ServiceType:odp:url'):
    urls=[]
    for key,rec in records.iteritems():
        #create a generator object, and iterate through it until the match is found
        #if not found, gets the default value (here "none")
        url = next((d['url'] for d in rec.references if d['scheme'] == service_string), None)
        if url is not None:
            urls.append(url)
    return urls

Print out all the OPeNDAP Data URL endpoints


In [17]:
dap_urls = service_urls(csw.records,service_string='urn:x-esri:specification:ServiceType:odp:url')
print "\n".join(dap_urls)


http://oos.soest.hawaii.edu/thredds/dodsC/hioos/nss/ns10agg
http://oos.soest.hawaii.edu/thredds/dodsC/hioos/nss/ns01agg
http://oos.soest.hawaii.edu/thredds/dodsC/hioos/nss/ns02agg
http://oos.soest.hawaii.edu/thredds/dodsC/hioos/nss/ns03agg
http://oos.soest.hawaii.edu/thredds/dodsC/hioos/nss/ns04agg

In [18]:
def standard_names(ncv):
    '''
    get dictionary of variables with standard_names
    '''
    d={}
    for k,v in ncv.iteritems():
        try:
            standard_name=v.getncattr('standard_name')
            try:
                d[standard_name]=[d[standard_name],[k]]
            except:
                d[standard_name]=[k]
        except:
            pass
    return d

In [19]:
for url in dap_urls:
    ncv = netCDF4.Dataset(url).variables
    print ncv.keys()
    lat = ncv['lon'][:]
    lon = ncv['lat'][:]
    tvar = ncv['time']
    istart = netCDF4.date2index(dt.datetime.strptime(start_date,'%Y-%m-%d'),tvar,select='nearest')
    istop = netCDF4.date2index(dt.datetime.strptime(stop_date,'%Y-%m-%d'),tvar,select='nearest')
    if istart != istop:
        dtime = netCDF4.num2date(tvar[istart:istop],tvar.units)
        # make a dictionary containing all data from variables that matched the standard_name
        # find list of variables for each standard_name
        d = standard_names(ncv)
        # find all the variables matching standard_name=std_name
        print d[std_name]
        # read all the data into a dictionary
        data_dict={}
        lev=0
        for v in d[std_name]:
            data_dict[v]=ncv[v][istart:istop].flatten()
        # Create Pandas data frame, with time index
        ts = pd.DataFrame.from_dict(data_dict)
        ts.index=dtime
        ts.plot(figsize=(12,4));
        title(std_name)


[u'z', u'lat', u'lon', u'time', u'temp', u'cond', u'turb', u'flor', u'salt', u'pres']
[u'temp']
[u'z', u'lat', u'lon', u'time', u'temp', u'cond', u'turb', u'flor', u'salt', u'pres']
[u'temp']
[u'z', u'lat', u'lon', u'time', u'temp', u'cond', u'turb', u'flor', u'salt']
[u'z', u'lat', u'lon', u'time', u'temp', u'cond', u'salt', u'pres']
[u'temp']
[u'z', u'lat', u'lon', u'time', u'temp', u'cond', u'salt', u'pres']
[u'temp']