Demonstration of how to use the OGC Catalog Services for the Web (CSW) to search for find all datasets containing a specified variable that fall withing a specified date range and geospatial bounding box, and then use the data access service contained in the returned metadata to retrieve and visualize the data.
Here we are accessing a Geoportal Server CSW, but in the future we should be able to point it toward any another CSW service, such as the one provided by catalog.data.gov.
In [2]:
from pylab import *
from owslib.csw import CatalogueServiceWeb
from owslib import fes
import netCDF4
import pandas as pd
In [3]:
from IPython.core.display import HTML
HTML('<iframe src=http://geoport.whoi.edu/geoportal/ width=950 height=400></iframe>')
Out[3]:
In [4]:
# connect to CSW, explore it's properties
#endpoint = 'http://www.ngdc.noaa.gov/geoportal/csw' # NGDC Geoportal
#endpoint = 'http://www.nodc.noaa.gov/geoportal/csw' # NODC Geoportal: granule level
#endpoint = 'http://data.nodc.noaa.gov/geoportal/csw' # NODC Geoportal: collection level
#endpoint = 'http://geodiscover.cgdi.ca/wes/serviceManagerCSW/csw' # NRCAN CUSTOM
#endpoint = 'http://geoport.whoi.edu/gi-cat/services/cswiso' # USGS Woods Hole GI_CAT
#endpoint = 'http://cida.usgs.gov/gdp/geonetwork/srv/en/csw' # USGS CIDA Geonetwork
#endpoint = 'http://cmgds.marine.usgs.gov/geonetwork/srv/en/csw' # USGS Coastal and Marine Program
#endpoint = 'http://geoport.whoi.edu/geoportal/csw' # USGS Woods Hole Geoportal
#endpoint = 'http://geo.gov.ckan.org/csw' # CKAN testing site for new Data.gov
#endpoint = 'https://edg.epa.gov/metadata/csw' # EPA
endpoint = 'http://cwic.csiss.gmu.edu/cwicv1/discovery' # CWIC
csw = CatalogueServiceWeb(endpoint,timeout=60)
csw.version
Out[4]:
In [5]:
[op.name for op in csw.operations]
Out[5]:
In [6]:
# hopefully something like this will be implemented in fes soon
def dateRange(start_date='1900-01-01',stop_date='2100-01-01',constraint='overlaps'):
if constraint == 'overlaps':
start = fes.PropertyIsLessThanOrEqualTo(propertyname='startDate', literal=stop_date)
stop = fes.PropertyIsGreaterThanOrEqualTo(propertyname='endDate', literal=start_date)
elif constraint == 'within':
start = fes.PropertyIsGreaterThanOrEqualTo(propertyname='startDate', literal=start_date)
stop = fes.PropertyIsLessThanOrEqualTo(propertyname='endDate', literal=stop_date)
return start,stop
In [7]:
# Perform the CSW query, using Kyle's cool new filters on ISO queryables
# find all datasets in a bounding box and temporal extent that have
# specific keywords and also can be accessed via OPeNDAP
bbox = fes.BBox([-71.5, 39.5, -63.0, 46])
start,stop = dateRange('1970-01-01','1979-02-01')
std_name = 'sea_water_temperature'
keywords = fes.PropertyIsLike(propertyname='anyText', literal=std_name)
serviceType = fes.PropertyIsLike(propertyname='apiso:ServiceType', literal='*opendap*')
In [8]:
# try simplest request first (no constraints)
csw.getrecords2(maxrecords=5,esn='full')
csw.records.keys()
In [41]:
for rec,item in csw.records.iteritems():
print item.title
In [33]:
# try simple request first
csw.getrecords2(constraints=[keywords],maxrecords=15,esn='full')
csw.records.keys()
Out[33]:
In [ ]:
# apply all the filters using the "and" syntax: [[filter1,filter2]]
csw.getrecords2(constraints=[[keywords,start,stop,serviceType,bbox]],maxrecords=15,esn='full')
csw.records.keys()
In [19]:
for rec,item in csw.records.iteritems():
print item.title
In [20]:
# get specific ServiceType URL from records
def service_urls(records,service_string='urn:x-esri:specification:ServiceType:odp:url'):
urls=[]
for key,rec in records.iteritems():
#create a generator object, and iterate through it until the match is found
#if not found, gets the default value (here "none")
url = next((d['url'] for d in rec.references if d['scheme'] == service_string), None)
if url is not None:
urls.append(url)
return urls
In [21]:
dap_urls = service_urls(csw.records,service_string='urn:x-esri:specification:ServiceType:odp:url')
print ".html\n".join(dap_urls)
In [22]:
def standard_names(nc):
'''
get dictionary of variables with standard_names
'''
d={}
for k,v in nc.iteritems():
try:
standard_name=v.getncattr('standard_name')
try:
d[standard_name]=[d[standard_name],[k]]
except:
d[standard_name]=[k]
except:
pass
return d
In [23]:
# hack for speed of access and plotting in this demo -- select data with 'A1H' in the URL
dap_urls = [url for url in dap_urls if '-A1H' in url]
print ".html\n".join(dap_urls)
In [24]:
for url in dap_urls:
nc = netCDF4.Dataset(url).variables
lat = nc['lat'][:]
lon = nc['lon'][:]
time_var = nc['time']
dtime = netCDF4.num2date(time_var[:],time_var.units)
# make a dictionary containing all data from variables that matched the standard_name
# find list of variables for each standard_name
d = standard_names(nc)
# find all the variables matching standard_name=std_name
d[std_name]
# read all the data into a dictionary
data_dict={}
for v in d[std_name]:
data_dict[v]=nc[v][:].flatten()
# Create Pandas data frame, with time index
ts = pd.DataFrame.from_dict(data_dict)
ts.index=dtime
ts.plot(figsize=(12,4));
title(std_name)
In [12]: