We will use the owslib library to construct queries and parse responses from CSW
In [1]:
from owslib.csw import CatalogueServiceWeb
from owslib import fes
import numpy as np
Specify a CSW endpoint. You can test if it's working with a getCapabilities request:
<endpoint>?request=GetCapabilities&service=CSW
for example:
http://catalog.data.gov/csw-all?service=CSW&version=2.0.2&request=GetCapabilities
In [2]:
#endpoint = 'http://catalog.data.gov/csw-all' #granule level production catalog
#endpoint = 'https://data.ioos.us/csw'
endpoint = 'https://dev-catalog.ioos.us/csw'
#endpoint = 'http://geoport.whoi.edu/csw'
#endpoint = 'http://www.ngdc.noaa.gov/geoportal/csw'
csw = CatalogueServiceWeb(endpoint,timeout=60)
print csw.version
In [3]:
val = 'sea_water_salinity'
#val = 'NODC'
filter1 = fes.PropertyIsLike(propertyname='apiso:AnyText',literal=('*%s*' % val),
escapeChar='\\',wildCard='*',singleChar='?')
filter_list = [ filter1 ]
csw.getrecords2(constraints=filter_list,maxrecords=100,esn='full')
print len(csw.records.keys())
for rec in list(csw.records.keys()):
print csw.records[rec].title
Hmmm..... In the query above, we only get 10 records, even though we specified maxrecords=100.
What's up with that?
Turns out the CSW service specified a MaxRecordDefault that cannot be exceeded. For example, checking: https://dev-catalog.ioos.us/csw?request=GetCapabilities&service=CSW we find:
<ows:Constraint name="MaxRecordDefault">
<ows:Value>10</ows:Value>
</ows:Constraint>
So we need to loop the getrecords request, incrementing the startposition:
In [4]:
from owslib.fes import SortBy, SortProperty
pagesize = 10
sort_property = 'dc:title' # a supported queryable of the CSW
sort_order = 'ASC' # should be 'ASC' or 'DESC' (ascending or descending)
maxrecords = 50
sortby = SortBy([SortProperty(sort_property, sort_order)])
In [5]:
startposition = 0
while True:
print 'getting records %d to %d' % (startposition, startposition+pagesize)
csw.getrecords2(constraints=filter_list,
startposition=startposition, maxrecords=pagesize, sortby=sortby)
for rec,item in csw.records.iteritems():
print(item.title)
print
if csw.results['nextrecord'] == 0:
break
startposition += pagesize
if startposition >= maxrecords:
break
Okay, now lets add another query filter and add it to the first one
In [6]:
val = 'CariCOOS'
#val = '0115145'
filter2 = fes.PropertyIsLike(propertyname='apiso:AnyText',literal=('*%s*' % val),
escapeChar='\\',wildCard='*',singleChar='?')
filter_list = [fes.And([filter1, filter2])]
In [7]:
startposition = 0
maxrecords = 50
while True:
print 'getting records %d to %d' % (startposition, startposition+pagesize)
csw.getrecords2(constraints=filter_list,
startposition=startposition, maxrecords=pagesize, sortby=sortby)
for rec,item in csw.records.iteritems():
print(item.title)
print
if csw.results['nextrecord'] == 0:
break
startposition += pagesize
if startposition >= maxrecords:
break
In [8]:
choice=np.random.choice(list(csw.records.keys()))
print(csw.records[choice].title)
csw.records[choice].references
Out[8]:
Lets see what the full XML record looks like
In [9]:
csw.records[choice].xml
Out[9]:
Yuk! That's why we use OWSlib! :-)
Now add contraint to return only records that have either the OPeNDAP or SOS service.
Let's first see what services are advertised:
In [10]:
try:
csw.get_operation_by_name('GetDomain')
csw.getdomain('apiso:ServiceType', 'property')
print(csw.results['values'])
except:
print('GetDomain not supported')
In [11]:
#val = 'OPeNDAP'
val = 'SOS'
filter3 = fes.PropertyIsLike(propertyname='apiso:ServiceType',literal=('*%s*' % val),
escapeChar='\\',wildCard='*',singleChar='?')
services = ['OPeNDAP','SOS']
service_filt = fes.Or([fes.PropertyIsLike(propertyname='apiso:ServiceType',literal=('*%s*' % val),
escapeChar='\\',wildCard='*',singleChar='?') for val in services])
filter_list = [fes.And([filter1, filter2, filter3])]
#filter_list = [fes.And([filter1, filter3])]
#filter_list = [fes.And([filter1, filter2, service_filt])]
In [12]:
startposition = 0
while True:
print 'getting records %d to %d' % (startposition, startposition+pagesize)
csw.getrecords2(constraints=filter_list,
startposition=startposition, maxrecords=pagesize, sortby=sortby)
for rec,item in csw.records.iteritems():
print(item.title)
print
if csw.results['nextrecord'] == 0:
break
startposition += pagesize
if startposition >= maxrecords:
break
Let's try adding a search for a non-existant service, which should result in no records back:
In [13]:
val = 'not_a_real_service'
filter3 = fes.PropertyIsLike(propertyname='apiso:ServiceType',literal=('*%s*' % val),
escapeChar='\\',wildCard='*',singleChar='?')
filter_list = [fes.And([filter1, filter2, filter3])]
csw.getrecords2(constraints=filter_list,maxrecords=100,esn='full')
print len(csw.records.keys())
for rec in list(csw.records.keys()):
print csw.records[rec].title
Good!
Now add bounding box constraint. To specify lon,lat order for bbox (which we want to do so that we can use the same bbox with either geoportal server or pycsw requests), we need to request the bounding box specifying the CRS84 coordinate reference system. The CRS84 option is available in pycsw 1.1.10
+. The ability to specify the crs
in the bounding box request is available in owslib 0.8.12
+. For more info on the bounding box problem and how it was solved, see this pycsw issue, this geoportal server issue, and this owslib issue
In [14]:
bbox = [-158.4, 21.24, -157.5, 21.77] # [lon_min, lat_min, lon_max, lat_max]
bbox_filter = fes.BBox(bbox,crs='urn:ogc:def:crs:OGC:1.3:CRS84')
filter_list = [fes.And([filter1, filter2, service_filt, bbox_filter])]
startposition = 0
while True:
print 'getting records %d to %d' % (startposition, startposition+pagesize)
csw.getrecords2(constraints=filter_list,
startposition=startposition, maxrecords=pagesize, sortby=sortby)
for rec,item in csw.records.iteritems():
print(item.title)
print
if csw.results['nextrecord'] == 0:
break
startposition += pagesize
if startposition >= maxrecords:
break
Now add time contraints. Here we first define a function that will return records if any data in the records overlaps the specified time period
In [15]:
def dateRange(start_date='1900-01-01',stop_date='2100-01-01',constraint='overlaps'):
if constraint == 'overlaps':
start = fes.PropertyIsLessThanOrEqualTo(propertyname='apiso:TempExtent_begin', literal=stop_date)
stop = fes.PropertyIsGreaterThanOrEqualTo(propertyname='apiso:TempExtent_end', literal=start_date)
elif constraint == 'within':
start = fes.PropertyIsGreaterThanOrEqualTo(propertyname='apiso:TempExtent_begin', literal=start_date)
stop = fes.PropertyIsLessThanOrEqualTo(propertyname='apiso:TempExtent_end', literal=stop_date)
return start,stop
In [16]:
import datetime as dt
# 2014 recent
jd_start = dt.datetime(1988,1,1)
jd_stop = dt.datetime(1988,3,1)
# 2011
#jd_start = dt.datetime(2013,4,20)
#jd_stop = dt.datetime(2013,4,24)
# ... or relative to now
jd_now = dt.datetime.utcnow()
jd_start = jd_now - dt.timedelta(days=3)
jd_stop = jd_now + dt.timedelta(days=3)
start_date = jd_start.strftime('%Y-%m-%d %H:00')
stop_date = jd_stop.strftime('%Y-%m-%d %H:00')
jd_start = dt.datetime.strptime(start_date,'%Y-%m-%d %H:%M')
jd_stop = dt.datetime.strptime(stop_date,'%Y-%m-%d %H:%M')
print(start_date,'to',stop_date)
start,stop = dateRange(start_date,stop_date)
In [17]:
filter_list = [fes.And([filter1, filter2, service_filt, bbox_filter, start, stop])]
startposition = 0
while True:
print 'getting records %d to %d' % (startposition, startposition+pagesize)
csw.getrecords2(constraints=filter_list,
startposition=startposition, maxrecords=pagesize, sortby=sortby)
for rec,item in csw.records.iteritems():
print(item.title)
print
if csw.results['nextrecord'] == 0:
break
startposition += pagesize
if startposition >= maxrecords:
break
Now add a NOT filter to eliminate some entries
In [18]:
kw = dict(wildCard='*', escapeChar='\\',
singleChar='?', propertyname='apiso:AnyText')
not_filt = fes.Not([fes.PropertyIsLike(literal='*Waikiki*', **kw)])
In [19]:
filter_list = [fes.And([filter1, filter2, service_filt, bbox_filter, start, stop, not_filt])]
startposition = 0
while True:
print 'getting records %d to %d' % (startposition, startposition+pagesize)
csw.getrecords2(constraints=filter_list,
startposition=startposition, maxrecords=pagesize, sortby=sortby)
for rec,item in csw.records.iteritems():
print(item.title)
print
if csw.results['nextrecord'] == 0:
break
startposition += pagesize
if startposition >= maxrecords:
break
Hopefully this notebook demonstrated some of the power (and complexity) of CSW! ;-)
In [ ]: