In [1]:
from xml.etree import cElementTree
from urlparse import urljoin

import requests

# namespaces for XML parsing
thredds = "http://www.unidata.ucar.edu/namespaces/thredds/InvCatalog/v1.0"
xlink = "http://www.w3.org/1999/xlink"

def crawl(catalog):
    r = requests.get(catalog)
    xml = cElementTree.fromstring(r.content)

    # depth first traversal
    for subdir in xml.iterfind(".//{%s}catalogRef" % thredds):
        link = subdir.attrib["{%s}href" % xlink]
        for dataset in crawl(urljoin(catalog, link)):
            yield dataset

    for dataset in xml.iterfind(".//{%s}dataset[@urlPath]" % thredds):
        yield dataset

In [2]:
catalog='http://geoport.whoi.edu/thredds/COAWST_catalog.xml'
foo = crawl(catalog)

In [9]:
print next(crawl(catalog))


<Element '{http://www.unidata.ucar.edu/namespaces/thredds/InvCatalog/v1.0}dataset' at 0x26c6330>

In [10]:
r = requests.get(catalog)

In [12]:
r.content


Out[12]:
'<?xml version="1.0" encoding="UTF-8"?>\r\n<catalog xmlns="http://www.unidata.ucar.edu/namespaces/thredds/InvCatalog/v1.0" xmlns:xlink="http://www.w3.org/1999/xlink" name="OPeNDAP Data Server" version="1.0.1">\r\n  <service name="allServices" serviceType="Compound" base="">\r\n    <service name="ncdods" serviceType="OPENDAP" base="/thredds/dodsC/" />\r\n    <service name="ncss" serviceType="NetcdfSubset" base="/thredds/ncss/grid/" />\r\n    <service name="wms" serviceType="WMS" base="/thredds/wms/" />\r\n    <service name="iso" serviceType="ISO" base="/thredds/iso/" />\r\n    <service name="ncml" serviceType="NCML" base="/thredds/ncml/" />\r\n    <service name="uddc" serviceType="UDDC" base="/thredds/uddc/" />\r\n  </service>\r\n  <dataset name="COAWST" ID="coawst">\r\n    <metadata inherited="true">\r\n      <serviceName>allServices</serviceName>\r\n      <authority>gov.usgs.er.whsc</authority>\r\n      <dataType>GRID</dataType>\r\n      <dataFormat>netCDF</dataFormat>\r\n      <documentation xlink:href="http://woodshole.er.usgs.gov/project-pages/cccp/index.html" xlink:title="Carolinas Coastal Change Program" />\r\n      <documentation xlink:href="http://geoport.whoi.edu:8081/ReadMeCOAWST.html" xlink:title="ReadMe.txt" />\r\n      <creator>\r\n        <name vocabulary="DIF">OM/WHSC/USGS</name>\r\n        <contact url="http://www.usgs.gov/" email="jcwarner@usgs.gov" />\r\n      </creator>\r\n      <publisher>\r\n        <name vocabulary="DIF">OM/WHSC/USGS</name>\r\n        <contact url="http://www.usgs.gov/" email="jcwarner@usgs.gov" />\r\n      </publisher>\r\n    </metadata>\r\n    <dataset name="COAWST Nowcast/Forecast Runs" ID="coast_fc">\r\n      <catalogRef xlink:href="/thredds/catalog/coawst_2_2/fmrc/catalog.xml" xlink:title="coawst_2_2" name="" />\r\n      <catalogRef xlink:href="/thredds/catalog/coawst_3/car/fmrc/catalog.xml" xlink:title="coawst_3_car" name="" />\r\n      <catalogRef xlink:href="/thredds/catalog/coawst_4/use/fmrc/catalog.xml" xlink:title="coawst_4_use" name="" />\r\n      <catalogRef xlink:href="/thredds/catalog/coawst_4/gom/fmrc/catalog.xml" xlink:title="coawst_4_gom" name="" />\r\n    </dataset>\r\n  </dataset>\r\n</catalog>\r\n\r\n'

In [ ]: