In [1]:
import pathlib
import xml.etree.ElementTree
import lxml.etree
import datetime
import requests
import tqdm
In [2]:
catalog = pathlib.Path('/Users/baart_f/src/thredds-docker/catalogList.xml')
In [3]:
tree = lxml.etree.parse(str(catalog))
In [6]:
refs = tree.findall('.//{http://www.unidata.ucar.edu/namespaces/thredds/InvCatalog/v1.0}catalogRef')
print(len(refs))
In [7]:
for i, ref in enumerate(tqdm.tqdm_notebook(refs)):
url = ref.attrib['{http://www.w3.org/1999/xlink}href']
msg = ''
parent = ref.getparent()
try:
resp = requests.get(url, timeout=5)
if resp.status_code != 200:
msg = "result not ok (status %s)" % (resp.status_code, )
except requests.exceptions.Timeout as e:
msg = "result not ok (timeout)"
except requests.exceptions.ConnectionError as e:
msg = "result not ok (connection error)"
if (msg):
comment = lxml.etree.Comment(
'Url checked: not ok: %s (%s)' % (msg, datetime.datetime.now(), )
)
ref.addprevious(comment)
ref_string = lxml.etree.tostring(ref)
comment = lxml.etree.Comment(ref_string)
# replace by comment
parent.replace(ref, comment)
else:
comment = lxml.etree.Comment('Url checked: ok (%s)' % (datetime.datetime.now(), ))
ref.addprevious(comment)
In [8]:
tree.write('catalog.xml', xml_declaration=True, pretty_print=True, encoding='UTF-8')
In [9]:
!open .
In [14]:
tree
Out[14]:
In [ ]: