In [3]:
from utilities import css_styles
css_styles()
Out[3]:
A common task is to search for what information is available for further research. Programmatically, this is done using a list of text fields (strings) to query common data catalogs and find out what services are available. This notebook tries to determine the most basic metrics about those services (e.g., how many services are available in each catalog).
What model records and how many are available via each endpoint in our list of CSWs?
In [4]:
# https://github.com/ioos/system-test/wiki/Service-Registries-and-Data-Catalogs
known_csw_servers = endpoints = ['http://data.nodc.noaa.gov/geoportal/csw',
'http://cwic.csiss.gmu.edu/cwicv1/discovery',
'http://geoport.whoi.edu/geoportal/csw',
'https://edg.epa.gov/metadata/csw',
'http://www.ngdc.noaa.gov/geoportal/csw',
'http://cmgds.marine.usgs.gov/geonetwork/srv/en/csw',
'http://www.nodc.noaa.gov/geoportal/csw',
'http://cida.usgs.gov/gdp/geonetwork/srv/en/csw',
'http://geodiscover.cgdi.ca/wes/serviceManagerCSW/csw',
'http://geoport.whoi.edu/gi-cat/services/cswiso',
#'https://data.noaa.gov/csw',
]
This notebook is primarily concerned with models, so we generate a list of terms over which to search the catalogs.
In [5]:
known_model_strings = ['roms', 'selfe', 'adcirc', 'ncom', 'hycom', 'fvcom', 'pom', 'wrams', 'wrf']
In [6]:
from owslib import fes
model_name_filters = []
for model in known_model_strings:
title_filter = fes.PropertyIsLike(propertyname='apiso:Title', literal='*%s*' % model, wildCard='*')
subject_filter = fes.PropertyIsLike(propertyname='apiso:Subject', literal='*%s*' % model, wildCard='*')
model_name_filters.append(fes.Or([title_filter, subject_filter]))
In [7]:
from owslib.csw import CatalogueServiceWeb
model_results = []
for x in range(len(model_name_filters)):
model_name = known_model_strings[x]
single_model_filter = model_name_filters[x]
for url in known_csw_servers:
try:
csw = CatalogueServiceWeb(url, timeout=20)
csw.getrecords2(constraints=[single_model_filter], maxrecords=1000, esn='full')
for record, item in csw.records.items():
for d in item.references:
result = dict(model=model_name,
scheme=d['scheme'],
url=d['url'],
server=url)
model_results.append(result)
except BaseException as e:
print "- FAILED: %s - %s" % (url, e.msg)
In [8]:
%matplotlib inline
import pandas as pd
pd.set_option('display.max_columns', 20)
pd.set_option('display.max_rows', 50)
from IPython.display import HTML
df = pd.DataFrame(model_results)
df = df.drop_duplicates()
In [9]:
total_services = pd.DataFrame(df.groupby("scheme").size(), columns=("Number of services",))
#HTML(total_services.to_html())
total_services.sort('Number of services', ascending=False).plot(kind="barh", figsize=(10,8,))
Out[9]:
In [10]:
from utilities import normalize_service_urn
normalized_urns = df.copy(deep=True)
normalized_urns["scheme"] = normalized_urns["scheme"].map(lambda x: normalize_service_urn(x))
In [11]:
normalized_urns_summary = pd.DataFrame(normalized_urns.groupby("scheme").size(), columns=("Number of services",))
normalized_urns_summary.sort('Number of services', ascending=False).plot(kind="barh", figsize=(10,6,))
Out[11]:
In [22]:
import math
model_service_summary = pd.DataFrame(normalized_urns.groupby(["model", "scheme"], sort=True).size(), columns=("Number of services",))
#HTML(model_service_summary.to_html())
model_service_plotter = model_service_summary.unstack("model")
model_service_plot = model_service_plotter.plot(kind='barh', figsize=(10,8,), sharey=True)
In [23]:
records_per_csw = pd.DataFrame(normalized_urns.groupby(["model", "server"]).size(), columns=("Number of services",))
#HTML(records_per_csw.to_html())
model_csw_plotter = records_per_csw.unstack("model")
model_csw_plot = model_csw_plotter.plot(kind='barh', figsize=(10,8,), sharey=True)
In [24]:
records_per_csw = pd.DataFrame(normalized_urns.groupby(["scheme", "server"]).size(), columns=("Number of services",))
#HTML(records_per_csw.to_html())
model_csw_plotter = records_per_csw.unstack("server")
model_csw_plot = model_csw_plotter.plot(kind='barh', subplots=True, figsize=(12,30,), sharey=True)