Estimate how many TB of data are served by IOOS

Estimate dataset size from the OPeNDAP DDS. Here we use regular expressions to parse the DDS and just the variable size (32 or 64 bit Int or Float) by their shapes. This represents the size in memory, not on disk, since the data could be compressed. But the data in memory is in some sense a more true representation of the quantity of data available by the service.


In [1]:
from owslib.csw import CatalogueServiceWeb
from owslib import fes
import pandas as pd
import datetime as dt
import requests
import re
import time
from __future__ import print_function

In [2]:
def service_urls(records,service_string='urn:x-esri:specification:ServiceType:odp:url'):
    """
    Get all URLs matching a specific ServiceType 
 
    Unfortunately these seem to differ between different CSW-ISO services.
    For example, OpenDAP is specified:
    NODC geoportal: 'urn:x-esri:specification:ServiceType:OPeNDAP'
    NGDC geoportal: 'urn:x-esri:specification:ServiceType:odp:url'
    """

    urls=[]
    for key,rec in records.iteritems():
        #create a generator object, and iterate through it until the match is found
        #if not found, gets the default value (here "none")
        url = next((d['url'] for d in rec.references if d['scheme'] == service_string), None)
        if url is not None:
            urls.append(url)
    return urls

Find OpenDAP endpoints from NGDC CSW


In [3]:
endpoint = 'http://www.ngdc.noaa.gov/geoportal/csw' #  NGDC/IOOS Geoportal
dap_timeout=4  # timeout for DAP response
csw_timeout=60 # timeout for CSW response
csw = CatalogueServiceWeb(endpoint,timeout=csw_timeout)
csw.version


Out[3]:
'2.0.2'

In [4]:
[op.name for op in csw.operations]


Out[4]:
['GetCapabilities',
 'DescribeRecord',
 'GetRecords',
 'GetRecordById',
 'Transaction']

In [5]:
csw.get_operation_by_name('GetRecords').constraints


Out[5]:
[Constraint: SupportedCommonQueryables - ['Subject', 'Title', 'Abstract', 'AnyText', 'Format', 'Identifier', 'Modified', 'Type', 'BoundingBox'],
 Constraint: SupportedISOQueryables - ['apiso:Subject', 'apiso:Title', 'apiso:Abstract', 'apiso:AnyText', 'apiso:Format', 'apiso:Identifier', 'apiso:Modified', 'apiso:Type', 'apiso:BoundingBox', 'apiso:CRS.Authority', 'apiso:CRS.ID', 'apiso:CRS.Version', 'apiso:RevisionDate', 'apiso:AlternateTitle', 'apiso:CreationDate', 'apiso:PublicationDate', 'apiso:OrganizationName', 'apiso:HasSecurityConstraints', 'apiso:Language', 'apiso:ResourceIdentifier', 'apiso:ParentIdentifier', 'apiso:KeywordType', 'apiso:TopicCategory', 'apiso:ResourceLanguage', 'apiso:GeographicDescriptionCode', 'apiso:Denominator', 'apiso:DistanceValue', 'apiso:DistanceUOM', 'apiso:TempExtent_begin', 'apiso:TempExtent_end', 'apiso:ServiceType', 'apiso:ServiceTypeVersion', 'apiso:Operation', 'apiso:OperatesOn', 'apiso:OperatesOnIdentifier', 'apiso:OperatesOnName', 'apiso:CouplingType'],
 Constraint: AdditionalQueryables - ['apiso:Degree', 'apiso:AccessConstraints', 'apiso:OtherConstraints', 'apiso:Classification', 'apiso:ConditionApplyingToAccessAndUse', 'apiso:Lineage', 'apiso:ResponsiblePartyRole', 'apiso:ResponsiblePartyName', 'apiso:SpecificationTitle', 'apiso:SpecificationDate', 'apiso:SpecificationDateType']]

In [6]:
for oper in csw.operations:
    print(oper.name)


GetCapabilities
DescribeRecord
GetRecords
GetRecordById
Transaction

In [7]:
csw.get_operation_by_name('GetRecords').constraints


Out[7]:
[Constraint: SupportedCommonQueryables - ['Subject', 'Title', 'Abstract', 'AnyText', 'Format', 'Identifier', 'Modified', 'Type', 'BoundingBox'],
 Constraint: SupportedISOQueryables - ['apiso:Subject', 'apiso:Title', 'apiso:Abstract', 'apiso:AnyText', 'apiso:Format', 'apiso:Identifier', 'apiso:Modified', 'apiso:Type', 'apiso:BoundingBox', 'apiso:CRS.Authority', 'apiso:CRS.ID', 'apiso:CRS.Version', 'apiso:RevisionDate', 'apiso:AlternateTitle', 'apiso:CreationDate', 'apiso:PublicationDate', 'apiso:OrganizationName', 'apiso:HasSecurityConstraints', 'apiso:Language', 'apiso:ResourceIdentifier', 'apiso:ParentIdentifier', 'apiso:KeywordType', 'apiso:TopicCategory', 'apiso:ResourceLanguage', 'apiso:GeographicDescriptionCode', 'apiso:Denominator', 'apiso:DistanceValue', 'apiso:DistanceUOM', 'apiso:TempExtent_begin', 'apiso:TempExtent_end', 'apiso:ServiceType', 'apiso:ServiceTypeVersion', 'apiso:Operation', 'apiso:OperatesOn', 'apiso:OperatesOnIdentifier', 'apiso:OperatesOnName', 'apiso:CouplingType'],
 Constraint: AdditionalQueryables - ['apiso:Degree', 'apiso:AccessConstraints', 'apiso:OtherConstraints', 'apiso:Classification', 'apiso:ConditionApplyingToAccessAndUse', 'apiso:Lineage', 'apiso:ResponsiblePartyRole', 'apiso:ResponsiblePartyName', 'apiso:SpecificationTitle', 'apiso:SpecificationDate', 'apiso:SpecificationDateType']]

Since the supported ISO queryables contain apiso:ServiceType, we can use CSW to find all datasets with services that contain the string "dap"


In [8]:
try:
    csw.get_operation_by_name('GetDomain')
    csw.getdomain('apiso:ServiceType', 'property')
    print(csw.results['values'])
except:
    print('GetDomain not supported')


GetDomain not supported

Since this CSW service doesn't provide us a list of potential values for apiso:ServiceType, we guess opendap, which seems to work:


In [9]:
val = 'opendap'
service_type = fes.PropertyIsLike(propertyname='apiso:ServiceType',literal=('*%s*' % val),
                        escapeChar='\\',wildCard='*',singleChar='?')
filter_list = [ service_type]

In [10]:
csw.getrecords2(constraints=filter_list,maxrecords=10000,esn='full')
len(csw.records.keys())


---------------------------------------------------------------------------
KeyboardInterrupt                         Traceback (most recent call last)
<ipython-input-10-47be0ed73150> in <module>()
----> 1 csw.getrecords2(constraints=filter_list,maxrecords=10000,esn='full')
      2 len(csw.records.keys())

/home/usgs/anaconda/lib/python2.7/site-packages/OWSLib-0.8.9-py2.7.egg/owslib/csw.pyc in getrecords2(self, constraints, sortby, typenames, esn, outputschema, format, startposition, maxrecords, cql, xml, resulttype)
    371             self.records = OrderedDict()
    372 
--> 373             self._parserecords(outputschema, esn)
    374 
    375     def transaction(self, ttype=None, typename='csw:Record', record=None, propertyname=None, propertyvalue=None, bbox=None, keywords=[], cql=None, identifier=None):

/home/usgs/anaconda/lib/python2.7/site-packages/OWSLib-0.8.9-py2.7.egg/owslib/csw.pyc in _parserecords(self, outputschema, esn)
    536                 val = i.find(util.nspath_eval('dc:identifier', namespaces))
    537                 identifier = self._setidentifierkey(util.testXMLValue(val))
--> 538                 self.records[identifier] = CswRecord(i)
    539 
    540     def _parsetransactionsummary(self):

/home/usgs/anaconda/lib/python2.7/site-packages/OWSLib-0.8.9-py2.7.egg/owslib/csw.pyc in __init__(self, record)
    785         val = record.find(util.nspath_eval('ows:WGS84BoundingBox', namespaces))
    786         if val is not None:
--> 787             self.bbox_wgs84 = ows.WGS84BoundingBox(val, namespaces['ows'])
    788         else:
    789             self.bbox_wgs84 = None

/home/usgs/anaconda/lib/python2.7/site-packages/OWSLib-0.8.9-py2.7.egg/owslib/ows.pyc in __init__(self, elem, namespace)
    229     """WGS84 bbox, axis order xy"""
    230     def __init__(self, elem, namespace=DEFAULT_OWS_NAMESPACE):
--> 231         BoundingBox.__init__(self, elem, namespace)
    232         self.dimensions = 2
    233         self.crs = crs.Crs('urn:ogc:def:crs:OGC:2:84')

/home/usgs/anaconda/lib/python2.7/site-packages/OWSLib-0.8.9-py2.7.egg/owslib/ows.pyc in __init__(self, elem, namespace)
    209         tmp = util.testXMLValue(val)
    210         if tmp is not None:
--> 211             xy = tmp.split()
    212             if len(xy) > 1:
    213                 if self.crs is not None and self.crs.axisorder == 'yx':

KeyboardInterrupt: 

By printing out the references from a random record, we see that for this CSW the DAP URL is identified by urn:x-esri:specification:ServiceType:odp:url


In [ ]:
choice=random.choice(list(csw.records.keys()))
print(choice)
csw.records[choice].references

Get all the OPeNDAP endpoints


In [ ]:
dap_urls = service_urls(csw.records,service_string='urn:x-esri:specification:ServiceType:odp:url')
len(dap_urls)

In [ ]:
def calc_dsize(txt):
    ''' 
    Calculate dataset size from the OPeNDAP DDS. 
    Approx method: Multiply 32|64 bit Int|Float variables by their shape.
    '''
    # split the OpenDAP DDS on ';' characters
    all = re.split(';',txt)
    '''
    Use regex to find numbers following Float or Int (e.g. Float32, Int64)
    and also numbers immediately preceding a "]".  The idea is that in line like:
    
    Float32 Total_precipitation_surface_6_Hour_Accumulation[time2 = 74][y = 303][x = 491];
           
    we want to find only the numbers that are not part of a variable or dimension name
    (want to return [32, 74, 303, 491], *not* [32, 6, 2, 74, 303, 491])
    '''
    m = re.compile('\d+(?=])|(?<=Float)\d+|(?<=Int)\d+')
    dsize=0
    for var in all:
        c = map(int,m.findall(var))
        if len(c)>=2:
            vsize = reduce(lambda x,y: x*y,c)
            dsize += vsize
    
    return dsize/1.0e6/8.   # return megabytes

In [ ]:
def tot_dsize(url,dap_timeout=2):
    das = url + '.dds'
    tot = 0
    try:
        response = requests.get(das,verify=True, timeout=dap_timeout)
    except:
        return tot, -1
    if response.status_code==200:
        # calculate the total size for all variables:
        tot = calc_dsize(response.text)
        # calculate the size for MAPS variables and subtract from the total:
        maps = re.compile('MAPS:(.*?)}',re.MULTILINE | re.DOTALL)
        map_text = ''.join(maps.findall(response.text))
        if map_text:
            map_tot = calc_dsize(map_text)
            tot -= map_tot
    
    return tot,response.status_code

In [ ]:
time0 = time.time()
good_data=[]
bad_data=[]
count=0
for url in dap_urls:
    count += 1
    dtot, status_code = tot_dsize(url,dap_timeout=dap_timeout)
    if status_code==200:
        good_data.append([url,dtot])
        print('[{}]Good:{},{}'.format(count,url,dtot), end='\r')
    else:
        bad_data.append([url,status_code])
        print('[{}]Fail:{},{}'.format(count,url,status_code), end='\r')
    
print('Elapsed time={} minutes'.format((time.time()-time0)/60.))

In [ ]:
print('Elapsed time={} minutes'.format((time.time()-time0)/60.))

In [ ]:
len(bad_data)

In [ ]:
bad_data[0][0]

So how much data are we serving?


In [ ]:
sum=0
for ds in good_data:
    sum +=ds[1]
    
print('{} terabytes'.format(sum/1.e6))

In [ ]:
url=[]
size=[]
for item in good_data:
    url.append(item[0])
    size.append(item[1])

In [ ]:
d={}
d['url']=url
d['size']=size

In [ ]:
good = pd.DataFrame(d)

In [ ]:
good.head()

In [ ]:
good=good.sort(['size'],ascending=0)

In [ ]:
good.head()

In [ ]:
url=[]
code=[]
for item in bad_data:
    url.append(item[0])
    code.append(item[1])

In [ ]:
d={}
d['url']=url
d['code']=code
bad = pd.DataFrame(d)

In [ ]:
bad.head()

In [ ]:
cd /usgs/data2/notebook/system-test/Theme_1_Baseline

In [ ]:
td = dt.datetime.today().strftime('%Y-%m-%d')

In [ ]:
bad.to_csv('bad'+td+'.csv')

In [ ]:
good.to_csv('good'+td+'.csv')

In [ ]:
bad=bad.sort(['url','code'],ascending=[0,0])

In [ ]:
bad = pd.read_csv('bad'+td+'.csv',index_col=0)
good = pd.read_csv('good'+td+'.csv',index_col=0)

In [ ]:
bad.head()

In [ ]:
recs = bad[bad['url'].str.contains('neracoos')]
print(len(recs))

In [ ]:
recs = bad[bad['url'].str.contains('ucar')]
print(len(recs))

In [ ]:
recs = bad[bad['url'].str.contains('tamu')]
print(len(recs))

In [ ]:
recs = bad[bad['url'].str.contains('axiom')]
print(len(recs))

In [ ]:
recs = bad[bad['url'].str.contains('caricoos')]
print(len(recs))

In [ ]:
recs.to_csv('axiom.csv')

In [ ]:
!git add *.csv

In [ ]:
!git commit -m 'new csv'

In [ ]:
!git push

In [ ]: