In [22]:
import datetime
import re


import netCDF4
import pandas
import numpy as np

In [23]:
def df2nc(df, filename, attributes=None, prefix="", mode='w', standard_names=None, units=None):
    if attributes is None:
        attributes = {}
    if units is None:
        units = {}
    if standard_names is None:
        standard_names = {}
    nc = netCDF4.Dataset(filename, mode)

    ## ADD DIMENSIONS
    records_dim = nc.createDimension(prefix + 'records', len(df))

    ## ADD GLOBAL ATTRIBUTES
    # see http://www.unidata.ucar.edu/software/thredds/current/netcdf-java/formats/DataDiscoveryAttConvention.html

    for key, val in attributes.items():
        setattr(nc, key, val)

    ## ADD VARIABLES
    for name, column in df.iteritems():
        assert isinstance(column, pandas.core.series.Series), "expected to iterate over columns"
        if column.dtype.char == 'O':
            # try and cast to strong
            column = column.astype('S')
        # no longs in opendap
        if column.dtype.char == 'l':
            column = column.astype('int32')
        var = nc.createVariable(prefix + name, column.dtype.str, (prefix + 'records', ))
        var.long_name = name
        if name in units:
            var.units = units[name]
        if name in standard_names:
            var.standard_name = standard_names[name]
            
            
        var[:] = np.asarray(column)

    nc.close()

In [24]:
"""
%   METADATA FIELDS
%   id                PSMSL id of station
%   latitude          Latitude of station
%   longitude         Longitude of station
%   name              Name of station
%   coastline         Old coastline code of station
%   stationcode       Old stationcode of station
%       (So, coastline/stationcode is the old PSMSL id of the station)
%   stationflag       Is entire station flagged for attention? True
%                       indicates yes: refer to station documentation for
%                       further information
"""
metadata_fields = ['id', 'latitude', 'longitude', 'name', 'coastline', 'stationcode', 'stationflag']
dtypes = {
    "id": int,
    "latitude": float,
    "longitude": float,
    "name": str,
    "coastline": int,
    "stationcode": int,
    "stationflag": str
}
converters = {
    "name": lambda x: x.strip()
}
stations = pandas.read_csv('../data/psmsl/rlr_annual/filelist.txt', sep=';', 
                     names=metadata_fields, dtype=dtypes,
                    converters=converters)
attributes = {
    "Conventions": 'CF-1.6',
    "Metadata_Conventions": 'Unidata Dataset Discovery v1.0',
    "standard_name_vocabulary": 'CF-1.6',
    "title": 'PSMSL Tide Gauge Dataset',
    "summary": '',
    "source": 'http://www.psmsl.org/data/',
    "uuid": '97110a34-ae15-11e5-868e-3c15c2d29372',
    "id": '',
    "institution": 'Deltares',
    "creator_name": 'Fedor Baart',
    "creator_url": 'https://www.linkedin.com/in/fedorbaart',
    "creator_email": 'fedor.baart@deltares.nl',
    "date_created": '2015-12-29T10:11Z',
    "date_modified": '2015-12-29T10:11Z',
    "date_issued": '2015-12-29T10:11Z',
    "publisher_name": 'PSMSL',
    "publisher_email": 'psmsl@noc.ac.uk',
    "publisher_url": 'http://www.psmsl.org/',
    "history": 'Created n %s' % (datetime.datetime.now(),),
    "references": 'http://www.jstor.org/stable/4299170',
    "license": 'Undefined'
}

In [25]:
filename = '../data/catalogue.dat'
lines = open(filename).readlines()
gloss_pattern = re.compile(r'GLOSS\s*(?P<gloss>\d+)')
station_id_pattern = re.compile(r'STATION ID\s*:\s*(?P<station_id>\d+)')
station_gloss = {}
for i, line in enumerate(lines):
    gloss_match = gloss_pattern.search(line)
    if gloss_match:
        station_id_match = station_id_pattern.search(lines[i+1])
        if station_id_match:
            gloss = int(gloss_match.group('gloss'))
            station = int(station_id_match.group('station_id'))
            station_gloss[station] = gloss
        else:
            raise ValueError

In [26]:
filename = '../data/psmsl/metadata.nc'
df2nc(stations, filename, attributes)

In [27]:
def store_station(station):
    """
    %   DATA FIELDS
    %   year              Year of data
    %   height            Annual height relative to RLR, in millimetres
    %                       NaN indicates that data is missing
    %   interpolated      Does the value include large amounts of
    %                       inferred data (see PSMSL help file for information)
    %   dataflag          Quality control flag - true values indicate problems
    %                       with the data - see station documentation for
    %                       further information
    """
    ncfile = '../data/psmsl/station_%d.nc' % (station.id)

    station['gloss_id'] = str(station_gloss.get(station.id))

    station_attributes = attributes.copy()
    station_attributes.update(station.to_dict())
    # name is not updated
    station_attributes['station_name'] = station['name']

    units = {'latitude': 'degrees_north', 'longitude': 'degreas_east'}
    standard_names = {'longitude': 'longitude', 'latitude': 'latitude'}

    df = pandas.DataFrame.from_records([station])
    
    df2nc(df, ncfile, attributes=station_attributes, prefix="", mode='w', units=units, standard_names=standard_names)

    names = names=('year', 'height', 'interpolated', 'dataflag')
    na_values = {"height": [-99999]}
    units = {'height': 'm'}
    standard_names = {'height': 'sea_surface_height'}

    filename = "../data/psmsl/rlr_annual/data/%d.rlrdata" % (station.id,)
    df = pandas.read_csv(filename, sep=';', names=names, na_values=na_values)
    df2nc(df, ncfile, prefix="annual_", mode='a', units=units, standard_names=standard_names)

    filename = "../data/psmsl/rlr_monthly/data/%d.rlrdata" % (station.id,)
    df = pandas.read_csv(filename, sep=';', names=names, na_values=na_values)
    df2nc(df, ncfile, prefix="monthly_", mode='a', units=units, standard_names=standard_names)

    filename = "../data/psmsl/met_monthly/data/%d.metdata" % (station.id,)
    df = pandas.read_csv(filename, sep=';', names=names, na_values=na_values)
    df2nc(df, ncfile, prefix="metric_", mode='a', units=units, standard_names=standard_names)

In [28]:
for i, station in stations.iterrows():
    station.id
    store_station(station)

In [29]:
names = names=('year', 'height', 'interpolated', 'dataflag')
na_values = {"height": [-99999]}
units = {'height': 'm'}
standard_names = {'height': 'sea_surface_height'}

filename = "../data/psmsl/rlr_monthly/data/%d.rlrdata" % (station.id,)
df = pandas.read_csv(filename, sep=';', names=names, na_values=na_values)
df.interpolated.dtype.char


Out[29]:
'l'

In [ ]:
def store_station(station):
    """
    %   DATA FIELDS
    %   year              Year of data
    %   height            Annual height relative to RLR, in millimetres
    %                       NaN indicates that data is missing
    %   interpolated      Does the value include large amounts of
    %                       inferred data (see PSMSL help file for information)
    %   dataflag          Quality control flag - true values indicate problems
    %                       with the data - see station documentation for
    %                       further information
    """

    df = pandas.DataFrame.from_records([station])
    
    df2nc(df, ncfile, attributes=station_attributes, prefix="", mode='w', units=units, standard_names=standard_names)

    names = names=('year', 'height', 'interpolated', 'dataflag')
    na_values = {"height": [-99999]}
    units = {'height': 'm'}
    standard_names = {'height': 'sea_surface_height'}

    filename = "../data/psmsl/rlr_annual/data/%d.rlrdata" % (station.id,)
    df = pandas.read_csv(filename, sep=';', names=names, na_values=na_values)
    df2nc(df, ncfile, prefix="annual_", mode='a', units=units, standard_names=standard_names)

    filename = "../data/psmsl/rlr_monthly/data/%d.rlrdata" % (station.id,)
    df = pandas.read_csv(filename, sep=';', names=names, na_values=na_values)
    df2nc(df, ncfile, prefix="monthly_", mode='a', units=units, standard_names=standard_names)

    filename = "../data/psmsl/met_monthly/data/%d.metdata" % (station.id,)
    df = pandas.read_csv(filename, sep=';', names=names, na_values=na_values)
    df2nc(df, ncfile, prefix="metric_", mode='a', units=units, standard_names=standard_names)

In [42]:
import geojson
import json
features = []
for i, station in stations.iterrows():
    geometry = geojson.Point(coordinates=(station.longitude, station.latitude))
    feature = geojson.Feature(
        id=station.id, 
        geometry=geometry, 
        properties=station.to_dict()
    )
    features.append(feature)
fc = geojson.FeatureCollection(features)
with open("../data/psmsl/stations.json", "w") as f:
    json.dump(fc, f)

In [35]:
for i, station in stations.iterrows():
    filename = "../data/psmsl/rlr_monthly/data/%d.rlrdata" % (station.id,)
    df = pandas.read_csv(filename, sep=';', names=names, na_values=na_values)


Out[35]:
geojson.feature.Feature

In [ ]: