In [22]:
import datetime
import re
import netCDF4
import pandas
import numpy as np
In [23]:
def df2nc(df, filename, attributes=None, prefix="", mode='w', standard_names=None, units=None):
if attributes is None:
attributes = {}
if units is None:
units = {}
if standard_names is None:
standard_names = {}
nc = netCDF4.Dataset(filename, mode)
## ADD DIMENSIONS
records_dim = nc.createDimension(prefix + 'records', len(df))
## ADD GLOBAL ATTRIBUTES
# see http://www.unidata.ucar.edu/software/thredds/current/netcdf-java/formats/DataDiscoveryAttConvention.html
for key, val in attributes.items():
setattr(nc, key, val)
## ADD VARIABLES
for name, column in df.iteritems():
assert isinstance(column, pandas.core.series.Series), "expected to iterate over columns"
if column.dtype.char == 'O':
# try and cast to strong
column = column.astype('S')
# no longs in opendap
if column.dtype.char == 'l':
column = column.astype('int32')
var = nc.createVariable(prefix + name, column.dtype.str, (prefix + 'records', ))
var.long_name = name
if name in units:
var.units = units[name]
if name in standard_names:
var.standard_name = standard_names[name]
var[:] = np.asarray(column)
nc.close()
In [24]:
"""
% METADATA FIELDS
% id PSMSL id of station
% latitude Latitude of station
% longitude Longitude of station
% name Name of station
% coastline Old coastline code of station
% stationcode Old stationcode of station
% (So, coastline/stationcode is the old PSMSL id of the station)
% stationflag Is entire station flagged for attention? True
% indicates yes: refer to station documentation for
% further information
"""
metadata_fields = ['id', 'latitude', 'longitude', 'name', 'coastline', 'stationcode', 'stationflag']
dtypes = {
"id": int,
"latitude": float,
"longitude": float,
"name": str,
"coastline": int,
"stationcode": int,
"stationflag": str
}
converters = {
"name": lambda x: x.strip()
}
stations = pandas.read_csv('../data/psmsl/rlr_annual/filelist.txt', sep=';',
names=metadata_fields, dtype=dtypes,
converters=converters)
attributes = {
"Conventions": 'CF-1.6',
"Metadata_Conventions": 'Unidata Dataset Discovery v1.0',
"standard_name_vocabulary": 'CF-1.6',
"title": 'PSMSL Tide Gauge Dataset',
"summary": '',
"source": 'http://www.psmsl.org/data/',
"uuid": '97110a34-ae15-11e5-868e-3c15c2d29372',
"id": '',
"institution": 'Deltares',
"creator_name": 'Fedor Baart',
"creator_url": 'https://www.linkedin.com/in/fedorbaart',
"creator_email": 'fedor.baart@deltares.nl',
"date_created": '2015-12-29T10:11Z',
"date_modified": '2015-12-29T10:11Z',
"date_issued": '2015-12-29T10:11Z',
"publisher_name": 'PSMSL',
"publisher_email": 'psmsl@noc.ac.uk',
"publisher_url": 'http://www.psmsl.org/',
"history": 'Created n %s' % (datetime.datetime.now(),),
"references": 'http://www.jstor.org/stable/4299170',
"license": 'Undefined'
}
In [25]:
filename = '../data/catalogue.dat'
lines = open(filename).readlines()
gloss_pattern = re.compile(r'GLOSS\s*(?P<gloss>\d+)')
station_id_pattern = re.compile(r'STATION ID\s*:\s*(?P<station_id>\d+)')
station_gloss = {}
for i, line in enumerate(lines):
gloss_match = gloss_pattern.search(line)
if gloss_match:
station_id_match = station_id_pattern.search(lines[i+1])
if station_id_match:
gloss = int(gloss_match.group('gloss'))
station = int(station_id_match.group('station_id'))
station_gloss[station] = gloss
else:
raise ValueError
In [26]:
filename = '../data/psmsl/metadata.nc'
df2nc(stations, filename, attributes)
In [27]:
def store_station(station):
"""
% DATA FIELDS
% year Year of data
% height Annual height relative to RLR, in millimetres
% NaN indicates that data is missing
% interpolated Does the value include large amounts of
% inferred data (see PSMSL help file for information)
% dataflag Quality control flag - true values indicate problems
% with the data - see station documentation for
% further information
"""
ncfile = '../data/psmsl/station_%d.nc' % (station.id)
station['gloss_id'] = str(station_gloss.get(station.id))
station_attributes = attributes.copy()
station_attributes.update(station.to_dict())
# name is not updated
station_attributes['station_name'] = station['name']
units = {'latitude': 'degrees_north', 'longitude': 'degreas_east'}
standard_names = {'longitude': 'longitude', 'latitude': 'latitude'}
df = pandas.DataFrame.from_records([station])
df2nc(df, ncfile, attributes=station_attributes, prefix="", mode='w', units=units, standard_names=standard_names)
names = names=('year', 'height', 'interpolated', 'dataflag')
na_values = {"height": [-99999]}
units = {'height': 'm'}
standard_names = {'height': 'sea_surface_height'}
filename = "../data/psmsl/rlr_annual/data/%d.rlrdata" % (station.id,)
df = pandas.read_csv(filename, sep=';', names=names, na_values=na_values)
df2nc(df, ncfile, prefix="annual_", mode='a', units=units, standard_names=standard_names)
filename = "../data/psmsl/rlr_monthly/data/%d.rlrdata" % (station.id,)
df = pandas.read_csv(filename, sep=';', names=names, na_values=na_values)
df2nc(df, ncfile, prefix="monthly_", mode='a', units=units, standard_names=standard_names)
filename = "../data/psmsl/met_monthly/data/%d.metdata" % (station.id,)
df = pandas.read_csv(filename, sep=';', names=names, na_values=na_values)
df2nc(df, ncfile, prefix="metric_", mode='a', units=units, standard_names=standard_names)
In [28]:
for i, station in stations.iterrows():
station.id
store_station(station)
In [29]:
names = names=('year', 'height', 'interpolated', 'dataflag')
na_values = {"height": [-99999]}
units = {'height': 'm'}
standard_names = {'height': 'sea_surface_height'}
filename = "../data/psmsl/rlr_monthly/data/%d.rlrdata" % (station.id,)
df = pandas.read_csv(filename, sep=';', names=names, na_values=na_values)
df.interpolated.dtype.char
Out[29]:
In [ ]:
def store_station(station):
"""
% DATA FIELDS
% year Year of data
% height Annual height relative to RLR, in millimetres
% NaN indicates that data is missing
% interpolated Does the value include large amounts of
% inferred data (see PSMSL help file for information)
% dataflag Quality control flag - true values indicate problems
% with the data - see station documentation for
% further information
"""
df = pandas.DataFrame.from_records([station])
df2nc(df, ncfile, attributes=station_attributes, prefix="", mode='w', units=units, standard_names=standard_names)
names = names=('year', 'height', 'interpolated', 'dataflag')
na_values = {"height": [-99999]}
units = {'height': 'm'}
standard_names = {'height': 'sea_surface_height'}
filename = "../data/psmsl/rlr_annual/data/%d.rlrdata" % (station.id,)
df = pandas.read_csv(filename, sep=';', names=names, na_values=na_values)
df2nc(df, ncfile, prefix="annual_", mode='a', units=units, standard_names=standard_names)
filename = "../data/psmsl/rlr_monthly/data/%d.rlrdata" % (station.id,)
df = pandas.read_csv(filename, sep=';', names=names, na_values=na_values)
df2nc(df, ncfile, prefix="monthly_", mode='a', units=units, standard_names=standard_names)
filename = "../data/psmsl/met_monthly/data/%d.metdata" % (station.id,)
df = pandas.read_csv(filename, sep=';', names=names, na_values=na_values)
df2nc(df, ncfile, prefix="metric_", mode='a', units=units, standard_names=standard_names)
In [42]:
import geojson
import json
features = []
for i, station in stations.iterrows():
geometry = geojson.Point(coordinates=(station.longitude, station.latitude))
feature = geojson.Feature(
id=station.id,
geometry=geometry,
properties=station.to_dict()
)
features.append(feature)
fc = geojson.FeatureCollection(features)
with open("../data/psmsl/stations.json", "w") as f:
json.dump(fc, f)
In [35]:
for i, station in stations.iterrows():
filename = "../data/psmsl/rlr_monthly/data/%d.rlrdata" % (station.id,)
df = pandas.read_csv(filename, sep=';', names=names, na_values=na_values)
Out[35]:
In [ ]: