This notebook contains everything you need to create a nice neat list of meta data dictionaries out of netcdf files. In this case we have made one meta data dictionary for each day in a five year span. The dictionaries are only created when there is data available on the given day, and there are up to 8 datafiles represented on each day. Each files contains data from various sensors and that is reported out in a whole slew of variables. Each variable has attributes associated with it in the netcdf file. These attributes are carried over into the dict and other attributes are added, such as a flag variable that can be raised for various problematic data situations (missing data, unreasonable data, ...)
In [1]:
from __future__ import print_function
import pandas as pd
import datetime as dt
import numpy as np
import os
import xray
from posixpath import join
ROOTDIR = 'C:/Users/Julia/Documents/GitHub/MpalaTower/raw_netcdf_output/'
data = 'Table1'
datas = ['upper', 'Table1', 'lws', 'licor6262', 'WVIA',
'Manifold', 'flux', 'ts_data', 'Table1Rain']
non_static_attrs = ['instrument', 'source', 'program', 'logger']
static_attrs = ['station_name', 'lat', 'lon', 'elevation',
'Year', 'Month', 'DOM', 'Minute', 'Hour',
'Day_of_Year', 'Second', 'uSecond', 'WeekDay']
# Setting expected ranges for units. It is ok to include multiple ways of writing
# the same unit, just put all the units in a list
flag_by_units = {}
temp_min = 0
temp_max = 40
temp = ['Deg C', 'C']
for unit in temp:
flag_by_units.update({unit : {'min' : temp_min, 'max' : temp_max}})
percent_min = 0
percent_max = 100
percent = ['percent', '%']
for unit in percent:
flag_by_units.update({unit : {'min' : percent_min, 'max' : percent_max}})
shf_min = ''
shf_max = ''
shf = ['W/m^2']
shf_cal_min = ''
shf_cal_max = ''
shf_cal = ['W/(m^2 mV)']
batt_min = 11
batt_max = 240
batt = ['Volts', 'V']
for unit in batt:
flag_by_units.update({unit : {'min' : batt_min, 'max' : batt_max}})
PA_min = 15
PA_max = 25
PA = ['uSec']
In [2]:
def process_netcdf(input_dir, data, f, static_attrs):
ds = xray.Dataset()
ds = xray.open_dataset(join(input_dir, data, f),
decode_cf=True, decode_times=True)
df = ds.to_dataframe()
# drop from df, columns that don't change with time
exclude = [var for var in static_attrs if var in df.columns]
df_var = df.drop(exclude, axis=1) # dropping vars like lat, lon
df_clean = df_var.dropna(axis=1, how='all') # dropping NAN vars
# get some descriptive statistics on each of the variables
df_int = df_clean.describe()
df_summ = pd.DataFrame(df_int, dtype=str)
for i in df_int:
# this loop outputs rounded string values instead of floats
# this helps to control the size of the final array
for k in range(len(df_int)):
try:
precision = 2 # higher numbers improve precision
sigfig = precision-int(np.log10(abs(df_int[i][k])))
command = '%2.'+str(sigfig)+'f'
df_summ[i][k] = command % df_int[i][k]
except:
pass
return ds, df_var, df_clean, df_summ
In [3]:
def convert_to_sec(num, units):
if units.startswith(('Min','min')):
out = int(num)*60
elif units.startswith('ms', 'mS'):
out = float(num)/1000
elif units.statswith(('s','S')):
out = int(num)
else:
print('couldn\'t parse units')
return (num, units)
return out
In [4]:
convert_to_sec(10, 'Min')
Out[4]:
In [5]:
def generate_local_attrs(ds, df_summ, var, flag_by_units):
local_attrs = {'var': var}
local_attrs.update(ds[var].attrs)
local_attrs.update(df_summ[var].to_dict())
# check status of data and raise flags
var_count = float(local_attrs['count'])
var_max = float(local_attrs['max'])
var_min = float(local_attrs['min'])
var_mean = float(local_attrs['mean'])
flags = []
if len(ds['time'])*11/12 < var_count < len(ds['time']):
flags.append('missing a little data')
elif len(ds['time'])/2 < var_count <= len(ds['time'])*11/12:
flags.append('missing some data')
elif var_count <= len(ds['time'])/2:
flags.append('missing lots of data')
if var.startswith('del'):
pass
elif local_attrs['comment'] == 'Std': # don't check std_dev
pass
else:
try:
if var_max > flag_by_units[local_attrs['units']]['max']:
flags.append('contains high values')
if var_min < flag_by_units[local_attrs['units']]['min']:
flags.append('contains low values')
except:
pass
if var.endswith(('total', 'Total', 'tot', 'Tot')):
local_attrs.update({'total' : var_mean*var_count})
local_attrs['flags'] = flags
return local_attrs
In [6]:
def programmed_frequency(ROOTDIR, data_dict,j):
data = data_dict['files'][j]['filename']
program = data_dict['files'][j]['source'].split('CPU:')[1].split(',')[0]
try:
f = open(join(ROOTDIR, 'programs', program))
except:
print('program not found', program)
lines = f.readlines()
i= 0
k = 0
freq = {}
DT = 'DataTable'
DI = 'DataInterval'
for i in range(len(lines)):
if lines[i].startswith((DT, ' '+DT, ' '+DT)) and data in lines[i]:
freq.update({'datatable': {data: {}}})
k = i
if lines[i].startswith((DI, ' '+DI, ' '+DI, ' '+DI, ' '+DI, ' '+DI)) and i <= (k+2):
interval = lines[i].split(',')[1]
print(interval)
units = lines[i].split(',')[2]
freq['datatable'][data].update({'interval': interval, 'units': units})
i +=1
try:
num = int(interval)
except:
scan = freq['datatable'][data]['interval']
for line in lines:
if line.startswith('Const '+scan):
a = line.split('=')[1]
b = a.split()[0]
num = int(b)
freq['datatable'][data].update({'num':num})
return freq
In [7]:
def generate_datafile_attrs(data, ds, df_var, df_clean, df_summ,
flag_by_units, non_static_attrs):
datafile_attrs = {'filename': data, 'variables': []}
# add in non_static datafile attributes from the global ones
for attr in [d for d in non_static_attrs if d in ds.attrs]:
datafile_attrs.update({attr : ds.attrs[attr]})
# calculate average frequency
times = ds.coords['time'].values
freq = np.diff(times, axis=-1)
a = freq.mean()
freq = a.astype('timedelta64[s]')
datafile_attrs['frequency'] = freq.astype(float)
# populate the empty variable list with dataless variables
empty_vars = [{'var': var, 'flags' : ['no data']} for var in df_var if var not in df_clean]
empty_vars.sort()
for var in empty_vars:
datafile_attrs['variables'].append(var)
# populate it with local attributes
full_vars = list(df_summ.columns)
full_vars.sort()
for var in full_vars:
local_attrs = generate_local_attrs(ds, df_summ, var, flag_by_units)
datafile_attrs['variables'].append(local_attrs)
return datafile_attrs
In [18]:
from . import db
In [10]:
data_dict = None
data_list = []
start = '2010-01-01'
end = dt.datetime.utcnow()
rng = pd.date_range(start, end, freq='D')
for date in rng:
i = 0
y = date.year
m = date.month
d = date.dayofyear
f = 'raw_MpalaTower_%i_%03d.nc' % (y, d)
if any(f in os.listdir(join(ROOTDIR, data)) for data in datas):
Metadata.__init__(join(ROOTDIR, data, f))
In [8]:
print(date)
data_dict = {'year': y, 'month' : m, 'doy': d, 'date' : date, 'files': []}
i += 1
for data in datas:
if f in os.listdir(join(ROOTDIR, data)):
print(f, data)
ds, df_var, df_clean, df_summ = process_netcdf(ROOTDIR, data, f, static_attrs)
datafile_attrs = generate_datafile_attrs(data, ds, df_var, df_clean, df_summ,
flag_by_units, non_static_attrs)
data_dict['files'].append(datafile_attrs)
if i == 1:
for attr in [d for d in ds.attrs if d not in non_static_attrs]:
data_dict.update({attr : ds.attrs[attr]})
data_list.append(data_dict)
In [187]:
for data_dict in data_list:
for j in range(len(data_dict['files'])):
data = data_dict['files'][j]['filename']
print(data)
big_freq = programmed_frequency(ROOTDIR, data_dict,j)
num = big_freq['datatable'][data]['num']
units = big_freq['datatable'][data]['units']
prog_freq = convert_to_sec(num, units)
data_dict['files'][j]['prog_freq'] = prog_freq
In [180]:
big_freq
Out[180]:
In [6]:
def send(data_list):
from pymongo import MongoClient
from pylab import array, nbytes
db_uri = 'mongodb://joey:joejoe@dogen.mongohq.com:10097/mpala_tower_metadata'
client = MongoClient(db_uri)
db = client.mpala_tower_metadata
Metadata = db.metadata
A = array(data_list)
print(A.nbytes,'bytes')
Metadata.remove({})
Metadata.insert(data_list)
In [6]:
from pylab import array, nbytes
A = array(data_list)
print(A.nbytes,'bytes')
In [7]:
send(data_list)
In [139]:
freq = programmed_frequency(ROOTDIR, data_dict)
In [146]:
freq['datatable'][data]['num']
Out[146]:
In [143]:
freq
Out[143]:
In [9]:
data_list[0]
Out[9]:
In [ ]: