In [1]:
%matplotlib inline
import nivapy3 as nivapy
import numpy as np
import pandas as pd
import os
import seaborn as sn
import matplotlib.pyplot as plt
import toc_trends_analysis as resa2_trends
import warnings
warnings.filterwarnings("ignore", message="Mean of empty slice")
plt.style.use('ggplot')
In [2]:
# Connect to NIVABASE
eng = nivapy.da.connect()
In [3]:
# Select projects
prj_grid = nivapy.da.select_resa_projects(eng)
prj_grid
In [4]:
prj_df = prj_grid.get_selected_df()
print (len(prj_df))
prj_df
Out[4]:
In [5]:
# Get stations
stn_df = nivapy.da.select_resa_project_stations(prj_df, eng)
print(len(stn_df))
stn_df.head()
Out[5]:
In [6]:
# Map
nivapy.spatial.quickmap(stn_df, popup='station_code')
Out[6]:
In [7]:
# User input
# Specify projects of interest
proj_list = ['ICPWaters US', 'ICPWaters NO', 'ICPWaters CA',
'ICPWaters UK', 'ICPWaters FI', 'ICPWaters SE',
'ICPWaters CZ', 'ICPWaters IT', 'ICPWaters PL',
'ICPWaters CH', 'ICPWaters LV', 'ICPWaters EE',
'ICPWaters IE', 'ICPWaters MD', 'ICPWaters DE']
# Specify results folder
res_fold = (r'../../../Thematic_Trends_Report_2019/results')
In [8]:
# Specify period of interest
st_yr, end_yr = 1990, 2016
# Build output paths
plot_fold = os.path.join(res_fold, 'trends_plots_%s-%s' % (st_yr, end_yr))
res_csv = os.path.join(res_fold, 'res_%s-%s.csv' % (st_yr, end_yr))
dup_csv = os.path.join(res_fold, 'dup_%s-%s.csv' % (st_yr, end_yr))
nd_csv = os.path.join(res_fold, 'nd_%s-%s.csv' % (st_yr, end_yr))
# Run analysis
res_df, dup_df, nd_df = resa2_trends.run_trend_analysis(proj_list,
eng,
st_yr=st_yr,
end_yr=end_yr,
plot=False,
fold=plot_fold)
# Delete mk_std_dev col as not relevant here
del res_df['mk_std_dev']
# Write output
res_df.to_csv(res_csv, index=False)
dup_df.to_csv(dup_csv, index=False)
if nd_df is not None:
nd_df.to_csv(nd_csv, index=False)
There are lots of warnings printed above, but the main one of interest is:
Some stations have no relevant data in the period specified.
Which station(s) are missing data?
In [9]:
# Get stations with no data
stn_df[stn_df['station_id'].isin(nd_df['station_id'])]
Out[9]:
It seems that one Irish station has no associated data. This is as expected, because all the data supplied by Julian for this site comes from "near-shore" sampling (rather than "open water") and these have been omitted from the data upload - see here for details.
In [10]:
# Set up plot
fig = plt.figure(figsize=(20,10))
sn.set(style="ticks", palette="muted",
color_codes=True, font_scale=2)
# Horizontal boxplots
ax = sn.boxplot(x="mean", y="par_id", data=res_df,
whis=np.inf, color="c")
# Add "raw" data points for each observation, with some "jitter"
# to make them visible
sn.stripplot(x="mean", y="par_id", data=res_df, jitter=True,
size=3, color=".3", linewidth=0)
# Remove axis lines
sn.despine(trim=True)
The code below is taken from here. It is used to generate output files in the format requested by Heleen.
In [11]:
# Change 'period' col to 'data_period' and add 'analysis_period'
res_df['data_period'] = res_df['period']
del res_df['period']
res_df['analysis_period'] = '1990-2016'
# Join
df = pd.merge(res_df, stn_df, how='left', on='station_id')
# Re-order columns
df = df[['station_id',
'station_code', 'station_name',
'latitude', 'longitude', 'analysis_period', 'data_period',
'par_id', 'non_missing', 'n_start', 'n_end', 'mean', 'median',
'std_dev', 'mk_stat', 'norm_mk_stat', 'mk_p_val', 'trend',
'sen_slp']]
df.head()
Out[11]:
In [12]:
def include(row):
if ((row['analysis_period'] == '1990-2016') &
(row['n_start'] >= 2) &
(row['n_end'] >= 2) &
(row['non_missing'] >= 18)):
return 'yes'
elif ((row['analysis_period'] == '1990-2004') &
(row['n_start'] >= 2) &
(row['n_end'] >= 2) &
(row['non_missing'] >= 10)):
return 'yes'
elif ((row['analysis_period'] == '2002-2016') &
(row['n_start'] >= 2) &
(row['n_end'] >= 2) &
(row['non_missing'] >= 10)):
return 'yes'
else:
return 'no'
df['include'] = df.apply(include, axis=1)
SO4 for this station ('station_id=36458' in the "core" dataset) should be removed. See here.
In [13]:
# Remove sulphate-related series at Abiskojaure
df = df.query('not((station_id==36458) and ((par_id=="ESO4") or '
'(par_id=="ESO4X") or '
'(par_id=="ESO4_ECl")))')
In [14]:
# Relative slope
df['rel_sen_slp'] = df['sen_slp'] / df['median']
In [15]:
# Remove unwanted cols
df.drop(labels=['mean', 'n_end', 'n_start', 'mk_stat', 'norm_mk_stat'],
axis=1, inplace=True)
# Reorder columns
df = df[['station_id', 'station_code',
'station_name', 'latitude', 'longitude', 'analysis_period',
'data_period', 'par_id', 'non_missing', 'median', 'std_dev',
'mk_p_val', 'trend', 'sen_slp', 'rel_sen_slp', 'include']]
# Write to output
out_path = os.path.join(res_fold, 'toc_core_trends_long_format.csv')
df.to_csv(out_path, index=False, encoding='utf-8')
df.head()
Out[15]:
In [16]:
del df['data_period']
# Melt to "long" format
melt_df = pd.melt(df,
id_vars=['station_id', 'station_code',
'station_name', 'latitude', 'longitude',
'analysis_period', 'par_id', 'include'],
var_name='stat')
# Get only values where include='yes'
melt_df = melt_df.query('include == "yes"')
del melt_df['include']
# Build multi-index on everything except "value"
melt_df.set_index(['station_id', 'station_code',
'station_name', 'latitude', 'longitude', 'par_id',
'analysis_period',
'stat'], inplace=True)
# Unstack levels of interest to columns
wide_df = melt_df.unstack(level=['par_id', 'analysis_period', 'stat'])
# Drop unwanted "value" level in index
wide_df.columns = wide_df.columns.droplevel(0)
# Replace multi-index with separate components concatenated with '_'
wide_df.columns = ["_".join(item) for item in wide_df.columns]
# Reset multiindex on rows
wide_df = wide_df.reset_index()
# Save output
out_path = os.path.join(res_fold, 'toc_trends_wide_format.csv')
wide_df.to_csv(out_path, index=False, encoding='utf-8')
wide_df.head()
Out[16]: