In [2]:
%matplotlib inline
import pandas as pd
import numpy as np
import nivapy3 as nivapy
import seaborn as sn
import matplotlib.pyplot as plt
import warnings
import os
import pickle
import itertools
import toc_trends_analysis as toc_trends
import plotnine as pn
import geopandas as gpd
from plotnine import ggplot, geom_point, aes, facet_grid, facet_wrap, theme, element_text, scale_x_log10, scale_x_continuous
from rpy2.robjects.packages import importr
from rpy2.robjects import r, pandas2ri
from IPython.display import Image
#warnings.filterwarnings("ignore")
plt.style.use('ggplot')
pn.options.figure_size = (10, 15)
pandas2ri.activate()
bcp = importr('bcp')
In [3]:
# Connect to db
eng = nivapy.da.connect()
This notebook implements more-or-less the same workflow as described in notebook 6a, but using a broader range of stations based on less strict selection criteria. See the original notebook for details.
Heleen has suggested applying the same selection criteria as used for the TOC trends analysis with Don and John. This involves aggregating to annual frequency and requiring that:
In [4]:
# Read stations
stn_path = r'../../../all_icpw_sites_may_2019.xlsx'
stn_df = pd.read_excel(stn_path, sheet_name='all_icpw_stns')
stn_df.head()
Out[4]:
In [5]:
# Load saved chem data
wc_csv = r'../../../Thematic_Trends_Report_2019/working_chem.csv'
wc_df = pd.read_csv(wc_csv, encoding='utf-8')
wc_df['sample_date'] = pd.to_datetime(wc_df['sample_date'])
wc_df.head()
Out[5]:
In [6]:
# Melt to long format
df = wc_df.copy()
del df['station_code'], df['station_name'], df['depth1'], df['depth2']
df = pd.melt(df, id_vars=['station_id', 'sample_date'])
df.head()
Out[6]:
The code below creates a dataframe showing which series at which sites meet the criteria defined above.
In [7]:
# Dict for results
inc_dict = {'station_id':[],
'variable':[],
'include':[],
}
# Loop over time series
for stn_id in df['station_id'].unique():
# Loop over variables
for par in df['variable'].unique():
# Get data
df2 = df.query("(station_id == @stn_id) and (variable == @par)")
df2.set_index('sample_date', inplace=True)
del df2['station_id'], df2['variable']
# Annual dates from 1990 - 2016
all_dates = pd.date_range('1990-01-01', '2016-12-31', freq='A')
dates_df = pd.DataFrame(index=all_dates)
# Resample to annual
df2 = df2.resample('A').median()
df2 = dates_df.join(df2)
df2.index = df2.index.year
if pd.isna(df2['value']).all().all():
# Not suitable
inc_dict['station_id'].append(stn_id)
inc_dict['variable'].append(par)
inc_dict['include'].append('no')
else:
n_start = pd.notnull(df2[df2.index<(1995)]['value']).sum()
n_end = pd.notnull(df2[df2.index>(2011)]['value']).sum()
non_missing = pd.notnull(df2['value']).sum()
if (n_start >= 1) and (n_end >= 1) and (non_missing >= 18):
# Not suitable
inc_dict['station_id'].append(stn_id)
inc_dict['variable'].append(par)
inc_dict['include'].append('yes')
else:
# Include
inc_dict['station_id'].append(stn_id)
inc_dict['variable'].append(par)
inc_dict['include'].append('no')
# Build df
inc_df = pd.DataFrame(inc_dict)
print('The number of stations with at least some time series meeting the '
'specified criteria is', len(inc_df.query('include == "yes"')['station_id'].unique()))
inc_df.head()
Out[7]:
497 out of 555 stations have at least one time series (i.e. one parameter) that meets the specified requirements.
In [8]:
# Map of selected stations
sel_stn_list = list(inc_df.query('include == "yes"')['station_id'].unique())
sel_stns = stn_df.query('station_id in @sel_stn_list')
nivapy.spatial.quickmap(sel_stns,
cluster=True,
popup='station_code')
Out[8]:
In [9]:
# Save station list
sel_stns.to_csv(r'../../../Thematic_Trends_Report_2019/results/selected_stations_relaxed_criteria.csv',
index=False,
encoding='utf-8')
The code below produces a "grid plot" showing the Sen's slope trend estimate for each data series at each site (only data series meeting the selection criteria are included). Each row corresponds to a parameter, and the first column is based on annual medians; the second column uses annual minima.
M-K and Sen's slope results for all series are also saved to a CSV file for later use.
In [10]:
%%capture
# Output folder
out_fold = r'../../../Thematic_Trends_Report_2019/results'
# Dicts for results
res_dict = {'station_id':[],
'variable':[],
'metric':[],
'mk_p_val':[],
'mk_z':[],
'mk_trend':[],
'sen_slp':[],
'sen_incpt':[],
'sen_trend':[],
}
series_dict = {}
for stn_id in inc_df['station_id'].unique():
# Determine whether to process this site
inc_site_df = inc_df.query("(station_id == @stn_id) and (include == 'yes')")
if len(inc_site_df) > 0:
# Setup plot
fig, axes = plt.subplots(nrows=8, ncols=2, figsize=(15, 20))
# Loop over variables
for row_idx, par in enumerate(inc_df['variable'].unique()):
# Determine whether to plot series
inc = inc_df.query("(station_id == @stn_id) and (variable == @par)")['include'].values[0]
if inc == 'no':
# Plot "omitted" text
axes[row_idx, 0].text(0.5, 0.5,
'Omitted due to lack of data',
verticalalignment='center',
horizontalalignment='center',
transform=axes[row_idx, 0].transAxes,
fontsize=18)
axes[row_idx, 1].text(0.5, 0.5,
'Omitted due to lack of data',
verticalalignment='center',
horizontalalignment='center',
transform=axes[row_idx, 1].transAxes,
fontsize=18)
else:
# Get data
df2 = df.query("(station_id == @stn_id) and (variable == @par)")
df2.set_index('sample_date', inplace=True)
del df2['station_id'], df2['variable']
series_dict[(stn_id, par)] = df2
# Resample to annual medians and mins
for col_idx, stat in enumerate(['annual median', 'annual minimum']):
if stat == 'annual median':
df_stat = df2.resample('A').median()
else:
df_stat = df2.resample('A').min()
df_stat.index = df_stat.index.year
# MK test
mk_df = nivapy.stats.mk_test(df_stat, 'value')
# Sen's slope
res_df, sen_df = nivapy.stats.sens_slope(df_stat,
value_col='value',
index_col=df_stat.index)
# Add results to dict
res_dict['station_id'].append(stn_id)
res_dict['variable'].append(par)
res_dict['metric'].append(stat)
res_dict['mk_p_val'].append(mk_df.loc['p'].value)
res_dict['mk_z'].append(mk_df.loc['z'].value)
res_dict['mk_trend'].append(mk_df.loc['trend'].value)
sslp = res_df.loc['sslp'].value
sincpt = res_df.loc['icpt'].value
res_dict['sen_slp'].append(sslp)
res_dict['sen_incpt'].append(sincpt)
res_dict['sen_trend'].append(res_df.loc['trend'].value)
# Plot
axes[row_idx, col_idx].plot(sen_df.index.values, sen_df['value'].values, 'bo-')
axes[row_idx, col_idx].plot(sen_df.index.values,
sen_df.index.values*sslp + sincpt, 'k-')
if par == 'TOC':
unit = 'mg-C/l'
else:
unit = 'µEq/l'
axes[row_idx, 0].set_title('Annual median %s (%s)' % (par, unit))
axes[row_idx, 1].set_title('Annual minimum %s (%s)' % (par, unit))
plt.tight_layout()
# Save
out_png = os.path.join(out_fold, 'trends_plots_1990-2016_relaxed_criteria/%s_trends_1990-2016_relaxed_criteria.png' % stn_id)
plt.savefig(out_png, dpi=200)
plt.close()
# Combine results
res_df = pd.DataFrame(res_dict)
out_csv = os.path.join(out_fold, 'trends_summary_1990-2016_relaxed_criteria.csv')
res_df.to_csv(out_csv, index=False, encoding='utf-8')
# Save series
out_pkl = os.path.join(out_fold, 'series_relaxed_criteria.pkl')
with open(out_pkl, 'wb') as handle:
pickle.dump(series_dict, handle, protocol=pickle.HIGHEST_PROTOCOL)
In [11]:
res_df.head()
Out[11]:
In [32]:
%%capture
# Output folder
out_fold = r'../../../Thematic_Trends_Report_2019/results'
# Dicts for results
res_dict = {'station_id':[],
'variable':[],
'period':[],
'sen_slp':[],
'sen_incpt':[],
'sen_trend':[],
}
for stn_id in inc_df['station_id'].unique():
# Determine whether to process this site
inc_site_df = inc_df.query("(station_id == @stn_id) and (include == 'yes')")
if len(inc_site_df) > 0:
# Loop over variables
for row_idx, par in enumerate(inc_df['variable'].unique()):
# Determine whether to plot series
inc = inc_df.query("(station_id == @stn_id) and (variable == @par)")['include'].values[0]
if inc == 'yes':
# Get data
df2 = df.query("(station_id == @stn_id) and (variable == @par)")
df2.set_index('sample_date', inplace=True)
del df2['station_id'], df2['variable']
df_stat = df2.resample('A').median()
df_stat.index = df_stat.index.year
for period in ['1990-2004', '2002-2016']:
# Get data for period
st_yr, end_yr = [int(i) for i in period.split('-')]
df3 = df_stat.loc[st_yr:end_yr]
# Sen's slope
res_df, sen_df = nivapy.stats.sens_slope(df3,
value_col='value',
index_col=df_stat.index)
# Add results to dict
res_dict['station_id'].append(stn_id)
res_dict['variable'].append(par)
res_dict['period'].append(period)
sslp = res_df.loc['sslp'].value
sincpt = res_df.loc['icpt'].value
res_dict['sen_slp'].append(sslp)
res_dict['sen_incpt'].append(sincpt)
res_dict['sen_trend'].append(res_df.loc['trend'].value)
# Combine results
res_df_pairwise = pd.DataFrame(res_dict)
out_csv = os.path.join(out_fold, 'pairwise_trends_long_1990-2016_relaxed_criteria.csv')
res_df_pairwise.to_csv(out_csv, index=False, encoding='utf-8')
In [33]:
# Convert to wide
res_df_pairwise_wide = res_df_pairwise.copy()
res_df_pairwise_wide['par_period'] = res_df_pairwise_wide['variable'] + '_' + res_df_pairwise_wide['period']
res_df_pairwise_wide.drop(['variable', 'period', 'sen_incpt', 'sen_trend'], axis=1, inplace=True)
res_df_pairwise_wide.set_index(['station_id', 'par_period'], inplace=True)
res_df_pairwise_wide = res_df_pairwise_wide.unstack('par_period')
res_df_pairwise_wide.reset_index(inplace=True)
res_df_pairwise_wide.index.name = ''
cols = [res_df_pairwise_wide.columns.get_level_values(0)[0]] + list(res_df_pairwise_wide.columns.get_level_values(1)[1:])
res_df_pairwise_wide.columns = cols
res_df_pairwise_wide = pd.merge(stn_df, res_df_pairwise_wide, how='inner', on='station_id')
res_df_pairwise_wide.head()
out_csv = os.path.join(out_fold, 'pairwise_trends_wide_1990-2016_relaxed_criteria.csv')
res_df_pairwise_wide.to_csv(out_csv, index=False, encoding='utf-8')
Øyvind would like to know whether the trends in medians and minima are consistent. One way to do this is to plot the Sen's slopes derived from medians against those based on minima, split by parameter. Øyvind is primarily interested in EH, ANC, alkalinity, so I'll just focus on those here.
In [12]:
# Cols of interest
cols = ['station_id', 'variable', 'metric', 'sen_slp']
# Setup plot
fig, axes = plt.subplots(nrows=3, ncols=1, figsize=(10,15))
# Loop over pars
for idx, par in enumerate(['EH', 'ANC', 'ALK-E']):
# Get data
cols = ['station_id', 'variable', 'metric', 'sen_slp']
comp_df = res_df.query("variable == @par")[cols].copy()
del comp_df['variable']
comp_df.set_index(['station_id', 'metric'], inplace=True)
# Convert to 'wide' format
comp_df = comp_df.unstack('metric')
comp_df.columns = comp_df.columns.get_level_values('metric')
# Plot
axes[idx].plot(comp_df['annual median'], comp_df['annual minimum'], 'bo', label="Sen's slopes")
axes[idx].plot(comp_df['annual median'], comp_df['annual median'], 'k-', label='1:1 line')
axes[idx].set_xlabel('Slope for annual medians ($yr^{-1}$)', fontsize=14)
axes[idx].set_ylabel('Slope for annual minima ($yr^{-1}$)', fontsize=14)
axes[idx].legend(loc='best', fontsize=14)
axes[idx].set_title('%s (µEq/l)' % par)
plt.tight_layout()
# Save
out_png = os.path.join(out_fold, 'trend_minima_vs_medians_relaxed_criteria.png')
plt.savefig(out_png, dpi=200)
In general, the slopes obtained from annual medians are very similar to those based on annual minima.
Note: There are a few outliers on the plots above that probably warrant further investigation.
Another visualisation option is to create heatmaps showing whether trends in annual minima are classified as being "significant" in the same way as trends in annual medians. The labels on the plots below show the percentage of the total sites for each variable in each of 9 classes. High proportions along the diagonal indicate that results based on minima and medians are giving essentially the same overall picture. The overall direction of the trends is also very clear.
In [13]:
# Cols of interest
cols = ['station_id', 'variable', 'metric', 'sen_trend']
# Setup plot
fig, axes = plt.subplots(nrows=1, ncols=3, figsize=(15,4))
# Loop over pars
for idx, par in enumerate(['EH', 'ANC', 'ALK-E']):
# Get data
cols = ['station_id', 'variable', 'metric', 'sen_trend']
comp_df = res_df.query("variable == @par")[cols].copy()
del comp_df['variable']
comp_df.set_index(['station_id', 'metric'], inplace=True)
# Convert to 'wide' format
comp_df = comp_df.unstack('metric')
comp_df.columns = comp_df.columns.get_level_values('metric')
comp_df.columns = [i.replace(' ', '_') for i in comp_df.columns]
# Get counts for combos
opt_list = ['decreasing', 'no trend', 'increasing']
hmap = np.zeros(shape=(3,3))
# Map options to array indices (med is x; min is y)
med_map = {'decreasing':0,
'no trend':1,
'increasing':2,
}
min_map = {'decreasing':2,
'no trend':1,
'increasing':0,
}
# Assign counts to array
for pair in itertools.product(opt_list, repeat=2):
ann_med, ann_min = pair
cnt = len(comp_df.query("(annual_median == @ann_med) and (annual_minimum == @ann_min)"))
pct = 100*cnt/len(comp_df)
hmap[min_map[ann_min], med_map[ann_med]] = pct
# Plot
g = sn.heatmap(hmap,
ax=axes[idx],
square=True,
annot=True,
xticklabels=opt_list,
yticklabels=opt_list[::-1],
cmap='coolwarm',
)
g.set_yticklabels(g.get_yticklabels(), rotation=0)
axes[idx].set_xlabel('Annual medians', fontsize=16)
axes[idx].set_ylabel('Annual minima', fontsize=16)
axes[idx].set_title('%s (%%)' % par)
plt.tight_layout()
# Save
out_png = os.path.join(out_fold, 'trend_minima_vs_medians_heatmap_relaxed_criteria.png')
plt.savefig(out_png, dpi=200)
To present results concisely, we need to aggregate output from the analysis above. A simple way to get an overall picture of the dataset is to create box plots, violin plots and histograms illustrating the range of significant Sen's slopes for each parameter in each region (or country or continent). I will focus on annual medians here since, based on the output above, results for minima should be similar.
In [14]:
# Get data
reg_df = res_df.query("(metric == 'annual median') and "
"(sen_trend in ('increasing', 'decreasing'))")
# Join regions
reg_df = pd.merge(reg_df, stn_df[['station_id', 'continent', 'country', 'region']],
how='left', on='station_id')
In [15]:
# Boxplots by region
g = sn.catplot(data=reg_df,
row='variable',
x='region',
y='sen_slp',
kind='box',
order=['NoNord', 'SoNord', 'Baltic', 'UK-IE-NL',
'WCE', 'ECE', 'Alps', 'AtlCan', 'QuMaVt',
'AdsCsk', 'Apps', 'BRM', 'Ont'],
sharex=False,
sharey=False,
aspect=3
)
# Save
out_png = os.path.join(out_fold, 'slope_box_plots_by_region_relaxed_criteria.png')
plt.savefig(out_png, dpi=200)
In [16]:
# Violin plots by continent
g = sn.catplot(data=reg_df,
col='variable',
col_wrap=4,
x='continent',
y='sen_slp',
kind='violin',
sharex=False,
sharey=False,
aspect=1
)
g.map(plt.axhline, y=0, lw=2, ls='--', c='k')
# Save
out_png = os.path.join(out_fold, 'slope_violin_plots_by_continent_relaxed_criteria.png')
plt.savefig(out_png, dpi=200)
In [17]:
# KDE-smoothed plots by continent
g = sn.FacetGrid(reg_df,
row="variable",
col='continent',
aspect=2,
sharex='row',
sharey=False)
g.map(sn.distplot, 'sen_slp', hist=False, rug=True)
g.map(plt.axvline, x=0, lw=2, ls='--', c='k')
# Save
out_png = os.path.join(out_fold, 'slope_kde_plots_by_continent_relaxed_criteria.png')
plt.savefig(out_png, dpi=200)
Remember that all the plots above are different ways of illustrating the distributions of significant Sen's slopes. I think there are some interesting patterns here - especially comparing Europe with North America.
Added 27/06/2019
I have now extended nivapy.stats
to include the regionl/seasonal Mann-Kendall test, plus a regional/seasoanl estimate of the Sen's slope - see nivapy.stats.seasonal_regional_mk_sen()
for details. The code below estimate regional trends in each region for each parameter, based on the 486 stations meeting the selection criteria.
In [18]:
%%capture
res_dict = {'region': [],
'param': [],
'stat': [],
'n_stns': [],
'mk_stat': [],
'mk_stat_var': [],
'mk_pval': [],
'sen_slp': [],
'mk_trend': [],
}
# Loop over parameters
for par in inc_df['variable'].unique():
# Get stations to include for this parameter
stns_par = inc_df.query('(variable == @par) and (include == "yes")')['station_id']
stns_par = stn_df.query('station_id in @stns_par')
# Loop over regions
for reg in stns_par['region'].unique():
stns_reg = stns_par.query('region == @reg')['station_id']
n_stns_reg = len(stns_reg)
# Get data
df2 = df.query('(station_id in @stns_reg) and (variable == @par)').copy()
df2['year'] = df2['sample_date'].dt.year
del df2['sample_date'], df2['variable']
# Consider medians, minima and maxima
for stat in ['annual median', 'annual minimum', 'annual maximum']:
if stat == 'annual median':
df3 = df2.groupby(['station_id', 'year']).median().reset_index()
elif stat == 'annual maximum':
df3 = df2.groupby(['station_id', 'year']).max().reset_index()
else:
df3 = df2.groupby(['station_id', 'year']).min().reset_index()
# Regional M-K test
res_df = nivapy.stats.seasonal_regional_mk_sen(df3,
time_col='year',
value_col='value',
block_col='station_id')
# Append to results
res_dict['region'].append(reg)
res_dict['param'].append(par)
res_dict['stat'].append(stat)
res_dict['n_stns'].append(n_stns_reg)
res_dict['mk_stat'].append(res_df.loc['s']['value'])
res_dict['mk_stat_var'].append(res_df.loc['var_s']['value'])
res_dict['mk_pval'].append(res_df.loc['p']['value'])
res_dict['sen_slp'].append(res_df.loc['sslp']['value'])
res_dict['mk_trend'].append(res_df.loc['trend']['value'])
res_df = pd.DataFrame(res_dict)
# Save
out_path = os.path.join(out_fold, 'regional_mk_sen_slope_relaxed_criteria.csv')
res_df.to_csv(out_path, encoding='utf-8', index=False)
In [19]:
res_df.head(10)
Out[19]:
The code below uses the BCP R package and is modified from notebook 5.
For each of the selected time series at each site, I will run a change point analysis (without regression - see here for details) and produce a single plot showing all the data for each site. For each variable, I will also record years when the probabiltiy of change is (i) greater than 50% and (ii) greater than 75%. This will make it possible to look for consistent change points across regions.
In [22]:
def plot_bcp_change_pts(stn_id, reg=True, p0=0.2):
""" Run BCP analysis to detect change-points.
"""
res_dict = {'station_id':[],
'variable':[],
'prob':[],
'year':[],
}
# Setup plot
fig, axes = plt.subplots(nrows=8, ncols=2,
sharex=False,
sharey=False,
figsize=(15,15))
# Loop over variables
for row_idx, par in enumerate(inc_df['variable'].unique()):
# Determine whether to plot series
inc = inc_df.query("(station_id == @stn_id) and (variable == @par)")['include'].values[0]
if inc == 'no':
# Plot "omitted" text
axes[row_idx, 0].text(0.5, 0.5,
'Omitted due to lack of data',
verticalalignment='center',
horizontalalignment='center',
transform=axes[row_idx, 0].transAxes,
fontsize=18)
axes[row_idx, 0].set_ylabel(par)
axes[row_idx, 1].text(0.5, 0.5,
'Omitted due to lack of data',
verticalalignment='center',
horizontalalignment='center',
transform=axes[row_idx, 1].transAxes,
fontsize=18)
axes[row_idx, 1].set_ylabel('Probability')
else:
# Get data
df2 = df.query("(station_id == @stn_id) and (variable == @par)")
df2.set_index('sample_date', inplace=True)
del df2['station_id'], df2['variable']
# Resample
df2 = df2.resample('A').median().reset_index()
df2['year'] = df2['sample_date'].dt.year
del df2['sample_date']
# Interpolate years with missing data
index_df = pd.DataFrame({'year':range(1990, 2017)})
df2 = pd.merge(index_df, df2, how='left', on='year')
df2.interpolate(kind='linear', inplace=True)
df2.fillna(method='backfill', inplace=True)
# Change point analysis
if reg:
# Perform regression within each partition
res = bcp.bcp(df2['value'],
x=df['year'],
p0=p0)
else:
# Assume constant mean within each partition
res = bcp.bcp(df2['value'],
p0=p0)
res = pd.DataFrame({'raw':df2['value'].values,
'mean':res.rx2('posterior.mean').flatten(),
'prob':res.rx2('posterior.prob')},
index=df2['year'].values)
# Add to results
gt50 = np.array(res.index[res['prob'] > 0.5])
for year in gt50:
res_dict['station_id'].append(stn_id)
res_dict['variable'].append(par)
res_dict['prob'].append('gt50')
res_dict['year'].append(year)
gt75 = np.array(res.index[res['prob'] > 0.75])
for year in gt75:
res_dict['station_id'].append(stn_id)
res_dict['variable'].append(par)
res_dict['prob'].append('gt75')
res_dict['year'].append(year)
# Plot
res['raw'].plot(ax=axes[row_idx, 0], marker='o', linestyle=':', label='Raw')
res['mean'].plot(ax=axes[row_idx, 0], label='Fitted')
axes[row_idx, 0].set_ylabel(par)
axes[row_idx, 0].legend(loc='best')
res['prob'].plot(ax=axes[row_idx, 1], marker='o')
axes[row_idx, 1].set_ylabel('Probability')
axes[row_idx, 1].set_ylim((0, 1))
axes[row_idx, 1].axhline(0.5, c='k', linestyle='--')
axes[row_idx, 1].axhline(0.95, c='k', linestyle='--')
axes[0, 0].set_title('Data series')
axes[0, 1].set_title('Change probability')
plt.tight_layout()
out_png = os.path.join(out_fold, 'bcp_selected_sites_relaxed_criteria/bcp_stn_%s_relaxed_criteria.png' % stn_id)
plt.savefig(out_png, dpi=200)
plt.close()
res_df = pd.DataFrame(res_dict)
return res_df
In [23]:
%%capture
# Container for results
df_list = []
# Loop over stations
for stn_id in inc_df['station_id'].unique():
# Determine whether to process this site
print(stn_id)
inc_site_df = inc_df.query("(station_id == @stn_id) and (include == 'yes')")
if len(inc_site_df) > 0:
# Run BCP
res_bcp = plot_bcp_change_pts(stn_id, reg=False, p0=0.1)
df_list.append(res_bcp)
# Combine results
res_bcp = pd.concat(df_list, axis=0)
In [24]:
# Show example BCP output
Image(r'../../../Thematic_Trends_Report_2019/results/bcp_selected_sites_relaxed_criteria/bcp_stn_38242_relaxed_criteria.png')
Out[24]:
These results can now be aggregated regionally. I will produce four sets of plots:
KDE-smoothed histograms showing years where the probabiltiy of change is >50%
a. Split by region
b. Split by continent
KDE-smoothed histograms showing years where the probabiltiy of change is >75%
a. Split by region
b. Split by continent
For ease of comparison, columns represent parameters and rows are regions. Considering one column at a time therefore makes it posible to identify for similar patterns between regions.
In [25]:
# Join regions
res_bcp = pd.merge(res_bcp, stn_df[['station_id', 'continent', 'country', 'region']],
how='left', on='station_id')
res_bcp.head()
Out[25]:
In [26]:
# KDE-smoothed plots by region
# Change prob > 50%
gt50_df = res_bcp.query('prob == "gt50"')
g = sn.FacetGrid(gt50_df,
row='region',
col='variable',
aspect=1.5,
sharex='row',
sharey=False,
row_order=['NoNord', 'SoNord', 'UK-IE-NL',
'WCE', 'ECE', 'Alps', 'QuMaVt',
'AdsCsk', 'BRM', 'Ont'])
g.map(sn.distplot, 'year')
g.set(xlim=(1990, 2016))
# Save
out_png = os.path.join(out_fold, 'change_prob_kde_gt50_by_region_relaxed_criteria.png')
plt.savefig(out_png, dpi=200)
In [27]:
# KDE-smoothed plots by region
# Change prob > 75%
gt50_df = res_bcp.query('prob == "gt75"')
g = sn.FacetGrid(gt50_df,
row='region',
col='variable',
aspect=1.5,
sharex='row',
sharey=False,
row_order=['NoNord', 'SoNord', 'UK-IE-NL',
'WCE', 'ECE', 'Alps', 'QuMaVt',
'AdsCsk', 'BRM', 'Ont'])
g.map(sn.distplot, 'year')
g.set(xlim=(1990, 2016))
# Save
out_png = os.path.join(out_fold, 'change_prob_kde_gt75_by_region_relaxed_criteria.png')
plt.savefig(out_png, dpi=200)
In [28]:
# KDE-smoothed plots by continent
# Change prob > 50%
gt50_df = res_bcp.query('prob == "gt50"')
g = sn.FacetGrid(gt50_df,
row='continent',
col='variable',
aspect=1.5,
sharex='row',
sharey=False)
g.map(sn.distplot, 'year')
g.set(xlim=(1990, 2016))
# Save
out_png = os.path.join(out_fold, 'change_prob_kde_gt50_by_continent_relaxed_criteria.png')
plt.savefig(out_png, dpi=200)
In [29]:
# KDE-smoothed plots by continent
# Change prob > 75%
gt50_df = res_bcp.query('prob == "gt75"')
g = sn.FacetGrid(gt50_df,
row='continent',
col='variable',
aspect=1.5,
sharex='row',
sharey=False)
g.map(sn.distplot, 'year')
g.set(xlim=(1990, 2016))
# Save
out_png = os.path.join(out_fold, 'change_prob_kde_gt75_by_continent_relaxed_criteria.png')
plt.savefig(out_png, dpi=200)
I haven't had time to look at this output in detail, but there appears to be evidence for consistent regional changes in at least some variables. For example, ENO3X, ECaX_EMgX and ESO4X all show similar patterns in the SoNord, NoNord and UK+IE+NL regions (and perhaps for Europe as a whole too), with many change points occurring in the mid-1990s. For TOC, meanwhile, the most common European change points are about a decade later, in the mid-2000s.
Updated 08/11/2019. The original code only considered differecnes between medians and minima. Øyvind asked for this to be extended to include (maxima - medians) - see e-mail received 05.11.2019 at 15.18.
Added 02/06/2019.
Øyvind would like to explore trends in the absolute differences between annual median and minimum values for ANC and pH. This is motivated by some interesting patterns previously documented at Øygardsbekken (station ID 38313) - see e-mail from Øyvind received 01/06/2019 at 17.47 for details.
As a sanity check, it's a good idea to first make sure I can reproduce the patterns in Øyvind's e-mail.
In [30]:
# Load series
pkl = os.path.join(out_fold, 'series_relaxed_criteria.pkl')
with open(pkl, 'rb') as handle:
series_dict = pickle.load(handle)
# Get data for Øygardsbekken
stn_id = 38313
# Loop over dfs
df_list = []
for par in ['ANC', 'EH']:
# Get data
df = series_dict[(stn_id, par)]
# Convert EH to pH
if par == 'EH':
df['value'] = -np.log10(df['value']/1E6)
par = 'pH'
# Resample to annual medians and mins
for col_idx, stat in enumerate(['annual median', 'annual minimum']):
if stat == 'annual median':
df_stat = df.resample('A').median()
else:
df_stat = df.resample('A').min()
df_stat['year'] = df_stat.index.year
df_stat['metric'] = stat
df_stat['variable'] = par
df_stat.reset_index(inplace=True, drop=True)
df_list.append(df_stat)
ann_df = pd.concat(df_list, axis=0)
ann_df.head()
Out[30]:
In [31]:
# Plot
g = sn.lmplot(x='year',
y='value',
data=ann_df,
col='variable',
hue='metric',
sharey=False,
lowess=True)
g.axes[0, 0].set_ylim((-125, 75))
g.axes[0, 1].set_ylim((4, 6))
Out[31]:
These plots look the same as in Øyvind's e-mail, which is good. The difference between minimum and median ANC looks fairly steady, whereas differences for pH are becoming larger through time, apparently because median pH is increasing more rapidly than minimum pH. The next step is to calculate absolute differences between median and minimum values for all sites and then test for trends.
In [32]:
%%capture
# Load series
pkl = os.path.join(out_fold, 'series_relaxed_criteria.pkl')
with open(pkl, 'rb') as handle:
series_dict = pickle.load(handle)
# Dicts for results
res_dict = {'station_id':[],
'variable':[],
'mk_p_val':[],
'mk_trend':[],
'sen_slp':[],
'sen_incpt':[],
'sen_trend':[],
}
for stn_id in inc_df['station_id'].unique():
for par in ['ANC', 'EH']:
try:
# Get data
df = series_dict[(stn_id, par)]
# Convert EH to pH
if par == 'EH':
df['value'] = -np.log10(df['value']/1E6)
par = 'pH'
# Resample to annual medians and mins
df_med = df.resample('A').median()
df_min = df.resample('A').min()
df_diff = df_med - df_min
df_diff.index = df_diff.index.year
# MK test
mk_df = nivapy.stats.mk_test(df_diff, 'value')
# Sen's slope
res_df, sen_df = nivapy.stats.sens_slope(df_diff,
value_col='value',
index_col=df_diff.index)
# Add results to dict
res_dict['station_id'].append(stn_id)
res_dict['variable'].append(par)
res_dict['mk_p_val'].append(mk_df.loc['p'].value)
res_dict['mk_trend'].append(mk_df.loc['trend'].value)
sslp = res_df.loc['sslp'].value
sincpt = res_df.loc['icpt'].value
res_dict['sen_slp'].append(sslp)
res_dict['sen_incpt'].append(sincpt)
res_dict['sen_trend'].append(res_df.loc['trend'].value)
except KeyError:
pass
# Combine results
res_df = pd.DataFrame(res_dict)
out_csv = os.path.join(out_fold, 'trends_in_differences_med-min_relaxed_criteria.csv')
res_df.to_csv(out_csv, index=False, encoding='utf-8')
In [33]:
res_df.head()
Out[33]:
As a further quick check, here are the results for Øygardsbekken:
In [34]:
# Get results for Øygardsbekken
res_df.query('station_id == 38313')
Out[34]:
As expected, there is no trend for differences in ANC, but a significant increasing trend for differences in pH.
Next, join in the region data and explore patterns for all sites.
In [35]:
# Just significant
df = res_df.query("sen_trend in ('increasing', 'decreasing')")
# Join regions
df = pd.merge(df, stn_df[['station_id', 'continent', 'country', 'region']],
how='left', on='station_id')
In [36]:
# Box plots of significant slopes in differences by region
g = sn.catplot(data=df,
x='region',
y='sen_slp',
row='variable',
kind='box',
order=['NoNord', 'SoNord', 'Baltic', 'UK-IE-NL',
'WCE', 'ECE', 'Alps', 'AtlCan', 'QuMaVt',
'AdsCsk', 'Apps', 'BRM', 'Ont'],
sharex=False,
sharey=False,
aspect=3,
)
g.map(plt.axhline, y=0, lw=2, ls='--', c='k', alpha=0.4)
# Save
out_png = os.path.join(out_fold, 'med_min_diff_box_plots_by_region_relaxed_criteria.png')
plt.savefig(out_png, dpi=200)
In [37]:
# KDE-smoothed plots by continent
g = sn.FacetGrid(df,
row='variable',
col='continent',
aspect=2,
sharex='row',
sharey=False)
g.map(sn.distplot, 'sen_slp', hist=False, rug=True)
g.map(plt.axvline, x=0, lw=2, ls='--', c='k')
# Save
out_png = os.path.join(out_fold, 'med_min_diff_kde_plots_by_continent_relaxed_criteria.png')
plt.savefig(out_png, dpi=200)
In [38]:
%%capture
# Load series
pkl = os.path.join(out_fold, 'series_relaxed_criteria.pkl')
with open(pkl, 'rb') as handle:
series_dict = pickle.load(handle)
# Dicts for results
res_dict = {'station_id':[],
'variable':[],
'mk_p_val':[],
'mk_trend':[],
'sen_slp':[],
'sen_incpt':[],
'sen_trend':[],
}
for stn_id in inc_df['station_id'].unique():
for par in ['ANC', 'EH']:
try:
# Get data
df = series_dict[(stn_id, par)]
# Convert EH to pH
if par == 'EH':
df['value'] = -np.log10(df['value']/1E6)
par = 'pH'
# Resample to annual medians and maxes
df_med = df.resample('A').median()
df_max = df.resample('A').max()
df_diff = df_max - df_med
df_diff.index = df_diff.index.year
# MK test
mk_df = nivapy.stats.mk_test(df_diff, 'value')
# Sen's slope
res_df, sen_df = nivapy.stats.sens_slope(df_diff,
value_col='value',
index_col=df_diff.index)
# Add results to dict
res_dict['station_id'].append(stn_id)
res_dict['variable'].append(par)
res_dict['mk_p_val'].append(mk_df.loc['p'].value)
res_dict['mk_trend'].append(mk_df.loc['trend'].value)
sslp = res_df.loc['sslp'].value
sincpt = res_df.loc['icpt'].value
res_dict['sen_slp'].append(sslp)
res_dict['sen_incpt'].append(sincpt)
res_dict['sen_trend'].append(res_df.loc['trend'].value)
except KeyError:
pass
# Combine results
res_df = pd.DataFrame(res_dict)
out_csv = os.path.join(out_fold, 'trends_in_differences_max-med_relaxed_criteria.csv')
res_df.to_csv(out_csv, index=False, encoding='utf-8')
In [39]:
# Get just significant
df = res_df.query("sen_trend in ('increasing', 'decreasing')")
# Join regions
df = pd.merge(df, stn_df[['station_id', 'continent', 'country', 'region']],
how='left', on='station_id')
In [40]:
# Box plots of significant slopes in differences by region
g = sn.catplot(data=df,
x='region',
y='sen_slp',
row='variable',
kind='box',
order=['NoNord', 'SoNord', 'Baltic', 'UK-IE-NL',
'WCE', 'ECE', 'Alps', 'AtlCan', 'QuMaVt',
'AdsCsk', 'Apps', 'BRM', 'Ont'],
sharex=False,
sharey=False,
aspect=3,
)
g.map(plt.axhline, y=0, lw=2, ls='--', c='k', alpha=0.4)
# Save
out_png = os.path.join(out_fold, 'max_med_diff_box_plots_by_region_relaxed_criteria.png')
plt.savefig(out_png, dpi=200)
In [41]:
# KDE-smoothed plots by continent
g = sn.FacetGrid(df,
row='variable',
col='continent',
aspect=2,
sharex='row',
sharey=False)
g.map(sn.distplot, 'sen_slp', hist=False, rug=True)
g.map(plt.axvline, x=0, lw=2, ls='--', c='k')
# Save
out_png = os.path.join(out_fold, 'max_med_diff_kde_plots_by_continent_relaxed_criteria.png')
plt.savefig(out_png, dpi=200)
In [42]:
# Read trend results
csv_path = os.path.join(out_fold, 'trends_summary_1990-2016_relaxed_criteria.csv')
df = pd.read_csv(csv_path)
# Join regions
df = pd.merge(df, stn_df[['station_id', 'region']], how='left', on='station_id')
# Just medians
df = df.query('metric == "annual median"')
df.head()
Out[42]:
In [43]:
# Plot
p = (ggplot(df, aes('sen_slp', 'mk_z'))
+ geom_point(aes(color='mk_p_val'))
+ facet_grid('region~variable', scales='free_x', shrink=True)
+ theme(axis_text_x=element_text(rotation=90)))
p
Out[43]:
In [44]:
# List of selected stn IDs
sel_stn_list = list(sel_stns['station_id'].unique())
# Subset by stns and dates (>=2010)
wc_df2 = wc_df.query('station_id in @sel_stn_list')
wc_df2 = wc_df2.query('sample_date >= "2010-01-01"')
# Mean ESO4
wc_df2 = wc_df2.groupby('station_id').mean()[['ESO4']].dropna().reset_index()
# Save
out_path = os.path.join(out_fold, 'mean_eso4_2010-2016_relaxed_criteria.csv')
wc_df2.to_csv(out_path, index=False, encoding='utf-8')
wc_df2.head()
Out[44]:
In [45]:
# Load saved chem data
wc_csv = r'../../../Thematic_Trends_Report_2019/working_chem.csv'
wc_df = pd.read_csv(wc_csv, encoding='utf-8')
wc_df['sample_date'] = pd.to_datetime(wc_df['sample_date'])
wc_df.head()
Out[45]:
In [46]:
# Read stations
stn_csv = r'../../../Thematic_Trends_Report_2019/results/selected_stations_relaxed_criteria.csv'
sel_stns = pd.read_csv(stn_csv)
sel_stns.head()
Out[46]:
In [47]:
def assign_period(row):
if row['year'] <= 1994:
return 'start'
elif row['year'] >= 2012:
return 'end'
else:
return np.nan
In [48]:
# Join regions
df = pd.merge(wc_df, sel_stns[['station_id', 'region']],
how='left', on='station_id')
# Filter to stations of interest
df = df[df['station_id'].isin(sel_stns['station_id'])]
# Get period of interest
df['year'] = df['sample_date'].dt.year
df['period'] = df.apply(assign_period, axis=1)
df.dropna(subset=['period'], inplace=True)
# Aggregate to regions
df = df.groupby(['region', 'period']).mean().reset_index()
# Get pars of interest
df = df[['region', 'period', 'ECl', 'ESO4', 'ENO3', 'ECa_EMg', 'EH', 'TOC']]
# Restructure
df = df.melt(id_vars=['region', 'period'])
df.set_index(['region', 'period', 'variable'], inplace=True)
df = df.unstack('period')
df.reset_index(inplace=True)
df.columns = ['region', 'variable', 'end', 'start']
df = df[['region', 'variable', 'start', 'end']]
# Pct change
df['pct_change_1990_2016'] = 100 * (df['end'] - df['start']) / df['start']
# Save
out_csv = r'../../../Thematic_Trends_Report_2019/results/pct_change_1990-2016_relaxed_criteria.csv'
df.to_csv(out_csv, encoding='utf-8', index=False)
df
Out[48]:
In [49]:
sn.catplot(x='region',
y='pct_change_1990_2016',
data=df,
kind='bar',
row='variable',
aspect=3,
order=['Ont', 'QuMaVt', 'Apps', 'BRM', 'AdsCsk', 'AtlCan',
'UK-IE-NL', 'Alps', 'WCE', 'SoNord', 'NoNord', 'ECE', 'Baltic']
)
# Save
out_png = r'../../../Thematic_Trends_Report_2019/results/pct_change_1990-2016_relaxed_criteria_bar_chart.png'
plt.savefig(out_png, dpi=300)
In [50]:
# Read stations
stn_csv = r'../../../Thematic_Trends_Report_2019/results/selected_stations_relaxed_criteria.csv'
sel_stns = pd.read_csv(stn_csv)
# Read trends summary
trends_csv = r'../../../Thematic_Trends_Report_2019/results/trends_summary_1990-2016_relaxed_criteria.csv'
trends_df = pd.read_csv(trends_csv)
# Get data of interest
trends_df = trends_df.query('metric == "annual median"')
trends_df = trends_df[['station_id', 'variable', 'sen_slp', 'sen_trend']]
# Unstack
trends_df.set_index(['station_id', 'variable'], inplace=True)
trends_df = trends_df.unstack('variable')
# Merge column multi-index
cols = []
for idx in trends_df.columns:
idx = list(idx)
idx[1] = idx[1].replace("_", "")
idx[1] = idx[1].replace("ESO4", "ESO4X")
if idx[0][-1] == 'p':
cols.append(idx[1] + '_slp')
else:
cols.append(idx[1] + '_trend')
trends_df.columns = cols
trends_df.reset_index(inplace=True)
# Join site data
df = pd.merge(sel_stns, trends_df,
how='left', on='station_id')
# Build geodataframe
gdf = gpd.GeoDataFrame(df,
geometry=gpd.points_from_xy(df.longitude, df.latitude),
crs={'init': 'epsg:4326'})
# Save
out_shp = (r'../../../Thematic_Trends_Report_2019/Trends_Maps/GIS/'
r'thematic_report_selected_stns_relaxed_criteria_trends.shp')
gdf.to_file(out_shp)
gdf.head()
Out[50]: