In [2]:
%matplotlib inline
import pandas as pd
import numpy as np
import nivapy3 as nivapy
import seaborn as sn
import matplotlib.pyplot as plt
import warnings
import os
import pickle
import itertools
import toc_trends_analysis as toc_trends
import plotnine as pn
import geopandas as gpd
from plotnine import ggplot, geom_point, aes, facet_grid, facet_wrap, theme, element_text, scale_x_log10, scale_x_continuous
from rpy2.robjects.packages import importr
from rpy2.robjects import r, pandas2ri
from IPython.display import Image

#warnings.filterwarnings("ignore")
plt.style.use('ggplot')
pn.options.figure_size = (10, 15)
pandas2ri.activate()
bcp = importr('bcp')

In [3]:
# Connect to db
eng = nivapy.da.connect()


Connection successful.

TOC Thematic Report - February 2019 (Part 6b: analysis of annual trends; relaxed selection criteria)

This notebook implements more-or-less the same workflow as described in notebook 6a, but using a broader range of stations based on less strict selection criteria. See the original notebook for details.

1. Select sites

Heleen has suggested applying the same selection criteria as used for the TOC trends analysis with Don and John. This involves aggregating to annual frequency and requiring that:

  • There is at least one data point within the first 5 years (start of 1990 to end of 1994)
  • There is at least one data point within the last 5 years (start of 2012 to end of 2016)
  • At least 65% of years have data (i.e. there are at least $0.65 * 27 \approx 18$ non-null datapoints in the aggregated series

In [4]:
# Read stations
stn_path = r'../../../all_icpw_sites_may_2019.xlsx'
stn_df = pd.read_excel(stn_path, sheet_name='all_icpw_stns')
stn_df.head()


Out[4]:
station_id station_code station_name latitude longitude altitude continent country region group
0 38115 Tr18_CA_DO1 Blue Chalk Lake 45.1999 -78.9432 344.0 North America Canada Ont Trends
1 38116 Tr18_CA_DO2 Chub Lake 45.2138 -78.9836 343.0 North America Canada Ont Trends
2 38117 Tr18_CA_DO3 Crosson Lake 45.0840 -79.0360 371.0 North America Canada Ont Trends
3 38118 Tr18_CA_DO4 Dickie Lake 45.1510 -79.0876 379.0 North America Canada Ont Trends
4 38119 Tr18_CA_DO5 Harp Lake 45.3798 -79.1335 327.0 North America Canada Ont Trends

In [5]:
# Load saved chem data
wc_csv = r'../../../Thematic_Trends_Report_2019/working_chem.csv'
wc_df = pd.read_csv(wc_csv, encoding='utf-8')
wc_df['sample_date'] = pd.to_datetime(wc_df['sample_date'])
wc_df.head()


Out[5]:
station_id station_code station_name sample_date depth1 depth2 TOC ECl EH ESO4 ENO3 ECa_EMg ANC ALK-E
0 23472 CH03 Lago di Tomè 1990-10-08 0.0 0.0 NaN 4.857143 3.630781 45.000000 23.571429 64.333333 10.939799 NaN
1 23472 CH03 Lago di Tomè 1993-08-18 0.0 0.0 NaN 0.857143 3.630781 47.708333 35.000000 47.166667 -15.749821 NaN
2 23472 CH03 Lago di Tomè 1995-09-05 0.0 0.0 NaN 4.571429 2.630268 46.666667 36.428571 81.666667 5.850613 4.0
3 23472 CH03 Lago di Tomè 1995-09-15 0.0 0.0 NaN 8.571429 4.073803 42.083333 32.142857 59.333333 -3.319358 NaN
4 23472 CH03 Lago di Tomè 1997-08-05 0.0 0.0 NaN 5.428571 2.089296 38.541667 23.571429 53.500000 3.383879 1.0

In [6]:
# Melt to long format
df = wc_df.copy()
del df['station_code'], df['station_name'], df['depth1'], df['depth2']
df = pd.melt(df, id_vars=['station_id', 'sample_date'])
df.head()


Out[6]:
station_id sample_date variable value
0 23472 1990-10-08 TOC NaN
1 23472 1993-08-18 TOC NaN
2 23472 1995-09-05 TOC NaN
3 23472 1995-09-15 TOC NaN
4 23472 1997-08-05 TOC NaN

The code below creates a dataframe showing which series at which sites meet the criteria defined above.


In [7]:
# Dict for results
inc_dict = {'station_id':[],
            'variable':[],
            'include':[],
           }

# Loop over time series
for stn_id in df['station_id'].unique():       
    # Loop over variables
    for par in df['variable'].unique():    
        # Get data
        df2 = df.query("(station_id == @stn_id) and (variable == @par)")
        df2.set_index('sample_date', inplace=True)
        del df2['station_id'], df2['variable']

        # Annual dates from 1990 - 2016
        all_dates = pd.date_range('1990-01-01', '2016-12-31', freq='A')
        dates_df = pd.DataFrame(index=all_dates)

        # Resample to annual
        df2 = df2.resample('A').median()
        df2 = dates_df.join(df2)
        df2.index = df2.index.year

        if pd.isna(df2['value']).all().all():
            # Not suitable
            inc_dict['station_id'].append(stn_id)
            inc_dict['variable'].append(par)
            inc_dict['include'].append('no') 

        else:
            n_start = pd.notnull(df2[df2.index<(1995)]['value']).sum()
            n_end = pd.notnull(df2[df2.index>(2011)]['value']).sum()
            non_missing = pd.notnull(df2['value']).sum()

            if (n_start >= 1) and (n_end >= 1) and (non_missing >= 18):
                # Not suitable
                inc_dict['station_id'].append(stn_id)
                inc_dict['variable'].append(par)
                inc_dict['include'].append('yes') 

            else:
                # Include
                inc_dict['station_id'].append(stn_id)
                inc_dict['variable'].append(par)
                inc_dict['include'].append('no')   

# Build df
inc_df = pd.DataFrame(inc_dict)

print('The number of stations with at least some time series meeting the '
      'specified criteria is', len(inc_df.query('include == "yes"')['station_id'].unique()))

inc_df.head()


The number of stations with at least some time series meeting the specified criteria is 497
Out[7]:
station_id variable include
0 23472 TOC no
1 23472 ECl yes
2 23472 EH yes
3 23472 ESO4 yes
4 23472 ENO3 yes

497 out of 555 stations have at least one time series (i.e. one parameter) that meets the specified requirements.


In [8]:
# Map of selected stations
sel_stn_list = list(inc_df.query('include == "yes"')['station_id'].unique())
sel_stns = stn_df.query('station_id in @sel_stn_list')
nivapy.spatial.quickmap(sel_stns,
                        cluster=True,
                        popup='station_code')


Out[8]:

In [9]:
# Save station list
sel_stns.to_csv(r'../../../Thematic_Trends_Report_2019/results/selected_stations_relaxed_criteria.csv',
                index=False,
                encoding='utf-8')

2. M-K and Sen's slope for selected sites

The code below produces a "grid plot" showing the Sen's slope trend estimate for each data series at each site (only data series meeting the selection criteria are included). Each row corresponds to a parameter, and the first column is based on annual medians; the second column uses annual minima.

M-K and Sen's slope results for all series are also saved to a CSV file for later use.


In [10]:
%%capture
# Output folder
out_fold = r'../../../Thematic_Trends_Report_2019/results'
    
# Dicts for results
res_dict = {'station_id':[],
            'variable':[],
            'metric':[],
            'mk_p_val':[],
            'mk_z':[],
            'mk_trend':[],
            'sen_slp':[],
            'sen_incpt':[],
            'sen_trend':[],
           }

series_dict = {}

for stn_id in inc_df['station_id'].unique():
    # Determine whether to process this site
    inc_site_df = inc_df.query("(station_id == @stn_id) and (include == 'yes')")
    
    if len(inc_site_df) > 0:    
        # Setup plot
        fig, axes = plt.subplots(nrows=8, ncols=2, figsize=(15, 20))

        # Loop over variables
        for row_idx, par in enumerate(inc_df['variable'].unique()):
            # Determine whether to plot series
            inc = inc_df.query("(station_id == @stn_id) and (variable == @par)")['include'].values[0]

            if inc == 'no':
                # Plot "omitted" text
                axes[row_idx, 0].text(0.5, 0.5, 
                                      'Omitted due to lack of data',
                                      verticalalignment='center', 
                                      horizontalalignment='center',
                                      transform=axes[row_idx, 0].transAxes,
                                      fontsize=18)

                axes[row_idx, 1].text(0.5, 0.5, 
                                      'Omitted due to lack of data',
                                      verticalalignment='center', 
                                      horizontalalignment='center',
                                      transform=axes[row_idx, 1].transAxes,
                                      fontsize=18)
                
            else:
                # Get data
                df2 = df.query("(station_id == @stn_id) and (variable == @par)")
                df2.set_index('sample_date', inplace=True)
                del df2['station_id'], df2['variable']
                series_dict[(stn_id, par)] = df2                
                
                # Resample to annual medians and mins
                for col_idx, stat in enumerate(['annual median', 'annual minimum']):
                    if stat == 'annual median':
                        df_stat = df2.resample('A').median()
                    else:
                        df_stat = df2.resample('A').min()
                
                    df_stat.index = df_stat.index.year
                    
                    # MK test
                    mk_df = nivapy.stats.mk_test(df_stat, 'value')
                
                    # Sen's slope
                    res_df, sen_df = nivapy.stats.sens_slope(df_stat, 
                                                             value_col='value',
                                                             index_col=df_stat.index)
                
                    # Add results to dict
                    res_dict['station_id'].append(stn_id)
                    res_dict['variable'].append(par)
                    res_dict['metric'].append(stat)
                    res_dict['mk_p_val'].append(mk_df.loc['p'].value)
                    res_dict['mk_z'].append(mk_df.loc['z'].value)
                    res_dict['mk_trend'].append(mk_df.loc['trend'].value)
                    
                    sslp = res_df.loc['sslp'].value
                    sincpt = res_df.loc['icpt'].value
                    res_dict['sen_slp'].append(sslp)
                    res_dict['sen_incpt'].append(sincpt)
                    res_dict['sen_trend'].append(res_df.loc['trend'].value)
                                        
                    # Plot
                    axes[row_idx, col_idx].plot(sen_df.index.values, sen_df['value'].values, 'bo-')
                    axes[row_idx, col_idx].plot(sen_df.index.values, 
                                                sen_df.index.values*sslp + sincpt, 'k-')                
                
            if par == 'TOC':
                unit = 'mg-C/l'
            else:
                unit = 'µEq/l'
                
            axes[row_idx, 0].set_title('Annual median %s (%s)' % (par, unit))
            axes[row_idx, 1].set_title('Annual minimum %s (%s)' % (par, unit))
        
        plt.tight_layout()
        
        # Save
        out_png = os.path.join(out_fold, 'trends_plots_1990-2016_relaxed_criteria/%s_trends_1990-2016_relaxed_criteria.png' % stn_id)
        plt.savefig(out_png, dpi=200)
        plt.close()        

# Combine results
res_df = pd.DataFrame(res_dict)
out_csv = os.path.join(out_fold, 'trends_summary_1990-2016_relaxed_criteria.csv')
res_df.to_csv(out_csv, index=False, encoding='utf-8')

# Save series
out_pkl = os.path.join(out_fold, 'series_relaxed_criteria.pkl')
with open(out_pkl, 'wb') as handle:
    pickle.dump(series_dict, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [11]:
res_df.head()


Out[11]:
station_id variable metric mk_p_val mk_z mk_trend sen_slp sen_incpt sen_trend
0 23472 ECl annual median 0.099902 -1.645329 no trend -0.088458 181.097016 no trend
1 23472 ECl annual minimum 0.027423 -2.205441 decreasing -0.088100 179.811742 decreasing
2 23472 EH annual median 0.004075 -2.872335 decreasing -0.069545 141.432722 decreasing
3 23472 EH annual minimum 0.005565 -2.772357 decreasing -0.061452 124.796353 decreasing
4 23472 ESO4 annual median 0.000001 -4.828009 decreasing -0.920324 1876.644477 decreasing

Added 20.12.2019: Sen's slope for time periods 1990 - 2004 and 2002 - 2016

See e-mail from Heleen received 13.12.2019 at 15.05.


In [32]:
%%capture

# Output folder
out_fold = r'../../../Thematic_Trends_Report_2019/results'

# Dicts for results
res_dict = {'station_id':[],
            'variable':[],
            'period':[],
            'sen_slp':[],
            'sen_incpt':[],
            'sen_trend':[],
           }

for stn_id in inc_df['station_id'].unique():
    # Determine whether to process this site
    inc_site_df = inc_df.query("(station_id == @stn_id) and (include == 'yes')")
    
    if len(inc_site_df) > 0:    
        # Loop over variables
        for row_idx, par in enumerate(inc_df['variable'].unique()):
            # Determine whether to plot series
            inc = inc_df.query("(station_id == @stn_id) and (variable == @par)")['include'].values[0]

            if inc == 'yes':
                # Get data
                df2 = df.query("(station_id == @stn_id) and (variable == @par)")
                df2.set_index('sample_date', inplace=True)
                del df2['station_id'], df2['variable']            
                
                df_stat = df2.resample('A').median()
                df_stat.index = df_stat.index.year
                
                for period in ['1990-2004', '2002-2016']:
                    # Get data for period
                    st_yr, end_yr = [int(i) for i in period.split('-')]                    
                    df3 = df_stat.loc[st_yr:end_yr]
                                  
                    # Sen's slope
                    res_df, sen_df = nivapy.stats.sens_slope(df3, 
                                                             value_col='value',
                                                             index_col=df_stat.index)
                
                    # Add results to dict
                    res_dict['station_id'].append(stn_id)
                    res_dict['variable'].append(par)
                    res_dict['period'].append(period)
                    
                    sslp = res_df.loc['sslp'].value
                    sincpt = res_df.loc['icpt'].value
                    res_dict['sen_slp'].append(sslp)
                    res_dict['sen_incpt'].append(sincpt)
                    res_dict['sen_trend'].append(res_df.loc['trend'].value)       

# Combine results
res_df_pairwise = pd.DataFrame(res_dict)
out_csv = os.path.join(out_fold, 'pairwise_trends_long_1990-2016_relaxed_criteria.csv')
res_df_pairwise.to_csv(out_csv, index=False, encoding='utf-8')

In [33]:
# Convert to wide
res_df_pairwise_wide = res_df_pairwise.copy()
res_df_pairwise_wide['par_period'] = res_df_pairwise_wide['variable'] + '_' + res_df_pairwise_wide['period']
res_df_pairwise_wide.drop(['variable', 'period', 'sen_incpt', 'sen_trend'], axis=1, inplace=True)
res_df_pairwise_wide.set_index(['station_id', 'par_period'], inplace=True)
res_df_pairwise_wide = res_df_pairwise_wide.unstack('par_period')
res_df_pairwise_wide.reset_index(inplace=True)
res_df_pairwise_wide.index.name = ''
cols = [res_df_pairwise_wide.columns.get_level_values(0)[0]] + list(res_df_pairwise_wide.columns.get_level_values(1)[1:])
res_df_pairwise_wide.columns = cols

res_df_pairwise_wide = pd.merge(stn_df, res_df_pairwise_wide, how='inner', on='station_id')
res_df_pairwise_wide.head()

out_csv = os.path.join(out_fold, 'pairwise_trends_wide_1990-2016_relaxed_criteria.csv')
res_df_pairwise_wide.to_csv(out_csv, index=False, encoding='utf-8')

Øyvind would like to know whether the trends in medians and minima are consistent. One way to do this is to plot the Sen's slopes derived from medians against those based on minima, split by parameter. Øyvind is primarily interested in EH, ANC, alkalinity, so I'll just focus on those here.


In [12]:
# Cols of interest
cols = ['station_id', 'variable', 'metric', 'sen_slp']

# Setup plot
fig, axes = plt.subplots(nrows=3, ncols=1, figsize=(10,15))

# Loop over pars
for idx, par in enumerate(['EH', 'ANC', 'ALK-E']):
    # Get data
    cols = ['station_id', 'variable', 'metric', 'sen_slp']
    comp_df = res_df.query("variable == @par")[cols].copy()
    del comp_df['variable']
    comp_df.set_index(['station_id', 'metric'], inplace=True)
    
    # Convert to 'wide' format
    comp_df = comp_df.unstack('metric')
    comp_df.columns = comp_df.columns.get_level_values('metric')

    # Plot
    axes[idx].plot(comp_df['annual median'], comp_df['annual minimum'], 'bo', label="Sen's slopes")
    axes[idx].plot(comp_df['annual median'], comp_df['annual median'], 'k-', label='1:1 line')
    axes[idx].set_xlabel('Slope for annual medians ($yr^{-1}$)', fontsize=14)
    axes[idx].set_ylabel('Slope for annual minima ($yr^{-1}$)', fontsize=14)
    axes[idx].legend(loc='best', fontsize=14)
    axes[idx].set_title('%s (µEq/l)' % par)

plt.tight_layout()

# Save
out_png = os.path.join(out_fold, 'trend_minima_vs_medians_relaxed_criteria.png')
plt.savefig(out_png, dpi=200)


In general, the slopes obtained from annual medians are very similar to those based on annual minima.

Note: There are a few outliers on the plots above that probably warrant further investigation.

Another visualisation option is to create heatmaps showing whether trends in annual minima are classified as being "significant" in the same way as trends in annual medians. The labels on the plots below show the percentage of the total sites for each variable in each of 9 classes. High proportions along the diagonal indicate that results based on minima and medians are giving essentially the same overall picture. The overall direction of the trends is also very clear.


In [13]:
# Cols of interest
cols = ['station_id', 'variable', 'metric', 'sen_trend']

# Setup plot
fig, axes = plt.subplots(nrows=1, ncols=3, figsize=(15,4))

# Loop over pars
for idx, par in enumerate(['EH', 'ANC', 'ALK-E']):
    # Get data
    cols = ['station_id', 'variable', 'metric', 'sen_trend']
    comp_df = res_df.query("variable == @par")[cols].copy()
    del comp_df['variable']
    comp_df.set_index(['station_id', 'metric'], inplace=True)
    
    # Convert to 'wide' format
    comp_df = comp_df.unstack('metric')
    comp_df.columns = comp_df.columns.get_level_values('metric')
    comp_df.columns = [i.replace(' ', '_') for i in comp_df.columns]
    
    # Get counts for combos
    opt_list = ['decreasing', 'no trend', 'increasing']
    hmap = np.zeros(shape=(3,3))
    
    # Map options to array indices (med is x; min is y)
    med_map = {'decreasing':0,
               'no trend':1,
               'increasing':2,
              }
    
    min_map = {'decreasing':2,
               'no trend':1,
               'increasing':0,
              }
    
    # Assign counts to array
    for pair in itertools.product(opt_list, repeat=2):
        ann_med, ann_min = pair
        cnt = len(comp_df.query("(annual_median == @ann_med) and (annual_minimum == @ann_min)"))
        pct = 100*cnt/len(comp_df)
        hmap[min_map[ann_min], med_map[ann_med]] = pct

    # Plot
    g = sn.heatmap(hmap, 
                   ax=axes[idx], 
                   square=True, 
                   annot=True,
                   xticklabels=opt_list,
                   yticklabels=opt_list[::-1],
                   cmap='coolwarm',
                  )
    g.set_yticklabels(g.get_yticklabels(), rotation=0)
    
    axes[idx].set_xlabel('Annual medians', fontsize=16)
    axes[idx].set_ylabel('Annual minima', fontsize=16)
    axes[idx].set_title('%s (%%)' % par)

plt.tight_layout()

# Save
out_png = os.path.join(out_fold, 'trend_minima_vs_medians_heatmap_relaxed_criteria.png')
plt.savefig(out_png, dpi=200)


4. Summarising by region

To present results concisely, we need to aggregate output from the analysis above. A simple way to get an overall picture of the dataset is to create box plots, violin plots and histograms illustrating the range of significant Sen's slopes for each parameter in each region (or country or continent). I will focus on annual medians here since, based on the output above, results for minima should be similar.


In [14]:
# Get data
reg_df = res_df.query("(metric == 'annual median') and "
                      "(sen_trend in ('increasing', 'decreasing'))")

# Join regions
reg_df = pd.merge(reg_df, stn_df[['station_id', 'continent', 'country', 'region']],
                  how='left', on='station_id')

In [15]:
# Boxplots by region
g = sn.catplot(data=reg_df,
               row='variable',
               x='region',
               y='sen_slp',
               kind='box',
               order=['NoNord', 'SoNord', 'Baltic', 'UK-IE-NL', 
                      'WCE', 'ECE', 'Alps', 'AtlCan', 'QuMaVt',
                      'AdsCsk', 'Apps', 'BRM', 'Ont'],
               sharex=False,
               sharey=False,
               aspect=3
              )

# Save
out_png = os.path.join(out_fold, 'slope_box_plots_by_region_relaxed_criteria.png')
plt.savefig(out_png, dpi=200)



In [16]:
# Violin plots by continent
g = sn.catplot(data=reg_df,
               col='variable',
               col_wrap=4,
               x='continent',
               y='sen_slp',
               kind='violin',
               sharex=False,
               sharey=False,
               aspect=1
              )

g.map(plt.axhline, y=0, lw=2, ls='--', c='k')

# Save
out_png = os.path.join(out_fold, 'slope_violin_plots_by_continent_relaxed_criteria.png')
plt.savefig(out_png, dpi=200)



In [17]:
# KDE-smoothed plots by continent
g = sn.FacetGrid(reg_df,
                 row="variable",
                 col='continent',
                 aspect=2,
                 sharex='row',
                 sharey=False)
g.map(sn.distplot, 'sen_slp', hist=False, rug=True)
g.map(plt.axvline, x=0, lw=2, ls='--', c='k')

# Save
out_png = os.path.join(out_fold, 'slope_kde_plots_by_continent_relaxed_criteria.png')
plt.savefig(out_png, dpi=200)


Remember that all the plots above are different ways of illustrating the distributions of significant Sen's slopes. I think there are some interesting patterns here - especially comparing Europe with North America.

4b. Regional M-K and Sen's slope tests

Added 27/06/2019

I have now extended nivapy.stats to include the regionl/seasonal Mann-Kendall test, plus a regional/seasoanl estimate of the Sen's slope - see nivapy.stats.seasonal_regional_mk_sen() for details. The code below estimate regional trends in each region for each parameter, based on the 486 stations meeting the selection criteria.


In [18]:
%%capture

res_dict = {'region':      [],
            'param':       [],
            'stat':        [],
            'n_stns':      [],
            'mk_stat':     [],
            'mk_stat_var': [],
            'mk_pval':     [],
            'sen_slp':     [],
            'mk_trend':    [],
            }

# Loop over parameters
for par in inc_df['variable'].unique():
    # Get stations to include for this parameter
    stns_par = inc_df.query('(variable == @par) and (include == "yes")')['station_id']
    stns_par = stn_df.query('station_id in @stns_par')
    
    # Loop over regions
    for reg in stns_par['region'].unique():
        stns_reg = stns_par.query('region == @reg')['station_id']
        n_stns_reg = len(stns_reg)
        
        # Get data
        df2 = df.query('(station_id in @stns_reg) and (variable == @par)').copy()
        df2['year'] = df2['sample_date'].dt.year
        del df2['sample_date'], df2['variable']
        
        # Consider medians, minima and maxima
        for stat in ['annual median', 'annual minimum', 'annual maximum']:
            if stat == 'annual median':
                df3 = df2.groupby(['station_id', 'year']).median().reset_index()
            elif stat == 'annual maximum':
                df3 = df2.groupby(['station_id', 'year']).max().reset_index()
            else:
                df3 = df2.groupby(['station_id', 'year']).min().reset_index()

            # Regional M-K test
            res_df = nivapy.stats.seasonal_regional_mk_sen(df3, 
                                                           time_col='year',
                                                           value_col='value',
                                                           block_col='station_id')  

            # Append to results
            res_dict['region'].append(reg)
            res_dict['param'].append(par)
            res_dict['stat'].append(stat)
            res_dict['n_stns'].append(n_stns_reg)
            res_dict['mk_stat'].append(res_df.loc['s']['value'])
            res_dict['mk_stat_var'].append(res_df.loc['var_s']['value'])
            res_dict['mk_pval'].append(res_df.loc['p']['value'])
            res_dict['sen_slp'].append(res_df.loc['sslp']['value'])
            res_dict['mk_trend'].append(res_df.loc['trend']['value'])
                
res_df = pd.DataFrame(res_dict)

# Save
out_path = os.path.join(out_fold, 'regional_mk_sen_slope_relaxed_criteria.csv')
res_df.to_csv(out_path, encoding='utf-8', index=False)

In [19]:
res_df.head(10)


Out[19]:
region param stat n_stns mk_stat mk_stat_var mk_pval sen_slp mk_trend
0 Ont TOC annual median 15 2036.0 32483.333333 0.000000e+00 0.022300 increasing
1 Ont TOC annual minimum 15 2392.0 32424.000000 0.000000e+00 0.029273 increasing
2 Ont TOC annual maximum 15 1009.0 32491.000000 2.242719e-08 0.019346 increasing
3 QuMaVt TOC annual median 62 3754.0 118256.666667 0.000000e+00 0.018000 increasing
4 QuMaVt TOC annual minimum 62 4375.0 118124.333333 0.000000e+00 0.022000 increasing
5 QuMaVt TOC annual maximum 62 3060.0 118190.666667 0.000000e+00 0.016667 increasing
6 AtlCan TOC annual median 63 3567.0 135021.000000 0.000000e+00 0.033515 increasing
7 AtlCan TOC annual minimum 63 5692.0 134820.000000 0.000000e+00 0.055556 increasing
8 AtlCan TOC annual maximum 63 1394.0 134764.666667 1.478968e-04 0.015556 increasing
9 ECE TOC annual median 29 2343.0 42270.333333 0.000000e+00 0.016667 increasing

5. Change points

The code below uses the BCP R package and is modified from notebook 5.

For each of the selected time series at each site, I will run a change point analysis (without regression - see here for details) and produce a single plot showing all the data for each site. For each variable, I will also record years when the probabiltiy of change is (i) greater than 50% and (ii) greater than 75%. This will make it possible to look for consistent change points across regions.


In [22]:
def plot_bcp_change_pts(stn_id, reg=True, p0=0.2):
    """ Run BCP analysis to detect change-points.
    """
    res_dict = {'station_id':[],
                'variable':[],
                'prob':[],
                'year':[],
               }
    
    # Setup plot
    fig, axes = plt.subplots(nrows=8, ncols=2, 
                             sharex=False,
                             sharey=False,
                             figsize=(15,15))
    
    # Loop over variables
    for row_idx, par in enumerate(inc_df['variable'].unique()):
        # Determine whether to plot series
        inc = inc_df.query("(station_id == @stn_id) and (variable == @par)")['include'].values[0]

        if inc == 'no':
            # Plot "omitted" text
            axes[row_idx, 0].text(0.5, 0.5, 
                                  'Omitted due to lack of data',
                                  verticalalignment='center', 
                                  horizontalalignment='center',
                                  transform=axes[row_idx, 0].transAxes,
                                  fontsize=18)
            axes[row_idx, 0].set_ylabel(par)

            axes[row_idx, 1].text(0.5, 0.5, 
                                  'Omitted due to lack of data',
                                  verticalalignment='center', 
                                  horizontalalignment='center',
                                  transform=axes[row_idx, 1].transAxes,
                                  fontsize=18)
            axes[row_idx, 1].set_ylabel('Probability')
            
        else:
            # Get data
            df2 = df.query("(station_id == @stn_id) and (variable == @par)")
            df2.set_index('sample_date', inplace=True)
            del df2['station_id'], df2['variable']
    
            # Resample
            df2 = df2.resample('A').median().reset_index()
            df2['year'] = df2['sample_date'].dt.year
            del df2['sample_date']

            # Interpolate years with missing data
            index_df = pd.DataFrame({'year':range(1990, 2017)})
            df2 = pd.merge(index_df, df2, how='left', on='year')
            df2.interpolate(kind='linear', inplace=True)
            df2.fillna(method='backfill', inplace=True)

            # Change point analysis
            if reg:
                # Perform regression within each partition
                res = bcp.bcp(df2['value'], 
                              x=df['year'], 
                              p0=p0)
            else:
                # Assume constant mean within each partition
                res = bcp.bcp(df2['value'], 
                              p0=p0)            
            
            res = pd.DataFrame({'raw':df2['value'].values,
                                'mean':res.rx2('posterior.mean').flatten(),
                                'prob':res.rx2('posterior.prob')},
                               index=df2['year'].values)

            # Add to results
            gt50 = np.array(res.index[res['prob'] > 0.5])
            for year in gt50:                        
                res_dict['station_id'].append(stn_id)
                res_dict['variable'].append(par)
                res_dict['prob'].append('gt50')
                res_dict['year'].append(year)
            
            gt75 = np.array(res.index[res['prob'] > 0.75])
            for year in gt75:                        
                res_dict['station_id'].append(stn_id)
                res_dict['variable'].append(par)
                res_dict['prob'].append('gt75')
                res_dict['year'].append(year)  
                
            # Plot
            res['raw'].plot(ax=axes[row_idx, 0], marker='o', linestyle=':', label='Raw')
            res['mean'].plot(ax=axes[row_idx, 0], label='Fitted')
            axes[row_idx, 0].set_ylabel(par)
            axes[row_idx, 0].legend(loc='best')

            res['prob'].plot(ax=axes[row_idx, 1], marker='o')
            axes[row_idx, 1].set_ylabel('Probability')
            axes[row_idx, 1].set_ylim((0, 1))
            axes[row_idx, 1].axhline(0.5, c='k', linestyle='--')
            axes[row_idx, 1].axhline(0.95, c='k', linestyle='--')

        axes[0, 0].set_title('Data series')    
        axes[0, 1].set_title('Change probability')
        
    plt.tight_layout()
    out_png = os.path.join(out_fold, 'bcp_selected_sites_relaxed_criteria/bcp_stn_%s_relaxed_criteria.png' % stn_id)
    plt.savefig(out_png, dpi=200)
    plt.close()
    
    res_df = pd.DataFrame(res_dict)
    
    return res_df

In [23]:
%%capture
# Container for results
df_list = []

# Loop over stations
for stn_id in inc_df['station_id'].unique():
    # Determine whether to process this site
    print(stn_id)
    inc_site_df = inc_df.query("(station_id == @stn_id) and (include == 'yes')")
    
    if len(inc_site_df) > 0: 
        # Run BCP
        res_bcp = plot_bcp_change_pts(stn_id, reg=False, p0=0.1)  
        df_list.append(res_bcp)
        
# Combine results
res_bcp = pd.concat(df_list, axis=0)

In [24]:
# Show example BCP output
Image(r'../../../Thematic_Trends_Report_2019/results/bcp_selected_sites_relaxed_criteria/bcp_stn_38242_relaxed_criteria.png')


Out[24]:

These results can now be aggregated regionally. I will produce four sets of plots:

  1. KDE-smoothed histograms showing years where the probabiltiy of change is >50%

    a. Split by region

    b. Split by continent

  2. KDE-smoothed histograms showing years where the probabiltiy of change is >75%

    a. Split by region

    b. Split by continent

For ease of comparison, columns represent parameters and rows are regions. Considering one column at a time therefore makes it posible to identify for similar patterns between regions.


In [25]:
# Join regions
res_bcp = pd.merge(res_bcp, stn_df[['station_id', 'continent', 'country', 'region']],
                   how='left', on='station_id')

res_bcp.head()


Out[25]:
station_id variable prob year continent country region
0 23472 ECl gt50 2005 Europe Switzerland Alps
1 23472 ECl gt50 2006 Europe Switzerland Alps
2 23472 ECl gt75 2005 Europe Switzerland Alps
3 23472 ECl gt75 2006 Europe Switzerland Alps
4 23472 EH gt50 1995 Europe Switzerland Alps

In [26]:
# KDE-smoothed plots by region
# Change prob > 50%
gt50_df = res_bcp.query('prob == "gt50"')

g = sn.FacetGrid(gt50_df,
                 row='region',
                 col='variable',
                 aspect=1.5,
                 sharex='row',
                 sharey=False,
                 row_order=['NoNord', 'SoNord', 'UK-IE-NL', 
                            'WCE', 'ECE', 'Alps', 'QuMaVt',
                            'AdsCsk', 'BRM', 'Ont'])
g.map(sn.distplot, 'year')
g.set(xlim=(1990, 2016))

# Save
out_png = os.path.join(out_fold, 'change_prob_kde_gt50_by_region_relaxed_criteria.png')
plt.savefig(out_png, dpi=200)


/opt/conda/lib/python3.7/site-packages/statsmodels/nonparametric/kde.py:487: RuntimeWarning: invalid value encountered in true_divide
  binned = fast_linbin(X, a, b, gridsize) / (delta * nobs)
/opt/conda/lib/python3.7/site-packages/statsmodels/nonparametric/kdetools.py:34: RuntimeWarning: invalid value encountered in double_scalars
  FAC1 = 2*(np.pi*bw/RANGE)**2
/opt/conda/lib/python3.7/site-packages/numpy/core/_methods.py:217: RuntimeWarning: Degrees of freedom <= 0 for slice
  keepdims=keepdims)
/opt/conda/lib/python3.7/site-packages/numpy/core/_methods.py:209: RuntimeWarning: invalid value encountered in double_scalars
  ret = ret.dtype.type(ret / rcount)

In [27]:
# KDE-smoothed plots by region
# Change prob > 75%
gt50_df = res_bcp.query('prob == "gt75"')

g = sn.FacetGrid(gt50_df,
                 row='region',
                 col='variable',
                 aspect=1.5,
                 sharex='row',
                 sharey=False,
                 row_order=['NoNord', 'SoNord', 'UK-IE-NL', 
                            'WCE', 'ECE', 'Alps', 'QuMaVt',
                            'AdsCsk', 'BRM', 'Ont'])
g.map(sn.distplot, 'year')
g.set(xlim=(1990, 2016))

# Save
out_png = os.path.join(out_fold, 'change_prob_kde_gt75_by_region_relaxed_criteria.png')
plt.savefig(out_png, dpi=200)



In [28]:
# KDE-smoothed plots by continent
# Change prob > 50%
gt50_df = res_bcp.query('prob == "gt50"')

g = sn.FacetGrid(gt50_df,
                 row='continent',
                 col='variable',
                 aspect=1.5,
                 sharex='row',
                 sharey=False)
g.map(sn.distplot, 'year')
g.set(xlim=(1990, 2016))

# Save
out_png = os.path.join(out_fold, 'change_prob_kde_gt50_by_continent_relaxed_criteria.png')
plt.savefig(out_png, dpi=200)



In [29]:
# KDE-smoothed plots by continent
# Change prob > 75%
gt50_df = res_bcp.query('prob == "gt75"')

g = sn.FacetGrid(gt50_df,
                 row='continent',
                 col='variable',
                 aspect=1.5,
                 sharex='row',
                 sharey=False)
g.map(sn.distplot, 'year')
g.set(xlim=(1990, 2016))

# Save
out_png = os.path.join(out_fold, 'change_prob_kde_gt75_by_continent_relaxed_criteria.png')
plt.savefig(out_png, dpi=200)


I haven't had time to look at this output in detail, but there appears to be evidence for consistent regional changes in at least some variables. For example, ENO3X, ECaX_EMgX and ESO4X all show similar patterns in the SoNord, NoNord and UK+IE+NL regions (and perhaps for Europe as a whole too), with many change points occurring in the mid-1990s. For TOC, meanwhile, the most common European change points are about a decade later, in the mid-2000s.

6. Differences between medians and minima/maxima

Updated 08/11/2019. The original code only considered differecnes between medians and minima. Øyvind asked for this to be extended to include (maxima - medians) - see e-mail received 05.11.2019 at 15.18.

6.1. Medians and minima

Added 02/06/2019.

Øyvind would like to explore trends in the absolute differences between annual median and minimum values for ANC and pH. This is motivated by some interesting patterns previously documented at Øygardsbekken (station ID 38313) - see e-mail from Øyvind received 01/06/2019 at 17.47 for details.

As a sanity check, it's a good idea to first make sure I can reproduce the patterns in Øyvind's e-mail.


In [30]:
# Load series
pkl = os.path.join(out_fold, 'series_relaxed_criteria.pkl')
with open(pkl, 'rb') as handle:
    series_dict = pickle.load(handle)
    
# Get data for Øygardsbekken
stn_id = 38313

# Loop over dfs
df_list = []

for par in ['ANC', 'EH']:
    # Get data
    df = series_dict[(stn_id, par)]   
    
    # Convert EH to pH
    if par == 'EH':
        df['value'] = -np.log10(df['value']/1E6) 
        par = 'pH'
        
    # Resample to annual medians and mins
    for col_idx, stat in enumerate(['annual median', 'annual minimum']):
        if stat == 'annual median':
            df_stat = df.resample('A').median()
        else:
            df_stat = df.resample('A').min()

        df_stat['year'] = df_stat.index.year
        df_stat['metric'] = stat
        df_stat['variable'] = par
        df_stat.reset_index(inplace=True, drop=True)
            
        df_list.append(df_stat)
        
ann_df = pd.concat(df_list, axis=0)        
ann_df.head()


Out[30]:
value year metric variable
0 -51.247305 1992 annual median ANC
1 -82.105765 1993 annual median ANC
2 -31.137920 1994 annual median ANC
3 -40.616714 1995 annual median ANC
4 -33.383755 1996 annual median ANC

In [31]:
# Plot
g = sn.lmplot(x='year', 
              y='value', 
              data=ann_df,
              col='variable',
              hue='metric',
              sharey=False,
              lowess=True)

g.axes[0, 0].set_ylim((-125, 75))
g.axes[0, 1].set_ylim((4, 6))


Out[31]:
(4, 6)

These plots look the same as in Øyvind's e-mail, which is good. The difference between minimum and median ANC looks fairly steady, whereas differences for pH are becoming larger through time, apparently because median pH is increasing more rapidly than minimum pH. The next step is to calculate absolute differences between median and minimum values for all sites and then test for trends.


In [32]:
%%capture
# Load series
pkl = os.path.join(out_fold, 'series_relaxed_criteria.pkl')
with open(pkl, 'rb') as handle:
    series_dict = pickle.load(handle)
    
# Dicts for results
res_dict = {'station_id':[],
            'variable':[],
            'mk_p_val':[],
            'mk_trend':[],
            'sen_slp':[],
            'sen_incpt':[],
            'sen_trend':[],
           }

for stn_id in inc_df['station_id'].unique():
    for par in ['ANC', 'EH']:
        try:
            # Get data
            df = series_dict[(stn_id, par)]   

            # Convert EH to pH
            if par == 'EH':
                df['value'] = -np.log10(df['value']/1E6) 
                par = 'pH'

            # Resample to annual medians and mins
            df_med = df.resample('A').median()
            df_min = df.resample('A').min()
            df_diff = df_med - df_min
            df_diff.index = df_diff.index.year

            # MK test
            mk_df = nivapy.stats.mk_test(df_diff, 'value')

            # Sen's slope
            res_df, sen_df = nivapy.stats.sens_slope(df_diff, 
                                                     value_col='value',
                                                     index_col=df_diff.index)

            # Add results to dict
            res_dict['station_id'].append(stn_id)
            res_dict['variable'].append(par)
            res_dict['mk_p_val'].append(mk_df.loc['p'].value)
            res_dict['mk_trend'].append(mk_df.loc['trend'].value)

            sslp = res_df.loc['sslp'].value
            sincpt = res_df.loc['icpt'].value
            res_dict['sen_slp'].append(sslp)
            res_dict['sen_incpt'].append(sincpt)
            res_dict['sen_trend'].append(res_df.loc['trend'].value)
                                       
        except KeyError:
            pass

# Combine results
res_df = pd.DataFrame(res_dict)
out_csv = os.path.join(out_fold, 'trends_in_differences_med-min_relaxed_criteria.csv')
res_df.to_csv(out_csv, index=False, encoding='utf-8')

In [33]:
res_df.head()


Out[33]:
station_id variable mk_p_val mk_trend sen_slp sen_incpt sen_trend
0 23472 ANC 0.481774 no trend 0.028164 -54.284790 no trend
1 23472 pH 0.004313 increasing 0.005417 -10.800417 increasing
2 23474 ANC 0.586128 no trend 0.083036 -159.982919 no trend
3 23474 pH 0.045897 increasing 0.011307 -22.498864 no trend
4 23475 ANC 0.045644 increasing 0.281697 -560.516635 no trend

As a further quick check, here are the results for Øygardsbekken:


In [34]:
# Get results for Øygardsbekken
res_df.query('station_id == 38313')


Out[34]:
station_id variable mk_p_val mk_trend sen_slp sen_incpt sen_trend
763 38313 ANC 2.157853e-01 no trend 0.687044 -1340.522993 no trend
764 38313 pH 7.221532e-07 increasing 0.016603 -32.951538 increasing

As expected, there is no trend for differences in ANC, but a significant increasing trend for differences in pH.

Next, join in the region data and explore patterns for all sites.


In [35]:
# Just significant
df = res_df.query("sen_trend in ('increasing', 'decreasing')")

# Join regions
df = pd.merge(df, stn_df[['station_id', 'continent', 'country', 'region']],
              how='left', on='station_id')

In [36]:
# Box plots of significant slopes in differences by region
g = sn.catplot(data=df,
               x='region',
               y='sen_slp',
               row='variable',
               kind='box', 
               order=['NoNord', 'SoNord', 'Baltic', 'UK-IE-NL', 
                      'WCE', 'ECE', 'Alps', 'AtlCan', 'QuMaVt',
                      'AdsCsk', 'Apps', 'BRM', 'Ont'],
               sharex=False,
               sharey=False,
               aspect=3,
              ) 

g.map(plt.axhline, y=0, lw=2, ls='--', c='k', alpha=0.4)

# Save
out_png = os.path.join(out_fold, 'med_min_diff_box_plots_by_region_relaxed_criteria.png')
plt.savefig(out_png, dpi=200)



In [37]:
# KDE-smoothed plots by continent
g = sn.FacetGrid(df,
                 row='variable',
                 col='continent',
                 aspect=2,
                 sharex='row',
                 sharey=False)
g.map(sn.distplot, 'sen_slp', hist=False, rug=True)
g.map(plt.axvline, x=0, lw=2, ls='--', c='k')

# Save
out_png = os.path.join(out_fold, 'med_min_diff_kde_plots_by_continent_relaxed_criteria.png')
plt.savefig(out_png, dpi=200)


6.2. Medians and maxima

This section repeats the code from above, but this time with (max - med).


In [38]:
%%capture
# Load series
pkl = os.path.join(out_fold, 'series_relaxed_criteria.pkl')
with open(pkl, 'rb') as handle:
    series_dict = pickle.load(handle)
    
# Dicts for results
res_dict = {'station_id':[],
            'variable':[],
            'mk_p_val':[],
            'mk_trend':[],
            'sen_slp':[],
            'sen_incpt':[],
            'sen_trend':[],
           }

for stn_id in inc_df['station_id'].unique():
    for par in ['ANC', 'EH']:
        try:
            # Get data
            df = series_dict[(stn_id, par)]   

            # Convert EH to pH
            if par == 'EH':
                df['value'] = -np.log10(df['value']/1E6) 
                par = 'pH'

            # Resample to annual medians and maxes
            df_med = df.resample('A').median()
            df_max = df.resample('A').max()
            df_diff = df_max - df_med
            df_diff.index = df_diff.index.year

            # MK test
            mk_df = nivapy.stats.mk_test(df_diff, 'value')

            # Sen's slope
            res_df, sen_df = nivapy.stats.sens_slope(df_diff, 
                                                     value_col='value',
                                                     index_col=df_diff.index)

            # Add results to dict
            res_dict['station_id'].append(stn_id)
            res_dict['variable'].append(par)
            res_dict['mk_p_val'].append(mk_df.loc['p'].value)
            res_dict['mk_trend'].append(mk_df.loc['trend'].value)

            sslp = res_df.loc['sslp'].value
            sincpt = res_df.loc['icpt'].value
            res_dict['sen_slp'].append(sslp)
            res_dict['sen_incpt'].append(sincpt)
            res_dict['sen_trend'].append(res_df.loc['trend'].value)
                                       
        except KeyError:
            pass

# Combine results
res_df = pd.DataFrame(res_dict)
out_csv = os.path.join(out_fold, 'trends_in_differences_max-med_relaxed_criteria.csv')
res_df.to_csv(out_csv, index=False, encoding='utf-8')

In [39]:
# Get just significant
df = res_df.query("sen_trend in ('increasing', 'decreasing')")

# Join regions
df = pd.merge(df, stn_df[['station_id', 'continent', 'country', 'region']],
              how='left', on='station_id')

In [40]:
# Box plots of significant slopes in differences by region
g = sn.catplot(data=df,
               x='region',
               y='sen_slp',
               row='variable',
               kind='box', 
               order=['NoNord', 'SoNord', 'Baltic', 'UK-IE-NL', 
                      'WCE', 'ECE', 'Alps', 'AtlCan', 'QuMaVt',
                      'AdsCsk', 'Apps', 'BRM', 'Ont'],
               sharex=False,
               sharey=False,
               aspect=3,
              ) 

g.map(plt.axhline, y=0, lw=2, ls='--', c='k', alpha=0.4)

# Save
out_png = os.path.join(out_fold, 'max_med_diff_box_plots_by_region_relaxed_criteria.png')
plt.savefig(out_png, dpi=200)



In [41]:
# KDE-smoothed plots by continent
g = sn.FacetGrid(df,
                 row='variable',
                 col='continent',
                 aspect=2,
                 sharex='row',
                 sharey=False)
g.map(sn.distplot, 'sen_slp', hist=False, rug=True)
g.map(plt.axvline, x=0, lw=2, ls='--', c='k')

# Save
out_png = os.path.join(out_fold, 'max_med_diff_kde_plots_by_continent_relaxed_criteria.png')
plt.savefig(out_png, dpi=200)


7. Scatter plots of z-scores

Added 24.08.2019

See e-mail from Øyvind received 23.08.2019 at 14.27.


In [42]:
# Read trend results
csv_path = os.path.join(out_fold, 'trends_summary_1990-2016_relaxed_criteria.csv')
df = pd.read_csv(csv_path)

# Join regions
df = pd.merge(df, stn_df[['station_id', 'region']], how='left', on='station_id')

# Just medians
df = df.query('metric == "annual median"')

df.head()


Out[42]:
station_id variable metric mk_p_val mk_z mk_trend sen_slp sen_incpt sen_trend region
0 23472 ECl annual median 0.099902 -1.645329 no trend -0.088458 181.097016 no trend Alps
2 23472 EH annual median 0.004075 -2.872335 decreasing -0.069545 141.432722 decreasing Alps
4 23472 ESO4 annual median 0.000001 -4.828009 decreasing -0.920324 1876.644477 decreasing Alps
6 23472 ENO3 annual median 0.141726 -1.469394 no trend -0.416541 862.736852 no trend Alps
8 23472 ECa_EMg annual median 0.004120 -2.868817 decreasing -0.992593 2040.764815 decreasing Alps

In [43]:
# Plot
p = (ggplot(df, aes('sen_slp', 'mk_z'))
     + geom_point(aes(color='mk_p_val'))
     + facet_grid('region~variable', scales='free_x', shrink=True)
     + theme(axis_text_x=element_text(rotation=90)))
p


/opt/conda/lib/python3.7/site-packages/plotnine/scales/scale.py:93: MatplotlibDeprecationWarning: 
The iterable function was deprecated in Matplotlib 3.1 and will be removed in 3.3. Use np.iterable instead.
  if cbook.iterable(self.breaks) and cbook.iterable(self.labels):
/opt/conda/lib/python3.7/site-packages/plotnine/utils.py:553: MatplotlibDeprecationWarning: 
The iterable function was deprecated in Matplotlib 3.1 and will be removed in 3.3. Use np.iterable instead.
  return cbook.iterable(var) and not is_string(var)
Out[43]:
<ggplot: (-9223363271091864611)>

8. Mean ESO4* for 2010 to 2016

Added 24.08.2019

See e-mail from Øyvind received 23.08.2019 at 14.27.


In [44]:
# List of selected stn IDs
sel_stn_list = list(sel_stns['station_id'].unique())

# Subset by stns and dates (>=2010)
wc_df2 = wc_df.query('station_id in @sel_stn_list')
wc_df2 = wc_df2.query('sample_date >= "2010-01-01"')

# Mean ESO4
wc_df2 = wc_df2.groupby('station_id').mean()[['ESO4']].dropna().reset_index()


# Save
out_path = os.path.join(out_fold, 'mean_eso4_2010-2016_relaxed_criteria.csv')
wc_df2.to_csv(out_path, index=False, encoding='utf-8')

wc_df2.head()


Out[44]:
station_id ESO4
0 23472 24.659861
1 23474 23.770000
2 23475 20.949028
3 23478 54.350893
4 23488 34.887222

9. Average change over whole data period

See e-mail from Øyvind received 05.11.2019 at 09.54. For each region, I need to calculate the percentage change between regional averages for (1990 - 1994) and averages for (2012 - 2016).


In [45]:
# Load saved chem data
wc_csv = r'../../../Thematic_Trends_Report_2019/working_chem.csv'
wc_df = pd.read_csv(wc_csv, encoding='utf-8')
wc_df['sample_date'] = pd.to_datetime(wc_df['sample_date'])
wc_df.head()


Out[45]:
station_id station_code station_name sample_date depth1 depth2 TOC ECl EH ESO4 ENO3 ECa_EMg ANC ALK-E
0 23472 CH03 Lago di Tomè 1990-10-08 0.0 0.0 NaN 4.857143 3.630781 45.000000 23.571429 64.333333 10.939799 NaN
1 23472 CH03 Lago di Tomè 1993-08-18 0.0 0.0 NaN 0.857143 3.630781 47.708333 35.000000 47.166667 -15.749821 NaN
2 23472 CH03 Lago di Tomè 1995-09-05 0.0 0.0 NaN 4.571429 2.630268 46.666667 36.428571 81.666667 5.850613 4.0
3 23472 CH03 Lago di Tomè 1995-09-15 0.0 0.0 NaN 8.571429 4.073803 42.083333 32.142857 59.333333 -3.319358 NaN
4 23472 CH03 Lago di Tomè 1997-08-05 0.0 0.0 NaN 5.428571 2.089296 38.541667 23.571429 53.500000 3.383879 1.0

In [46]:
# Read stations
stn_csv = r'../../../Thematic_Trends_Report_2019/results/selected_stations_relaxed_criteria.csv'
sel_stns = pd.read_csv(stn_csv)
sel_stns.head()


Out[46]:
station_id station_code station_name latitude longitude altitude continent country region group
0 38115 Tr18_CA_DO1 Blue Chalk Lake 45.1999 -78.9432 344.0 North America Canada Ont Trends
1 38116 Tr18_CA_DO2 Chub Lake 45.2138 -78.9836 343.0 North America Canada Ont Trends
2 38117 Tr18_CA_DO3 Crosson Lake 45.0840 -79.0360 371.0 North America Canada Ont Trends
3 38118 Tr18_CA_DO4 Dickie Lake 45.1510 -79.0876 379.0 North America Canada Ont Trends
4 38119 Tr18_CA_DO5 Harp Lake 45.3798 -79.1335 327.0 North America Canada Ont Trends

In [47]:
def assign_period(row):
    if row['year'] <= 1994:
        return 'start'
    elif row['year'] >= 2012:
        return 'end'
    else:
        return np.nan

In [48]:
# Join regions
df = pd.merge(wc_df, sel_stns[['station_id', 'region']], 
              how='left', on='station_id')

# Filter to stations of interest
df = df[df['station_id'].isin(sel_stns['station_id'])]

# Get period of interest
df['year'] = df['sample_date'].dt.year
df['period'] = df.apply(assign_period, axis=1)
df.dropna(subset=['period'], inplace=True)

# Aggregate to regions
df = df.groupby(['region', 'period']).mean().reset_index()

# Get pars of interest
df = df[['region', 'period', 'ECl', 'ESO4', 'ENO3', 'ECa_EMg', 'EH', 'TOC']]

# Restructure
df = df.melt(id_vars=['region', 'period'])
df.set_index(['region', 'period', 'variable'], inplace=True)
df = df.unstack('period')
df.reset_index(inplace=True)
df.columns = ['region', 'variable', 'end', 'start']
df = df[['region', 'variable', 'start', 'end']]

# Pct change
df['pct_change_1990_2016'] = 100 * (df['end'] - df['start']) / df['start']

# Save
out_csv = r'../../../Thematic_Trends_Report_2019/results/pct_change_1990-2016_relaxed_criteria.csv'
df.to_csv(out_csv, encoding='utf-8', index=False)

df


Out[48]:
region variable start end pct_change_1990_2016
0 AdsCsk ECa_EMg 130.552003 96.746356 -25.894392
1 AdsCsk ECl 11.661530 9.434127 -19.100434
2 AdsCsk EH 8.901589 4.623513 -48.059690
3 AdsCsk ENO3 19.223053 13.908321 -27.647700
4 AdsCsk ESO4 107.698417 51.288994 -52.377207
... ... ... ... ... ...
73 WCE ECl 102.328042 108.632429 6.160957
74 WCE EH 10.192357 4.880321 -52.117836
75 WCE ENO3 54.940549 38.894034 -29.207052
76 WCE ESO4 251.779314 124.399702 -50.591770
77 WCE TOC 2.916162 4.608473 58.032150

78 rows × 5 columns


In [49]:
sn.catplot(x='region', 
           y='pct_change_1990_2016', 
           data=df,
           kind='bar',
           row='variable',
           aspect=3,
           order=['Ont', 'QuMaVt', 'Apps', 'BRM', 'AdsCsk', 'AtlCan',
                  'UK-IE-NL', 'Alps', 'WCE', 'SoNord', 'NoNord', 'ECE', 'Baltic']
          )

# Save
out_png = r'../../../Thematic_Trends_Report_2019/results/pct_change_1990-2016_relaxed_criteria_bar_chart.png'
plt.savefig(out_png, dpi=300)


10. Maps showing rates of change

Øyvind would like versions of the rate-of-change maps, updated to show all 497 stations. Generating the shapefiles for this is most easily done using GeoPandas.


In [50]:
# Read stations
stn_csv = r'../../../Thematic_Trends_Report_2019/results/selected_stations_relaxed_criteria.csv'
sel_stns = pd.read_csv(stn_csv)

# Read trends summary
trends_csv = r'../../../Thematic_Trends_Report_2019/results/trends_summary_1990-2016_relaxed_criteria.csv'
trends_df = pd.read_csv(trends_csv)

# Get data of interest
trends_df = trends_df.query('metric == "annual median"')
trends_df = trends_df[['station_id', 'variable', 'sen_slp', 'sen_trend']]

# Unstack
trends_df.set_index(['station_id', 'variable'], inplace=True)
trends_df = trends_df.unstack('variable')

# Merge column multi-index
cols = []
for idx in trends_df.columns:
    idx = list(idx)
    idx[1] = idx[1].replace("_", "")
    idx[1] = idx[1].replace("ESO4", "ESO4X")
    if idx[0][-1] == 'p':
        cols.append(idx[1] + '_slp')
    else:
        cols.append(idx[1] + '_trend')
trends_df.columns = cols

trends_df.reset_index(inplace=True)

# Join site data
df = pd.merge(sel_stns, trends_df,
              how='left', on='station_id')

# Build geodataframe
gdf = gpd.GeoDataFrame(df, 
                       geometry=gpd.points_from_xy(df.longitude, df.latitude),
                       crs={'init': 'epsg:4326'})

# Save
out_shp = (r'../../../Thematic_Trends_Report_2019/Trends_Maps/GIS/'
           r'thematic_report_selected_stns_relaxed_criteria_trends.shp')
gdf.to_file(out_shp)

gdf.head()


Out[50]:
station_id station_code station_name latitude longitude altitude continent country region group ... TOC_slp ALK-E_trend ANC_trend ECaEMg_trend ECl_trend EH_trend ENO3_trend ESO4X_trend TOC_trend geometry
0 38115 Tr18_CA_DO1 Blue Chalk Lake 45.1999 -78.9432 344.0 North America Canada Ont Trends ... 0.019100 increasing increasing decreasing decreasing no trend no trend decreasing increasing POINT (-78.94320 45.19990)
1 38116 Tr18_CA_DO2 Chub Lake 45.2138 -78.9836 343.0 North America Canada Ont Trends ... 0.067221 increasing increasing decreasing decreasing decreasing no trend decreasing increasing POINT (-78.98360 45.21380)
2 38117 Tr18_CA_DO3 Crosson Lake 45.0840 -79.0360 371.0 North America Canada Ont Trends ... 0.033258 increasing increasing decreasing decreasing decreasing no trend decreasing increasing POINT (-79.03600 45.08400)
3 38118 Tr18_CA_DO4 Dickie Lake 45.1510 -79.0876 379.0 North America Canada Ont Trends ... 0.037833 increasing increasing no trend increasing decreasing decreasing decreasing increasing POINT (-79.08760 45.15100)
4 38119 Tr18_CA_DO5 Harp Lake 45.3798 -79.1335 327.0 North America Canada Ont Trends ... 0.024888 no trend increasing decreasing no trend decreasing no trend decreasing increasing POINT (-79.13350 45.37980)

5 rows × 27 columns