In [1]:

    
%matplotlib inline
import pandas as pd
import numpy as np
import glob
import os
import matplotlib.pyplot as plt
import altair as alt
import plotnine as pn
import warnings
from plotnine import ggplot, geom_point, aes, stat_smooth, facet_grid, theme, element_text, scale_x_discrete

warnings.simplefilter('ignore')
pn.options.figure_size = (10, 15)
plt.style.use('ggplot')

TOC Thematic Report - February 2019 (part 4: relative change scatterplots)

This is a rough/quick notebook to generate plots for Heleen's presentation at the WGE meeting in March 2019. The broad aim is to recreate Fig. 2 from Øyvind G's 2014 trends paper.

Update - 21/05/2019: I have now modified the code in this notebook to include data from all 555 sites in the "full" ICPW dataset.

Update 2 - 03/06/2019: Øyvind's Fig. 2 does not include ANC or alkalinity. His method is to divide each value in the series by the first value, thereby obtaining numbers relative to the first year. However this does not work for ANC or alkalinity, which can be negative (see e-mail sent to Øyvind at 02.06.2019 at 23.33 for an explanation). This can be corrected by shifting the data prior to scaling. The most usual approach is "standardisation", which involves subtracting the mean and dividing by the standard deviation. For the application here, I will instead substract the first value in the series and divide by the standard deviation, which gives results more comparable with Øyvind's original plots, because each series should then start from a common point at zero (instead of 1 on Øyvind's version).

1. Stations



In [2]:

    
# Read station data
stn_path = r'../../../all_icpw_sites_may_2019.xlsx'
stn_df = pd.read_excel(stn_path, sheet_name='all_icpw_stns')
stn_df.dropna(subset=['region'], inplace=True)
print(len(stn_df))
stn_df.head()









    



555






    Out[2]:







  
    
      
      station_id
      station_code
      station_name
      latitude
      longitude
      altitude
      continent
      country
      region
      group
    
  
  
    
      0
      38115
      Tr18_CA_DO1
      Blue Chalk Lake
      45.1999
      -78.9432
      344.0
      North America
      Canada
      Ont
      Trends
    
    
      1
      38116
      Tr18_CA_DO2
      Chub Lake
      45.2138
      -78.9836
      343.0
      North America
      Canada
      Ont
      Trends
    
    
      2
      38117
      Tr18_CA_DO3
      Crosson Lake
      45.0840
      -79.0360
      371.0
      North America
      Canada
      Ont
      Trends
    
    
      3
      38118
      Tr18_CA_DO4
      Dickie Lake
      45.1510
      -79.0876
      379.0
      North America
      Canada
      Ont
      Trends
    
    
      4
      38119
      Tr18_CA_DO5
      Harp Lake
      45.3798
      -79.1335
      327.0
      North America
      Canada
      Ont
      Trends

2. Water chemsitry



In [3]:

    
# Load saved data
wc_csv = r'../../../Thematic_Trends_Report_2019/working_chem.csv'
wc_df = pd.read_csv(wc_csv, encoding='utf-8')
wc_df['sample_date'] = pd.to_datetime(wc_df['sample_date'])

# Annual medians by site
wc_df['year'] = wc_df['sample_date'].dt.year
agg_df = wc_df.groupby(['station_id', 'year']).median().reset_index()
agg_df.head()

Added 12.12.2019

Remove alkalinty and include ECl - see e-mail from Øyvind received 11.12.2019 at 11.59.



In [4]:

    
del agg_df['ALK-E']



In [5]:

    
# Loop over sites
df_list = []
for idx, row in stn_df.iterrows():
    # Get station metadata
    stn_id = row['station_id']
    region = row['region']

    # Read series
    df = agg_df.query('station_id == @stn_id').copy()

    # Delete unwanted cols and tidy
    del df['station_id'], df['depth1'], df['depth2']
    df.set_index('year', inplace=True)
    df.sort_index(inplace=True)
    
    # Get first value in series
    for col in df.columns:
        # Values equal to exactly zero must be wrong
        # Convert to NaN
        df[col][df[col]==0] = np.nan
        vals = df[col].dropna().values
        
        # If at least one valid value in series, get the first
        if len(vals) > 0:
            first = vals[0]
        else:
            first = np.nan
            
        # Apply scaling
        df[col] = (df[col] - first) / df[col].std()

    # Build results df
    df['region'] = region
    df['station_id'] = stn_id
    df.reset_index(inplace=True)
    df_list.append(df)
    
# Combine and tidy
df = pd.concat(df_list, axis=0)
df['year'] = df['year'].astype(int)
df.reset_index(inplace=True, drop=True)
df = df.melt(id_vars=['year', 'region', 'station_id'])

# Ignore Baltic countries and Moldova (not in Øyvind's paper and 
# some strange data)
df = df.query('region != "Moldova"')

df.head()

3. Explore data

3.1. All data

The code below uses Altair to create an interactive plot. Hovering the mouse over any point will display information, including the station code. This makes it much easier to identify problem data series.



In [ ]:

    
## Altair plot
#alt.data_transformers.enable('default', max_rows=None)
#c = alt.Chart(df).mark_point().encode(
#      x='year:O',
#      y=alt.Y('value:Q', axis=alt.Axis(title='')),
#      color=alt.Color('station_id:O', legend=None),
#      tooltip='station_id:O',
#).properties(
#      width=60,
#      height=150,
#).facet(
#      column=alt.Column('region', sort=['NoNord', 'SoNord', 'Baltic', 'UK-IE-NL', 'WCE', 'ECE', 'Alps', 
#                                        'AtlCan', 'QuMaVt', 'AdsCsk', 'Apps', 'BRM', 'Ont']),
#      row=alt.Column('variable', sort=['ESO4', 'ENO3', 'ECaX_EMg', 'TOC', 'EH', 'ANC'])
#).resolve_scale(
#      y='independent'
#).interactive()
#c



In [6]:

    
# Convert 'region' and 'variable' to custom categorical data types
reg_type = pd.api.types.CategoricalDtype(categories=['NoNord', 'SoNord', 'Baltic', 'UK-IE-NL', 'WCE', 'ECE', 'Alps', 
                                                     'AtlCan', 'QuMaVt', 'AdsCsk', 'Apps', 'BRM', 'Ont'],
                                         ordered=True)
df['region'] = df['region'].astype(reg_type)

var_type = pd.api.types.CategoricalDtype(categories=['ECl', 'ESO4', 'ENO3', 'ECa_EMg', 'TOC', 'EH', 'ANC'],
                                         ordered=True)
df['variable'] = df['variable'].astype(var_type)

df.head()

The code below uses plotnine to generate a facet-grid of the data with a similar layout to the version in Øyvind's paper. Note, however, that I'm using lowess smoothing here rather than cubic splines, and the approach to scaling is also different (see notebook introduction above).



In [7]:

    
# Static plot
p = (ggplot(df, aes('year', 'value'))
     + geom_point(alpha=0.2, stroke=0.1, color='red')
     + stat_smooth(method='lowess', se=False)
     + facet_grid('variable~region', scales='free_y', shrink=True)
     + theme(axis_text_x=element_text(rotation=90)))
p









    












    Out[7]:





<ggplot: (-9223363248748810727)>



In [8]:

    
out_png = r'../../../Thematic_Trends_Report_2019/results/raw_wq_trends_all.png'
p.save(out_png, width=8, height=12, units='in', dpi=300)

	station_id	year	TOC	ECl	EH	ESO4	ENO3	ECa_EMg	ANC	ALK-E
0	23472	1990	NaN	4.857143	3.630781	45.000000	23.571429	64.333333	10.939799	NaN
1	23472	1993	NaN	0.857143	3.630781	47.708333	35.000000	47.166667	-15.749821	NaN
2	23472	1995	NaN	6.571429	3.352035	44.375000	34.285714	70.500000	1.265627	4.0
3	23472	1997	NaN	4.000000	2.041738	38.541667	25.000000	53.500000	3.383879	4.0
4	23472	2000	NaN	3.523828	1.584893	34.252307	36.161583	57.478071	4.324865	4.0

	station_id	station_code	station_name	latitude	longitude	altitude	continent	country	region	group
0	38115	Tr18_CA_DO1	Blue Chalk Lake	45.1999	-78.9432	344.0	North America	Canada	Ont	Trends
1	38116	Tr18_CA_DO2	Chub Lake	45.2138	-78.9836	343.0	North America	Canada	Ont	Trends
2	38117	Tr18_CA_DO3	Crosson Lake	45.0840	-79.0360	371.0	North America	Canada	Ont	Trends
3	38118	Tr18_CA_DO4	Dickie Lake	45.1510	-79.0876	379.0	North America	Canada	Ont	Trends
4	38119	Tr18_CA_DO5	Harp Lake	45.3798	-79.1335	327.0	North America	Canada	Ont	Trends

	year	region	station_id	variable	value
0	1990	Ont	38115	TOC	0.000000
1	1991	Ont	38115	TOC	0.178090
2	1992	Ont	38115	TOC	-0.095895
3	1993	Ont	38115	TOC	0.945247
4	1994	Ont	38115	TOC	-0.246586