In [1]:
%matplotlib inline
import pandas as pd
import matplotlib.pyplot as plt
import mpld3
import seaborn as sn
import imp
import os
from collections import defaultdict
from scipy.stats import theilslopes
sn.set_context('notebook')
This notebook follows on from icpw_climate_data_proc_py32.ipynb, which can be found here. That notebook extracted point time series with annual frequency from CRU netCDF files. The aim of this notebook is to calculate trends and summaries based on these series.
For the period from 1990 to 2012, calculate:
The average of annual average temperature and total precipitation
The average of summer (JJA) average temperature and total precipitation
The average of summer (JAS) average temperature and total precipitation
In addition, for the time periods 1990 to 2012, 1990 to 2004 and 1998 to 2012, calculate:
Trends in the above 6 quantities over the duration of the time period of interest, estimated using the Theil-Sen method
Trend significance for the above 6 quantities estimated using the Mann-Kendall method
Note that the mean temperature estimates should all be corrected for the difference between pixel elevation and actual site elevation according to the lapse rate. The actual lapse rate is highly variable, but Heleen would like to use a value of 0.6C/100m (see e-mail received 23/01/2017 at 11.49).
To do: Heleen would also like to add a fourth time period (1990 to 1992) to the trends analysis. For this period we are only interested in median TOC, ECa_EMg and ECaX_EMgX. I haven't done this yet as time is very tight, and it's probably easier to do separately anyway. Come back to this.
The code below calculates all the desired statistics.
In [2]:
# Import my earlier code for the M-K test
resa2_trends_path = (r'C:\Data\James_Work\Staff\Heleen_d_W\ICP_Waters\TOC_Trends_Analysis_2015'
r'\Python\icpw\toc_trends_analysis.py')
resa2_trends = imp.load_source('toc_trends_analysis', resa2_trends_path)
In [3]:
# Define variables and periods of interest
var_list = ['pre', 'tmp']
per_list = [[1990, 2012], [1990, 2004], [1998, 2012]]
# Excel file of climate data and stn elevs
clim_xls = (r'C:\Data\James_Work\Staff\Heleen_d_W\ICP_Waters\TOC_Trends_Analysis_2015'
r'\CRU_Climate_Data\cru_climate_summaries.xlsx')
stn_xls = (r'C:\Data\James_Work\Staff\Heleen_d_W\ICP_Waters\TOC_Trends_Analysis_2015'
r'\CRU_Climate_Data\cru_stn_elevs.csv')
# Output summary stats
out_csv = (r'C:\Data\James_Work\Staff\Heleen_d_W\ICP_Waters\TOC_Trends_Analysis_2015'
r'\CRU_Climate_Data\icpw_climate_stats.csv')
# Output folder for plots
plot_fold = (r'C:\Data\James_Work\Staff\Heleen_d_W\ICP_Waters\TOC_Trends_Analysis_2015'
r'\CRU_Climate_Data\plots')
# Produce plots? (For testing)
make_plots = False
In [4]:
# Read stn elev data
stn_df = pd.read_csv(stn_xls)
# Get list of sites
stn_list = stn_df['stn_id'].unique()
# Dict to store output
data_dict = defaultdict(list)
# Loop over data
for var in var_list:
for tm in ['ann', 'jja', 'jas']:
# Open the climate data
clim_df = pd.read_excel(clim_xls, sheetname='%s_%s' % (var, tm))
# Loop over stations
for stn in stn_list:
# Filter the climate data for this station
stn_clim_df = clim_df.query('stn_id == @stn')
# Set date index
stn_clim_df.index = stn_clim_df['time']
del stn_clim_df['time']
stn_clim_df = stn_clim_df.sort_index()
# Correct temperatures according to lapse rate
if var == 'tmp':
# Get elevations
stn_elev = stn_df.query('stn_id == @stn')['elev_m'].values[0]
px_elev = stn_df.query('stn_id == @stn')['px_elev_m'].values[0]
# If pixel elev is negative (i.e. in sea), correct back to s.l.
if px_elev < 0:
px_elev = 0
# Calculate temperature difference based on 0.6C/100m
t_diff = 0.6 * (px_elev - stn_elev) / 100.
# Apply correction
stn_clim_df['tmp'] = stn_clim_df['tmp'] + t_diff
# Loop over time periods
for per in per_list:
# Truncate
df = stn_clim_df.truncate(before='%s-01-01' % per[0],
after='%s-12-31' % per[1])
# Only need averages for 1990-2012
if (per[0]==1990) and (per[1]==2012):
# Calculate long-term averages
key = '%s_%s_%s-%s_avg' % (var, tm, per[0], per[1])
val = df.mean()[var]
data_dict[key].append(val)
# Calculate Sen's slope and add to dict
sslp, icpt, lb, ub = theilslopes(df[var].values,
df['year'], 0.95)
key = '%s_%s_%s-%s_slp' % (var, tm, per[0], per[1])
data_dict[key].append(sslp)
# Calculate MK signif and add to dict
res = resa2_trends.mk_test(df[var].values, str(stn), var)
sig = res[3]
key = '%s_%s_%s-%s_sig' % (var, tm, per[0], per[1])
data_dict[key].append(sig)
# Plot
if make_plots:
plt.plot(df['year'], df[var].values, 'bo-')
plt.plot(df['year'],
sslp*df['year'] + icpt,
'k-')
plt.title('%s %s at station %s (%s-%s)' % (tm, var, stn, per[0], per[1]),
fontsize=20)
# Save
png_path = os.path.join(plot_fold,
'%s_%s_%s_%s-%s.png' % (stn, tm, var,
per[0], per[1]))
plt.savefig(png_path, dpi=150)
plt.close()
# Build output df
df = pd.DataFrame(data_dict, index=stn_list)
# Reorder columns
cols = df.columns
cols = sorted(cols)
df = df[cols]
# Save
df.to_csv(out_csv, index_label='stn_id')
df.head()
Out[4]:
In [5]:
# Process raw climate data
# File paths
pptn_csv = (r'K:\Prosjekter\langtransporterte forurensninger\O-23300 - ICP-WATERS - HWI'
r'\Database\2015 DOC analysis\climate data\2016-02-02 from don\Precip_res_NEW_corr.csv')
temp_csv = (r'K:\Prosjekter\langtransporterte forurensninger\O-23300 - ICP-WATERS - HWI'
r'\Database\2015 DOC analysis\climate data\2016-02-02 from don\Temp_res_NEW_corr.csv')
# Container for DFs
df_list = []
# Loop over files
for csv in [pptn_csv, temp_csv]:
# Read data
df = pd.read_csv(csv, delimiter=';')
# Melt
df = pd.melt(df, id_vars=['StationID', 'Variable'],
var_name='Param', value_name='value')
# Concat 'variable' and 'param' cols
# Convert to lower case
df['variable'] = df['Variable'].str.lower() + '_' + df['Param'].str.lower()
# Tidy
df['station_id'] = df['StationID']
del df['Param'], df['Variable'], df['StationID']
# Pivot
df = df.pivot(index='station_id', columns='variable',
values='value')
# Add to list
df_list.append(df)
# Concat pptn and temp data
df = pd.concat(df_list, axis=1)
# Reset index and tidy
df.reset_index(inplace=True)
df.columns.name = None
df.index = df['station_id']
del df['station_id']
df.head()
Out[5]:
In [6]:
# Read the new data again (calculated earlier in this notebook)
new_df = pd.read_csv(out_csv, index_col=0, encoding='utf-8')
# Join
df = new_df.join(df)
Plot a few examples to see how similar the results are.
In [7]:
# Compare estimates
fig, axes = plt.subplots(nrows=2, ncols=2, figsize=(12, 12))
# Annual slopes (pptn)
axes[0, 0].scatter(df['pre_ann_1990-2012_slp'], df['precip_yr_senslope90_12'],
label='')
axes[0, 0].plot(df['pre_ann_1990-2012_slp'], df['pre_ann_1990-2012_slp'],
'k-', label='Line with gradient 1')
axes[0, 0].set_title('Slope for annual precipitation (1990 to 2012)')
axes[0, 0].set_xlabel('New climate data')
axes[0, 0].set_ylabel('Old climate data')
axes[0, 0].legend(loc='best')
# Annual slopes p-vals (pptn)
axes[0, 1].scatter(df['pre_ann_1990-2012_sig'], df['precip_yr_manken_pval_90_12'],
label='')
axes[0, 1].plot(df['pre_ann_1990-2012_sig'], df['pre_ann_1990-2012_sig'],
'k-', label='Line with gradient 1')
axes[0, 1].set_title('p-values for annual precipitation (1990 to 2012)')
axes[0, 1].set_xlabel('New climate data')
axes[0, 1].set_ylabel('Old climate data')
axes[0, 1].legend(loc='best')
# Annual slopes (temp)
axes[1, 0].scatter(df['tmp_ann_1990-2012_slp'], df['temp_yr_senslope90_12'],
label='')
axes[1, 0].plot(df['tmp_ann_1990-2012_slp'], df['tmp_ann_1990-2012_slp'],
'k-', label='Line with gradient 1')
axes[1, 0].set_title('Slope for annual temperature (1990 to 2012)')
axes[1, 0].set_xlabel('New climate data')
axes[1, 0].set_ylabel('Old climate data')
axes[1, 0].legend(loc='best')
# Annual slopes p-vals (temp)
axes[1, 1].scatter(df['tmp_ann_1990-2012_sig'], df['temp_yr_manken_pval_90_12'],
label='')
axes[1, 1].plot(df['tmp_ann_1990-2012_sig'], df['tmp_ann_1990-2012_sig'],
'k-', label='Line with gradient 1')
axes[1, 1].set_title('p-values for annual temperature (1990 to 2012)')
axes[1, 1].set_xlabel('New climate data')
axes[1, 1].set_ylabel('Old climate data')
axes[1, 1].legend(loc='best')
plt.tight_layout()
The slope estimates (left column) are not identical, but the values are clearly related and are more-or-less evenly scattered around the 1:1 line. These differences are probably due to changes in the climate data as it has been updated, so I am not too worries about these.
The p-values, however, seem very different indeed: there is essentially no relationshiop between the values calculated in my script and those obtained in the previous climate analysis. Because the data has changed, I do not expect the p-values to be identical, but I'm a little surprised to see just how different they are.
As a check on my approach, I've manually extracted annual time series for two parameters at two of the sites and then calculated M-K statistics using the the "trends" package in R. This is a completely independent package from anything in my analysis, and I would expect the R output for something as common as M-K to be pretty reliable. R reports the p-values to 5 decimal places and the results agree exactly with the output from my script. I am therefore reasonably confident that my code is correctly estimating significance levels.
What code was used for calculating p-values in the previous analysis?
Was the previous analysis analysing the same quantities/variables (i.e. means of annual means)?