In [1]:
import os
import numpy as np
from scipy.stats import gaussian_kde
from scipy.spatial import distance
from scipy.cluster import hierarchy
import pandas as pd
import matplotlib as mpl
import matplotlib.pyplot as plt
import pandas.tools.plotting as pdplt
import seaborn as sns
# D3 plots with pandas objects in the notebook
# https://github.com/mlunacek/smash
import smash
%matplotlib inline
The analysis performed in this notebook is based on the following NA direct-search run:
In [2]:
na_run_name = "na_run20"
In this run, the NA parameters were chosen for exhaustive exploring of the model parameter space rather than fine optimization (1024 samples randomly generated at the first iteration, then 3 samples generated in each of the 32 best-fit Voronoi cells at each next iteration).
In [3]:
# Inversion run directory
na_run_dir = os.path.abspath(
os.path.join(os.pardir, na_run_name)
)
# NA inversion parameters
na_params_file = os.path.join(na_run_dir, "input", "na.in")
na_in = pd.read_table(na_params_file, sep=':', comment='#',
engine='python', header=None, usecols=[0])
pnames = ('algorithm', 'n_iter', 'n_spl_init', 'n_spl_iter',
'n_cell_iter', 'seed', 'init_type', 'verbose',
'time_mode', 'debug_mode')
ptypes = (int, int, int, int, int, str, int, int, str, str)
pvalues = (t(p) for t, p in zip(ptypes, na_in.get(0)))
na_params = dict(zip(pnames, pvalues))
# Model parameter search ranges
p_ranges_file = os.path.join(na_run_dir, "input", "p_ranges.in")
p_ranges = pd.read_csv(p_ranges_file,
index_col='name',
dtype={'log': np.bool})
# NA search samples
na_samples_file = os.path.join(na_run_dir, "output", "na_samples.out")
na_samples = pd.read_table(na_samples_file)
Some variables need to be re-calculated, like the intermediate statistics $\overline{D}*$ and RMSD that were used to compute the misfit $M$.
In [4]:
# correct a bug in CLICHE (wrong `kappa` values),
# which fortunately had no impact on the NA search results
na_samples['P_0'] /= 4.
p_ranges.loc['P_0', 'min'] /= 4.
p_ranges.loc['P_0', 'max'] /= 4.
# select dataset columns for calculations below
ks_cols = [c for c in na_samples.columns if '_D_' in c]
na_samples_ks = na_samples.loc[:, ks_cols]
sp_cols = [c for c in na_samples.columns if c.startswith('SP_')]
na_samples_sp = na_samples.loc[:, sp_cols]
# re-calculate the weighted mean of the K-S statistics that were
# computed for each soil class and topography derivative
n_cells = 183 # total number of cells in the mesh used for forward modelling
na_samples_sc = (na_samples_sp * n_cells).astype('int')
ci = pd.concat([na_samples_sc] * (len(ks_cols) / len(sp_cols)),
axis=1)
# equation error in thesis manuscript
#weights = (2. / np.pi) * np.arctan(ci * 80 / np.pi)
weights = 2.0 * np.arctan(ci / (80.0 / np.pi)) / np.pi
weights.columns = ks_cols
d_mean = (weights.values * na_samples_ks.values).sum(axis=1)
d_mean /= weights.sum(axis=1)
na_samples.loc[:, 'D_mean'] = d_mean
# re-calculate the RSMD values from each sample proportions
sp_obs = [0.03693368, 0.17780696, 0.55456336, 0.23069599] # observed sample prop
ssd = ((sp_obs - na_samples_sp)**2).sum(axis=1)
na_samples.loc[:, 'RMSD'] = np.sqrt(ssd / len(sp_obs))
# assign the iteration number of the NA direct-search to each sample
na_iter = np.concatenate(
[np.zeros((na_params['n_spl_init'])),
np.repeat(np.arange(1, na_params['n_iter']),
na_params['n_spl_iter'])]
)
na_samples.loc[:, 'NA_iter'] = na_iter[:len(na_samples)]
# drop columns that are not very useful anymore
na_dataset = na_samples.drop(ks_cols + sp_cols + ['mname'],
axis=1)
In [5]:
p = p_ranges[p_ranges['log']].index
We provide below some functions to transform the scale of model parameters (useful for some analysis performed here).
In [6]:
# transform log search scales
def log_df(df):
pnames = p_ranges[p_ranges['log']].index
pnames_log = ["log({})".format(p) for p in pnames]
df_log = df.copy()
df_log.loc[:, pnames] = np.log(df.loc[:, pnames])
df_log.rename(columns=dict(zip(pnames, pnames_log)),
inplace=True)
return df_log
def delog_df(df_log):
pnames = p_ranges[p_ranges['log']].index
pnames_log = ["log({})".format(p) for p in pnames]
df = df_log.copy()
df.loc[:, pnames_log] = np.exp(df_log.loc[:, pnames_log])
df.rename(columns=dict(zip(pnames_log, pnames)),
inplace=True)
return df
# normalize or de-normalize each parameter scale
def norm_df(df):
return (df - df.mean()) / (df.max() - df.min())
def denorm_df(df_norm, df):
return df_norm * (df.max() - df.min()) + df.mean()
Some additional things for fancy visualization.
In [7]:
# map dataframe column names to math-formatted variables
na_labels = {
'P_0': '$P_0$',
'K_r12': '$K_r$',
'K_d': '$K_d$',
'K_dd': '$K_{dd}$',
'K_g': '$K_g$',
'h_init': '$h_{t0}$',
'misfit': '$M$',
'D_mean': '$\overline{D}*$',
'RMSD': 'RMSD',
'cluster': 'cluster'
}
# format all floats when displaying Pandas dataframes
pd.options.display.float_format = '{:,.3e}'.format
# get rid of Pandas's SettingWithCopyWarning
pd.options.mode.chained_assignment = None
# seaborn's categorical and quantitative color palettes
cat_clr = [mpl.colors.rgb2hex(rgb)
for rgb in sns.color_palette(n_colors=20)]
quant_cmap = sns.cubehelix_palette(start=.5, rot=-.75,
as_cmap=True, reverse=True)
Below are the inputs used for the NA direct-search run analysed here (model parameter search ranges/scales and NA parameters)
In [8]:
p_ranges
Out[8]:
In [9]:
na_params
Out[9]:
The misfit function $M$ that we use for our inversion is a complex function that combines several "intermediate" statistics. Before going any further and seeing if - and how - the NA direct-search algorithm has converged towards one or several solutions, we should first look at how the intermediate statistics interplay between each other and how it influences the misfit. It may help to better understand how the NA direct-search has evolved towards regions of better data dit in the model parameter space.
The code below produces three scatter plots showing the relationships between the RMSD, $\overline{D}*$ and $M$ values that were computed for each NA-sampled model.
In [10]:
def scatter_misfit_stats(dataset):
"""
Scatter plots of misfit vs. intermediate statistics.
"""
fig, axes = plt.subplots(1, 3, figsize=(10, 3))
col_pairs = (('RMSD', 'D_mean'),
('RMSD', 'misfit'),
('D_mean', 'misfit'))
for (x, y), ax in zip(col_pairs, axes):
dataset.plot(kind='scatter', x=x, y=y,
ax=ax, alpha=0.2, c=cat_clr[0])
plt.setp(ax, xlabel=na_labels[x],
ylabel=na_labels[y])
fig.tight_layout()
return fig, axes
_ = scatter_misfit_stats(na_dataset)
Looking at the first plot, we see that the relationship $\overline{D}*$ vs. RMSD seems to be globally positive and thus consistent, i.e., that a good (bad) fit to the observed distributions of slope/curvature/drainage corresponds to a good (bad) fit to the relative coverage of each soil thickness class. This positive relationship is, however, strongly non-linear and much weaker for low RSMD values, where the $\overline{D}*$ values are stuck in the [0.2, 0.3] interval (no better fit could be obtained). It is indeed clear in the two lasts scatter plots of $M$ vs. $\overline{D}*$ and $M$ vs. RMSD that RMSD mostly contribute to the variability of $M$ for $M$ values below -3.5. $M$ is also very sensitive to small differences in low RMSD values.
Note that two categories of models don't follow the positive relationship between the two statistics: (1) a set of models with the lowest $\overline{D}*$ values and RSMD values between 0.2 and 0.3, and (2) another set of models with anormally low $\overline{D}*$ values at RMSD values around 0.45.
Let's see if these models can be considered as "outliers", i.e., if we cannot rely in their computed value of $\overline{D}*$. The latter is indeed less meaningful in cases where one or more simulated soil thickness classes don't have enough coverage (i.e., very low number of low-resolution regridded cells) to adequately characterize their distributions, hence the use of an average weighted by the coverage. Due to their low computed weight values, these classes don't contribute much to the computation of $\overline{D}*$.
The code below generates the same scatter plots than above, but from a dataset in which we removed the NA-sampled models that have at least 2 soil thickness classes for which the relative coverage is < 5 %.
In [11]:
# cell count proportion threshold
sp_tresh = 0.05
# minimum number of classes where proportion < threshold
nc_min = 2
outliers = (na_samples_sp < sp_tresh).sum(axis=1) >= nc_min
_ = scatter_misfit_stats(na_dataset[-outliers])
In these lasts scatter plots, we effectively see that the second category of models identified above are not shown anymore and can therefore be considered as outliers! Note that these identified outliers had no real influence on the behavior of the NA direct-search as they all have quite high misfit values.
The models of the first category (lowest $\overline{D}*$ values and RSMD values between 0.2 and 0.3) are still valid given the exclusion criterion above, but the simulations from these models resulted in soil thickness configurations on the synthetic hill that are too inconsistent with the overall configuration seen on the soil map, hence the need of comparing the relative coverage of soil thickness classes (RMSD) in addition to the comparison of the distributions of terrain derivatives ( $\overline{D}*$).
A good way to visualize the convergence of the NA is to generate trace plots of the models sampled at each NA iteration. It is interesting to show the convergence of the algorithm for the misfit and its intermediate statistics, as well as for each model parameter forming the search space. In the subplots below, each dot represents a NA-sampled model. These dots are drawn with some degree of alpha transparency so that it is easier to identify the areas of high sampling density.
In [12]:
fig, axes = plt.subplots(3, 3, figsize=(10, 6))
for col, ax in zip(na_dataset.columns, axes.flatten()):
if col in p_ranges.index and p_ranges.loc[col, 'log']:
logy = True
else:
logy = False
if col in ['misfit', 'D_mean', 'RMSD']:
pt_clr = cat_clr[1]
else:
pt_clr = cat_clr[0]
trace = na_dataset.get(col)
na_dataset.plot(kind='scatter', x='NA_iter', y=col, ax=ax,
logy=logy, s=50, alpha=0.15, color=pt_clr)
if col in p_ranges.index:
plt.setp(ax, ylim=p_ranges.loc[col, ['min','max']])
plt.setp(ax, xlim=[-1, na_params['n_iter']])
plt.setp(ax, ylabel=na_labels[col], xlabel='NA iteration')
fig.tight_layout()
We see in these trace plots remarkable convergence patterns, despite that the NA has not converged towards a unique solution. The $\overline{D}*$ values got rapidly minimized after only a few iterations, though are stuck in the [0.2, 0.3] interval. Although it took more iterations to minimize the RMSD values, the lasts iterations don't show much evolution in both the computed $\overline{D}*$ and RMSD values. The trace of the $M$ (misfit) values, however, show a significant evolution even during the lasts iterations, but this is mainly due to the very small lowering of the computed RMSD values that were already close to 0 at the end of the run. Given our purpose, which consists of exploring the parameter space rather than performing a fine calibration of the model parameters, we can thus reasonably say that the NA direct-search has completed with an enough number of iterations.
The trace plots represent only a limited view of the convergence patterns. We would have a clearer view on these patterns if we plot the distribution of the whole ensemble of NA-sampled models. This would allow us to see more precisely the regions of best data fit, i.e., the regions that have been preferably sampled by the NA direct-search algorithm as the iterations proceeded. The plots below show an histogram and a Kernel Densisty Estimate (KDE) of the NA-sampled models for each model parameter. These plotted distributions don't include the models that were generated randomly during the first NA iteration.
In [13]:
fig, axes = plt.subplots(2, 3, figsize=(8, 5))
for col, ax in zip(na_dataset.columns, axes.flatten()):
# don't include samples from 1st NA iteration
col_data = na_dataset[na_params['n_spl_init']:].get(col)
xlim = p_ranges.loc[col, ['min', 'max']].values.astype('f')
# calculate KDE on log or linear scale
if p_ranges.loc[col, 'log'] == True:
bins = np.logspace(*np.log10(xlim), num=20)
kde_x = np.logspace(*np.log10(xlim), num=200)
kde_gen = gaussian_kde(np.log10(col_data))
kde_y = kde_gen.evaluate(np.log10(kde_x))
plt.setp(ax, xscale='log')
else:
bins = np.linspace(*xlim, num=20)
kde_x = np.linspace(*xlim, num=200)
kde_gen = gaussian_kde(col_data)
kde_y = kde_gen.evaluate(kde_x)
sns.distplot(col_data, bins=bins, kde=False, hist=True, ax=ax)
ax2 = ax.twinx()
ax2.plot(kde_x, kde_y, c=cat_clr[0])
plt.setp(ax, yticks=[], xlim=xlim, xlabel=na_labels[col])
plt.setp(ax2, yticks=[])
fig.tight_layout()
We see that the distribution of models is at least bimodal for most of the parameters, though we note a well-defined mode of high sampling density for the $P_0$, $K_d$ and $K_g$ parameters. In the trace plots above, we see that these modes correspond to value intervals that were already sampled early in the evolution of the NA run.
Because they are 1-dimensional, the trace plots and histograms above show us only little information about the distribution of the NA-sampled models in the 6-dimensional parameter search-space. A conventional, though incomplete, way to view the multi-dimensional search space is to draw a scatterplot matrix showing the model distributions projected on 2-dimensional spaces formed by couples of model parameters. This allow us to see possible relationships between parameters.
In [14]:
def scatter_matrix_p(dataset, carray, **kwargs):
"""
Scatter plot matrix of (a subset of the)
NA sampled models (`dataset`), colored with
`carray`.
"""
# scatter matrix
axes = pdplt.scatter_matrix(dataset, c=carray,
edgecolors='none',
grid=False, **kwargs)
fig = axes[0][0].figure
# colorbar
dummy_scatter = axes[0][0].scatter(
carray, carray, c=carray,
cmap=kwargs.get('cmap'),
norm=kwargs.get('norm')
)
cax = fig.add_axes([0.2, 0.05, 0.6, 0.02])
clb = plt.colorbar(dummy_scatter, cax=cax,
orientation='horizontal')
clb.set_label(na_labels[carray.name])
# clear diagonal subplots
n_diag = len(dataset.columns)
for i in range(n_diag):
plt.setp(axes[i][i], axis_bgcolor='w')
axes[i][i].clear()
# axis limits (and log-scale if needed)
for i in range(n_diag):
icol = dataset.columns[i]
ylim = p_ranges.loc[icol, ['min', 'max']]
for j in range(n_diag):
jcol = dataset.columns[j]
xlim = p_ranges.loc[jcol, ['min', 'max']]
plt.setp(axes[i][j], xlim=xlim, ylim=ylim)
if p_ranges.loc[icol, 'log']:
plt.setp(axes[i][j], yscale='log')
if p_ranges.loc[jcol, 'log']:
plt.setp(axes[i][j], xscale='log')
# axis labels
for i in range(n_diag):
icol = dataset.columns[i]
label = na_labels[icol]
plt.setp(axes[-1][i], xlabel=label)
plt.setp(axes[i][0], ylabel=label)
# hide grid
for ax in axes.flatten():
ax.grid(False)
# subplots spacing
fig.subplots_adjust(wspace=0.02, hspace=0.02, bottom=0.13)
return fig, axes
# sort and subset data (model parameters only)
na_sorted = na_dataset.sort('misfit', ascending=False)
na_subset = na_sorted.loc[:, p_ranges.index]
_ = scatter_matrix_p(na_subset, na_sorted['misfit'],
s=50, cmap=quant_cmap, alpha=0.4,
figsize=(10, 11))
These 2-dimension scatter plots show us that the misfit distribution in the 6-dimensions parameter space is definitely not as simple as one can see on the histograms plotted above. Especially when looking at the plots of $P_0$ vs. the other parameters, what is striking is that the convergence patterns seem to incompletely describe some complex relationships between the model parameters. Models of relatively good data fit show for example a well-defined linear relationship between $\log(P_0)$ and $\log(K_{dd})$, although better fitted models at $P_0 \sim 3 \cdot 10^{-5}$ show that the relationship is even more complex. All plots of $P_0$ vs. transport parameters also tend to partially show a positive correlation.
From what we can see in the plots generated so far, it is likely that the NA search has been trapped in local minima at some time, despite the explorative-based choice of the NA parameters. This is rather due to the complexity of the misfit in face of the power of the search algorithm, which is based on a Gibbs sampler that itself is not the most efficient in handling high misfit correlations among the parameters.
We see, however, that the sporadic areas of best data-fit are quite well spread over the parameter search-space. The results are still satisfactory and hopefully form a good basis to study and characterize the main features of the shape of the misfit in its - possibly continuous - low-value regions.
Another way to visualize the multi-dimensional search space is the parallel-coordinate plot. Unlike the scatterplot matrix, it has the advantage of showing all dimensions (parameters) at once. While it is not very efficient at characterizing continuous relationships between parameters, it can greatly help to identify discrete patterns among the multi-dimensional data, which makes it perfectly suited for our problem.
The code below generates a parallel-coordinate plot of the NA dataset. As too much data can make the plot unreadable, we only show the sampled models for which the misfit is below the characteritic value of -3.5, as shown in the $M$ vs. $\overline{D}*$ vs. RMSD plots.
The plot is interactive, which means that it is possible to click and drag along any axis to specify a filter for that dimension (brushing). The plot is built using using d3.js and smash.
In [15]:
# show only models for misfit values lower than
misfit_max = -3.5
# select the subset and transform log search scales
na_misfit_sel = na_dataset.misfit < misfit_max
na_subset_log = log_df(na_dataset[na_misfit_sel])
# build and show the interactive plot
smash.parallel(data=na_subset_log).show()
Some interesting patterns appear when filtering on the misfit axis. If we filter the models on that axis from the bottom limit up to around -5.4, we should see a small number of well-defined clusters. Enlarging the misfit filter interval to values higher than -5.4 don't seem to increase the number of clusters in the plot, it only increases the variances of the cluster distributions on each dimension.
Hence, we can apply methods of automatic clustering to delineate the clusters. Since we don't know a-priori the number of clusters (some clusters seem to overlap in the parallel-coordinate plot, which may be confusing), we choose a hierarchical clustering method.
Clustering is applied only on models which have a misfit value lower than -5.4. In order to give equal weight to each model parameter, scales are first log-transformed (if needed) and normalized. Clustering is then based on euclidian distances between models in this normalized parameter space. The distance between two clusters is defined as the average distance between each pairs of models taken from these two clusters.
In [16]:
# apply clustering only to models with misfit lower than
misfit_max = -5.4
# remove the following variables of the dataset before clustering
drop_cols = ['misfit', 'D_mean', 'RMSD']
# select the subset, transform log search scales, normalize
# and select the variables used for clustering
na_misfit_sel = na_dataset.misfit < misfit_max
na_subset = na_dataset[na_misfit_sel]
na_subset.drop('NA_iter', axis=1, inplace=True)
na_subset_log = log_df(na_subset)
na_subset_norm = norm_df(na_subset_log)
na_subset_norm_p = na_subset_norm.drop(drop_cols, axis=1)
# compute the distance matrix (euclidian distances by default)
dist_matrix = distance.cdist(na_subset_norm_p, na_subset_norm_p)
# compute the link matrix (average method)
link_matrix = hierarchy.linkage(distance.squareform(dist_matrix),
method='average')
The code below generates the dendrogram, i.e., the cluster tree obtained by the hierarchical clustering algorithm. The x-axis of the plot is proportional to the distances between the clusters. We also define a cluster distance threshold so that we can use different colors to arbitrarily distinguish between clusters in the dendrogram. The y-axis labels are the model ids (NA-sample increments).
In [17]:
# cluster distance threshold
dist_tresh = 0.6
fig, ax = plt.subplots(figsize=(6, 9))
hierarchy.set_link_color_palette(cat_clr)
dendrogram = hierarchy.dendrogram(link_matrix,
color_threshold=dist_tresh,
labels=na_subset_norm.index,
orientation='left', ax=ax)
The dendrogram shows that 5 clusters quickly formed. They are all quite homogeneous, i.e., the distances between lower-level clusters are small (< 0.2, excepted for one cluster). Higher-level clusters are much more distant between each-other.
We therefore choose the number of cluster = 5 as the criterion of cluster delineation.
In [18]:
# clustering criterion = number of clusters
n_clusters = 5
clusters = hierarchy.fcluster(link_matrix, n_clusters,
criterion='maxclust')
na_subset['cluster'] = clusters
na_subset_norm['cluster'] = clusters
We then calculate the cluster centroids, which consist of the mean of the model values in each cluster. Note that the table below show also the values of the centroids for $M$, $\overline{D}*$ and RMSD, even though these variables were not taken into account in the clustering. Their centroid values are also mean values and thus don't correspond to the $M$, $\overline{D}*$ and RMSD values computed from the centroid models.
In [19]:
centroids_norm = na_subset_norm.groupby('cluster').mean()
centroids = delog_df(denorm_df(centroids_norm, na_subset_log))
centroids
Out[19]:
We then generate a second parallel-coordinate plot, this time with a color code for each cluster (the same used than in the dendrogram). The thick lines represent the centroid, while the thin-transparent lines represent the individual models that belong to each cluster. Note that normalized values are used on each axis.
In [20]:
fig, ax = plt.subplots(figsize=(8, 6))
pdplt.parallel_coordinates(
na_subset_norm.sort('cluster'), 'cluster',
alpha=0.25, color=cat_clr, ax=ax
)
ax.legend_.remove()
lines = []
for c, ctr in enumerate(centroids_norm.values):
l, = ax.plot(ctr, linewidth=3,
label='cluster {}'.format(c + 1))
lines.append(l)
leg = ax.legend(handles=lines, loc='lower right')
A scatterplot matrix of the clusters may also help us to visualize them.
In [21]:
# define discrete colormap from categorical color list
cmap = mpl.colors.ListedColormap(cat_clr[:n_clusters])
bounds = np.arange(1, n_clusters + 2)
norm = mpl.colors.BoundaryNorm(bounds, cmap.N)
# scatter plots
fig, axes = scatter_matrix_p(na_subset.loc[:, p_ranges.index],
na_subset['cluster'],
cmap=cmap, norm=norm,
alpha=0.2, s=160,
figsize=(10, 11))
# add cluster centroids to the scatter plots
na_labels_rev = {v: k for k, v in na_labels.items()}
axes_lu = axes.copy()
np.fill_diagonal(axes_lu, 0)
for ax in axes_lu[np.nonzero(axes_lu)]:
xlabel = na_labels_rev.get(ax.get_xlabel()) or ax.get_xlabel()
ylabel = na_labels_rev.get(ax.get_ylabel()) or ax.get_ylabel()
x_centroids = centroids[xlabel]
y_centroids = centroids[ylabel]
ax.scatter(x_centroids, y_centroids,
c=centroids.index, s=40,
cmap=cmap, norm=norm)
We note two main characteritics of the cluster configuration: (1) the value of $P_0$ and $K_g$ is nearly the same for clusters 3,4,5, despite the diverging values for the other parameters, and (2) $\overline{D}*$ and RMSD values are also well clustered while they were not taken into account in the clustering.
The results from forward-modelling in each cluster will be further investigated.
Authors: B. Bovy, J. Braun, A. Demoulin
F.R.S-FNRS | UGPQ-Ulg | ISTerre-Université de Grenoble
This work is licensed under a Creative Commons Attribution 4.0 International License.
Softwares and versions used:
In [22]:
%load_ext version_information
%version_information numpy, scipy, pandas, matplotlib, seaborn
Out[22]:
In [22]: