In [1]:
import openturns as ot
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
%load_ext autoreload
%autoreload 2
random_state = 123
np.random.seed(random_state)
The first example of conservative estimation consider an additive model $\eta : \mathbb R^d \rightarrow \mathbb R$ with Gaussian margins. The objectives are to estimate a quantity of interest $\mathcal C(Y)$ of the model output distribution. Unfortunately, the dependence structure is unknown. In order to be conservative we aim to give bounds to $\mathcal C(Y)$.
This example consider the simple additive example.
In [2]:
from depimpact.tests import func_sum
help(func_sum)
In [3]:
dim = 2
margins = [ot.Normal()]*dim
In [4]:
families = np.zeros((dim, dim), dtype=int)
families[1, 0] = 1
In [5]:
from depimpact import ConservativeEstimate
quant_estimate = ConservativeEstimate(model_func=func_sum, margins=margins, families=families)
First, we compute the quantile at independence
In [6]:
n = 1000
indep_result = quant_estimate.independence(n_input_sample=n, random_state=random_state)
We aim to minimize the output quantile. To do that, we create a q_func
object from the function quantile_func
to associate a probability $\alpha$ to a function that computes the empirical quantile from a given sample.
In [7]:
from dependence import quantile_func
alpha = 0.05
q_func = quantile_func(alpha)
indep_result.q_func = q_func
The computation returns a DependenceResult
instance. This object gather the informations of the computation. It also computes the output quantity of interest (which can also be changed).
In [8]:
sns.jointplot(indep_result.input_sample[:, 0], indep_result.input_sample[:, 1]);
In [9]:
h = sns.distplot(indep_result.output_sample_id, axlabel='Model output', label="Output Distribution")
plt.plot([indep_result.quantity]*2, h.get_ylim(), label='Quantile at %d%%' % (alpha*100))
plt.legend(loc=0)
print('Output quantile :', indep_result.quantity)
A boostrap can be done on the output quantity
In [10]:
indep_result.compute_bootstrap(n_bootstrap=5000)
And we can plot it
In [11]:
sns.distplot(indep_result.bootstrap_sample, axlabel='Output quantile');
In [12]:
ci = [0.025, 0.975]
quantity_ci = indep_result.compute_quantity_bootstrap_ci(ci)
h = sns.distplot(indep_result.output_sample_id, axlabel='Model output', label="Output Distribution")
plt.plot([indep_result.quantity]*2, h.get_ylim(), 'g-', label='Quantile at %d%%' % (alpha*100))
plt.plot([quantity_ci[0]]*2, h.get_ylim(), 'g--', label='%d%% confidence intervals' % ((1. - (ci[0] + 1. - ci[1]))*100))
plt.plot([quantity_ci[1]]*2, h.get_ylim(), 'g--')
plt.legend(loc=0)
print('Quantile at independence: %.2f with a C.O.V at %.1f %%' % (indep_result.boot_mean, indep_result.boot_cov))
Firstly, we consider a grid search approach in order to compare the perfomance with the iterative algorithm. The discretization can be made on the parameter space or on other concordance measure such as the kendall's Tau. This below example shows a grid-search on the parameter space.
In [31]:
%%snakeviz
K = 500
n = 10000
grid_type = 'lhs'
dep_measure = 'parameter'
grid_result = quant_estimate.gridsearch(n_dep_param=K, n_input_sample=n, grid_type=grid_type, dep_measure=dep_measure,
random_state=random_state)
The computation returns a ListDependenceResult
which is a list of DependenceResult
instances and some bonuses.
In [14]:
print('The computation did %d model evaluations.' % (grid_result.n_evals))
Lets set the quantity function and search for the minimum among the grid results.
In [15]:
grid_result.q_func = q_func
min_result = grid_result.min_result
print('Minimum quantile: {} at param: {}'.format(min_result.quantity, min_result.dep_param))
We can plot the result in grid results. The below figure shows the output quantiles in function of the dependence parameters.
In [17]:
plt.plot(grid_result.dep_params, grid_result.quantities, '.', label='Quantiles')
plt.plot(min_result.dep_param[0], min_result.quantity, 'ro', label='minimum')
plt.xlabel('Dependence parameter')
plt.ylabel('Quantile value')
plt.legend(loc=0);
As for the individual problem, we can do a boostrap also, for each parameters. Because we have $K$ parameters, we can do a bootstrap for the $K$ samples, compute the $K$ quantiles for all the bootstrap and get the minimum quantile for each bootstrap.
In [18]:
grid_result.compute_bootstraps(n_bootstrap=5000)
boot_min_quantiles = grid_result.bootstrap_samples.min(axis=0)
boot_argmin_quantiles = grid_result.bootstrap_samples.argmin(axis=0).ravel().tolist()
boot_min_params = [grid_result.dep_params[idx][0] for idx in boot_argmin_quantiles]
fig, axes = plt.subplots(1, 2, figsize=(14, 5))
sns.distplot(boot_min_quantiles, axlabel="Minimum quantiles", ax=axes[0])
sns.distplot(boot_min_params, axlabel="Parameters of the minimum", ax=axes[1])
Out[18]:
For the parameter that have the most occurence for the minimum, we compute its bootstrap mean.
In [19]:
# The parameter with most occurence
boot_id_min = max(set(boot_argmin_quantiles), key=boot_argmin_quantiles.count)
boot_min_result = grid_result[boot_id_min]
boot_mean = boot_min_result.bootstrap_sample.mean()
boot_std = boot_min_result.bootstrap_sample.std()
print('Worst Quantile: {} at {} with a C.O.V of {} %'.format(boot_min_result.boot_mean, min_result.dep_param, boot_min_result.boot_cov*100.))
In [20]:
plt.plot(grid_result.kendalls, grid_result.quantities, '.', label='Quantiles')
plt.plot(min_result.kendall_tau, min_result.quantity, 'ro', label='Minimum quantile')
plt.xlabel("Kendall's tau")
plt.ylabel('Quantile')
plt.legend(loc=0);
As we can see, the bounds
In [21]:
bounds_tau = np.asarray([[0., 0.7], [0.1, 0.]])
quant_estimate.bounds_tau = bounds_tau
K = 20
n = 10000
grid_type = 'lhs'
grid_result = quant_estimate.gridsearch(n_dep_param=K, n_input_sample=n, grid_type=grid_type, random_state=random_state)
In [22]:
grid_result.q_func = q_func
min_result = grid_result.min_result
print('Minimum quantile: {} at param: {}'.format(min_result.quantity, min_result.dep_param))
In [23]:
plt.plot(grid_result.dep_params, grid_result.quantities, '.', label='Quantiles')
plt.plot(min_result.dep_param[0], min_result.quantity, 'ro', label='minimum')
plt.xlabel('Dependence parameter')
plt.ylabel('Quantile value')
plt.legend(loc=0);
In [25]:
filename = './result.hdf'
grid_result.to_hdf(filename)
In [26]:
from dependence import ListDependenceResult
load_grid_result = ListDependenceResult.from_hdf(filename, q_func=q_func, with_input_sample=False)
In [27]:
np.testing.assert_array_equal(grid_result.output_samples, load_grid_result.output_samples)
In [28]:
import os
os.remove(filename)
In [43]:
K = None
n = 1000
grid_type = 'vertices'
grid_result = quant_estimate.gridsearch(n_dep_param=K, n_input_sample=n, grid_type=grid_type, random_state=random_state)
In [49]:
In [50]:
grid_result.q_func = q_func
print("Kendall's Tau : {}, Quantile: {}".format(grid_result.kendalls.ravel(), grid_result.quantities))
In [53]:
from depimpact.plots import matrix_plot_input
matrix_plot_input(grid_result.min_result);
In [54]:
dim = 5
quant_estimate.margins = [ot.Normal()]*dim
In [55]:
families = np.zeros((dim, dim), dtype=int)
families[2, 0] = 1
quant_estimate.families = families
families
Out[55]:
In [56]:
quant_estimate.bounds_tau = None
quant_estimate.bounds_tau
Out[56]:
We reset the families and bounds for the current instance. (I don't want to create a new instance, just to check if the setters are good).
In [57]:
quant_estimate.vine_structure
Out[57]:
Let's do the grid search to see
In [59]:
K = 20
n = 10000
grid_type = 'vertices'
grid_result = quant_estimate.gridsearch(n_dep_param=K, n_input_sample=n, grid_type=grid_type, random_state=random_state)
The quantile is lower compare to the problem of dimension 1. Indeed, there is more variables, more uncertainty, so a larger deviation of the output.
In [64]:
grid_result.q_func = q_func
min_result = grid_result.min_result
print('Worst Quantile: {} at {}'.format(min_result.quantity, min_result.dep_param))
matrix_plot_input(min_result)
Out[64]:
In [67]:
plt.plot(grid_result.dep_params, grid_result.quantities, '.', label='Quantiles')
plt.plot(min_result.dep_param[0], min_result.quantity, 'ro', label='Minimum')
plt.xlabel('Dependence parameter')
plt.ylabel('Quantile value')
plt.legend(loc=0);
In [68]:
families = np.zeros((dim, dim), dtype=int)
for i in range(1, dim):
for j in range(i):
families[i, j] = 1
In [69]:
quant_estimate.margins = margins
quant_estimate.families = families
quant_estimate.vine_structure = None
quant_estimate.bounds_tau = None
quant_estimate.bounds_tau
In [71]:
K = 100
n = 1000
grid_type = 'lhs'
grid_result = quant_estimate.gridsearch(n_dep_param=K, n_input_sample=n, grid_type=grid_type, random_state=random_state)
In [153]:
min_result = grid_result.min_result
print('Worst Quantile: {0} at {1}'.format(min_result.quantity, min_result.dep_param))
In [72]:
families[3, 2] = 0
quant_estimate = ConservativeEstimate(model_func=func_sum, margins=margins, families=families)
In [73]:
K = 100
n = 10000
grid_type = 'lhs'
grid_result = quant_estimate.gridsearch(n_dep_param=K, n_input_sample=n, grid_type=grid_type,
q_func=q_func, random_state=random_state)
In [74]:
min_result = grid_result.min_result
print('Worst Quantile: {0} at {1}'.format(min_result.quantity, min_result.dep_param))
In [75]:
grid_result.vine_structure
Out[75]:
In [77]:
from depimpact.plots import matrix_plot_input
matrix_plot_input(min_result)
Out[77]:
In [78]:
K = 100
n = 1000
grid_type = 'lhs'
grid_result_1 = quant_estimate.gridsearch(n_dep_param=K, n_input_sample=n, grid_type=grid_type, save_grid=True, grid_path='./output')
In [79]:
grid_result_2 = quant_estimate.gridsearch(n_dep_param=K, n_input_sample=n, grid_type=grid_type,
q_func=q_func, use_grid=0, grid_path='./output')
In [43]:
grid_result_1.n_input_sample, grid_result_2.n_input_sample
Out[43]:
In [44]:
grid_result = grid_result_1 + grid_result_2
Because the configurations are the same, we can gather the results from two different runs
In [45]:
grid_result.n_input_sample
Out[45]: