The confidence intervals of a distribution parameters is implemented using a bootstrap technique in the confidence_intervals_bootstrap module, or cib.
The bootstrap technique according to [1] is implemented in the cib module using numpy and scipy.
Here this implementation is benchmarked against the equivalent R module fitdistrplus [2].
This is a python notebook using R code from the rpy2 module [3].
[1] Bootstrap confidence intervals
[2] R package fitdistrplus
https://cran.r-project.org/web/packages/fitdistrplus/fitdistrplus.pdf
[3] Using R inside Python
http://rpy2.readthedocs.io/en/version_2.8.x/introduction.html
This notebook checks the critical seastates found for the ITS deployment for the Visund Nord project, August 2017.
Document 072110C001 AB.00.00.300-RT-5804-0001 - General Deployment Analyses, Campaign 1, rev. 2
I:\DCC_STA\072110C001\TNO\Docs_PDF\AB.00.00.300-RT-5804-0001_221-2A-FM-O16-00010-.pdf
@author: rarossi
Created on Mon Oct 9 13:14:17 2017
In [23]:
import sys
import time
from scipy import stats as ss
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
import itertools
# Import my modules
import quantilelib as ql
sys.path.append(r'C:\Users\rarossi\Documents\git\yoshimi\Qt\ResultsVisualiser')
import confidence_interval_bootstrap as cib
# Import of R objects
from rpy2 import robjects
from rpy2.robjects.packages import importr
rfitdistrplus = importr('fitdistrplus')
rstats = importr('stats')
rbase = importr('base')
In [24]:
# R R R R R R R R R R R R R R R R R R R R R R R R R R R R R R R R R R R R R R R R R R R R R R R R R
# Define the Gumbel distribution in R
robjects.r('''
# create functions for `gumbel`
dgumbel <- function(x, a, b) 1/b*exp((a-x)/b)*exp(-exp((a-x)/b))
pgumbel <- function(q, a, b) exp(-exp((a-q)/b))
qgumbel <- function(p, a, b) a-b*log(-log(p))
dgumbelmin <- function(x, a, b) 1/b*exp((x-a)/b)*exp(-exp((x-a)/b))
pgumbelmin <- function(q, a, b) 1-exp(-exp((q-a)/b))
qgumbelmin <- function(p, a, b) a+b*log(-log(1-p))
''')
# R R R R R R R R R R R R R R R R R R R R R R R R R R R R R R R R R R R R R R R R R R R R R R R R
Out[24]:
In [25]:
# Load a Results.txt to use as test sample
# resfile = r'C:\Users\rarossi\Documents\git\yoshimi\Qt\ResultsVisualiser\Results.txt'
resfile = r'T:\Temp\AOA\ITS IDC\ITS splash zone - 18m radius\Results.txt'
df = pd.read_table(resfile)
variables = df.columns[3:5]
hss = set(df.WaveHs)
tps = set(df.WaveTp)
wds = set(df.WaveDirection)
# critical sea states for the ITS deployment
seastates = {'Winch1 Max Tension': [(2.0, 8, 195), (2.0, 9, 195), (2.2, 6, 195), (2.2, 7 , 165), (2.2, 7 , 180),
(2.2, 7 , 195), (2.2, 8 , 165), (2.2, 8 , 180), (2.2, 9 , 165), (2.2, 9 , 180),
(2.2, 10, 195), (2.4, 6 , 165), (2.4, 10, 165), (2.4, 10, 180), (2.4, 11, 180),
(2.4, 11, 195), (2.4, 12, 195), (2.4, 13, 195), (2.6, 11, 165), (2.6, 14, 195)],
'Sling3 Min Tension': [(2.4, 10, 165), (2.6, 7, 195), (2.6, 8, 195), (2.6, 10, 165), (2.6, 11, 165)]
}
In [32]:
def params_fit_cib(sample, tail):
"""Returns the distribution parameters, and the confidence interval for these parameters."""
stats_fun = ss.gumbel_r.fit if tail == 'upper' else ss.gumbel_l.fit
pfit = stats_fun(sample)
pci = cib.confidence_interval(sample, stats_fun, relative=False, repeat=200)
return pfit, pci
In [33]:
# R R R R R R R R R R R R R R R R R R R R R R R R R R R R R R R R R R R R R R R R R R R R R R R
# R R R R R R R R R R R R R R R R R R R R R R R R R R R R R R R R R R R R R R R R R R R R R R
def params_fit_R(sample, tail):
"""Returns the distribution parameters, and the confidence intervals for these parameters."""
# Convert sample into a R FloatVector
rsample = robjects.FloatVector(sample)
# use method of moments for first estimate of parameters
b = np.std(sample)*np.sqrt(6)/np.pi
a = np.mean(sample) - b * np.euler_gamma
# Use R fitdist to fit a Gumbel distribution to sample
Rfun = "gumbel" if tail == 'upper' else "gumbelmin"
rfit = rfitdistrplus.fitdist(rsample, Rfun, start=rbase.list(a=a, b=b))
# Use R confint to calculate the confidence intervals for the fitted
# distro parameters
rci = np.array(rstats.confint(rfit)).transpose()
rfit = tuple(rfit[0])
# ATTENTION HERE
# The order of the returned rfit[0] seems to vary!
# Sometimes (loc, scale) is returned, and sometime (scale, loc).
# Need to check every time! And un/comment the lines below when needed.
#rfit = tuple(np.flipud(rfit))
#rci = np.fliplr(rci)
return rfit, rci
# R R R R R R R R R R R R R R R R R R R R R R R R R R R R R R R R R R R R R R R R R R R R R R
# R R R R R R R R R R R R R R R R R R R R R R R R R R R R R R R R R R R R R R R R R R R R R R R
In [34]:
def get_y(params, x, tail):
"""distribution params, variate -> -log(-log(cdf))
If tail is 'lower', sf = 1 - cdf, is used instead of cdf.
"""
if tail == 'upper':
return -np.log(-ss.gumbel_r(*params).logcdf(x))
else:
return -np.log(-ss.gumbel_l(*params).logsf(x))
In [35]:
count = 0
for var, seasts in seastates.items():
for hs, tp, wd in seasts:
count += 1
sample = df[(df.WaveHs == hs) & (df.WaveTp == tp) & (df.WaveDirection == wd)][var]
if sample.empty:
continue
print('#'*80)
print('{} - Hs {} - Tp {} - wd {}'.format(var, hs, tp, wd))
tail = 'upper' if 'Max' in var else 'lower'
sample = np.array(sample)
# Now move into R and fit statistics on sample, and do the same in cib
rfit, rci = params_fit_R(sample, tail)
pfit, pci = params_fit_cib(sample, tail)
# ###
# Compare what has been done so far
print('Pack \t Loc \t Scale')
print('R \t %.3f \t %.3f' % (rfit))
print('Scipy \t %.3f \t %.3f' % (pfit))
print('Confidence Interval from R')
print(' 2.5%%\t %.4f\t%.4f' % tuple(rci[0]))
print('97.5%%\t %.4f\t%.4f' % tuple(rci[1]))
print('Confidence Interval from cib')
print(' 2.5%%\t %.4f\t%.4f' % tuple(pci[0]))
print('97.5%%\t %.4f\t%.4f' % tuple(pci[1]))
print('Relative % differences')
print(str((100 - 100*pci/rci).round(2)))
# Show graphically the differences
# plot sample
sample.sort()
y = ql.llquantiles(len(sample)) if tail == 'upper' else ql.llsurvivals(len(sample))
plt.plot(sample, y, 'o', label='sample')
# plot best fit and confidence intervals from cib
x = sample.min(), sample.max()
y = get_y(pfit, x, tail).transpose()
plt.plot(x, y, '--b', label='cib')
y = np.array([get_y(p, x, tail) for p in pci]).transpose()
plt.plot(x, y, '--b')
# plot confidence intervals from R
y = get_y(rfit, x, tail).transpose()
plt.plot(x, y, 'r', label='R')
y = np.array([get_y(p, x, tail) for p in rci]).transpose()
plt.plot(x, y, 'r')
plt.title('{} - Hs {} - Tp {} - wd {}'.format(var, hs, tp, wd))
plt.legend(loc='best')
plt.grid()
#plt.savefig(r'R_in_py\{}-Hs{}-Tp{}-wd{}.png'.format(var, hs, tp, wd))
plt.show()
plt.close()
In [38]:
print(time.asctime())
In [ ]: