jupyter nbconvert mv_kecdf_frechet.ipynb --to slides --post serve
In [1]:
from __future__ import division
import matplotlib.pyplot as plt
import numpy as np
import scipy.stats as spst
import statsmodels.api as sm
from scipy import optimize
from statsmodels.nonparametric import kernels
kernel_func = dict(wangryzin=kernels.wang_ryzin,
aitchisonaitken=kernels.aitchison_aitken,
gaussian=kernels.gaussian,
aitchison_aitken_reg = kernels.aitchison_aitken_reg,
wangryzin_reg = kernels.wang_ryzin_reg,
gauss_convolution=kernels.gaussian_convolution,
wangryzin_convolution=kernels.wang_ryzin_convolution,
aitchisonaitken_convolution=kernels.aitchison_aitken_convolution,
gaussian_cdf=kernels.gaussian_cdf,
aitchisonaitken_cdf=kernels.aitchison_aitken_cdf,
wangryzin_cdf=kernels.wang_ryzin_cdf,
d_gaussian=kernels.d_gaussian)
def gpke(bwp, dataxx, data_predict, var_type, ckertype='gaussian',
okertype='wangryzin', ukertype='aitchisonaitken', tosum=True):
r"""Returns the non-normalized Generalized Product Kernel Estimator"""
kertypes = dict(c=ckertype, o=okertype, u=ukertype)
Kval = np.empty(dataxx.shape)
for ii, vtype in enumerate(var_type):
func = kernel_func[kertypes[vtype]]
Kval[:, ii] = func(bwp[ii], dataxx[:, ii], data_predict[ii])
iscontinuous = np.array([c == 'c' for c in var_type])
dens = Kval.prod(axis=1) / np.prod(bwp[iscontinuous])
#dens = np.nanprod(Kval,axis=1) / np.prod(bwp[iscontinuous])
if tosum:
return dens.sum(axis=0)
else:
return dens
class LeaveOneOut(object):
def __init__(self, X):
self.X = np.asarray(X)
def __iter__(self):
X = self.X
nobs, k_vars = np.shape(X)
for i in range(nobs):
index = np.ones(nobs, dtype=np.bool)
index[i] = False
yield X[index, :]
def loo_likelihood(bww, datax, var_type, func=lambda x: x, ):
#print(bww)
LOO = LeaveOneOut(datax)
L = 0
for i, X_not_i in enumerate(LOO):
f_i = gpke(bww, dataxx=-X_not_i, data_predict=-datax[i, :],
var_type=var_type)
L += func(f_i)
return -L
def get_bw(datapfft ,var_type ,reference):
# Using leave-one-out likelihood
# the initial value for the optimization is the normal_reference
# h0 = normal_reference()
data = adjust_shape(datapfft, len(var_type))
h0 =reference
fmin =lambda bb, funcx: loo_likelihood(bb, data, var_type, func=funcx)
bw = optimize.fmin(fmin, x0=h0, args=(np.log, ),
maxiter=1e3, maxfun=1e3, disp=0, xtol=1e-3)
# bw = self._set_bw_bounds(bw) # bound bw if necessary
return bw
def adjust_shape(dat, k_vars):
""" Returns an array of shape (nobs, k_vars) for use with `gpke`."""
dat = np.asarray(dat)
if dat.ndim > 2:
dat = np.squeeze(dat)
if dat.ndim == 1 and k_vars > 1: # one obs many vars
nobs = 1
elif dat.ndim == 1 and k_vars == 1: # one obs one var
nobs = len(dat)
else:
if np.shape(dat)[0] == k_vars and np.shape(dat)[1] != k_vars:
dat = dat.T
nobs = np.shape(dat)[0] # ndim >1 so many obs many vars
dat = np.reshape(dat, (nobs, k_vars))
return dat
%alias_magic t timeit
There are different methods for BW selection, many based on some kind of optimization.
In [18]:
n=1000
distr=spst.beta #<-- From SciPy
smpl=np.linspace(0,1,num=n)
params={'horns':(0.5,0.5),'horns1':(0.5,0.55),
'shower':(5.,2.),'grower':(2.,5.)}
v_type=f'{"c"*len(params)}' #<-- Statsmodels wants to know if data is
# continuous (c)
# discrete ordered (o)
# discrete unordered (u)
fig, ax = plt.subplots(1,2,figsize=(10,5))
list(map(lambda x: ax[0].plot(distr.cdf(smpl,*x),smpl) , params.values()))
list(map(lambda x: ax[1].plot(smpl,distr.pdf(smpl,*x)) , params.values()))
ax[0].legend(list(params.keys()));ax[1].legend(list(params.keys()))
ax[0].grid(); ax[1].grid()
fig.suptitle(f'CDFs & PDFs for different marginals (Beta distributed)')
plt.show()
This is a list of the kernel functions available in the package
kernel_func = dict(
wangryzin=kernels.wang_ryzin,
aitchisonaitken=kernels.aitchison_aitken,
gaussian=kernels.gaussian,
aitchison_aitken_reg = kernels.aitchison_aitken_reg,
wangryzin_reg = kernels.wang_ryzin_reg,
gauss_convolution=kernels.gaussian_convolution,
wangryzin_convolution=kernels.wang_ryzin_convolution,
aitchisonaitken_convolution=kernels.aitchison_aitken_convolution,
gaussian_cdf=kernels.gaussian_cdf,
aitchisonaitken_cdf=kernels.aitchison_aitken_cdf,
wangryzin_cdf=kernels.wang_ryzin_cdf,
d_gaussian=kernels.d_gaussian)
Different kernels are selected for different reasons: variable features and if they want to fit pdfs or cdfs.
(probably)
Uses the bandwidth estimate that maximizes the leave-out-out likelihood.
where $K_{h}$ represents the Generalized product kernel estimator:
$$ K_{h}(X_{i},X_{j})=\prod_{s=1}^{q}h_{s}^{-1}k\left(\frac{X_{is}-X_{js}}{h_{s}}\right) $$Returns the value of the bandwidth that maximizes the integrated mean square error between the estimated and actual distribution.
The integrated mean square error (IMSE) is given by:
$$ \int\left[\hat{f}(x)-f(x)\right]^{2}dx $$Let's compare times and values for bandwidth selection for each method available in StatsModels, considering 4 dimensions and 1000 samples.
In [20]:
import statsmodels.api as sm
#Generate some independent data for each parameter set
mvdata={k:distr.rvs(*params[k],size=n) for k in params}
rd=np.array(list(mvdata.values()))
%timeit -n3 sm.nonparametric.KDEMultivariate(data=rd,var_type=v_type, bw='normal_reference')
dens_u_rot = sm.nonparametric.KDEMultivariate(data=rd,var_type=v_type, bw='normal_reference')
print('bw with reference',dens_u_rot.bw, '(only available for gaussian kernels)')
%timeit -n3 sm.nonparametric.KDEMultivariate(data=rd,var_type=v_type, bw='cv_ml')
dens_u_ml = sm.nonparametric.KDEMultivariate(data=rd,var_type=v_type, bw='cv_ml')
print('bw with maximum likelihood',dens_u_ml.bw)
# BW with least squares takes >8x more than with ml
%timeit -n3 sm.nonparametric.KDEMultivariate(data=rd,var_type=v_type, bw='cv_ls')
dens_u_ls = sm.nonparametric.KDEMultivariate(data=rd,var_type=v_type, bw='cv_ls')
print('bw with least squares',dens_u_ls.bw)
self._compute_bw(bw)
or self._compute_efficient(bw)
)_normal_reference()
<- Silverman's rule_cv_ml()
<- Cross validation maximum likelihood_cv_ls()
<- Cross validation least squares_cv_ml()
and _cv_ls()
are almost the same method except for:
_cv_ml() |
_cv_ls() |
---|---|
h0 = self._normal_reference() | h0 = self._normal_reference() |
bw = optimize.fmin(self.loo_likelihood, | bw = optimize.fmin(self.imse, |
x0=h0, args=(np.log, ), | x0=h0, |
maxiter=1e3, | maxiter=1e3 |
maxfun=1e3, | maxfun=1e3, |
disp=0, | disp=0, |
xtol=1e-3) | xtol=1e-3) |
A bummer: no direct way to feed ranges of hyperparameters in order to constrain the search space!
optimize.fmin
comes from scipy.optimize
In [43]:
def loo_likelihood(self, bw, func=lambda x: x):
LOO = LeaveOneOut(self.data) #<- iterator for a leave-one-out over the data
L = 0
for i, X_not_i in enumerate(LOO): #<- per leave-one-out of the data (ouch!)
f_i = gpke(bw, #<- provided by the optimization algorithm
data=-X_not_i, #<- dataset minus one sample as given by LOO
data_predict=-self.data[i, :], #<- REAL dataset, ith point
var_type=self.var_type) #<- 'cccc' or something similar
L += func(f_i) #<- _cv_ml() passed np.log, so its log-likelihood
# of gkpe at ith point.
return -L
In [ ]:
def gpke(bw, data, data_predict, var_type, ckertype='gaussian',
okertype='wangryzin', ukertype='aitchisonaitken', tosum=True):
kertypes = dict(c=ckertype, o=okertype, u=ukertype) #<- kernel selection
Kval = np.empty(data.shape)
for ii, vtype in enumerate(var_type): #per ii dimension
func = kernel_func[kertypes[vtype]]
Kval[:, ii] = func(bw[ii], data[:, ii], data_predict[ii])
iscontinuous = np.array([c == 'c' for c in var_type])
dens = Kval.prod(axis=1) / np.prod(bw[iscontinuous]) #<- Ta-da, kernel products.
if tosum:
return dens.sum(axis=0)
else:
return dens
Methods for:
Tried using the violations as a way to prune "unpromising" bandwidths before applying the gpke thru the whole LOO iteration.
It made the optimization algorithm go coo-coo
Because scipy.optimize.fmin was expecting a number out of that function.
To return something "proportionally punishing", I probably should keeping track of the previous estimates.
That would require more work.
Tried feeding number of violations to the algorithm.
The algorithm got lost.
Maybe too little information?
Tried feeding the sum of the size of all violations.
It kinda worked but the final steps of the algorithm were unstable.
Can we make it a bit more informative?
Tried feeding a weighted sum of the size of all violations.
The weights were the size of the bound at each violation point.
The rationale is that a violation at a narrow point should be punished more than a violation at an already wide point.
It still takes between 20% to 200% more time than cv_ml when it should be at least an order of magnitude faster (cdf estimation is faster than leave-one-out)
Gee, I wonder if I have a bug somewhere?
;D I will repeat the tests with the right call and show the results for the next presentation.
In [41]:
def get_frechets(dvars):
d=len(dvars)
n=len(dvars[0])
dimx=np.array(range(d))
un=np.ones(d,dtype=int)
bottom_frechet = np.array([max( np.sum( dvars[dimx,un*i] ) +1-d, 0 )
for i in range(n) ])
top_frechet = np.array([min([y[i] for y in dvars]) for i in range(n)])
return {'top': top_frechet, 'bottom': bottom_frechet}
In [29]:
cdfs={fname :distr.cdf(smpl,*params[fname]) for fname in params}
frechets=get_frechets(np.array(list(cdfs.values())))
In [30]:
def check_frechet_fails(guinea_cdf,frechets):
fails={'top':[], 'bottom':[]}
for n in range(len(guinea_cdf)):
#n_hyper_point=np.array([x[n] for x in rd])
if guinea_cdf[n]>frechets['top'][n]:
fails['top'].append(True)
else:
fails['top'].append(False)
if guinea_cdf[n]<frechets['bottom'][n]:
fails['bottom'].append(True)
else:
fails['bottom'].append(False)
return {'top':np.array(fails['top']),
'bottom':np.array(fails['bottom'])}
Given 4 dimensions and 1000 samples, we got:
For Silverman: 58.8% violations
For cv_ml: 58.0% violations
For cv_ls: 57.0% violations
In [35]:
# For Silverman
violations_silverman=check_frechet_fails(dens_u_rot.cdf(),frechets)
violations_silverman=np.sum(violations_silverman['top'])+ np.sum(violations_silverman['bottom'])
print(f'violations:{violations_silverman} ({100.*violations_silverman/len(smpl)}%)')
In [36]:
# For cv_ml
violations_cv_ml=check_frechet_fails(dens_u_ml.cdf(),frechets)
violations_cv_ml=np.sum(violations_cv_ml['top'])+ np.sum(violations_cv_ml['bottom'])
print(f'violations:{violations_cv_ml} ({100.*violations_cv_ml/len(smpl)}%)')
In [40]:
# For cv_ls
violations_cv_ls=check_frechet_fails(dens_u_ls.cdf(),frechets)
violations_cv_ls=np.sum(violations_cv_ls['top'])+ np.sum(violations_cv_ls['bottom'])
print(f'violations:{violations_cv_ls} ({100.*violations_cv_ls/len(smpl)}%)')
In [42]:
def generate_experiments(reps,n,params, distr, dims):
bws_frechet={f'bw_{x}':[] for x in params}
bws_cv_ml={f'bw_{x}':[] for x in params}
for iteration in range(reps):
mvdata = {k: distr.rvs(*params[k], size=n) for k in params}
rd = np.array(list(mvdata.values())) #<---- THIS IS NOT A CDF!!!!!
# get frechets and thresholds
frechets = get_frechets(rd) #<------- THEREFORE THIS IS A BUG !!!!!
bw_frechets, bw_cv_ml=profile_run(rd, frechets,iteration)
for ix,x in enumerate(params):
bws_frechet[f'bw_{x}'].append(bw_frechets[ix])
bws_cv_ml[f'bw_{x}'].append(bw_cv_ml[ix])
pd.DataFrame(bws_frechet).to_csv(f'/home/lia/liaProjects/outs/bws_frechet_d{dims}-n{n}-iter{reps}.csv')
pd.DataFrame(bws_cv_ml).to_csv(f'/home/lia/liaProjects/outs/bws_cv_ml_d{dims}-n{n}-iter{reps}.csv')
In [ ]:
def get_bw(datapfft, var_type, reference, frech_bounds=None):
# Using leave-one-out likelihood
# the initial value for the optimization is the normal_reference
# h0 = normal_reference()
data = adjust_shape(datapfft, len(var_type))
if not frech_bounds:
fmin =lambda bw, funcx: loo_likelihood(bw, data, var_type, func=funcx)
argsx=(np.log,)
else:
fmin = lambda bw, funcx: frechet_likelihood(bw, data, var_type,
frech_bounds, func=funcx)
argsx=(None,) #second element of tuple is if debug mode
h0 = reference
bw = optimize.fmin(fmin, x0=h0, args=argsx, #feeding logarithm for loo
maxiter=1e3, maxfun=1e3, disp=0, xtol=1e-3)
# bw = self._set_bw_bounds(bw) # bound bw if necessary
return bw
In [ ]:
def frechet_likelihood(bww, datax, var_type, frech_bounds, func=None, debug_mode=False,):
cdf_est = cdf(datax, bww, var_type) # <- calls gpke underneath, but is a short call
d_violations = calc_frechet_fails(cdf_est, frech_bounds)
width_bound = frech_bounds['top'] - frech_bounds['bottom']
viols=(d_violations['top']+d_violations['bottom'])/width_bound
L= np.sum(viols)
return L
In [ ]:
def profile_run(rd,frechets,iterx):
dims=len(rd)
n=len(rd[0])
v_type = f'{"c"*dims}'
# threshold: number of violations by the cheapest method.
dens_u_rot = sm.nonparametric.KDEMultivariate(data=rd, var_type=v_type, bw='normal_reference')
cdf_dens_u_rot = dens_u_rot.cdf()
violations_rot = count_frechet_fails(cdf_dens_u_rot, frechets)
#profile frechets
pr = cProfile.Profile()
pr.enable()
bw_frechets = get_bw(rd, v_type, dens_u_rot.bw, frech_bounds=frechets)
pr.disable()
s = io.StringIO()
ps = pstats.Stats(pr, stream=s).sort_stats('cumtime')
ps.print_stats()
s = s.getvalue()
with open(f'/home/lia/liaProjects/outs/frechet-profile-d{dims}-n{n}-iter{iterx}.txt', 'w+') as f:
f.write(s)
#profile cv_ml
pr = cProfile.Profile()
pr.enable()
bw_cv_ml = get_bw(rd, v_type, dens_u_rot.bw)
pr.disable()
s = io.StringIO()
ps = pstats.Stats(pr, stream=s).sort_stats('cumtime')
ps.print_stats()
s = s.getvalue()
with open(f'/home/lia/liaProjects/outs/loo-ml-profile-d{dims}-n{n}-iter{iterx}.txt', 'w+') as f:
f.write(s)
return bw_frechets,bw_cv_ml
In [ ]: