In [1]:
%load_ext Cython
%pylab qt4 --no-import-all
In [2]:
from scipy import stats
In [11]:
%%cython
"""
Implementation of the methods in cython.
"""
cimport numpy as np
import numpy as np
from libc.math cimport exp, sqrt, M_PI, log
cimport cython
@cython.boundscheck(False)
@cython.cdivision(True)
def cython_lognormal_kde(np.ndarray[np.float_t, ndim=1] data, np.ndarray[np.float_t, ndim=1] x, double sigma):
"""
cython_lognormal_kde(np.ndarray[np.float_t, ndim=1] data, np.ndarray[np.float_t, ndim=1] x, double sigma)
Evaluates the lognormal kernel density estimate of `data` with bandwidth `sigma` at `x`.
"""
cdef int i, j, n=data.shape[0], m = x.shape[0]
cdef np.ndarray[np.float_t, ndim=1] logx = np.log(x), logdata=np.log(data), pdf
cdef double residual
#Create a pdf
pdf = np.zeros_like(x)
#Iterate over the data and points
for j in range(m):
if x[j] <= 0:
pdf[j] = 0
continue
for i in range(n):
residual = (logx[j] - logdata[i]) / sigma
pdf[j] += exp(-.5 * residual * residual) / x[j]
pdf[j] /= n
return pdf / (sqrt(2 * M_PI) * sigma)
@cython.boundscheck(False)
@cython.cdivision(True)
def cython_lscv(np.ndarray[np.float_t, ndim=1] data, double sigma, str output='sum'):
"""
cython_lscv(np.ndarray[np.float_t, ndim=1] data, double sigma, str output='sum')
Computes the mean integrated squared error (MISE) of the log-normal kernel density estimate
of `data` with bandwidth `sigma`.
"""
cdef int i, j, n = data.shape[0]
cdef double mise1 = 0, mise2 = 0, term
cdef np.ndarray[np.float_t, ndim=1] log_data
cdef double residual
if sigma <= 0:
return np.inf
#Rescale the data because it only ever appears in that form
log_data = np.log(data) / sigma
data = data * sigma
for i in range(n):
#Add the diagonal part
mise1 += 1.0 / data[i]
for j in range(i+1, n):
#Add the off-diagonal part
residual = log_data[i] - log_data[j]
term = exp(-0.25 * residual * residual) \
/ sqrt(data[i] * data[j])
mise1 += 2 * term
#Add to the second term
if i == j:
print "What?"
continue
term = exp(-.5 * residual * residual) \
* (1 / data[i] + 1 / data[j])
mise2 += term
#Apply the correct normalisation
mise1 *= exp(.25 * sigma * sigma) / (2 * sqrt(M_PI) * n * n)
mise2 *= -2 / (n * (n - 1) * sqrt(2 * M_PI))
if output=='sum':
return mise1 + mise2
elif output=='tuple':
return mise1, mise2
else:
raise ValueError('`{}` is not a valid output format.'.format(output))
def cython_lcv(np.ndarray[np.float_t, ndim=1] data, double sigma):
"""
cython_lcv(np.ndarray[np.float_t, ndim=1] data, double sigma)
Computes the leave-one-out likelihood cross-validation score.
"""
cdef int i, j, n = data.shape[0]
cdef np.ndarray[np.float_t, ndim=1] log_data, ll = np.zeros_like(data)
cdef double residual
if sigma <= 0:
return np.inf
#Rescale the data because it only ever appears in that form
log_data = np.log(data) / sigma
data = data * sigma
for i in range(n):
for j in range(i+1, n):
residual = log_data[i] - log_data[j]
residual = exp(-.5 * residual * residual)
ll[i] += residual / data[i]
ll[j] += residual / data[j]
#Take logs
ll = np.log(ll / (sqrt(2 * M_PI) * (n-1)))
return -np.mean(ll)
In [16]:
%%cython -f
"""
Implementation of methods in C with cython wrapper.
"""
import numpy as np
cimport numpy as np
cdef extern from "/Users/tillhoffmann/old_git/lognormal_kde/lognormal_kde.h":
#Evaluation of the KDE
void ln_evaluate_kde(const double* data, const double* weight, const int n,
double* result, const double* x, const int m,
const double bandwidth, const double threshold);
#Evaluation of risk functions
double ln_evaluate_lscv(const double* data, const double* weight, const int n,
const double bandwidth, const double threshold);
double ln_evaluate_lcv(const double* data, const double* weight, const int n,
const double bandwidth, const double threshold)
double ln_evaluate_exact(const double* data, const double* weight, const int n,
const double bandwidth)
double ln_evaluate_asymptotic(const double* data, const double* weight, const int n,
const double bandwidth)
#Analytic minimisation of the risk function
double ln_argmin_asymptotic_parametrised(const double Sigma, const double neff)
double ln_argmin_asymptotic(const double* data, const double* weight,
const int n)
def evaluate_kde(np.ndarray[np.float_t, ndim=1] x, np.ndarray[np.float_t, ndim=1] data,
double bandwidth, np.ndarray[np.float_t, ndim=1] weight=None, double threshold=0):
#Create an empty array
cdef int m = x.shape[0]
cdef np.ndarray[np.float_t, ndim=1] result = np.zeros(m, np.float64)
#Set default weights
if weight is None:
weight = np.ones_like(data)
#Evaluate the result
ln_evaluate_kde(<double*>data.data, <double*>weight.data, data.shape[0],
<double*>result.data, <double*>x.data, m,
bandwidth, threshold)
return result
def evaluate_lscv(np.ndarray[np.float_t, ndim=1] data, double bandwidth,
np.ndarray[np.float_t, ndim=1] weight=None, double threshold=0):
#Set default weights
if weight is None:
weight = np.ones_like(data)
#Compute the result
return ln_evaluate_lscv(<double*>data.data, <double*>weight.data, data.shape[0],
bandwidth, threshold)
def evaluate_lcv(np.ndarray[np.float_t, ndim=1] data, double bandwidth,
np.ndarray[np.float_t, ndim=1] weight=None, double threshold=0):
#Set default weights
if weight is None:
weight = np.ones_like(data)
#Compute the result
return ln_evaluate_lcv(<double*>data.data, <double*>weight.data, data.shape[0],
bandwidth, threshold)
def evaluate_exact(np.ndarray[np.float_t, ndim=1] data, double bandwidth,
np.ndarray[np.float_t, ndim=1] weight=None):
#Set default weights
if weight is None:
weight = np.ones_like(data)
#Compute the result
return ln_evaluate_exact(<double*>data.data, <double*>weight.data, data.shape[0],
bandwidth)
def evaluate_asymptotic(np.ndarray[np.float_t, ndim=1] data, double bandwidth,
np.ndarray[np.float_t, ndim=1] weight=None):
#Set default weights
if weight is None:
weight = np.ones_like(data)
#Compute the result
return ln_evaluate_asymptotic(<double*>data.data, <double*>weight.data, data.shape[0],
bandwidth)
def argmin_asymptotic(np.ndarray[np.float_t, ndim=1] data, np.ndarray[np.float_t, ndim=1] weight=None):
#Set default weights
if weight is None:
weight = np.ones_like(data)
#Compute the result
return ln_argmin_asymptotic(<double*>data.data, <double*>weight.data, data.shape[0])
In [5]:
def from_data(func, data, *args):
"""
Estimates the log-scale `Sigma` from the data and evaluates `func`
by passing `Sigma` and the number of data points `n` together with
the `*args` as arguments.
"""
#Compute statistics
log_data = np.log(data)
Sigma = np.std(log_data)
n = data.shape[0]
#Evaluate the function
return func(Sigma, n, *args)
def mise_exact(Sigma, n, sigma):
"""
Computes the mean integrated squared error (MISE) for an `n`-point log-normal kernel
density estimate with log-scale `sigma` assuming an underlying log-normal
distribution with log-scale `Sigma`.
"""
return np.exp((sigma**2 + 2*Sigma**2)/4)/(n*sigma) + np.exp(Sigma**2/4)/Sigma + \
(np.exp((sigma**2 + Sigma**2)/4)*(-1 + n))/(n*np.sqrt(sigma**2 + Sigma**2)) - \
(2*np.sqrt(2)*np.exp((sigma**2*Sigma**2 + Sigma**4)/(2*sigma**2 + 4*Sigma**2)))/ \
np.sqrt(sigma**2 + 2*Sigma**2)
def mise_exact_max(Sigma, n):
"""
Computes the log-scale parameter of the kernel density estimate that optimises
the mean integrated squared error (MISE).
"""
return optimize.brent(lambda sigma: mise_exact(Sigma, n, sigma))
def mise_asymptotic(Sigma, n, sigma):
"""
Computes the asymptotic MISE assuming an underlying log-normal
distribution with log-scale `Sigma`.
"""
return (64 * np.exp(Sigma * Sigma / 4) * Sigma ** 5 / sigma + \
n * sigma ** 4 * (12 + 4 * Sigma ** 2 + Sigma ** 4)) / \
(128 * n * np.sqrt(np.pi) * Sigma ** 5) * np.exp(Sigma * Sigma / 4)
def mise_asymptotic_max(Sigma, n):
"""
Computes the optimal kernel width using an analogue of Silverman's rule,
where `Sigma` is the log-scale of the dat and `n` is the number of data points.
"""
return 2.0 ** (.8) * Sigma * np.exp(Sigma * Sigma / 20) \
/ (n * (12 + 4 * Sigma * Sigma + Sigma ** 4)) ** .2
In [12]:
mu = 1
sigma = 1
n = 500
np.random.seed(1)
dist = stats.lognorm(sigma, 0, np.exp(mu))
data = dist.rvs(n)
data = np.loadtxt('data/old_faithful.txt')
Sigma = np.std(np.log(data))
bandwidth = mise_asymptotic_max(Sigma, n)
x = np.linspace(0, 10, 1000)
y1 = cython_lognormal_kde(data, x, bandwidth)
y2 = evaluate_kde(x, data, bandwidth)
bandwidths = np.logspace(-3, 0)
lscv1 = [cython_lscv(data, bandwidth) for bandwidth in bandwidths]
lscv2 = [evaluate_lscv(data, bandwidth, 10 * np.ones(n)) for bandwidth in bandwidths]
lcv1 = [cython_lcv(data, bandwidth) for bandwidth in bandwidths]
lcv2 = [evaluate_lcv(data, bandwidth, 10 * np.ones(n), 0) for bandwidth in bandwidths]
asymptotic1 = [from_data(mise_asymptotic, data, bandwidth) for bandwidth in bandwidths]
asymptotic2 = [evaluate_asymptotic(data, bandwidth) for bandwidth in bandwidths]
exact1 = [from_data(mise_exact, data, bandwidth) for bandwidth in bandwidths]
exact2 = [evaluate_exact(data, bandwidth) for bandwidth in bandwidths]
In [8]:
plt.plot(x, y1)
plt.plot(x, y2)
plt.scatter(data, np.zeros_like(data), marker='|')
plt.xlim(min(x), max(x))
plt.ylim(-.1, .5)
Out[8]:
In [9]:
plt.plot(bandwidths, lscv1, color='b', ls='-')
plt.plot(bandwidths, lscv2, color='r', ls='--')
plt.xscale('log')
In [13]:
plt.plot(bandwidths, lcv1, color='b', ls='-')
plt.plot(bandwidths, lcv2, color='r', ls='--')
plt.xscale('log')
In [105]:
plt.plot(bandwidths, asymptotic1, color='b', ls='-')
plt.plot(bandwidths, asymptotic2, color='r', ls='--')
plt.xscale('log')
In [106]:
plt.plot(bandwidths, exact1, color='b', ls='-')
plt.plot(bandwidths, exact2, color='r', ls='--')
plt.xscale('log')
In [138]:
%%cython
cimport numpy as np
cdef extern from "string.h":
void * memset ( void * ptr, int value, size_t num )
def cython_memset(np.ndarray[np.float_t, ndim=1] array):
memset(<double*>array.data, 0, sizeof(double)*array.shape[0])
return array
In [140]:
a = np.ones(200)
cython_memset(a)
Out[140]:
In [ ]: