In [1]:
%load_ext Cython
%pylab qt4 --no-import-all


Populating the interactive namespace from numpy and matplotlib

In [2]:
from scipy import stats

In [11]:
%%cython

"""
Implementation of the methods in cython.
"""

cimport numpy as np
import numpy as np
from libc.math cimport exp, sqrt, M_PI, log
cimport cython

        
@cython.boundscheck(False)
@cython.cdivision(True)
def cython_lognormal_kde(np.ndarray[np.float_t, ndim=1] data, np.ndarray[np.float_t, ndim=1] x, double sigma):
    """
    cython_lognormal_kde(np.ndarray[np.float_t, ndim=1] data, np.ndarray[np.float_t, ndim=1] x, double sigma)
    
    Evaluates the lognormal kernel density estimate of `data` with bandwidth `sigma` at `x`.
    """
    cdef int i, j, n=data.shape[0], m = x.shape[0]
    cdef np.ndarray[np.float_t, ndim=1] logx = np.log(x), logdata=np.log(data), pdf
    cdef double residual

    #Create a pdf
    pdf = np.zeros_like(x)
    #Iterate over the data and points
    for j in range(m):
        if x[j] <= 0:
            pdf[j] = 0
            continue
        for i in range(n):
            residual = (logx[j] - logdata[i]) / sigma
            pdf[j] += exp(-.5 * residual * residual) / x[j]
        pdf[j] /= n

    return pdf / (sqrt(2 * M_PI) * sigma)


@cython.boundscheck(False)
@cython.cdivision(True)
def cython_lscv(np.ndarray[np.float_t, ndim=1] data, double sigma, str output='sum'):
    """
    cython_lscv(np.ndarray[np.float_t, ndim=1] data, double sigma, str output='sum')
    
    Computes the mean integrated squared error (MISE) of the log-normal kernel density estimate
    of `data` with bandwidth `sigma`.
    """
    cdef int i, j, n = data.shape[0]
    cdef double mise1 = 0, mise2 = 0, term
    cdef np.ndarray[np.float_t, ndim=1] log_data
    cdef double residual
    
    if sigma <= 0:
        return np.inf
    
    #Rescale the data because it only ever appears in that form
    log_data = np.log(data) / sigma
    data = data * sigma
    
    for i in range(n):
        #Add the diagonal part
        mise1 += 1.0 / data[i] 
        for j in range(i+1, n):
            #Add the off-diagonal part
            residual = log_data[i] - log_data[j]
            term = exp(-0.25 * residual * residual) \
                / sqrt(data[i] * data[j])
            mise1 += 2 * term
            
            #Add to the second term
            if i == j: 
                print "What?"
                continue
            term = exp(-.5 * residual * residual) \
                * (1 / data[i] + 1 / data[j])
            mise2 += term
    
    #Apply the correct normalisation
    mise1 *= exp(.25 * sigma * sigma) / (2 * sqrt(M_PI) * n * n)
    mise2 *= -2  / (n * (n - 1) * sqrt(2 * M_PI))
    
    if output=='sum':
        return mise1 + mise2
    elif output=='tuple':
        return mise1, mise2
    else:
        raise ValueError('`{}` is not a valid output format.'.format(output))
        
def cython_lcv(np.ndarray[np.float_t, ndim=1] data, double sigma):
    """
    cython_lcv(np.ndarray[np.float_t, ndim=1] data, double sigma)
    
    Computes the leave-one-out likelihood cross-validation score.
    """
    cdef int i, j, n = data.shape[0]
    cdef np.ndarray[np.float_t, ndim=1] log_data, ll = np.zeros_like(data)
    cdef double residual
    
    if sigma <= 0:
        return np.inf
    
    #Rescale the data because it only ever appears in that form
    log_data = np.log(data) / sigma
    data = data * sigma
    
    for i in range(n):
        for j in range(i+1, n):
            residual = log_data[i] - log_data[j]
            residual = exp(-.5 * residual * residual)
            ll[i] += residual / data[i]
            ll[j] += residual / data[j]
        
    #Take logs
    ll = np.log(ll / (sqrt(2 * M_PI) * (n-1)))
        
    return -np.mean(ll)

In [16]:
%%cython -f

"""
Implementation of methods in C with cython wrapper.
"""

import numpy as np
cimport numpy as np

cdef extern from "/Users/tillhoffmann/old_git/lognormal_kde/lognormal_kde.h":
    #Evaluation of the KDE
    void ln_evaluate_kde(const double* data, const double* weight, const int n,
                         double* result, const double* x, const int m,
                         const double bandwidth, const double threshold);
    #Evaluation of risk functions
    double ln_evaluate_lscv(const double* data, const double* weight, const int n,
                            const double bandwidth, const double threshold);
    double ln_evaluate_lcv(const double* data, const double* weight, const int n,
                           const double bandwidth, const double threshold)
    double ln_evaluate_exact(const double* data, const double* weight, const int n,
                             const double bandwidth)
    double ln_evaluate_asymptotic(const double* data, const double* weight, const int n,
                                  const double bandwidth)
    #Analytic minimisation of the risk function
    double ln_argmin_asymptotic_parametrised(const double Sigma, const double neff)
    double ln_argmin_asymptotic(const double* data, const double* weight,
                                const int n)
    
def evaluate_kde(np.ndarray[np.float_t, ndim=1] x, np.ndarray[np.float_t, ndim=1] data,
             double bandwidth, np.ndarray[np.float_t, ndim=1] weight=None, double threshold=0):
    #Create an empty array
    cdef int m = x.shape[0]
    cdef np.ndarray[np.float_t, ndim=1] result = np.zeros(m, np.float64)
    #Set default weights
    if weight is None:
        weight = np.ones_like(data)
    #Evaluate the result
    ln_evaluate_kde(<double*>data.data, <double*>weight.data, data.shape[0],
                <double*>result.data, <double*>x.data, m, 
                bandwidth, threshold)
    return result

def evaluate_lscv(np.ndarray[np.float_t, ndim=1] data, double bandwidth, 
                  np.ndarray[np.float_t, ndim=1] weight=None, double threshold=0):
    #Set default weights
    if weight is None:
        weight = np.ones_like(data)
    #Compute the result
    return ln_evaluate_lscv(<double*>data.data, <double*>weight.data, data.shape[0], 
                   bandwidth, threshold)

def evaluate_lcv(np.ndarray[np.float_t, ndim=1] data, double bandwidth, 
                 np.ndarray[np.float_t, ndim=1] weight=None, double threshold=0):
    #Set default weights
    if weight is None:
        weight = np.ones_like(data)
    #Compute the result
    return ln_evaluate_lcv(<double*>data.data, <double*>weight.data, data.shape[0], 
                  bandwidth, threshold)

def evaluate_exact(np.ndarray[np.float_t, ndim=1] data, double bandwidth, 
                   np.ndarray[np.float_t, ndim=1] weight=None):
    #Set default weights
    if weight is None:
        weight = np.ones_like(data)
    #Compute the result
    return ln_evaluate_exact(<double*>data.data, <double*>weight.data, data.shape[0], 
                    bandwidth)

def evaluate_asymptotic(np.ndarray[np.float_t, ndim=1] data, double bandwidth, 
                        np.ndarray[np.float_t, ndim=1] weight=None):
    #Set default weights
    if weight is None:
        weight = np.ones_like(data)
    #Compute the result
    return ln_evaluate_asymptotic(<double*>data.data, <double*>weight.data, data.shape[0], 
                         bandwidth)

def argmin_asymptotic(np.ndarray[np.float_t, ndim=1] data, np.ndarray[np.float_t, ndim=1] weight=None):
    #Set default weights
    if weight is None:
        weight = np.ones_like(data)
    #Compute the result
    return ln_argmin_asymptotic(<double*>data.data, <double*>weight.data, data.shape[0])

In [5]:
def from_data(func, data, *args):
    """
    Estimates the log-scale `Sigma` from the data and evaluates `func`
    by passing `Sigma` and the number of data points `n` together with
    the `*args` as arguments.
    """
    #Compute statistics
    log_data = np.log(data)
    Sigma = np.std(log_data)
    n = data.shape[0]
    #Evaluate the function
    return func(Sigma, n, *args)

def mise_exact(Sigma, n, sigma):
    """
    Computes the mean integrated squared error (MISE) for an `n`-point log-normal kernel 
    density estimate with log-scale `sigma` assuming an underlying log-normal 
    distribution with log-scale `Sigma`.
    """
    return np.exp((sigma**2 + 2*Sigma**2)/4)/(n*sigma) + np.exp(Sigma**2/4)/Sigma + \
        (np.exp((sigma**2 + Sigma**2)/4)*(-1 + n))/(n*np.sqrt(sigma**2 + Sigma**2)) - \
        (2*np.sqrt(2)*np.exp((sigma**2*Sigma**2 + Sigma**4)/(2*sigma**2 + 4*Sigma**2)))/ \
        np.sqrt(sigma**2 + 2*Sigma**2)
        
def mise_exact_max(Sigma, n):
    """
    Computes the log-scale parameter of the kernel density estimate that optimises
    the mean integrated squared error (MISE).
    """
    return optimize.brent(lambda sigma: mise_exact(Sigma, n, sigma))

def mise_asymptotic(Sigma, n, sigma):
    """
    Computes the asymptotic MISE assuming an underlying log-normal 
    distribution with log-scale `Sigma`.
    """
    return (64 * np.exp(Sigma * Sigma / 4) * Sigma ** 5 / sigma + \
        n * sigma ** 4 * (12 + 4 * Sigma ** 2 + Sigma ** 4)) / \
        (128 * n * np.sqrt(np.pi) * Sigma ** 5) * np.exp(Sigma * Sigma / 4)

def mise_asymptotic_max(Sigma, n):
    """
    Computes the optimal kernel width using an analogue of Silverman's rule, 
    where `Sigma` is the log-scale of the dat and `n` is the number of data points.
    """
    return 2.0 ** (.8) * Sigma * np.exp(Sigma * Sigma / 20) \
        / (n * (12 + 4 * Sigma * Sigma + Sigma ** 4)) ** .2

In [12]:
mu = 1
sigma = 1
n = 500
np.random.seed(1)
dist = stats.lognorm(sigma, 0, np.exp(mu))

data = dist.rvs(n)
data = np.loadtxt('data/old_faithful.txt')
Sigma = np.std(np.log(data))
bandwidth = mise_asymptotic_max(Sigma, n)

x = np.linspace(0, 10, 1000)
y1 = cython_lognormal_kde(data, x, bandwidth)
y2 = evaluate_kde(x, data, bandwidth)

bandwidths = np.logspace(-3, 0)
lscv1 = [cython_lscv(data, bandwidth) for bandwidth in bandwidths]
lscv2 = [evaluate_lscv(data, bandwidth, 10 * np.ones(n)) for bandwidth in bandwidths]

lcv1 = [cython_lcv(data, bandwidth) for bandwidth in bandwidths]
lcv2 = [evaluate_lcv(data, bandwidth, 10 * np.ones(n), 0) for bandwidth in bandwidths]

asymptotic1 = [from_data(mise_asymptotic, data, bandwidth) for bandwidth in bandwidths]
asymptotic2 = [evaluate_asymptotic(data, bandwidth) for bandwidth in bandwidths]

exact1 = [from_data(mise_exact, data, bandwidth) for bandwidth in bandwidths]
exact2 = [evaluate_exact(data, bandwidth) for bandwidth in bandwidths]


/Users/tillhoffmann/Library/Enthought/Canopy_64bit/User/lib/python2.7/site-packages/IPython/kernel/__main__.py:13: RuntimeWarning: divide by zero encountered in log
/Users/tillhoffmann/Library/Enthought/Canopy_64bit/User/lib/python2.7/site-packages/IPython/kernel/__main__.py:20: RuntimeWarning: divide by zero encountered in log

In [8]:
plt.plot(x, y1)
plt.plot(x, y2)
plt.scatter(data, np.zeros_like(data), marker='|')
plt.xlim(min(x), max(x))
plt.ylim(-.1, .5)


Out[8]:
(-0.1, 0.5)

In [9]:
plt.plot(bandwidths, lscv1, color='b', ls='-')
plt.plot(bandwidths, lscv2, color='r', ls='--')

plt.xscale('log')

In [13]:
plt.plot(bandwidths, lcv1, color='b', ls='-')
plt.plot(bandwidths, lcv2, color='r', ls='--')

plt.xscale('log')

In [105]:
plt.plot(bandwidths, asymptotic1, color='b', ls='-')
plt.plot(bandwidths, asymptotic2, color='r', ls='--')

plt.xscale('log')

In [106]:
plt.plot(bandwidths, exact1, color='b', ls='-')
plt.plot(bandwidths, exact2, color='r', ls='--')

plt.xscale('log')

In [138]:
%%cython

cimport numpy as np

cdef extern from "string.h":
    void * memset ( void * ptr, int value, size_t num )
    
def cython_memset(np.ndarray[np.float_t, ndim=1] array):
    memset(<double*>array.data, 0, sizeof(double)*array.shape[0])
    return array

In [140]:
a = np.ones(200)
cython_memset(a)


Out[140]:
array([ 0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
        0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
        0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
        0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
        0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
        0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
        0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
        0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
        0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
        0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
        0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
        0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
        0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
        0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
        0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
        0.,  0.,  0.,  0.,  0.])

In [ ]: