We want to calculate $\sum_{i=1}^N \log x_i$. As calculating the logarithm is expensive we can make use of the identity $\sum_{i=1}^N \log x_i = \log \prod_{i=1}^N x_i$. We have to be careful though to avoid loss of precision in the product. In practice we thus apply the identity on chunks of data whose size is dyamically determined.
In [1]:
import numpy as np
%load_ext cython
In [2]:
def meanlog(data):
return np.mean(np.log(data))
In [3]:
%%cython
import cython
import numpy as np
cimport numpy as np
from libc.math cimport log
@cython.boundscheck(False)
@cython.wraparound(False)
def meanlogc(np.ndarray[np.double_t, ndim=1] data):
# speed up calculation of sum of log by calculating log of product
cdef double res = 0.0
cdef double tmp = 1.0
for i in range(data.shape[0]):
tmp *= data[i]
if not (1e-14 < tmp < 1e14):
res += log(tmp)
tmp = 1.0
res += log(tmp)
return res/data.shape[0]
In [4]:
N = int(1e6)
n = int(1e3)
data = np.random.lognormal(size=N)
print meanlog(data), meanlogc(data)
%timeit meanlog(data)
%timeit meanlogc(data)