In [1]:
import numpy as np

In [2]:
from scipy import fftpack

In [3]:
n = 2**15

In [4]:
a = np.random.randn(n)

In [5]:
a = a.astype(np.complex128)

Numpy y Scipy


In [6]:
res_numpy = np.fft.fft(a)

In [7]:
res_numpy


Out[7]:
array([ 348.43580732  +0.j        , -140.34766665+146.86379751j,
        -20.95179516-270.26881706j, ...,   54.40852271 +18.25801042j,
        -20.95179516+270.26881706j, -140.34766665-146.86379751j])

In [8]:
res_scipy = fftpack.fft(a)

In [9]:
np.allclose(res_numpy, res_scipy)


Out[9]:
True

In [10]:
%timeit np.fft.fft(a)


100 loops, best of 3: 3.29 ms per loop

In [11]:
%timeit fftpack.fft(a)


100 loops, best of 3: 3.35 ms per loop

FFTW3


In [12]:
import fftw3

In [13]:
import multiprocessing

In [14]:
ncores = multiprocessing.cpu_count()

In [15]:
ncores


Out[15]:
4

In [16]:
afftw3 = a.astype(np.complex128)

In [17]:
output_fftw3 = afftw3.copy()

In [18]:
fftw3plan = fftw3.Plan(afftw3, output_fftw3, nthreads = ncores)

In [19]:
fftw3plan()

In [20]:
output_fftw3


Out[20]:
array([ 348.43580732  +0.j        , -140.34766665+146.86379751j,
        -20.95179516-270.26881706j, ...,   54.40852271 +18.25801042j,
        -20.95179516+270.26881706j, -140.34766665-146.86379751j])

In [21]:
np.allclose(res_numpy, output_fftw3)


Out[21]:
True

In [22]:
%timeit fftw3plan()


1000 loops, best of 3: 610 µs per loop

PyFFTW


In [23]:
import pyfftw

In [24]:
apyfftw = pyfftw.n_byte_align_empty(n, 16, np.complex128)

In [25]:
apyfftw[:] = a[:]

In [26]:
np.allclose(apyfftw, a)


Out[26]:
True

PyFFTW con interfaz numpy

Even after the first transform of a given specification has been performed, subsequent transforms are never as fast as using pyfftw.FFTW objects directly, and in many cases are substantially slower. This is because of the internal overhead of creating a new pyfftw.FFTW object on every call. For this reason, a cache is provided, which is recommended to be used whenever pyfftw.interfaces is used. Turn the cache on using pyfftw.interfaces.cache.enable()


In [27]:
pyfftw.interfaces.cache.enable()

In [28]:
pyfftw_output = pyfftw.interfaces.numpy_fft.fft(apyfftw)

In [29]:
np.allclose(res_numpy, pyfftw_output)


Out[29]:
True

In [30]:
%timeit pyfftw.interfaces.numpy_fft.fft(apyfftw)


1000 loops, best of 3: 767 µs per loop

PyFFTW usando objetos FFTW


In [31]:
pyfftw_output2 = pyfftw.n_byte_align_empty(n, 16, np.complex128)

In [32]:
fft_object = pyfftw.FFTW(apyfftw.copy(), pyfftw_output2)

In general, the creation of the pyfftw.FFTW object clears the contents of the arrays, so the arrays should be filled or updated after creation.


In [33]:
fft_object()


Out[33]:
array([ 348.43580732  +0.j        , -140.34766665+146.86379751j,
        -20.95179516-270.26881706j, ...,   54.40852271 +18.25801042j,
        -20.95179516+270.26881706j, -140.34766665-146.86379751j])

In [34]:
pyfftw_output2


Out[34]:
array([ 348.43580732  +0.j        , -140.34766665+146.86379751j,
        -20.95179516-270.26881706j, ...,   54.40852271 +18.25801042j,
        -20.95179516+270.26881706j, -140.34766665-146.86379751j])

In [35]:
np.allclose(res_numpy, pyfftw_output2)


Out[35]:
True

In [36]:
%timeit fft_object()


1000 loops, best of 3: 720 µs per loop

Builders

If you absolutely need the flexibility of dealing with pyfftw.FFTW directly, an easier option than constructing valid arrays and so on is to use the convenient pyfftw.builders package. These functions take care of much of the difficulty in specifying the exact size and dtype requirements to produce a valid scheme.


In [37]:
fft_object = pyfftw.builders.fft(apyfftw, threads=4)

In [38]:
res_pyfftw = fft_object()

In [39]:
np.allclose(res_numpy, res_pyfftw)


Out[39]:
True

In [40]:
%timeit fft_object()


1000 loops, best of 3: 369 µs per loop

scikits.cuda


In [41]:
import pycuda.autoinit

In [42]:
import pycuda.gpuarray as gpuarray

In [43]:
from scikits.cuda.fft import Plan

In [44]:
from scikits.cuda.fft import fft as cuda_fft

In [ ]:
# c = pycuda.autoinit.make_default_context()

In [45]:
x_gpu = gpuarray.to_gpu(a.astype(np.complex64))

In [46]:
x_gpu.dtype


Out[46]:
dtype('complex64')

In [47]:
xf_gpu = x_gpu.copy()

In [48]:
cuda_plan = Plan(shape=a.shape, in_dtype=np.complex64, out_dtype=np.complex64, batch=1, stream=None, mode=1)

In [49]:
cuda_fft(x_gpu.copy(), xf_gpu, cuda_plan)

In [50]:
res_cuda = xf_gpu.get(pagelocked=True)

In [51]:
np.allclose(res_numpy,res_cuda)


Out[51]:
False

In [52]:
res_numpy,res_cuda


Out[52]:
(array([ 348.43580732  +0.j        , -140.34766665+146.86379751j,
         -20.95179516-270.26881706j, ...,   54.40852271 +18.25801042j,
         -20.95179516+270.26881706j, -140.34766665-146.86379751j]),
 array([ 348.43579102  +0.j        , -140.34764099+146.86378479j,
         -20.95178223-270.26879883j, ...,   54.40847778 +18.25805283j,
         -20.95183182+270.26873779j, -140.34759521-146.8637085j ], dtype=complex64))

In [53]:
%timeit cuda_fft(x_gpu.copy(), xf_gpu, cuda_plan)


1000 loops, best of 3: 299 µs per loop

In [61]:
del cuda_plan

Soporte para fftw


In [54]:
cuda_plan_fftw = Plan(shape=a.shape, in_dtype=np.complex64, out_dtype=np.complex64, batch=1, stream=None, mode=3)

In [55]:
xf_gpu_fftw = x_gpu.copy()

In [56]:
cuda_fft(x_gpu.copy(), xf_gpu_fftw, cuda_plan_fftw)

In [57]:
res_cuda_fftw = xf_gpu_fftw.get(pagelocked=True)

In [58]:
np.allclose(res_numpy,res_cuda_fftw)


Out[58]:
False

In [59]:
res_numpy,res_cuda_fftw


Out[59]:
(array([ 348.43580732  +0.j        , -140.34766665+146.86379751j,
         -20.95179516-270.26881706j, ...,   54.40852271 +18.25801042j,
         -20.95179516+270.26881706j, -140.34766665-146.86379751j]),
 array([ 348.43579102  +0.j        , -140.34764099+146.86378479j,
         -20.95178223-270.26879883j, ...,   54.40847778 +18.25805283j,
         -20.95183182+270.26873779j, -140.34759521-146.8637085j ], dtype=complex64))

In [60]:
%timeit cuda_fft(x_gpu.copy(), xf_gpu, cuda_plan)


1000 loops, best of 3: 303 µs per loop

In [62]:
del cuda_plan_fftw