In [1]:
import numpy as np
In [2]:
from scipy import fftpack
In [3]:
n = 2**15
In [4]:
a = np.random.randn(n)
In [5]:
a = a.astype(np.complex128)
In [6]:
res_numpy = np.fft.fft(a)
In [7]:
res_numpy
Out[7]:
In [8]:
res_scipy = fftpack.fft(a)
In [9]:
np.allclose(res_numpy, res_scipy)
Out[9]:
In [10]:
%timeit np.fft.fft(a)
In [11]:
%timeit fftpack.fft(a)
In [12]:
import fftw3
In [13]:
import multiprocessing
In [14]:
ncores = multiprocessing.cpu_count()
In [15]:
ncores
Out[15]:
In [16]:
afftw3 = a.astype(np.complex128)
In [17]:
output_fftw3 = afftw3.copy()
In [18]:
fftw3plan = fftw3.Plan(afftw3, output_fftw3, nthreads = ncores)
In [19]:
fftw3plan()
In [20]:
output_fftw3
Out[20]:
In [21]:
np.allclose(res_numpy, output_fftw3)
Out[21]:
In [22]:
%timeit fftw3plan()
In [23]:
import pyfftw
In [24]:
apyfftw = pyfftw.n_byte_align_empty(n, 16, np.complex128)
In [25]:
apyfftw[:] = a[:]
In [26]:
np.allclose(apyfftw, a)
Out[26]:
Even after the first transform of a given specification has been performed, subsequent transforms are never as fast as using pyfftw.FFTW objects directly, and in many cases are substantially slower. This is because of the internal overhead of creating a new pyfftw.FFTW object on every call. For this reason, a cache is provided, which is recommended to be used whenever pyfftw.interfaces is used. Turn the cache on using pyfftw.interfaces.cache.enable()
In [27]:
pyfftw.interfaces.cache.enable()
In [28]:
pyfftw_output = pyfftw.interfaces.numpy_fft.fft(apyfftw)
In [29]:
np.allclose(res_numpy, pyfftw_output)
Out[29]:
In [30]:
%timeit pyfftw.interfaces.numpy_fft.fft(apyfftw)
In [31]:
pyfftw_output2 = pyfftw.n_byte_align_empty(n, 16, np.complex128)
In [32]:
fft_object = pyfftw.FFTW(apyfftw.copy(), pyfftw_output2)
In general, the creation of the pyfftw.FFTW object clears the contents of the arrays, so the arrays should be filled or updated after creation.
In [33]:
fft_object()
Out[33]:
In [34]:
pyfftw_output2
Out[34]:
In [35]:
np.allclose(res_numpy, pyfftw_output2)
Out[35]:
In [36]:
%timeit fft_object()
If you absolutely need the flexibility of dealing with pyfftw.FFTW directly, an easier option than constructing valid arrays and so on is to use the convenient pyfftw.builders package. These functions take care of much of the difficulty in specifying the exact size and dtype requirements to produce a valid scheme.
In [37]:
fft_object = pyfftw.builders.fft(apyfftw, threads=4)
In [38]:
res_pyfftw = fft_object()
In [39]:
np.allclose(res_numpy, res_pyfftw)
Out[39]:
In [40]:
%timeit fft_object()
In [41]:
import pycuda.autoinit
In [42]:
import pycuda.gpuarray as gpuarray
In [43]:
from scikits.cuda.fft import Plan
In [44]:
from scikits.cuda.fft import fft as cuda_fft
In [ ]:
# c = pycuda.autoinit.make_default_context()
In [45]:
x_gpu = gpuarray.to_gpu(a.astype(np.complex64))
In [46]:
x_gpu.dtype
Out[46]:
In [47]:
xf_gpu = x_gpu.copy()
In [48]:
cuda_plan = Plan(shape=a.shape, in_dtype=np.complex64, out_dtype=np.complex64, batch=1, stream=None, mode=1)
In [49]:
cuda_fft(x_gpu.copy(), xf_gpu, cuda_plan)
In [50]:
res_cuda = xf_gpu.get(pagelocked=True)
In [51]:
np.allclose(res_numpy,res_cuda)
Out[51]:
In [52]:
res_numpy,res_cuda
Out[52]:
In [53]:
%timeit cuda_fft(x_gpu.copy(), xf_gpu, cuda_plan)
In [61]:
del cuda_plan
In [54]:
cuda_plan_fftw = Plan(shape=a.shape, in_dtype=np.complex64, out_dtype=np.complex64, batch=1, stream=None, mode=3)
In [55]:
xf_gpu_fftw = x_gpu.copy()
In [56]:
cuda_fft(x_gpu.copy(), xf_gpu_fftw, cuda_plan_fftw)
In [57]:
res_cuda_fftw = xf_gpu_fftw.get(pagelocked=True)
In [58]:
np.allclose(res_numpy,res_cuda_fftw)
Out[58]:
In [59]:
res_numpy,res_cuda_fftw
Out[59]:
In [60]:
%timeit cuda_fft(x_gpu.copy(), xf_gpu, cuda_plan)
In [62]:
del cuda_plan_fftw