In [1]:

    
import numpy as np



In [2]:

    
from scipy import fftpack



In [3]:

    
n = 2**15



In [4]:

    
a = np.random.randn(n)



In [5]:

    
a = a.astype(np.complex128)

Numpy y Scipy



In [6]:

    
res_numpy = np.fft.fft(a)



In [7]:

    
res_numpy









    Out[7]:





array([ 348.43580732  +0.j        , -140.34766665+146.86379751j,
        -20.95179516-270.26881706j, ...,   54.40852271 +18.25801042j,
        -20.95179516+270.26881706j, -140.34766665-146.86379751j])



In [8]:

    
res_scipy = fftpack.fft(a)



In [9]:

    
np.allclose(res_numpy, res_scipy)









    Out[9]:





True



In [10]:

    
%timeit np.fft.fft(a)









    



100 loops, best of 3: 3.29 ms per loop



In [11]:

    
%timeit fftpack.fft(a)









    



100 loops, best of 3: 3.35 ms per loop

FFTW3



In [12]:

    
import fftw3



In [13]:

    
import multiprocessing



In [14]:

    
ncores = multiprocessing.cpu_count()



In [15]:

    
ncores









    Out[15]:





4



In [16]:

    
afftw3 = a.astype(np.complex128)



In [17]:

    
output_fftw3 = afftw3.copy()



In [18]:

    
fftw3plan = fftw3.Plan(afftw3, output_fftw3, nthreads = ncores)



In [19]:

    
fftw3plan()



In [20]:

    
output_fftw3









    Out[20]:





array([ 348.43580732  +0.j        , -140.34766665+146.86379751j,
        -20.95179516-270.26881706j, ...,   54.40852271 +18.25801042j,
        -20.95179516+270.26881706j, -140.34766665-146.86379751j])



In [21]:

    
np.allclose(res_numpy, output_fftw3)









    Out[21]:





True



In [22]:

    
%timeit fftw3plan()









    



1000 loops, best of 3: 610 µs per loop

PyFFTW

http://hgomersall.github.io/pyFFTW/index.html



In [23]:

    
import pyfftw



In [24]:

    
apyfftw = pyfftw.n_byte_align_empty(n, 16, np.complex128)



In [25]:

    
apyfftw[:] = a[:]



In [26]:

    
np.allclose(apyfftw, a)









    Out[26]:





True

PyFFTW con interfaz numpy

Even after the first transform of a given specification has been performed, subsequent transforms are never as fast as using pyfftw.FFTW objects directly, and in many cases are substantially slower. This is because of the internal overhead of creating a new pyfftw.FFTW object on every call. For this reason, a cache is provided, which is recommended to be used whenever pyfftw.interfaces is used. Turn the cache on using pyfftw.interfaces.cache.enable()



In [27]:

    
pyfftw.interfaces.cache.enable()



In [28]:

    
pyfftw_output = pyfftw.interfaces.numpy_fft.fft(apyfftw)



In [29]:

    
np.allclose(res_numpy, pyfftw_output)









    Out[29]:





True



In [30]:

    
%timeit pyfftw.interfaces.numpy_fft.fft(apyfftw)









    



1000 loops, best of 3: 767 µs per loop

PyFFTW usando objetos FFTW



In [31]:

    
pyfftw_output2 = pyfftw.n_byte_align_empty(n, 16, np.complex128)



In [32]:

    
fft_object = pyfftw.FFTW(apyfftw.copy(), pyfftw_output2)

In general, the creation of the pyfftw.FFTW object clears the contents of the arrays, so the arrays should be filled or updated after creation.



In [33]:

    
fft_object()









    Out[33]:





array([ 348.43580732  +0.j        , -140.34766665+146.86379751j,
        -20.95179516-270.26881706j, ...,   54.40852271 +18.25801042j,
        -20.95179516+270.26881706j, -140.34766665-146.86379751j])



In [34]:

    
pyfftw_output2









    Out[34]:





array([ 348.43580732  +0.j        , -140.34766665+146.86379751j,
        -20.95179516-270.26881706j, ...,   54.40852271 +18.25801042j,
        -20.95179516+270.26881706j, -140.34766665-146.86379751j])



In [35]:

    
np.allclose(res_numpy, pyfftw_output2)









    Out[35]:





True



In [36]:

    
%timeit fft_object()









    



1000 loops, best of 3: 720 µs per loop

Builders

If you absolutely need the flexibility of dealing with pyfftw.FFTW directly, an easier option than constructing valid arrays and so on is to use the convenient pyfftw.builders package. These functions take care of much of the difficulty in specifying the exact size and dtype requirements to produce a valid scheme.



In [37]:

    
fft_object = pyfftw.builders.fft(apyfftw, threads=4)



In [38]:

    
res_pyfftw = fft_object()



In [39]:

    
np.allclose(res_numpy, res_pyfftw)









    Out[39]:





True



In [40]:

    
%timeit fft_object()









    



1000 loops, best of 3: 369 µs per loop

scikits.cuda



In [41]:

    
import pycuda.autoinit



In [42]:

    
import pycuda.gpuarray as gpuarray



In [43]:

    
from scikits.cuda.fft import Plan



In [44]:

    
from scikits.cuda.fft import fft as cuda_fft



In [ ]:

    
# c = pycuda.autoinit.make_default_context()



In [45]:

    
x_gpu = gpuarray.to_gpu(a.astype(np.complex64))



In [46]:

    
x_gpu.dtype









    Out[46]:





dtype('complex64')



In [47]:

    
xf_gpu = x_gpu.copy()



In [48]:

    
cuda_plan = Plan(shape=a.shape, in_dtype=np.complex64, out_dtype=np.complex64, batch=1, stream=None, mode=1)



In [49]:

    
cuda_fft(x_gpu.copy(), xf_gpu, cuda_plan)



In [50]:

    
res_cuda = xf_gpu.get(pagelocked=True)



In [51]:

    
np.allclose(res_numpy,res_cuda)









    Out[51]:





False



In [52]:

    
res_numpy,res_cuda









    Out[52]:





(array([ 348.43580732  +0.j        , -140.34766665+146.86379751j,
         -20.95179516-270.26881706j, ...,   54.40852271 +18.25801042j,
         -20.95179516+270.26881706j, -140.34766665-146.86379751j]),
 array([ 348.43579102  +0.j        , -140.34764099+146.86378479j,
         -20.95178223-270.26879883j, ...,   54.40847778 +18.25805283j,
         -20.95183182+270.26873779j, -140.34759521-146.8637085j ], dtype=complex64))



In [53]:

    
%timeit cuda_fft(x_gpu.copy(), xf_gpu, cuda_plan)









    



1000 loops, best of 3: 299 µs per loop



In [61]:

    
del cuda_plan

Soporte para fftw



In [54]:

    
cuda_plan_fftw = Plan(shape=a.shape, in_dtype=np.complex64, out_dtype=np.complex64, batch=1, stream=None, mode=3)



In [55]:

    
xf_gpu_fftw = x_gpu.copy()



In [56]:

    
cuda_fft(x_gpu.copy(), xf_gpu_fftw, cuda_plan_fftw)



In [57]:

    
res_cuda_fftw = xf_gpu_fftw.get(pagelocked=True)



In [58]:

    
np.allclose(res_numpy,res_cuda_fftw)









    Out[58]:





False



In [59]:

    
res_numpy,res_cuda_fftw









    Out[59]:





(array([ 348.43580732  +0.j        , -140.34766665+146.86379751j,
         -20.95179516-270.26881706j, ...,   54.40852271 +18.25801042j,
         -20.95179516+270.26881706j, -140.34766665-146.86379751j]),
 array([ 348.43579102  +0.j        , -140.34764099+146.86378479j,
         -20.95178223-270.26879883j, ...,   54.40847778 +18.25805283j,
         -20.95183182+270.26873779j, -140.34759521-146.8637085j ], dtype=complex64))



In [60]:

    
%timeit cuda_fft(x_gpu.copy(), xf_gpu, cuda_plan)









    



1000 loops, best of 3: 303 µs per loop



In [62]:

    
del cuda_plan_fftw