author: Diogo Silva

This notebook was created to develop and test the convergence part of the K-Means class. Initial works passes to evaluate summing all the distances on GPU and extracting only a number instead of N numbers form the device.


In [3]:
home = %env HOME

In [5]:
cd $home/QCThesis/CUDA/


/home/diogoaos/QCThesis/CUDA

In [6]:
import numpy as np
import cProfile

In [12]:
import K_Means3
reload(K_Means3)


Out[12]:
<module 'K_Means3' from 'K_Means3.py'>

In [9]:
from numbapro import vectorize
from numpy import arange

@vectorize(['float32(float32, float32)'], target='cpu') # default to 'cpu'
def add2(a, b):
    return a + b

@vectorize(['float32(float32, float32)'], target='gpu') # default to 'cpu'
def add2_gpu(a, b):
    return a + b

In [10]:
n=4e6
X = np.arange(np.int(n), dtype='float32')

print 'Results:',add2.reduce(X),add2_gpu.reduce(X)
%timeit add2.reduce(X)
%timeit add2_gpu.reduce(X)


Results: 7.99429e+12 8e+12
100 loops, best of 3: 9.31 ms per loop
100 loops, best of 3: 11.4 ms per loop

In [11]:
a=cProfile.run("add2_gpu.reduce(X)")


         12128 function calls (12021 primitive calls) in 0.016 seconds

   Ordered by: standard name

   ncalls  tottime  percall  cumtime  percall filename:lineno(function)
        1    0.000    0.000    0.016    0.016 <string>:1(<module>)
       82    0.000    0.000    0.000    0.000 <string>:8(__new__)
      164    0.000    0.000    0.002    0.000 _methods.py:34(_prod)
        1    0.000    0.000    0.000    0.000 api.py:156(stream)
        1    0.000    0.000    0.003    0.003 api.py:23(to_device)
       27    0.000    0.000    0.000    0.000 autotune.py:61(by_occupancy)
       27    0.000    0.000    0.000    0.000 autotune.py:71(best)
       27    0.000    0.000    0.000    0.000 autotune.py:74(max_occupancy_min_blocks)
       27    0.000    0.000    0.001    0.000 compiler.py:173(copy)
       27    0.000    0.000    0.001    0.000 compiler.py:176(configure)
       27    0.000    0.000    0.000    0.000 compiler.py:251(get)
       27    0.000    0.000    0.004    0.000 compiler.py:304(__call__)
       27    0.000    0.000    0.004    0.000 compiler.py:347(_kernel_call)
       81    0.000    0.000    0.001    0.000 compiler.py:400(_prepare_args)
        1    0.000    0.000    0.000    0.000 contextlib.py:12(__init__)
        1    0.000    0.000    0.000    0.000 contextlib.py:15(__enter__)
        1    0.000    0.000    0.000    0.000 contextlib.py:21(__exit__)
        1    0.000    0.000    0.000    0.000 contextlib.py:82(helper)
       54    0.000    0.000    0.000    0.000 copy.py:306(_reconstruct)
       54    0.000    0.000    0.001    0.000 copy.py:66(copy)
       54    0.000    0.000    0.000    0.000 copy_reg.py:92(__newobj__)
       29    0.000    0.000    0.000    0.000 devicearray.py:113(_default_stream)
       83    0.000    0.000    0.000    0.000 devicearray.py:125(device_ctypes_pointer)
        1    0.000    0.000    0.003    0.003 devicearray.py:134(copy_to_device)
        1    0.000    0.000    0.000    0.000 devicearray.py:151(copy_to_host)
       81    0.000    0.000    0.004    0.000 devicearray.py:198(split)
      136    0.000    0.000    0.000    0.000 devicearray.py:22(is_cuda_ndarray)
       27    0.000    0.000    0.002    0.000 devicearray.py:248(reshape)
        1    0.000    0.000    0.000    0.000 devicearray.py:325(from_array_like)
        1    0.000    0.000    0.000    0.000 devicearray.py:344(sentry_contiguous)
       82    0.000    0.000    0.003    0.000 devicearray.py:354(auto_device)
       82    0.001    0.000    0.006    0.000 devicearray.py:55(__init__)
       31    0.000    0.000    0.000    0.000 devices.py:108(current_context)
       31    0.000    0.000    0.000    0.000 devices.py:183(get_or_create_context)
       31    0.000    0.000    0.000    0.000 devices.py:224(get_context)
        2    0.000    0.000    0.003    0.002 devices.py:236(_require_cuda_context)
       27    0.000    0.000    0.000    0.000 deviceufunc.py:109(_get_actual_args)
       27    0.000    0.000    0.000    0.000 deviceufunc.py:118(_broadcast)
       54    0.000    0.000    0.000    0.000 deviceufunc.py:123(<lambda>)
       27    0.000    0.000    0.001    0.000 deviceufunc.py:147(get_arguments)
       27    0.000    0.000    0.000    0.000 deviceufunc.py:157(get_function)
       27    0.000    0.000    0.008    0.000 deviceufunc.py:182(call)
       27    0.000    0.000    0.000    0.000 deviceufunc.py:34(__init__)
       27    0.000    0.000    0.000    0.000 deviceufunc.py:45(_fill_arrays)
       27    0.000    0.000    0.000    0.000 deviceufunc.py:60(_fill_argtypes)
       27    0.000    0.000    0.000    0.000 deviceufunc.py:68(_resolve_signature)
       81    0.000    0.000    0.000    0.000 deviceufunc.py:92(<genexpr>)
       27    0.000    0.000    0.005    0.000 dispatch.py:136(launch)
      135    0.000    0.000    0.000    0.000 dispatch.py:139(is_device_array)
       27    0.000    0.000    0.008    0.000 dispatch.py:26(__call__)
        1    0.000    0.000    0.016    0.016 dispatch.py:39(reduce)
     27/1    0.000    0.000    0.013    0.013 dispatch.py:68(__reduce)
       27    0.000    0.000    0.001    0.000 driver.py:1019(configure)
       27    0.000    0.000    0.002    0.000 driver.py:1036(__call__)
       27    0.001    0.000    0.001    0.000 driver.py:1071(launch_kernel)
       81    0.000    0.000    0.000    0.000 driver.py:1234(device_memory_size)
        2    0.000    0.000    0.000    0.000 driver.py:1248(host_pointer)
        1    0.000    0.000    0.000    0.000 driver.py:1261(host_memory_extents)
        1    0.000    0.000    0.000    0.000 driver.py:1266(memory_size_from_info)
        1    0.000    0.000    0.000    0.000 driver.py:1276(host_memory_size)
       83    0.000    0.000    0.000    0.000 driver.py:1283(device_pointer)
       83    0.000    0.000    0.000    0.000 driver.py:1288(device_ctypes_pointer)
      652    0.000    0.000    0.000    0.000 driver.py:1296(is_device_memory)
       83    0.000    0.000    0.000    0.000 driver.py:1307(require_device_memory)
        1    0.000    0.000    0.003    0.003 driver.py:1324(host_to_device)
        1    0.000    0.000    0.000    0.000 driver.py:1342(device_to_host)
       65    0.004    0.000    0.004    0.000 driver.py:213(safe_cuda_api_call)
       65    0.000    0.000    0.000    0.000 driver.py:242(_check_error)
       31    0.000    0.000    0.000    0.000 driver.py:271(get_context)
        2    0.000    0.000    0.000    0.000 driver.py:292(add_trash)
        2    0.000    0.000    0.000    0.000 driver.py:295(process)
        1    0.000    0.000    0.000    0.000 driver.py:499(memalloc)
        1    0.000    0.000    0.000    0.000 driver.py:597(create_stream)
        1    0.000    0.000    0.000    0.000 driver.py:665(_make_mem_finalizer)
        1    0.000    0.000    0.000    0.000 driver.py:666(mem_finalize)
        1    0.000    0.000    0.000    0.000 driver.py:670(core)
        1    0.000    0.000    0.000    0.000 driver.py:671(cleanup)
        1    0.000    0.000    0.000    0.000 driver.py:704(_stream_finalizer)
        1    0.000    0.000    0.000    0.000 driver.py:705(core)
        1    0.000    0.000    0.000    0.000 driver.py:706(<lambda>)
       55    0.000    0.000    0.000    0.000 driver.py:729(__init__)
       55    0.000    0.000    0.000    0.000 driver.py:740(__del__)
        1    0.000    0.000    0.000    0.000 driver.py:747(own)
       26    0.000    0.000    0.000    0.000 driver.py:750(free)
       54    0.000    0.000    0.000    0.000 driver.py:768(view)
       83    0.000    0.000    0.000    0.000 driver.py:779(device_ctypes_pointer)
       55    0.000    0.000    0.000    0.000 driver.py:839(__init__)
       55    0.000    0.000    0.000    0.000 driver.py:848(__del__)
      218    0.000    0.000    0.000    0.000 driver.py:859(__getattr__)
        1    0.000    0.000    0.000    0.000 driver.py:870(__init__)
        1    0.000    0.000    0.000    0.000 driver.py:876(__del__)
        1    0.000    0.000    0.000    0.000 driver.py:889(synchronize)
        2    0.000    0.000    0.000    0.000 driver.py:892(auto_synchronize)
      164    0.000    0.000    0.000    0.000 dummyarray.py:104(is_contiguous)
      164    0.000    0.000    0.000    0.000 dummyarray.py:108(compute_index)
      328    0.000    0.000    0.000    0.000 dummyarray.py:109(<genexpr>)
       82    0.000    0.000    0.004    0.000 dummyarray.py:148(from_desc)
       82    0.001    0.000    0.003    0.000 dummyarray.py:157(__init__)
      164    0.000    0.000    0.000    0.000 dummyarray.py:160(<genexpr>)
      164    0.000    0.000    0.000    0.000 dummyarray.py:161(<genexpr>)
       82    0.000    0.000    0.000    0.000 dummyarray.py:167(_compute_layout)
      164    0.000    0.000    0.000    0.000 dummyarray.py:179(is_contig)
       82    0.000    0.000    0.001    0.000 dummyarray.py:191(_compute_extent)
       82    0.000    0.000    0.000    0.000 dummyarray.py:27(__init__)
      164    0.000    0.000    0.000    0.000 dummyarray.py:80(get_offset)
      164    0.001    0.000    0.002    0.000 fromnumeric.py:2259(prod)
       27    0.000    0.000    0.000    0.000 plugins.py:21(autotune)
       27    0.000    0.000    0.000    0.000 plugins.py:50(forall)
       27    0.000    0.000    0.000    0.000 plugins.py:63(_compute_thread_per_block)
       27    0.000    0.000    0.000    0.000 plugins.py:77(__init__)
       27    0.000    0.000    0.005    0.000 plugins.py:84(__call__)
        2    0.000    0.000    0.000    0.000 service.py:18(service)
       62    0.000    0.000    0.000    0.000 threadlocal.py:13(stack)
       31    0.000    0.000    0.000    0.000 threadlocal.py:29(top)
       31    0.000    0.000    0.000    0.000 threadlocal.py:33(is_empty)
       31    0.000    0.000    0.000    0.000 threadlocal.py:37(__bool__)
       31    0.000    0.000    0.000    0.000 threadlocal.py:40(__nonzero__)
        1    0.000    0.000    0.000    0.000 utils.py:140(__setitem__)
      486    0.000    0.000    0.000    0.000 {_ctypes.addressof}
       33    0.000    0.000    0.000    0.000 {_ctypes.byref}
       57    0.000    0.000    0.000    0.000 {_weakref.proxy}
       27    0.000    0.000    0.000    0.000 {all}
      136    0.000    0.000    0.000    0.000 {built-in method __new__ of type object at 0x7f6f5b3ccd00}
1195/1114    0.001    0.000    0.001    0.000 {getattr}
       81    0.000    0.000    0.000    0.000 {hasattr}
       54    0.000    0.000    0.000    0.000 {id}
      576    0.000    0.000    0.000    0.000 {isinstance}
       82    0.000    0.000    0.000    0.000 {iter}
      682    0.000    0.000    0.000    0.000 {len}
       27    0.000    0.000    0.000    0.000 {math.ceil}
       27    0.000    0.000    0.000    0.000 {max}
       54    0.000    0.000    0.000    0.000 {method '__reduce_ex__' of 'object' objects}
     1307    0.000    0.000    0.000    0.000 {method 'append' of 'list' objects}
        1    0.000    0.000    0.000    0.000 {method 'disable' of '_lsprof.Profiler' objects}
       27    0.000    0.000    0.000    0.000 {method 'extend' of 'list' objects}
      135    0.000    0.000    0.000    0.000 {method 'get' of 'dict' objects}
        1    0.000    0.000    0.000    0.000 {method 'keys' of 'dict' objects}
       54    0.000    0.000    0.000    0.000 {method 'pop' of 'dict' objects}
        2    0.000    0.000    0.000    0.000 {method 'pop' of 'list' objects}
      164    0.001    0.000    0.001    0.000 {method 'reduce' of 'numpy.ufunc' objects}
       54    0.000    0.000    0.000    0.000 {method 'update' of 'dict' objects}
       55    0.000    0.000    0.000    0.000 {min}
      166    0.000    0.000    0.000    0.000 {next}
        2    0.000    0.000    0.000    0.000 {numba.mviewbuf.memoryview_get_buffer}
        1    0.000    0.000    0.000    0.000 {numba.mviewbuf.memoryview_get_extents_info}
        1    0.000    0.000    0.000    0.000 {numba.mviewbuf.memoryview_get_extents}
        1    0.000    0.000    0.000    0.000 {numpy.core.multiarray.array}
      189    0.000    0.000    0.000    0.000 {range}
      164    0.000    0.000    0.000    0.000 {sum}
      300    0.000    0.000    0.000    0.000 {zip}



In [ ]: