author: Diogo Silva
This notebook was created to develop and test the convergence part of the K-Means class. Initial works passes to evaluate summing all the distances on GPU and extracting only a number instead of N numbers form the device.
In [3]:
home = %env HOME
In [5]:
cd $home/QCThesis/CUDA/
/home/diogoaos/QCThesis/CUDA
In [6]:
import numpy as np
import cProfile
In [12]:
import K_Means3
reload(K_Means3)
Out[12]:
<module 'K_Means3' from 'K_Means3.py'>
In [9]:
from numbapro import vectorize
from numpy import arange
@vectorize(['float32(float32, float32)'], target='cpu') # default to 'cpu'
def add2(a, b):
return a + b
@vectorize(['float32(float32, float32)'], target='gpu') # default to 'cpu'
def add2_gpu(a, b):
return a + b
In [10]:
n=4e6
X = np.arange(np.int(n), dtype='float32')
print 'Results:',add2.reduce(X),add2_gpu.reduce(X)
%timeit add2.reduce(X)
%timeit add2_gpu.reduce(X)
Results: 7.99429e+12 8e+12
100 loops, best of 3: 9.31 ms per loop
100 loops, best of 3: 11.4 ms per loop
In [11]:
a=cProfile.run("add2_gpu.reduce(X)")
12128 function calls (12021 primitive calls) in 0.016 seconds
Ordered by: standard name
ncalls tottime percall cumtime percall filename:lineno(function)
1 0.000 0.000 0.016 0.016 <string>:1(<module>)
82 0.000 0.000 0.000 0.000 <string>:8(__new__)
164 0.000 0.000 0.002 0.000 _methods.py:34(_prod)
1 0.000 0.000 0.000 0.000 api.py:156(stream)
1 0.000 0.000 0.003 0.003 api.py:23(to_device)
27 0.000 0.000 0.000 0.000 autotune.py:61(by_occupancy)
27 0.000 0.000 0.000 0.000 autotune.py:71(best)
27 0.000 0.000 0.000 0.000 autotune.py:74(max_occupancy_min_blocks)
27 0.000 0.000 0.001 0.000 compiler.py:173(copy)
27 0.000 0.000 0.001 0.000 compiler.py:176(configure)
27 0.000 0.000 0.000 0.000 compiler.py:251(get)
27 0.000 0.000 0.004 0.000 compiler.py:304(__call__)
27 0.000 0.000 0.004 0.000 compiler.py:347(_kernel_call)
81 0.000 0.000 0.001 0.000 compiler.py:400(_prepare_args)
1 0.000 0.000 0.000 0.000 contextlib.py:12(__init__)
1 0.000 0.000 0.000 0.000 contextlib.py:15(__enter__)
1 0.000 0.000 0.000 0.000 contextlib.py:21(__exit__)
1 0.000 0.000 0.000 0.000 contextlib.py:82(helper)
54 0.000 0.000 0.000 0.000 copy.py:306(_reconstruct)
54 0.000 0.000 0.001 0.000 copy.py:66(copy)
54 0.000 0.000 0.000 0.000 copy_reg.py:92(__newobj__)
29 0.000 0.000 0.000 0.000 devicearray.py:113(_default_stream)
83 0.000 0.000 0.000 0.000 devicearray.py:125(device_ctypes_pointer)
1 0.000 0.000 0.003 0.003 devicearray.py:134(copy_to_device)
1 0.000 0.000 0.000 0.000 devicearray.py:151(copy_to_host)
81 0.000 0.000 0.004 0.000 devicearray.py:198(split)
136 0.000 0.000 0.000 0.000 devicearray.py:22(is_cuda_ndarray)
27 0.000 0.000 0.002 0.000 devicearray.py:248(reshape)
1 0.000 0.000 0.000 0.000 devicearray.py:325(from_array_like)
1 0.000 0.000 0.000 0.000 devicearray.py:344(sentry_contiguous)
82 0.000 0.000 0.003 0.000 devicearray.py:354(auto_device)
82 0.001 0.000 0.006 0.000 devicearray.py:55(__init__)
31 0.000 0.000 0.000 0.000 devices.py:108(current_context)
31 0.000 0.000 0.000 0.000 devices.py:183(get_or_create_context)
31 0.000 0.000 0.000 0.000 devices.py:224(get_context)
2 0.000 0.000 0.003 0.002 devices.py:236(_require_cuda_context)
27 0.000 0.000 0.000 0.000 deviceufunc.py:109(_get_actual_args)
27 0.000 0.000 0.000 0.000 deviceufunc.py:118(_broadcast)
54 0.000 0.000 0.000 0.000 deviceufunc.py:123(<lambda>)
27 0.000 0.000 0.001 0.000 deviceufunc.py:147(get_arguments)
27 0.000 0.000 0.000 0.000 deviceufunc.py:157(get_function)
27 0.000 0.000 0.008 0.000 deviceufunc.py:182(call)
27 0.000 0.000 0.000 0.000 deviceufunc.py:34(__init__)
27 0.000 0.000 0.000 0.000 deviceufunc.py:45(_fill_arrays)
27 0.000 0.000 0.000 0.000 deviceufunc.py:60(_fill_argtypes)
27 0.000 0.000 0.000 0.000 deviceufunc.py:68(_resolve_signature)
81 0.000 0.000 0.000 0.000 deviceufunc.py:92(<genexpr>)
27 0.000 0.000 0.005 0.000 dispatch.py:136(launch)
135 0.000 0.000 0.000 0.000 dispatch.py:139(is_device_array)
27 0.000 0.000 0.008 0.000 dispatch.py:26(__call__)
1 0.000 0.000 0.016 0.016 dispatch.py:39(reduce)
27/1 0.000 0.000 0.013 0.013 dispatch.py:68(__reduce)
27 0.000 0.000 0.001 0.000 driver.py:1019(configure)
27 0.000 0.000 0.002 0.000 driver.py:1036(__call__)
27 0.001 0.000 0.001 0.000 driver.py:1071(launch_kernel)
81 0.000 0.000 0.000 0.000 driver.py:1234(device_memory_size)
2 0.000 0.000 0.000 0.000 driver.py:1248(host_pointer)
1 0.000 0.000 0.000 0.000 driver.py:1261(host_memory_extents)
1 0.000 0.000 0.000 0.000 driver.py:1266(memory_size_from_info)
1 0.000 0.000 0.000 0.000 driver.py:1276(host_memory_size)
83 0.000 0.000 0.000 0.000 driver.py:1283(device_pointer)
83 0.000 0.000 0.000 0.000 driver.py:1288(device_ctypes_pointer)
652 0.000 0.000 0.000 0.000 driver.py:1296(is_device_memory)
83 0.000 0.000 0.000 0.000 driver.py:1307(require_device_memory)
1 0.000 0.000 0.003 0.003 driver.py:1324(host_to_device)
1 0.000 0.000 0.000 0.000 driver.py:1342(device_to_host)
65 0.004 0.000 0.004 0.000 driver.py:213(safe_cuda_api_call)
65 0.000 0.000 0.000 0.000 driver.py:242(_check_error)
31 0.000 0.000 0.000 0.000 driver.py:271(get_context)
2 0.000 0.000 0.000 0.000 driver.py:292(add_trash)
2 0.000 0.000 0.000 0.000 driver.py:295(process)
1 0.000 0.000 0.000 0.000 driver.py:499(memalloc)
1 0.000 0.000 0.000 0.000 driver.py:597(create_stream)
1 0.000 0.000 0.000 0.000 driver.py:665(_make_mem_finalizer)
1 0.000 0.000 0.000 0.000 driver.py:666(mem_finalize)
1 0.000 0.000 0.000 0.000 driver.py:670(core)
1 0.000 0.000 0.000 0.000 driver.py:671(cleanup)
1 0.000 0.000 0.000 0.000 driver.py:704(_stream_finalizer)
1 0.000 0.000 0.000 0.000 driver.py:705(core)
1 0.000 0.000 0.000 0.000 driver.py:706(<lambda>)
55 0.000 0.000 0.000 0.000 driver.py:729(__init__)
55 0.000 0.000 0.000 0.000 driver.py:740(__del__)
1 0.000 0.000 0.000 0.000 driver.py:747(own)
26 0.000 0.000 0.000 0.000 driver.py:750(free)
54 0.000 0.000 0.000 0.000 driver.py:768(view)
83 0.000 0.000 0.000 0.000 driver.py:779(device_ctypes_pointer)
55 0.000 0.000 0.000 0.000 driver.py:839(__init__)
55 0.000 0.000 0.000 0.000 driver.py:848(__del__)
218 0.000 0.000 0.000 0.000 driver.py:859(__getattr__)
1 0.000 0.000 0.000 0.000 driver.py:870(__init__)
1 0.000 0.000 0.000 0.000 driver.py:876(__del__)
1 0.000 0.000 0.000 0.000 driver.py:889(synchronize)
2 0.000 0.000 0.000 0.000 driver.py:892(auto_synchronize)
164 0.000 0.000 0.000 0.000 dummyarray.py:104(is_contiguous)
164 0.000 0.000 0.000 0.000 dummyarray.py:108(compute_index)
328 0.000 0.000 0.000 0.000 dummyarray.py:109(<genexpr>)
82 0.000 0.000 0.004 0.000 dummyarray.py:148(from_desc)
82 0.001 0.000 0.003 0.000 dummyarray.py:157(__init__)
164 0.000 0.000 0.000 0.000 dummyarray.py:160(<genexpr>)
164 0.000 0.000 0.000 0.000 dummyarray.py:161(<genexpr>)
82 0.000 0.000 0.000 0.000 dummyarray.py:167(_compute_layout)
164 0.000 0.000 0.000 0.000 dummyarray.py:179(is_contig)
82 0.000 0.000 0.001 0.000 dummyarray.py:191(_compute_extent)
82 0.000 0.000 0.000 0.000 dummyarray.py:27(__init__)
164 0.000 0.000 0.000 0.000 dummyarray.py:80(get_offset)
164 0.001 0.000 0.002 0.000 fromnumeric.py:2259(prod)
27 0.000 0.000 0.000 0.000 plugins.py:21(autotune)
27 0.000 0.000 0.000 0.000 plugins.py:50(forall)
27 0.000 0.000 0.000 0.000 plugins.py:63(_compute_thread_per_block)
27 0.000 0.000 0.000 0.000 plugins.py:77(__init__)
27 0.000 0.000 0.005 0.000 plugins.py:84(__call__)
2 0.000 0.000 0.000 0.000 service.py:18(service)
62 0.000 0.000 0.000 0.000 threadlocal.py:13(stack)
31 0.000 0.000 0.000 0.000 threadlocal.py:29(top)
31 0.000 0.000 0.000 0.000 threadlocal.py:33(is_empty)
31 0.000 0.000 0.000 0.000 threadlocal.py:37(__bool__)
31 0.000 0.000 0.000 0.000 threadlocal.py:40(__nonzero__)
1 0.000 0.000 0.000 0.000 utils.py:140(__setitem__)
486 0.000 0.000 0.000 0.000 {_ctypes.addressof}
33 0.000 0.000 0.000 0.000 {_ctypes.byref}
57 0.000 0.000 0.000 0.000 {_weakref.proxy}
27 0.000 0.000 0.000 0.000 {all}
136 0.000 0.000 0.000 0.000 {built-in method __new__ of type object at 0x7f6f5b3ccd00}
1195/1114 0.001 0.000 0.001 0.000 {getattr}
81 0.000 0.000 0.000 0.000 {hasattr}
54 0.000 0.000 0.000 0.000 {id}
576 0.000 0.000 0.000 0.000 {isinstance}
82 0.000 0.000 0.000 0.000 {iter}
682 0.000 0.000 0.000 0.000 {len}
27 0.000 0.000 0.000 0.000 {math.ceil}
27 0.000 0.000 0.000 0.000 {max}
54 0.000 0.000 0.000 0.000 {method '__reduce_ex__' of 'object' objects}
1307 0.000 0.000 0.000 0.000 {method 'append' of 'list' objects}
1 0.000 0.000 0.000 0.000 {method 'disable' of '_lsprof.Profiler' objects}
27 0.000 0.000 0.000 0.000 {method 'extend' of 'list' objects}
135 0.000 0.000 0.000 0.000 {method 'get' of 'dict' objects}
1 0.000 0.000 0.000 0.000 {method 'keys' of 'dict' objects}
54 0.000 0.000 0.000 0.000 {method 'pop' of 'dict' objects}
2 0.000 0.000 0.000 0.000 {method 'pop' of 'list' objects}
164 0.001 0.000 0.001 0.000 {method 'reduce' of 'numpy.ufunc' objects}
54 0.000 0.000 0.000 0.000 {method 'update' of 'dict' objects}
55 0.000 0.000 0.000 0.000 {min}
166 0.000 0.000 0.000 0.000 {next}
2 0.000 0.000 0.000 0.000 {numba.mviewbuf.memoryview_get_buffer}
1 0.000 0.000 0.000 0.000 {numba.mviewbuf.memoryview_get_extents_info}
1 0.000 0.000 0.000 0.000 {numba.mviewbuf.memoryview_get_extents}
1 0.000 0.000 0.000 0.000 {numpy.core.multiarray.array}
189 0.000 0.000 0.000 0.000 {range}
164 0.000 0.000 0.000 0.000 {sum}
300 0.000 0.000 0.000 0.000 {zip}
In [ ]:
Content source: Chiroptera/QCThesis
Similar notebooks: