Using Theano backend.
1 #define _CUDA_NDARRAY_C
2
3 #include <Python.h>
4 #include <structmember.h>
5 #include "theano_mod_helper.h"
6
7 #include <numpy/arrayobject.h>
8 #include <iostream>
9
10 #include "cuda_ndarray.cuh"
11
12 #ifndef CNMEM_DLLEXPORT
13 #define CNMEM_DLLEXPORT
14 #endif
15
16 #include "cnmem.h"
17 #include "cnmem.cpp"
18
19 //If true, when there is a gpu malloc or free error, we print the size of allocated memory on the device.
20 #define COMPUTE_GPU_MEM_USED 0
21
22 //If true, we fill with NAN allocated device memory.
23 #define ALLOC_MEMSET 0
24
25 //If true, we print out when we free a device pointer, uninitialize a
26 //CudaNdarray, or allocate a device pointer
27 #define PRINT_FREE_MALLOC 0
28
29 //If true, we do error checking at the start of functions, to make sure there
30 //is not a pre-existing error when the function is called.
31 //You probably need to set the environment variable
32 //CUDA_LAUNCH_BLOCKING=1, and/or modify the CNDA_THREAD_SYNC
33 //preprocessor macro in cuda_ndarray.cuh
34 //if you want this to work.
35 #define PRECHECK_ERROR 0
36
37 cublasHandle_t handle = NULL;
38 int* err_var = NULL;
39
40 /////////////////////////
41 // Alloc and Free
42 /////////////////////////
43
44 static int g_gpu_context_active = 0;
45
46
47 PyObject *
48 CudaNdarray_Dimshuffle(PyObject* _unused, PyObject* args);
49 static PyObject *CudaNdarray_get_shape(CudaNdarray *self, void *closure);
50
51
52 /**
53 *
54 * In the test program I'm using, the _outstanding_mallocs decreases with every call.
55 * This suggests there are more free() calls being made than alloc(), but I can't figure out why.
56 *
57 */
58 int _outstanding_mallocs[] = {0,0};
59
60 #if COMPUTE_GPU_MEM_USED
61 size_t _allocated_size = 0;
62 size_t _max_allocated_size = 0;
63
64 const int TABLE_SIZE = 10000;
65 struct table_struct{
66 void* ptr;
67 size_t size;
68 };
69 table_struct _alloc_size_table[TABLE_SIZE];
70 #endif
71
72 void * device_malloc(size_t size)
73 {
74 return device_malloc(size, VERBOSE_DEVICE_MALLOC);
75 }
76
77 static bool g_use_cnmem = false;
78 static const int g_max_devices = 8;
79 int initCnmem(int card_number_provided, int card_nb, size_t mem) {
80 static bool cnmemInitialized = false;
81 if(cnmemInitialized) {
82 return 0;
83 }
84 // On stderr to be at the same place as "Using gpu device..."
85 int numDevices = 0;
86 cnmemDevice_t devices[g_max_devices];
87 if(cudaGetDeviceCount(&numDevices) != cudaSuccess) {
88 PyErr_Format(PyExc_RuntimeError,
89 "initCnmem: 'cudaGetDeviceCount' failed! Reason=%s\n",
90 cudaGetErrorString(cudaGetLastError()));
91 return -1;
92 }
93 if(card_number_provided){
94 numDevices = 1;
95 int i = 0;
96 devices[i].device = card_nb;
97 devices[i].size = mem;
98 ///@TODO: thejaswi: add support for multiple streams
99 devices[i].numStreams = 0;
100 devices[i].streams = NULL;
101 devices[i].streamSizes = NULL;
102 }else{
103 for(int i=0;i<numDevices;++i) {
104 devices[i].device = i;
105 devices[i].size = mem;
106 ///@TODO: thejaswi: add support for multiple streams
107 devices[i].numStreams = 0;
108 devices[i].streams = NULL;
109 }
110 }
111
112 ///@TODO: thejaswi: passing custom cnmem flags?
113 cnmemStatus_t status = cnmemInit(numDevices, devices, CNMEM_FLAGS_DEFAULT);
114 if(status != CNMEM_STATUS_SUCCESS) {
115 PyErr_Format(PyExc_RuntimeError,
116 "initCnmem: cnmemInit call failed! Reason=%s. numdev=%d\n",
117 cnmemGetErrorString(status), numDevices);
118 return -1;
119 }
120 cnmemInitialized = true;
121 return 0;
122 }
123
124 void * device_malloc(size_t size, int verbose)
125 {
126 #if PRECHECK_ERROR
127 cudaThreadSynchronize();
128 cudaError_t prevError = cudaGetLastError();
129 if (cudaSuccess != prevError)
130 {
131 fprintf(stderr,
132 "Error existed before calling device_malloc. %s\n",
133 cudaGetErrorString(prevError)
134 );
135 }
136 #endif
137 void * rval=NULL;
138 ///@TODO: thejaswi: support for multiple-streams?
139 if(g_use_cnmem) {
140 cnmemStatus_t status = CNMEM_STATUS_SUCCESS;
141 status = cnmemMalloc(&rval, size, NULL);
142 if(status != CNMEM_STATUS_SUCCESS) {
143 PyErr_Format(PyExc_MemoryError,
144 "Error allocating %llu bytes of device memory (%s).",
145 (unsigned long long)size, cnmemGetErrorString(status));
146 return NULL;
147 }
148 }
149 else {
150 cudaError_t err = cudaMalloc(&rval, size);
151 if (cudaSuccess != err)
152 {
153 // Clear the error flag, cudaMalloc doesn't do it.
154 // Currently this returns the same thing as err, but if in future
155 // it returns something else I still don't see why we should ignore
156 // it. All we want to do here is reset the flag.
157 cudaGetLastError();
158 if (verbose)
159 {
160 size_t free = 0, total = 0;
161 cudaError_t err2 = cudaMemGetInfo(&free, &total);
162 if (err2 != cudaSuccess){
163 cudaGetLastError();
164 fprintf(stderr,
165 "Error when trying to find the memory information"
166 " on the GPU: %s\n", cudaGetErrorString(err2));
167 }
168 #if COMPUTE_GPU_MEM_USED
169 fprintf(stderr,
170 "Error allocating %llu bytes of device memory (%s)."
171 " new total bytes allocated: %llu."
172 " Driver report %llu bytes free and %llu bytes total \n",
173 (unsigned long long)size, cudaGetErrorString(err), (unsigned long long)_allocated_size,
174 (unsigned long long)free, (unsigned long long)total);
175 #else
176 fprintf(stderr,
177 "Error allocating %llu bytes of device memory (%s)."
178 " Driver report %llu bytes free and %llu bytes total \n",
179 (unsigned long long)size, cudaGetErrorString(err), (unsigned long long)free, (unsigned long long)total);
180 #endif
181 }
182 PyErr_Format(PyExc_MemoryError,
183 "Error allocating %llu bytes of device memory (%s).",
184 (unsigned long long)size, cudaGetErrorString(err));
185 return NULL;
186 }
187 }
188 if (rval != NULL){
189 // Can it happen that cudaMalloc return cudaSuccess, but return a NULL ptr?
190 // Could this be what happen if size is 0?
191 _outstanding_mallocs[0] += 1;
192
193 #if COMPUTE_GPU_MEM_USED
194 _allocated_size += size;
195 _max_allocated_size = std::max(_max_allocated_size, _allocated_size);
196 int i = 0;
197 for(;i<TABLE_SIZE;i++){
198 if(NULL==_alloc_size_table[i].ptr){
199 _alloc_size_table[i].ptr=rval;
200 _alloc_size_table[i].size=size;
201 break;
202 }
203 }
204 if (i == TABLE_SIZE){
205 fprintf(stderr,
206 "When tracking GPU malloc, our table size wasn't big enough."
207 " So we loose some tracking. Raise the value of TABLE_SIZE in the file cuda_ndarra.cu");
208 }
209 #endif
210 }
211 //fprintf(stderr,
212 //"allocated %li bytes of device memory (%s). new total bytes allocated: %d. ptr: %p\n",
213 //(long)size, cudaGetErrorString(err),_allocated_size,rval);
214
215 if(ALLOC_MEMSET){
216 //We init them to nan to make sure we catch more debug case.
217 cudaMemset(rval, 0xFF, size);
218 //printf("MEMSET\n");
219 }
220 #if PRINT_FREE_MALLOC
221 fprintf(stderr, "device malloc %p of size %d\n", rval, size);
222 #endif
223 return rval;
224 }
225
226 int device_free(void *ptr)
227 {
228 #if PRECHECK_ERROR
229 cudaThreadSynchronize();
230 cudaError_t prevError = cudaGetLastError();
231 if (cudaSuccess != prevError)
232 {
233 fprintf(stderr,
234 "Error existed before calling device_free. %s\n",
235 cudaGetErrorString(prevError)
236 );
237 }
238 #endif
239 #if PRINT_FREE_MALLOC
240 size_t free = 0, total = 0;
241 cudaError_t err2 = cudaMemGetInfo(&free, &total);
242 if (err2 != cudaSuccess){
243 cudaGetLastError();
244 fprintf(stderr,
245 "Error when tring to find the memory information"
246 " on the GPU: %s\n", cudaGetErrorString(err2));
247 }
248 #if COMPUTE_GPU_MEM_USED
249 {
250 int i = 0;
251 for(;i<TABLE_SIZE;i++)
252 if(_alloc_size_table[i].ptr==ptr){
253 break;
254 }
255 assert(i<TABLE_SIZE);
256 fprintf(stderr, "device_free %p of size %d."
257 " Driver report %d bytes free and %d bytes total \n",
258 ptr, _alloc_size_table[i].size, free, total);
259 }
260 #else
261 fprintf(stderr, "device_free %p."
262 " Driver report %d bytes free and %d bytes total \n",
263 ptr, free, total);
264 #endif
265 #endif
266
267 // if there is no gpu context, the call to cudaFree will fail; skip it entirely
268 if(!g_gpu_context_active) {
269 return 0;
270 }
271
272 ///@TODO: thejaswi: multi-stream support
273 if(g_use_cnmem) {
274 cnmemStatus_t status = cnmemFree(ptr, NULL);
275 if(status != CNMEM_STATUS_SUCCESS) {
276 fprintf(stderr, "device_free: cnmemFree call failed! Reason=%s\n",
277 cnmemGetErrorString(status));
278 }
279 }
280 else {
281 // We need sync as the Theano's GC could remove intermediate variable that
282 // are still needed as the gpu kernel are running or in the queue.
283 CNDA_BEGIN_ALLOW_THREADS
284 cudaThreadSynchronize();
285 CNDA_END_ALLOW_THREADS
286
287 cudaError_t err = cudaFree(ptr);
288 if (cudaSuccess != err)
289 {
290 // Clear the error flag, cudaFree doesn't do it.
291 // Currently this returns the same thing as err, but if in future
292 // it returns something else I still don't see why we should ignore
293 // it. All we want to do here is reset the flag.
294 cudaGetLastError();
295 size_t free = 0, total = 0;
296 cudaError_t err2 = cudaMemGetInfo(&free, &total);
297 if (err2 != cudaSuccess){
298 cudaGetLastError();
299 fprintf(stderr,
300 "Error when tring to find the memory information"
301 " on the GPU: %s\n", cudaGetErrorString(err2));
302 }
303 #if COMPUTE_GPU_MEM_USED
304 {
305 int i = 0;
306 for(;i<TABLE_SIZE;i++)
307 if(_alloc_size_table[i].ptr==ptr){
308 break;
309 }
310 assert(i<TABLE_SIZE);
311 fprintf(stderr,
312 "Error freeing device pointer %p (%s) of size %llu. %llu byte already allocated."
313 " Driver report %llu bytes free and %llu bytes total \n",
314 ptr, cudaGetErrorString(err),
315 (unsigned long long)_alloc_size_table[i].size, (unsigned long long)_allocated_size, (unsigned long long)free, (unsigned long long)total);
316 }
317 #else
318 fprintf(stderr,
319 "Error freeing device pointer %p (%s)."
320 " Driver report %llu bytes free and %llu bytes total \n",
321 ptr,
322 cudaGetErrorString(err), (unsigned long long)free, (unsigned long long)total);
323 #endif
324 if (NULL != PyErr_Occurred()){
325 fprintf(stderr,
326 "device_free: cudaFree() returned an error, but there is already an"
327 " Python error set. This happen during the clean up when there is a"
328 " first error and the CUDA driver is in a so bad state that it don't"
329 " work anymore. We keep the previous error set to help debugging it.");
330 return -1;
331 }
332 PyErr_Format(PyExc_MemoryError,
333 "error freeing device pointer %p (%s)",
334 ptr,
335 cudaGetErrorString(err));
336 return -1;
337 }
338 }
339 _outstanding_mallocs[0] -= (ptr != NULL);
340 #if COMPUTE_GPU_MEM_USED
341 int i=0;
342 size_t total_freed = 0;
343 for(;i<TABLE_SIZE;i++)
344 if(_alloc_size_table[i].ptr==ptr){
345 _allocated_size -= _alloc_size_table[i].size;
346 total_freed += _alloc_size_table[i].size;
347 _alloc_size_table[i].ptr=0;
348 _alloc_size_table[i].size=0;
349
350 break;
351 }
352 //if(i==TABLE_SIZE)
353 // printf("Unallocated unknow size!\n");
354 //fprintf(stderr, "freed %li bytes of device memory (%s). %d already allocated, ptr=%p\n", (long)total_freed, cudaGetErrorString(err),_allocated_size,ptr);
355 #endif
356 return 0;
357 }
358
359 static PyObject *
360 outstanding_mallocs(PyObject* self, PyObject * args)
361 {
362 return PyInt_FromLong(_outstanding_mallocs[0]);
363 }
364
365
366 static void *work_mem = NULL;
367 static size_t work_size = 0;
368
369 /*
370 * Returns a chunk of memory for temporary work inside of an op. You can only
371 * request a single chunk of memory at a time since it is reused.
372 */
373 void *get_work_mem(size_t sz) {
374 if (sz <= work_size)
375 return work_mem;
376 device_free(work_mem);
377 work_mem = device_malloc(sz);
378 work_size = sz;
379 if (work_mem == NULL)
380 work_size = 0;
381 return work_mem;
382 }
383
384 /////////////////////////
385 // Static helper methods
386 /////////////////////////
387
388 static void
389 CudaNdarray_null_init(CudaNdarray*self)
390 {
391 self->base = NULL;
392 self->nd = -1;
393 self->host_structure = NULL;
394 self->data_allocated = 0;
395 self->dev_structure_fresh = 1;
396 self->dev_structure = NULL;
397 self->devdata = NULL;
398 }
399
400 static int
401 CudaNdarray_uninit(CudaNdarray*self)
402 {
403 #if PRINT_FREE_MALLOC
404 fprintf(stderr, "CudaNdarray_uninit %p\n", self);
405 #endif
406 int rval = 0;
407 if (self->data_allocated) {
408 assert(self->devdata);
409 if (device_free(self->devdata))
410 {
411 fprintf(stderr,
412 "CudaNdarray_uninit: error freeing self->devdata. (self=%p, self->devata=%p)\n",
413 self, self->devdata);
414 rval = -1;
415 }
416 self->devdata = NULL;
417 self->data_allocated = 0;
418 }
419 if (self->dev_structure)
420 {
421 if (device_free(self->dev_structure))
422 {
423 fprintf(stderr,
424 "CudaNdarray_uninit: error freeing dev_structure memory %p (self=%p)\n",
425 self->dev_structure, self);
426 rval = -1;
427 }
428 self->dev_structure = NULL;
429 }
430 if (self->host_structure)
431 {
432 free(self->host_structure);
433 self->host_structure = NULL;
434 }
435 self->nd = -1;
436 Py_XDECREF(self->base);
437 self->base = NULL;
438 return rval;
439 }
440
441
442 //make the rightmost coords change fastest
443 //TODO: why does a downward for-loop not work????
444 //TODO: use the log2_dims and driver code to remove / and %
445 //TODO: skip the last division (when d == 0)
446 #define decl_k_elemwise_unary_rowmajor(name, F) \
447 __global__ void name (unsigned int numEls, \
448 unsigned int nd, \
449 const int * dim, \
450 const float * a_data, const int * a_str, \
451 float * z_data, const int * z_str) \
452 { \
453 const unsigned int idx = blockIdx.x * blockDim.x + threadIdx.x; \
454 const unsigned int numThreads = blockDim.x * gridDim.x; \
455 \
456 for (unsigned int i = idx; i < numEls; i += numThreads) \
457 { \
458 unsigned int ii = i; \
459 const float * a_i = a_data; \
460 float * z_i = z_data; \
461 for (unsigned int _d = 0; _d < nd; ++_d) \
462 { \
463 unsigned int d = nd - _d-1; \
464 int i_d = ii % dim[d]; /* i_d is our position in the d'th dimension */ \
465 ii = ii / dim[d]; \
466 a_i += i_d * a_str[d]; /* increment our a and z pointers by i_d elements */ \
467 z_i += i_d * z_str[d]; \
468 } \
469 z_i[0] = F(a_i[0]); \
470 } \
471 }
472
473 template<typename T> __device__ T unary_copy(T a) { return a; }
474 decl_k_elemwise_unary_rowmajor(k_elemwise_unary_rowmajor_copy, unary_copy<float>)
475
476 template<typename T> __device__ T unary_exp(T a) { return exp(a); }
477 decl_k_elemwise_unary_rowmajor(k_elemwise_unary_rowmajor_exp, unary_exp<float>)
478
479 /////////////////////////////
480 // Satisfying reqs to be Type
481 /////////////////////////////
482
483 //DON'T use directly(if their is other CudaNdarray that point to it, it will cause problem)! use Py_DECREF() instead
484 static void
485 CudaNdarray_dealloc(CudaNdarray* self)
486 {
487 if (0) std::cerr << "CudaNdarray dealloc " << self << " " << self->devdata << '\n';
488 if(Py_REFCNT(self) > 1)
489 printf("WARNING:CudaNdarray_dealloc called when there is still active reference to it.\n");
490 CudaNdarray_uninit(self);
491 Py_TYPE(self)->tp_free((PyObject*)self);
492 --_outstanding_mallocs[1];
493 if (0)
494 {
495 fprintf(stderr, "device_malloc_counts: (device) %i (obj) %i\n",
496 _outstanding_mallocs[0],
497 _outstanding_mallocs[1]);
498 }
499 }
500
501 static PyObject *
502 CudaNdarray_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
503 {
504 CudaNdarray *self;
505
506 self = (CudaNdarray *)type->tp_alloc(type, 0);
507 if (self != NULL)
508 {
509 CudaNdarray_null_init(self);
510 ++_outstanding_mallocs[1];
511 }
512 return (PyObject *)self;
513 }
514 static int
515 CudaNdarray_init(CudaNdarray *self, PyObject *args, PyObject *kwds)
516 {
517 PyObject *arr=NULL;
518
519 if (! PyArg_ParseTuple(args, "O", &arr))
520 return -1;
521 if (! PyArray_Check(arr))
522 {
523 PyErr_SetString(PyExc_TypeError, "PyArray arg required");
524 return -1;
525 }
526 int rval = CudaNdarray_CopyFromArray(self, (PyArrayObject*)arr);
527 return rval;
528 }
529 static PyMemberDef CudaNdarray_members[] =
530 {
531 /*
532 {"first", T_OBJECT_EX, offsetof(CudaNdarray, first), 0,
533 "first name"},
534 {"last", T_OBJECT_EX, offsetof(CudaNdarray, last), 0,
535 "last name"},
536 {"number", T_INT, offsetof(CudaNdarray, number), 0,
537 "noddy number"},
538 */
539 {NULL} /* Sentinel */
540 };
541
542 PyObject * CudaNdarray_CreateArrayObj(CudaNdarray * self, PyObject *args)
543 {
544 PyObject * dtype = NULL;
545 if (args && !PyArg_ParseTuple(args, "|O", &dtype))
546 return NULL;
547 if (dtype) {
548 PyArray_Descr* dtype2;
549 // PyArray_DescrConverter try to convert anything to a PyArray_Descr.
550 if(!PyArray_DescrConverter(dtype, &dtype2))
551 {
552 PyObject * str = PyObject_Repr(dtype);
553 PyErr_Format(PyExc_TypeError,
554 "CudaNdarray dtype parameter not understood: %s",
555 PyString_AsString(str)
556 );
557 Py_CLEAR(str);
558 return NULL;
559 }
560 int typeNum = dtype2->type_num;
561 Py_DECREF(dtype2);
562 if (typeNum != NPY_FLOAT32)
563 {
564 PyObject * str = PyObject_Repr(dtype);
565 PyErr_Format(PyExc_TypeError,
566 "CudaNdarray support only support float32 dtype, provided: %d",
567 typeNum
568 );
569 Py_CLEAR(str);
570 return NULL;
571 }
572 }
573
574 int verbose = 0;
575 if(self->nd>=0 && CudaNdarray_SIZE(self)==0){
576 npy_intp * npydims = (npy_intp*)malloc(self->nd * sizeof(npy_intp));
577 assert (npydims);
578 for (int i = 0; i < self->nd; ++i) npydims[i] = (npy_intp)(CudaNdarray_HOST_DIMS(self)[i]);
579 PyObject * rval = PyArray_SimpleNew(self->nd, npydims, REAL_TYPENUM);
580 free(npydims);
581 if (!rval){
582 return NULL;
583 }
584 assert (PyArray_ITEMSIZE((PyArrayObject *)rval) == sizeof(real));
585 return rval;
586 }
587 if ((self->nd < 0) || (self->devdata == 0))
588 {
589 PyErr_SetString(PyExc_ValueError, "can't copy from un-initialized CudaNdarray");
590 return NULL;
591 }
592 CudaNdarray * contiguous_self = NULL;
593 if (CudaNdarray_is_c_contiguous(self))
594 {
595 contiguous_self = self;
596 Py_INCREF(contiguous_self);
597 if (verbose) std::cerr << "CreateArrayObj already contiguous" << contiguous_self << '\n';
598 }
599 else
600 {
601 contiguous_self = (CudaNdarray*)CudaNdarray_Copy(self);
602 if (verbose) std::cerr << "CreateArrayObj created contiguous" << contiguous_self << '\n';
603 }
604 if (!contiguous_self)
605 {
606 return NULL;
607 }
608
609 npy_intp * npydims = (npy_intp*)malloc(self->nd * sizeof(npy_intp));
610 assert (npydims);
611 for (int i = 0; i < self->nd; ++i)
612 npydims[i] = (npy_intp)(CudaNdarray_HOST_DIMS(self)[i]);
613 PyArrayObject * rval = (PyArrayObject *) PyArray_SimpleNew(self->nd,
614 npydims,
615 REAL_TYPENUM);
616 free(npydims);
617 if (!rval)
618 {
619 Py_DECREF(contiguous_self);
620 return NULL;
621 }
622
623 assert (PyArray_ITEMSIZE(rval) == sizeof(real));
624
625 npy_intp rval_size = PyArray_SIZE(rval);
626 void *rval_data = PyArray_DATA(rval);
627 cudaError_t err;
628 CNDA_BEGIN_ALLOW_THREADS;
629
630 err = cudaMemcpy(rval_data, contiguous_self->devdata,
631 rval_size * sizeof(real),
632 cudaMemcpyDeviceToHost
633 );
634 //CNDA_THREAD_SYNC; // unneeded because cudaMemcpy is blocking anyway
635 CNDA_END_ALLOW_THREADS;
636
637 if (cudaSuccess != err)
638 {
639 PyErr_Format(PyExc_RuntimeError, "error (%s)copying data to host",
640 cudaGetErrorString(err));
641 Py_DECREF(rval);
642 rval = NULL;
643 }
644
645 Py_DECREF(contiguous_self);
646 return (PyObject *)rval;
647 }
648
649 // TODO-- we have two functions here, ZEROS and Zeros.
650 // ZEROS is meant to be called just from C code (you don't need to pass it PyObject * s)
651 // but this naming is very weird, makes it look like a macro
652 // we should figure out the correct convention and change to that
653 PyObject* CudaNdarray_ZEROS(int n, int * dims)
654 {
655
656 size_t total_elements = 1;
657
658 for(size_t i=0;i<n;i++){
659 // Detect overflow on unsigned integer
660 if (dims[i] != 0 && total_elements > (SIZE_MAX / dims[i])) {
661 PyErr_Format(PyExc_RuntimeError,
662 "Can't store in size_t for the bytes requested %llu * %llu",
663 (unsigned long long)total_elements,
664 (unsigned long long)dims[i]);
665 return NULL;
666 }
667 total_elements*=dims[i];
668 }
669
670 // total_elements now contains the size of the array, in reals
671 if (total_elements > (SIZE_MAX / sizeof(real))){
672 PyErr_Format(PyExc_RuntimeError,
673 "Can't store in size_t for the bytes requested %llu * 4",
674 (unsigned long long)total_elements);
675 return NULL;
676 }
677 size_t total_size = total_elements * sizeof(real);
678
679 CudaNdarray* rval = (CudaNdarray*)CudaNdarray_New();
680 if (!rval)
681 {
682 PyErr_SetString(PyExc_RuntimeError, "CudaNdarray_ZEROS: call to New failed");
683 return NULL;
684 }
685
686 if (CudaNdarray_alloc_contiguous(rval, n, dims))
687 {
688 PyErr_SetString(PyExc_RuntimeError, "CudaNdarray_ZEROS: allocation failed.");
689 Py_DECREF(rval);
690 return NULL;
691 }
692
693 // Fill with zeros
694 //fprintf(stdout, "Sizeof: %d\n", total_size);
695 if (cudaSuccess != cudaMemset(rval->devdata, 0, total_size))
696 {
697 PyErr_Format(PyExc_MemoryError,
698 "CudaNdarray_ZEROS: Error memsetting %llu bytes of device memory.",
699 (unsigned long long)total_size);
700 Py_DECREF(rval);
701 return NULL;
702 }
703
704 if (cnda_copy_structure_to_device(rval))
705 {
706 PyErr_SetString(PyExc_RuntimeError, "CudaNdarray_ZEROS: syncing structure to device failed");
707 Py_DECREF(rval);
708 return NULL;
709 }
710 return (PyObject*) rval;
711 }
712
713 // declared as a static method (hence 1st parameter is not used)
714 // Based on _Copy and _dimshuffle
715 PyObject* CudaNdarray_Zeros(PyObject* _unused, PyObject* shape)
716 {
717 if(!shape)
718 {
719 PyErr_SetString(PyExc_TypeError, "CudaNdarray_Zeros: function takes at least 1 argument (0 given)");
720 return NULL;
721 }
722 if(!PySequence_Check(shape))
723 {
724 PyErr_SetString(PyExc_TypeError, "shape argument must be a sequence");
725 return NULL;
726 }
727
728 int shplen = PySequence_Length(shape);
729
730 if (shplen == 0)
731 {
732 return CudaNdarray_ZEROS(0, NULL);
733 }
734
735 int* newdims = (int *)malloc(sizeof(int) * shplen);
736
737 if (!newdims)
738 {
739 PyErr_SetString(PyExc_MemoryError,
740 "CudaNdarray_Zeros: Failed to allocate temporary space");
741 return NULL;
742 }
743
744 // start from the end to compute strides
745 for (int i = shplen-1; i >= 0; --i)
746 {
747 PyObject* shp_el_obj = PySequence_GetItem(shape, i);
748 if(shp_el_obj == NULL)
749 {
750 // shouldn't happen since we checked length before...
751 PyErr_SetString(PyExc_RuntimeError, "CudaNdarray_Zeros: Index out of bound in sequence");
752 free(newdims);
753 return NULL;
754 }
755
756 int shp_el = PyInt_AsLong(shp_el_obj);
757 Py_DECREF(shp_el_obj);
758
759 if (shp_el < 0)
760 {
761 PyErr_SetString(PyExc_ValueError, "CudaNdarray_Zeros: shape must contain only non-negative values for size of a dimension");
762 free(newdims);
763 return NULL;
764 }
765
766 newdims[i] = shp_el;
767 }
768
769 PyObject* rval = CudaNdarray_ZEROS(shplen,newdims);
770
771 free(newdims);
772
773 return (PyObject*)rval;
774 }
775
776
777
778
779
780 PyObject * CudaNdarray_Copy(const CudaNdarray * self)
781 {
782 PyObject * rval = CudaNdarray_New();
783 if ((!rval) || (-1 == self->nd))
784 {
785 return rval;
786 }
787 if (CudaNdarray_alloc_contiguous((CudaNdarray*)rval, self->nd, CudaNdarray_HOST_DIMS(self)))
788 {
789 Py_DECREF(rval);
790 return NULL;
791 }
792 if (CudaNdarray_CopyFromCudaNdarray((CudaNdarray*)rval, self))
793 {
794 Py_DECREF(rval);
795 return NULL;
796 }
797 return rval;
798 }
799 PyObject * CudaNdarray_DeepCopy(CudaNdarray * self, PyObject * memo)
800 {
801 assert(PyDict_Check(memo));
802 PyObject * selfkey = PyInt_FromLong((long)self);
803 assert(selfkey);
804 if (PyDict_Contains(memo, selfkey))
805 {
806 PyObject * rval = PyDict_GetItem(memo, selfkey);
807 Py_DECREF(selfkey);
808 Py_XINCREF(rval);
809 return rval;
810 }
811 else
812 {
813 PyObject * rval = CudaNdarray_Copy(self);
814 if (0) std::cerr << "DeepCopy created " << rval << " devdata " << ((CudaNdarray*)rval)->devdata << "\n";
815 if (NULL == rval)
816 {
817 Py_DECREF(selfkey);
818 return NULL;
819 }
820 if (PyDict_SetItem(memo, selfkey, rval))
821 {
822 Py_DECREF(rval);
823 Py_DECREF(selfkey);
824 return NULL;
825 }
826 Py_DECREF(selfkey);
827 return rval;
828 }
829 }
830 PyObject * CudaNdarray_ReduceSum(CudaNdarray * self, PyObject * py_reduce_mask)
831 {
832 if (!PySequence_Check(py_reduce_mask))
833 {
834 PyErr_SetString(PyExc_TypeError, "reduce_mask must be sequence of ints");
835 return NULL;
836 }
837 int len = PySequence_Length(py_reduce_mask);
838 if (len != self->nd)
839 {
840 PyErr_SetString(PyExc_TypeError, "length of reduce_mask must match self->nd");
841 return NULL;
842 }
843 CudaNdarray * self_sum = (CudaNdarray*)CudaNdarray_New();
844 if (!self_sum)
845 {
846 return NULL;
847 }
848 //TODO: allocate a fixed size dimshuffle_pattern_cache on the stack,
849 // and use it if it is big enough.
850 int * dimshuffle_pattern = (int*)malloc(len * 2 * sizeof(int));
851 int * sum_dims = dimshuffle_pattern + len;
852 int n_remaining_dims = 0;
853 if (!dimshuffle_pattern)
854 {
855 Py_DECREF(self_sum);
856 PyErr_SetString(PyExc_MemoryError, "failed to alloc internal storage");
857 return NULL;
858 }
859 for (int i = 0; i < len; ++i)
860 {
861 PyObject *o_i = PySequence_GetItem(py_reduce_mask, i);
862 int o_i_int = PyInt_AsLong(o_i);
863 Py_XDECREF(o_i);
864 if (PyErr_Occurred())
865 {
866 Py_DECREF(self_sum);
867 free(dimshuffle_pattern);
868 return NULL;
869 }
870 if (o_i_int) // this is a dimension over which we are reducing
871 {
872 sum_dims[i] = 1;
873 }
874 else
875 {
876 sum_dims[i] = CudaNdarray_HOST_DIMS(self)[i];
877 dimshuffle_pattern[n_remaining_dims++] = i;
878 }
879 }
880 if (0 || CudaNdarray_alloc_contiguous(self_sum, len, sum_dims)
881 || CudaNdarray_reduce_sum(self_sum, self)
882 || CudaNdarray_dimshuffle(self_sum, n_remaining_dims, dimshuffle_pattern))
883 {
884 Py_DECREF(self_sum);
885 free(dimshuffle_pattern);
886 return NULL;
887 }
888 free(dimshuffle_pattern);
889 return (PyObject*)self_sum;
890 }
891
892 // Reshape self to the new shape gived by the tuple shape.
893 //
894 // If self is c contiguous, it return a view. Otherwise it always do a copy.
895 // TODO: make it return a view when the strides allow it even if it is not
896 // c contiguous
897 PyObject * CudaNdarray_Reshape(CudaNdarray * self, PyObject * shape)
898 {
899 if(!CudaNdarray_is_c_contiguous(self))
900 {
901 // allocate new space
902 //TODO: test to see if we can re-use old one and take a new param to
903 // use this
904 CudaNdarray* rval = (CudaNdarray*) CudaNdarray_Copy(self);
905 if (!rval)
906 {
907 return NULL;
908 }
909
910 CudaNdarray* ret = (CudaNdarray*) CudaNdarray_Reshape(rval, shape);
911 Py_XDECREF(rval);
912 return (PyObject*)ret;
913 }
914
915 // check shape tuple
916 unsigned int rval_nd;
917 unsigned int * rval_dims;
918 size_t rval_size = 1;
919
920 if (PyTuple_Check(shape)){
921 // copy shape to integer array
922 rval_nd = PyTuple_Size(shape);
923 }else if (PyInt_Check(shape)){
924 rval_nd = 1;
925 }else{
926 PyErr_SetString(PyExc_TypeError, "shape must be tuple of integers or an integer");
927 return NULL;
928 }
929 rval_dims = (unsigned int*)malloc(rval_nd * sizeof(int));
930
931 if(PyTuple_Check(shape)){
932 for (int i = 0; i < rval_nd; ++i)
933 {
934 rval_dims[i] = PyInt_AsLong(PyTuple_GetItem(shape, i)); //GetItem returns borrowed reference
935 if (PyErr_Occurred()) //error in AsLong
936 {
937 free(rval_dims);
938 return NULL;
939 }
940 if(rval_dims[i]<0){
941 PyErr_Format(PyExc_ValueError, "Reshape has invalid dimension %i (must be >=0)",rval_dims[i]);
942 free(rval_dims);
943 return NULL;
944 }
945 rval_size = rval_size * rval_dims[i];
946 }
947 }else{
948 rval_size = PyInt_AsLong(shape);
949 rval_dims[0] = rval_size;
950 }
951 // calculate new size, assert same as old size
952 if (rval_size != CudaNdarray_SIZE(self))
953 {
954 PyErr_Format(PyExc_ValueError, "size must remain unchanged, changed from %lld to %lld", CudaNdarray_SIZE(self), rval_size);
955 free(rval_dims);
956 return NULL;
957 }
958 if (rval_size==0)
959 {
960 PyObject * rval = CudaNdarray_NewDims(rval_nd, rval_dims);
961 free(rval_dims);
962 return rval;
963 }
964
965 //return a view, not a copy
966 //we can do this as we checked self is c_contiguous
967 CudaNdarray * rval = (CudaNdarray * )CudaNdarray_New(rval_nd);
968
969 if (!rval || 0 != rval->data_allocated
970 ||CudaNdarray_set_device_data(rval, CudaNdarray_DEV_DATA(self), self))
971 {
972 Py_XDECREF(rval);
973 free(rval_dims);
974 return NULL;
975 }
976 //set dim and stride
977 int size = 1;
978 for (int i = rval_nd-1; i >= 0; --i)
979 {
980 CudaNdarray_set_stride(rval, i, (rval_dims[i] == 1) ? 0 : size);
981 CudaNdarray_set_dim(rval, i, rval_dims[i]);
982 size = size * rval_dims[i];
983 }
984 free(rval_dims);
985 return (PyObject*)rval;
986 }
987
988 PyObject * CudaNdarray_View(const CudaNdarray * self)
989 {
990 CudaNdarray * rval = (CudaNdarray*)CudaNdarray_New(self->nd);
991 if (!rval || CudaNdarray_set_device_data(rval, CudaNdarray_DEV_DATA(self), self))
992 {
993 Py_XDECREF(rval);
994 rval = NULL;
995 }
996 else
997 {
998 for (int i = 0; i < self->nd; ++i)
999 {
1000 CudaNdarray_set_dim(rval, i, CudaNdarray_HOST_DIMS(self)[i]);
1001 CudaNdarray_set_stride(rval, i, CudaNdarray_HOST_STRIDES(self)[i]);
1002 }
1003 }
1004 return (PyObject*)rval;
1005 }
1006
1007 /*
1008 * d0,... are the output dims
1009 * indices are a list of index to operate on
1010 * They are int32 viewed as float32.
1011 * a is the output
1012 * b is the input
1013 * dB0, the source leading dimensions size
1014 */
1015 template <int operator_num>
1016 __global__ void k_take_3(const int d0, const int d1, const int d2,
1017 const npy_int64* indices,
1018 float* a,
1019 const int sA0, const int sA1, const int sA2,
1020 const float* b, const int dB0,
1021 const int sB0, const int sB1, const int sB2,
1022 int* err){
1023 for (int i0 = blockIdx.x; i0 < d0; i0 += gridDim.x){
1024 npy_int64 idx = indices[i0];
1025 if (idx<0)
1026 idx += dB0; // To allow negative indexing.
1027 if ((idx < 0) || (idx >= dB0)){
1028 // Any value other the 0 probably work. But to be more safe, I want
1029 // to change all bits to prevent problem with concurrent write that
1030 // could cross cache line. But this should not happen with the
1031 // current code and driver.
1032 *err = 0xFFFF;
1033 continue;
1034 }
1035 for (int i1 = threadIdx.x; i1 < d1; i1 += blockDim.x){
1036 for (int i2 = threadIdx.y; i2 < d2; i2 += blockDim.y){
1037 int a_idx = i0*sA0 + i1*sA1 + i2*sA2;
1038 int b_idx = idx*sB0 + i1*sB1 + i2*sB2;
1039 a[a_idx] = b[b_idx];
1040 }
1041 }
1042 }
1043 }
1044
1045 // We try to be similar to the PyArray_TakeFrom function
1046 //http://docs.scipy.org/doc/numpy/reference/c-api.array.html
1047 //TODO: support other clip mode then raise(clip, wrap)
1048 //self is the input that we copy data from.
1049 //The indices that we receive MUST be an CudaNdarray(float32)
1050 // that is in fact a view to int64 indices
1051 PyObject*
1052 CudaNdarray_TakeFrom(CudaNdarray * self, PyObject *args){
1053 int verbose = 0;
1054 PyObject * indices_obj = NULL;
1055 //int axis; Default None, that mean the flattened array.
1056 PyObject * axis_obj = Py_None;
1057 PyObject * out_obj = Py_None;
1058 PyObject * clipmode_obj = NULL;
1059 int max_threads = 1; // max threads per blocks
1060
1061 if (! PyArg_ParseTuple(args, "O|OOOi", &indices_obj, &axis_obj,
1062 &out_obj, &clipmode_obj, &max_threads))
1063 return NULL;
1064
1065 //Check argument indices
1066 //TODO: if not a numpy.ndarray, convert to numpy.ndarray
1067 //TODO: If a CudaNdarray, accept it and suppose the data is int32? is float32 number of int?
1068 //TODO: Support ndarray of other dtype then int32
1069 //TODO: support list of indices that are not c_contiguous
1070 CudaNdarray * indices = NULL;
1071 if (CudaNdarray_Check(indices_obj)) {
1072 if (verbose) printf("cudandarray indices\n");
1073 indices = (CudaNdarray*) indices_obj;
1074 Py_INCREF(indices);
1075 } else if (PyArray_Check(indices_obj)) {
1076 if (verbose) printf("ndarray indices\n");
1077 if (PyArray_TYPE((PyArrayObject *)indices_obj) != NPY_INT64) {
1078 PyErr_SetString(PyExc_TypeError,
1079 "CudaNdarray_TakeFrom: need a ndarray for indices"
1080 " with dtype int64");
1081 return NULL;
1082 }
1083 if (PyArray_NDIM(((PyArrayObject*)indices_obj)) != 1) {
1084 PyErr_SetString(PyExc_TypeError,
1085 "CudaNdarray_TakeFrom: need a CudaNdarray of"
1086 " indices with only 1 dimensions");
1087 return NULL;
1088 }
1089 // We need indices_obj to be contiguous, in order to take a view
1090 // with a different dtype.
1091 if (!PyArray_IS_C_CONTIGUOUS((PyArrayObject*) indices_obj)) {
1092 PyObject* indices_obj_contig = PyArray_NewCopy((PyArrayObject*) indices_obj, NPY_CORDER);
1093 if (!indices_obj_contig)
1094 return NULL;
1095 indices_obj = indices_obj_contig;
1096 } else {
1097 // Keep the refcount consistent
1098 Py_INCREF(indices_obj);
1099 }
1100 PyArray_Descr* float32_descr = PyArray_DescrFromType(NPY_FLOAT32);
1101 PyObject * indices_float32 = NULL;
1102 indices_float32 = PyArray_View((PyArrayObject*)indices_obj,
1103 float32_descr, NULL);
1104 if (verbose) printf("ndarray indices\n");
1105 if (!indices_float32) {
1106 Py_DECREF(indices_obj);
1107 return NULL;
1108 }
1109
1110 indices = (CudaNdarray*) CudaNdarray_New();
1111 if (verbose) printf("\nndarray after new\n");
1112 if (! indices){
1113 Py_DECREF(indices_obj);
1114 Py_DECREF(indices_float32);
1115 return NULL;
1116 }
1117 if (CudaNdarray_CopyFromArray(indices,
1118 (PyArrayObject *)indices_float32)){
1119 Py_DECREF(indices_obj);
1120 Py_DECREF(indices_float32);
1121 return NULL;
1122 }
1123 Py_DECREF(indices_obj);
1124 Py_DECREF(indices_float32);
1125 } else {
1126 PyObject* py_s = PyObject_Str(indices_obj);
1127 const char* s = PyString_AsString(py_s);
1128 Py_DECREF(py_s);
1129 PyErr_Format(PyExc_TypeError,
1130 "CudaNdarray_TakeFrom: need an ndarray of int64 or a"
1131 " CudaNdarray(float32) that is a view from int64 data"
1132 " for indices. Got %s", s);
1133 return NULL;
1134 }
1135
1136 if (verbose) {
1137 printf("indices used on the gpu\n");
1138 fprint_CudaNdarray(stdout, indices);
1139 PyObject * used_indices = CudaNdarray_CreateArrayObj(indices);
1140 PyObject_Print(used_indices, stdout, 0);
1141 Py_DECREF(used_indices);
1142 }
1143 if (verbose) printf("after print of object\n");
1144 if(!CudaNdarray_is_c_contiguous(indices) != 0) {
1145 PyErr_SetString(PyExc_NotImplementedError,
1146 "CudaNdarray_TakeFrom: The indices must be contiguous in memory.");
1147 Py_DECREF(indices);
1148 return NULL;
1149 }
1150 int nb_indices = CudaNdarray_SIZE((CudaNdarray *)indices) / 2;// int64 are 8 bytes, float32 are 4 bytes
1151
1152 //Check argument axis
1153 //TODO: implement the default and other axis
1154 long axis = PyInt_AsLong(axis_obj);
1155
1156 if (axis != 0) {
1157 PyErr_Format(PyExc_NotImplementedError,
1158 "CudaNdarray_TakeFrom: only axis=0 is currently supported."
1159 " Got %ld.", axis);
1160 Py_DECREF(indices);
1161 return NULL;
1162 }
1163
1164 //Check argument out_obj
1165 CudaNdarray * out = NULL;
1166 if (out_obj && CudaNdarray_Check(out_obj))
1167 out = (CudaNdarray*) out_obj;
1168 if (out && (out->nd != self->nd ||
1169 CudaNdarray_HOST_DIMS(out)[0] != nb_indices))
1170 out = NULL;
1171 int * dims = (int *)malloc(sizeof(int) * self->nd);
1172 dims[0] = nb_indices;
1173
1174 for (int i=1 ; i<self->nd ; i++) {
1175 dims[i] = CudaNdarray_HOST_DIMS(self)[i];
1176 if (out && CudaNdarray_HOST_DIMS(out)[i] != dims[i]) {
1177 out = NULL;
1178 }
1179 }
1180 if (!out) {
1181 out = (CudaNdarray*)CudaNdarray_New();
1182 if (!out){
1183 Py_DECREF(indices);
1184 free(dims);
1185 return NULL;
1186 }
1187 if (CudaNdarray_alloc_contiguous(out, self->nd, dims)) {
1188 Py_DECREF(out);
1189 Py_DECREF(indices);
1190 free(dims);
1191 return NULL;
1192 }
1193 }else {
1194 Py_INCREF(out);
1195 }
1196
1197 //Check argument clipmode
1198 if (clipmode_obj) {
1199 char * clipmode = PyString_AsString(clipmode_obj);
1200 if (! clipmode){
1201 Py_DECREF(indices);
1202 Py_DECREF(out);
1203 free(dims);
1204 return NULL;
1205 }
1206 if (strcmp(clipmode, "raise") != 0) {
1207 PyErr_Format(PyExc_NotImplementedError,
1208 "CudaNdarray_TakeFrom: only the raise mode is currently supported. Got '%s'",
1209 clipmode);
1210 Py_DECREF(indices);
1211 Py_DECREF(out);
1212 free(dims);
1213 return NULL;
1214 }
1215 }
1216 void (*k3)(const int, const int, const int,
1217 const npy_int64*,
1218 float*, const int, const int, const int,
1219 const float*, const int,
1220 const int, const int, const int,
1221 int*);
1222 k3 = k_take_3<CPY>;
1223
1224 // Create the memory place that will store the error information.
1225 if(init_err_var() != 0) return NULL;
1226
1227 dim3 n_blocks(std::min(CudaNdarray_HOST_DIMS(out)[0],65535),1,1);
1228 if(CudaNdarray_HOST_DIMS(out)[0] == 0){
1229 // We take 0 elements, so no need for the rest of the code.
1230 // This speed up that case AND fix crash otherwise.
1231 free(dims);
1232 Py_DECREF(indices);
1233 return (PyObject *)out;
1234 }
1235
1236 switch (self->nd) {
1237 case 1:
1238 {
1239 dim3 n_threads(1, 1, 1);
1240 if (verbose)
1241 printf("cudaGetLastError=%d, nd=%d"
1242 " kernel config: (n_blocks.x=%d, n_blocks.y=%d,"
1243 " n_threads.x=%i, n_threads.y=%i)\n",
1244 cudaGetLastError(), self->nd,
1245 n_blocks.x, n_blocks.y, n_threads.x, n_threads.y);
1246 k3<<<n_blocks, n_threads>>>(
1247 dims[0],
1248 1,
1249 1,
1250 (npy_int64*) CudaNdarray_DEV_DATA(indices),
1251 CudaNdarray_DEV_DATA(out),
1252 CudaNdarray_HOST_STRIDES(out)[0], //strides
1253 1,
1254 1,
1255 CudaNdarray_DEV_DATA(self),
1256 CudaNdarray_HOST_DIMS(self)[0], //For indices check
1257 CudaNdarray_HOST_STRIDES(self)[0], //strides
1258 1,
1259 1,
1260 err_var);
1261 }
1262 break;
1263 case 2:
1264 {
1265 dim3 n_threads(std::min(CudaNdarray_HOST_DIMS(out)[1], max_threads), 1, 1);
1266
1267 if (verbose)
1268 printf("cudaGetLastError=%d, nd=%d"
1269 " kernel config: (n_blocks.x=%d, n_blocks.y=%d,"
1270 " n_threads.x=%i, n_threads.y=%i)\n",
1271 cudaGetLastError(), self->nd,
1272 n_blocks.x, n_blocks.y, n_threads.x, n_threads.y);
1273
1274 k3<<<n_blocks, n_threads>>>(
1275 dims[0], //dimensions
1276 dims[1],
1277 1,
1278 (npy_int64*) CudaNdarray_DEV_DATA(indices),
1279 CudaNdarray_DEV_DATA(out),
1280 CudaNdarray_HOST_STRIDES(out)[0], //strides
1281 CudaNdarray_HOST_STRIDES(out)[1],
1282 1,
1283 CudaNdarray_DEV_DATA(self),
1284 CudaNdarray_HOST_DIMS(self)[0], //For indices check
1285 CudaNdarray_HOST_STRIDES(self)[0], //strides
1286 CudaNdarray_HOST_STRIDES(self)[1],
1287 1,
1288 err_var);
1289 }
1290 break;
1291 case 3:
1292 {
1293 int ty = std::min(CudaNdarray_HOST_DIMS(out)[2], max_threads);
1294 int tx = std::min(CudaNdarray_HOST_DIMS(out)[1], max_threads / ty);
1295 dim3 n_threads(tx, ty, 1);
1296 if (verbose)
1297 printf("cudaGetLastError=%d, nd=%d"
1298 " kernel config: (n_blocks.x=%d, n_blocks.y=%d,"
1299 " n_threads.x=%i, n_threads.y=%i)\n",
1300 cudaGetLastError(), self->nd,
1301 n_blocks.x, n_blocks.y, n_threads.x, n_threads.y);
1302 k3<<<n_blocks, n_threads>>>(
1303 dims[0], //dimensions
1304 dims[1],
1305 dims[2],
1306 (npy_int64*) CudaNdarray_DEV_DATA(indices),
1307 CudaNdarray_DEV_DATA(out),
1308 CudaNdarray_HOST_STRIDES(out)[0], //strides
1309 CudaNdarray_HOST_STRIDES(out)[1],
1310 CudaNdarray_HOST_STRIDES(out)[2],
1311 CudaNdarray_DEV_DATA(self),
1312 CudaNdarray_HOST_DIMS(self)[0], //For indices check
1313 CudaNdarray_HOST_STRIDES(self)[0], //strides
1314 CudaNdarray_HOST_STRIDES(self)[1],
1315 CudaNdarray_HOST_STRIDES(self)[2],
1316 err_var);
1317 }
1318 break;
1319 default:
1320 PyErr_SetString(PyExc_NotImplementedError,
1321 "CudaNdarray_TakeFrom: only input with 1, 2 or 3"
1322 " dimensions are currently supported");
1323
1324 }
1325 free(dims);
1326 CNDA_THREAD_SYNC;
1327 cudaError_t err = cudaGetLastError();
1328 if (cudaSuccess != err) {
1329 PyErr_Format(PyExc_RuntimeError,
1330 "Cuda error: %s: %s.\n",
1331 "CudaNdarray_TakeFrom",
1332 cudaGetErrorString(err));
1333 Py_DECREF(indices);
1334 Py_DECREF(out);
1335 return NULL;
1336 }
1337
1338 int index_err = check_err_var();
1339 Py_DECREF(indices);
1340 if (index_err != 0) {
1341 Py_DECREF(out);
1342 return NULL;
1343 }
1344
1345 if (verbose) printf("TAKE SUCCEDED\n");
1346 return (PyObject *)out;
1347 }
1348
1349
1350 PyObject * CudaNdarray_SetStride(CudaNdarray * self, PyObject *args)
1351 {
1352 int pos, stride;
1353 if (! PyArg_ParseTuple(args, "ii", &pos, &stride))
1354 return NULL;
1355 if ((pos < 0) || (pos >= self->nd))
1356 {
1357 PyErr_Format(PyExc_ValueError, "position argument out of legal range [0, %i)", self->nd);
1358 return NULL;
1359 }
1360 CudaNdarray_set_stride(self, pos, stride);
1361 if (cnda_copy_structure_to_device(self))
1362 {
1363 return NULL;
1364 }
1365 Py_INCREF(Py_None);
1366 return Py_None;
1367 }
1368 PyObject * CudaNdarray_SetShapeI(CudaNdarray * self, PyObject *args)
1369 {
1370 int pos, dim;
1371 if (! PyArg_ParseTuple(args, "ii", &pos, &dim))
1372 return NULL;
1373 if ((pos < 0) || (pos >= self->nd))
1374 {
1375 PyErr_Format(PyExc_ValueError, "position argument out of legal range [0, %i)", self->nd);
1376 return NULL;
1377 }
1378 CudaNdarray_set_dim(self, pos, dim);
1379 if (cnda_copy_structure_to_device(self))
1380 {
1381 return NULL;
1382 }
1383 Py_INCREF(Py_None);
1384 return Py_None;
1385 }
1386
1387 static PyObject *
1388 CudaNdarray_exp(CudaNdarray* self)
1389 {
1390 CudaNdarray * rval = (CudaNdarray *)CudaNdarray_New();
1391 if ((NULL == rval) || CudaNdarray_alloc_contiguous(rval, self->nd, CudaNdarray_HOST_DIMS(self)))
1392 {
1393 Py_XDECREF(rval);
1394 return NULL;
1395 }
1396 unsigned int size = 1;
1397 for (int i = 0; i < self->nd; i++)
1398 {
1399 size *= (unsigned int) CudaNdarray_HOST_DIMS(self)[i];
1400 }
1401 unsigned int threads_per_block = std::min(size, (unsigned int)NUM_VECTOR_OP_THREADS_PER_BLOCK);
1402 unsigned int n_blocks = std::min(ceil_intdiv(size,threads_per_block), (unsigned int)NUM_VECTOR_OP_BLOCKS);
1403 k_elemwise_unary_rowmajor_exp<<<n_blocks,threads_per_block>>>(size, self->nd, CudaNdarray_DEV_DIMS(self),
1404 CudaNdarray_DEV_DATA(self), CudaNdarray_DEV_STRIDES(self),
1405 CudaNdarray_DEV_DATA(rval), CudaNdarray_DEV_STRIDES(rval));
1406
1407 //TODO: don't do this right away, do it when we need the result
1408 CNDA_THREAD_SYNC;
1409 cudaError_t err = cudaGetLastError();
1410 if( cudaSuccess != err)
1411 {
1412 Py_DECREF(rval);
1413 PyErr_Format(PyExc_RuntimeError, "Cuda error: %s: %s.\n", "kExp", cudaGetErrorString(err));
1414 return NULL;
1415 }
1416
1417 return (PyObject*)rval;
1418 }
1419
1420 static PyMethodDef CudaNdarray_methods[] =
1421 {
1422 {"__array__",
1423 (PyCFunction)CudaNdarray_CreateArrayObj, METH_VARARGS,
1424 "Copy from the device to a numpy ndarray"},
1425 {"__copy__",
1426 (PyCFunction)CudaNdarray_View, METH_NOARGS,
1427 "Create a shallow copy of this object. used by module copy"},
1428 {"__deepcopy__",
1429 (PyCFunction)CudaNdarray_DeepCopy, METH_O,
1430 "Create a copy of this object"},
1431 {"zeros",
1432 (PyCFunction)CudaNdarray_Zeros, METH_STATIC | METH_O,
1433 "Create a new CudaNdarray with specified shape, filled with zeros."},
1434 {"copy",
1435 (PyCFunction)CudaNdarray_Copy, METH_NOARGS,
1436 "Create a copy of this object"},
1437 {"is_c_contiguous",
1438 (PyCFunction)CudaNdarray_IS_C_Contiguous, METH_NOARGS,
1439 "Return True is the object is c contiguous. False otherwise."},
1440 {"reduce_sum",
1441 (PyCFunction)CudaNdarray_ReduceSum, METH_O,
1442 "Reduce over the given dimensions by summation"},
1443 {"exp",
1444 (PyCFunction)CudaNdarray_exp, METH_NOARGS,
1445 "Return the exponential of all elements"},
1446 {"reshape",
1447 (PyCFunction)CudaNdarray_Reshape, METH_O,
1448 "Return a reshaped view (or copy) of this ndarray\n\
1449 The required argument is a tuple of integers specifying the shape of the new ndarray."},
1450 {"view",
1451 (PyCFunction)CudaNdarray_View, METH_NOARGS,
1452 "Return an alias of this ndarray"},
1453 {"_set_stride",
1454 (PyCFunction)CudaNdarray_SetStride, METH_VARARGS,
1455 "For integer arguments (i, s), set the 'i'th stride to 's'"},
1456 {"take",
1457 (PyCFunction)CudaNdarray_TakeFrom, METH_VARARGS,
1458 "Equivalent of numpy.take"},
1459 {"_set_shape_i",
1460 (PyCFunction)CudaNdarray_SetShapeI, METH_VARARGS,
1461 "For integer arguments (i, s), set the 'i'th shape to 's'"},
1462 {NULL, NULL, NULL, NULL} /* Sentinel */
1463 };
1464
1465
1466 ////////////////////
1467 // Number protocol
1468 ////////////////////
1469
1470 __global__ void kAdd_contiguous(float* a, float* b, float* dest, unsigned int numEls) {
1471 const unsigned int idx = blockIdx.x * blockDim.x + threadIdx.x;
1472 const unsigned int numThreads = blockDim.x * gridDim.x;
1473
1474 for (unsigned int i = idx; i < numEls; i += numThreads) {
1475 dest[i] = a[i] + b[i];
1476 }
1477 }
1478
1479 // Will be called by __add__ in Python
1480 static PyObject *
1481 CudaNdarray_add(PyObject* py_self, PyObject * py_other)
1482 {
1483 if (! CudaNdarray_Check(py_self)) {
1484 PyErr_SetString(PyExc_TypeError, "need a CudaNdarray on left");
1485 return NULL;
1486 }
1487 if (! CudaNdarray_Check(py_other)) {
1488 PyErr_SetString(PyExc_TypeError, "need a CudaNdarray on right");
1489 return NULL;
1490 }
1491 CudaNdarray * self = (CudaNdarray *)py_self;
1492 CudaNdarray * other = (CudaNdarray *)py_other;
1493 if(!CudaNdarray_is_c_contiguous(self) || !CudaNdarray_is_c_contiguous(other)){
1494 PyErr_SetString(PyExc_TypeError, "We have implementet only the c_contiguous version for now.");
1495 return NULL;
1496 }
1497
1498 //standard elemwise size checks
1499 if (self->nd != other->nd)
1500 {
1501 PyErr_SetString(PyExc_TypeError, "CudaNdarray_add: need same number of dims");
1502 return NULL;
1503 }
1504 //standard elemwise dim checks
1505 unsigned int size = 1;
1506 for (int i = 0; i< self->nd; ++i)
1507 {
1508 if (CudaNdarray_HOST_DIMS(self)[i] != CudaNdarray_HOST_DIMS(other)[i])
1509 {
1510 PyErr_SetString(PyExc_TypeError, "need same dimensions");
1511 return NULL;
1512 }
1513 size *= (unsigned int) CudaNdarray_HOST_DIMS(self)[i];
1514 }
1515 CudaNdarray * rval = (CudaNdarray *)CudaNdarray_New();
1516 if (!rval || CudaNdarray_alloc_contiguous(rval, self->nd, CudaNdarray_HOST_DIMS(self)))
1517 {
1518 Py_XDECREF(rval);
1519 return NULL;
1520 }
1521
1522 if(CudaNdarray_SIZE((CudaNdarray *)py_self)==0 && CudaNdarray_SIZE((CudaNdarray *)py_other)==0){
1523 return (PyObject *) rval;
1524 }
1525
1526 int threads_per_block = std::min(size, (unsigned int)NUM_VECTOR_OP_THREADS_PER_BLOCK);
1527 int n_blocks = std::min(ceil_intdiv(size,(unsigned int)threads_per_block), (unsigned int)NUM_VECTOR_OP_BLOCKS);
1528 kAdd_contiguous<<<n_blocks,threads_per_block>>>(
1529 self->devdata, other->devdata, rval->devdata, size);
1530 CNDA_THREAD_SYNC;
1531 cudaError_t err = cudaGetLastError();
1532 if( cudaSuccess != err)
1533 {
1534 PyErr_Format(PyExc_RuntimeError, "Cuda error: %s: %s.\n", "kAdd", cudaGetErrorString(err));
1535 Py_DECREF(rval);
1536 return NULL;
1537 }
1538 return (PyObject *) rval;
1539 }
1540
1541 template <int operator_num>
1542 __global__ void k_ielem_3(const int d0, const int d1, const int d2,
1543 float* a, const int sA0, const int sA1, const int sA2,
1544 const float* b, const int sB0, const int sB1, const int sB2){
1545 for (int i0 = blockIdx.x; i0 < d0; i0 += gridDim.x){
1546 for (int i1 = blockIdx.y; i1 < d1; i1 += gridDim.y){
1547 for (int i2 = threadIdx.x; i2 < d2; i2 += blockDim.x){
1548 switch (operator_num)
1549 {
1550 case IADD:
1551 a[i0*sA0 + i1*sA1 + i2*sA2] += b[i0*sB0 + i1*sB1 + i2*sB2];
1552 break;
1553 case IDIV:
1554 a[i0*sA0 + i1*sA1 + i2*sA2] /= b[i0*sB0 + i1*sB1 + i2*sB2];
1555 break;
1556 case CPY:
1557 a[i0*sA0 + i1*sA1 + i2*sA2] = b[i0*sB0 + i1*sB1 + i2*sB2];
1558 break;
1559 }
1560 }
1561 }
1562 }
1563 }
1564
1565 template <int operator_num>
1566 __global__ void k_ielem_4(const int d0, const int d1, const int d2, const int d3,
1567 float* a, const int sA0, const int sA1,
1568 const int sA2, const int sA3,
1569 const float* b, const int sB0, const int sB1,
1570 const int sB2, const int sB3){
1571 for (int i0 = blockIdx.x; i0 < d0; i0 += gridDim.x){
1572 for (int i1 = blockIdx.y; i1 < d1; i1 += gridDim.y){
1573 for (int i2 = threadIdx.x; i2 < d2; i2 += blockDim.x){
1574 for (int i3 = threadIdx.y; i3 < d3; i3 += blockDim.y){
1575 switch (operator_num) {
1576 case IADD:
1577 a[i0*sA0 + i1*sA1 + i2*sA2 + i3*sA3]
1578 += b[i0*sB0 + i1*sB1 + i2*sB2 + i3*sB3];
1579 break;
1580 case IDIV:
1581 a[i0*sA0 + i1*sA1 + i2*sA2 + i3*sA3]
1582 /= b[i0*sB0 + i1*sB1 + i2*sB2 + i3*sB3];
1583 break;
1584 case CPY:
1585 a[i0*sA0 + i1*sA1 + i2*sA2 + i3*sA3]
1586 = b[i0*sB0 + i1*sB1 + i2*sB2 + i3*sB3];
1587 break;
1588 }
1589 }
1590 }
1591 }
1592 }
1593 }
1594
1595 template <int operator_num>
1596 __global__ void k_ielem_6(const int d0, const int d1,
1597 const int d2, const int d3,
1598 const int d4, const int d5,
1599 float* a, const int sA0, const int sA1,
1600 const int sA2, const int sA3,
1601 const int sA4, const int sA5,
1602 const float* b, const int sB0, const int sB1,
1603 const int sB2, const int sB3,
1604 const int sB4, const int sB5
1605 ){
1606 for (int i0 = blockIdx.x; i0 < d0; i0 += gridDim.x){
1607 for (int i1 = blockIdx.y; i1 < d1; i1 += gridDim.y){
1608 for (int i2 = blockIdx.z; i2 < d2; i2 += gridDim.z){
1609 for (int i3 = threadIdx.x; i3 < d3; i3 += blockDim.x){
1610 for (int i4 = threadIdx.y; i4 < d4; i4 += blockDim.y){
1611 for (int i5 = threadIdx.z; i5 < d5; i5 += blockDim.z){
1612 switch (operator_num) {
1613 case IADD:
1614 a[i0*sA0 + i1*sA1 + i2*sA2 + i3*sA3 + i4*sA4 + i5*sA5]
1615 += b[i0*sB0 + i1*sB1 + i2*sB2 + i3*sB3 + i4*sB4 + i5*sB5];
1616 break;
1617 case IDIV:
1618 a[i0*sA0 + i1*sA1 + i2*sA2 + i3*sA3 + i4*sA4 + i5*sA5]
1619 /= b[i0*sB0 + i1*sB1 + i2*sB2 + i3*sB3 + i4*sB4 + i5*sB5];
1620 break;
1621 case CPY:
1622 a[i0*sA0 + i1*sA1 + i2*sA2 + i3*sA3 + i4*sA4 + i5*sA5]
1623 = b[i0*sB0 + i1*sB1 + i2*sB2 + i3*sB3 + i4*sB4 + i5*sB5];
1624 break;
1625 }
1626 }
1627 }
1628 }
1629 }
1630 }
1631 }
1632 }
1633
1634 /*
1635 CudaNdarray_inplace_elemwise
1636 Compute elemwise, working inplace on A.
1637 Currently implemented A / B, A + B and A = B
1638 (the last is not tested and not used!)
1639
1640 py_self - the CudaNdarray that we'll modify (A)
1641 py_other - the other argument (B)
1642 fct_nb - which operation to perform (operator_t)
1643
1644 Returns 0 on success.
1645 Returns -1 on failure, and sets Python exception.
1646
1647 */
1648 int
1649 CudaNdarray_inplace_elemwise(PyObject* py_self, PyObject * py_other, operator_t fct_nb)
1650 {
1651 int verbose = 0;
1652 void (*k3)(const int, const int, const int,
1653 float*, const int, const int, const int,
1654 const float*, const int, const int, const int);
1655 void (*k4)(const int, const int, const int, const int,
1656 float*, const int, const int,
1657 const int, const int,
1658 const float*, const int, const int,
1659 const int, const int);
1660 void (*k6)(const int, const int,
1661 const int, const int,
1662 const int, const int,
1663 float*, const int, const int,
1664 const int, const int,
1665 const int, const int,
1666 const float*, const int, const int,
1667 const int, const int,
1668 const int, const int);
1669 switch (fct_nb)
1670 {
1671 case IADD:
1672 k3 = k_ielem_3<IADD>;
1673 k4 = k_ielem_4<IADD>;
1674 k6 = k_ielem_6<IADD>;
1675 break;
1676 case IDIV:
1677 k3 = k_ielem_3<IDIV>;
1678 k4 = k_ielem_4<IDIV>;
1679 k6 = k_ielem_6<IDIV>;
1680 break;
1681 case CPY:
1682 k3 = k_ielem_3<CPY>;
1683 k4 = k_ielem_4<CPY>;
1684 k6 = k_ielem_6<CPY>;
1685 break;
1686 default:
1687 assert (0);
1688 PyErr_Format(
1689 PyExc_TypeError,
1690 "CudaNdarray_inplace_elemwise invalid fct_nb (%i).",
1691 (int)fct_nb);
1692 return -1;
1693 }
1694 if (!CudaNdarray_Check(py_self)) {
1695 PyErr_SetString(
1696 PyExc_TypeError,
1697 "CudaNdarray_inplace_elemwise need a CudaNdarray on left");
1698 return -1;
1699 }
1700 CudaNdarray * new_other = NULL;
1701 if (!CudaNdarray_Check(py_other)) {
1702 new_other = (CudaNdarray*) CudaNdarray_New();
1703 if(!new_other)
1704 {
1705 return -1;
1706 }
1707 if(CudaNdarray_CopyFromArray(new_other, (PyArrayObject *) py_other))
1708 {
1709 Py_XDECREF(new_other);
1710 return -1;
1711 }
1712 py_other = (PyObject *) new_other;
1713 }
1714
1715 CudaNdarray * self = (CudaNdarray *)py_self;
1716 CudaNdarray * other = (CudaNdarray *)py_other;
1717
1718 if (verbose)
1719 {
1720 fprintf(stderr,
1721 "INPLACE ADD/DIV for self->nd=%d other->nd=%d\n",
1722 self->nd, other->nd);
1723 }
1724
1725 //standard elemwise nb dim checks
1726 if (self->nd < other->nd)
1727 {
1728 PyErr_Format(
1729 PyExc_TypeError,
1730 "CudaNdarray_inplace_elemwise: The destination need more or the"
1731 " same number of dimensions then the source. Got %d and %d.",
1732 self->nd, other->nd);
1733 Py_XDECREF(new_other);
1734 return -1;
1735 }
1736
1737 //broadcast to the same number of dimensions.
1738 int* other_dims = (int*) alloca(self->nd * sizeof(int));
1739 int* other_strides = (int*) alloca(self->nd * sizeof(int));
1740 int added_dims = self->nd - other->nd;
1741 // Add the added broadcasted dimensions
1742 for (int i = 0; i< added_dims; ++i)
1743 {
1744 other_dims[i] = 1;
1745 other_strides[i] = 0;
1746 }
1747 // Copy the existing dimensions
1748 for (int i = 0; i< other->nd; ++i)
1749 {
1750 other_dims[i+added_dims] = CudaNdarray_HOST_DIMS(other)[i];
1751 other_strides[i+added_dims] = CudaNdarray_HOST_STRIDES(other)[i];
1752 }
1753
1754 //standard elemwise dim checks
1755 unsigned int size = 1;
1756 for (int i = 0; i< self->nd; ++i)
1757 {
1758 if ((CudaNdarray_HOST_DIMS(self)[i] != other_dims[i])
1759 && (other_dims[i] != 1))
1760 {
1761 PyErr_SetString(
1762 PyExc_ValueError,
1763 "CudaNdarray_inplace_elemwise need same dimensions (or broadcastable dimension)");
1764 Py_XDECREF(new_other);
1765 return -1;
1766 }
1767 // if we're broadcasting other, then make sure it has stride 0
1768 assert ((CudaNdarray_HOST_DIMS(self)[i] == other_dims[i])
1769 || (other_strides[i] == 0));
1770 size *= (unsigned int) CudaNdarray_HOST_DIMS(self)[i];
1771 }
1772
1773 if (size==0)
1774 {
1775 int other_size = CudaNdarray_SIZE((CudaNdarray *)py_other);
1776 if (!(other_size == 0 || other_size == 1))
1777 {
1778 PyErr_SetString(
1779 PyExc_ValueError,
1780 "CudaNdarray_inplace_elemwise cannot work inplace on"
1781 " un-initialized array when the new value have more than"
1782 " 0 or 1 broadcastable dimensions");
1783 Py_XDECREF(new_other);
1784 return 0;
1785 }
1786 Py_XDECREF(new_other);
1787 return 0;
1788 }
1789
1790 switch(self->nd)
1791 {
1792 case 0:
1793 {
1794 dim3 n_blocks(1, 1, 1);
1795 dim3 n_threads(1);
1796 k3<<<n_blocks, n_threads>>>(
1797 1, //d0
1798 1, //d1
1799 1, //d2
1800 CudaNdarray_DEV_DATA(self),
1801 1, //strides
1802 1,
1803 1,
1804 CudaNdarray_DEV_DATA(other),
1805 1, //strides
1806 1,
1807 1);
1808 CNDA_THREAD_SYNC;
1809 cudaError_t err = cudaGetLastError();
1810 if (cudaSuccess != err)
1811 {
1812 PyErr_Format(
1813 PyExc_RuntimeError,
1814 "CudaNdarray_inplace_elemwise case0: Cuda error: %s: %s.\n",
1815 "k3",
1816 cudaGetErrorString(err));
1817 Py_XDECREF(new_other);
1818 return -1;
1819 }
1820 }
1821 break;
1822 case 1:
1823 {
1824 dim3 n_blocks(1, 1, 1);
1825 dim3 n_threads(
1826 std::min(
1827 CudaNdarray_HOST_DIMS(self)[0],
1828 NUM_VECTOR_OP_THREADS_PER_BLOCK));
1829 k3<<<n_blocks, n_threads>>>(
1830 1, //dimensions
1831 1,
1832 CudaNdarray_HOST_DIMS(self)[0],
1833 CudaNdarray_DEV_DATA(self),
1834 1, //strides
1835 1,
1836 CudaNdarray_HOST_STRIDES(self)[0],
1837 CudaNdarray_DEV_DATA(other),
1838 1, //strides
1839 1,
1840 other_strides[0]);
1841 CNDA_THREAD_SYNC;
1842 cudaError_t err = cudaGetLastError();
1843 if (cudaSuccess != err)
1844 {
1845 PyErr_Format(
1846 PyExc_RuntimeError,
1847 "CudaNdarray_inplace_elemwise case1: Cuda error: %s: %s.\n",
1848 "k3",
1849 cudaGetErrorString(err));
1850 Py_XDECREF(new_other);
1851 return -1;
1852 }
1853 }
1854 break;
1855 case 2:
1856 {
1857 //TODO: if both self and other are f-contiguous
1858 // Then flip the block and thread dimensions
1859 // to make contiguous reads & writes
1860 dim3 n_blocks(1,
1861 std::min(
1862 CudaNdarray_HOST_DIMS(self)[0],
1863 NUM_VECTOR_OP_BLOCKS));
1864 dim3 n_threads(
1865 std::min(
1866 CudaNdarray_HOST_DIMS(self)[1],
1867 NUM_VECTOR_OP_THREADS_PER_BLOCK));
1868 k3<<<n_blocks, n_threads>>>(1,
1869 CudaNdarray_HOST_DIMS(self)[0],
1870 CudaNdarray_HOST_DIMS(self)[1],
1871 CudaNdarray_DEV_DATA(self),
1872 1,
1873 CudaNdarray_HOST_STRIDES(self)[0],
1874 CudaNdarray_HOST_STRIDES(self)[1],
1875 CudaNdarray_DEV_DATA(other),
1876 1,
1877 other_strides[0],
1878 other_strides[1]);
1879 CNDA_THREAD_SYNC;
1880 cudaError_t err = cudaGetLastError();
1881 if (cudaSuccess != err)
1882 {
1883 PyErr_Format(
1884 PyExc_RuntimeError,
1885 "CudaNdarray_inplace_elemwise case2: Cuda error: %s: %s.\n",
1886 "k3",
1887 cudaGetErrorString(err));
1888 Py_XDECREF(new_other);
1889 return -1;
1890 }
1891 }
1892 break;
1893 case 3:
1894 {
1895 //TODO: Dimshuffle so that at least one of the arrays
1896 // has a contiguous dimension on the thread idx.
1897 dim3 n_blocks(
1898 std::min(
1899 CudaNdarray_HOST_DIMS(self)[0],
1900 NUM_VECTOR_OP_BLOCKS),
1901 CudaNdarray_HOST_DIMS(self)[1]);
1902 while (n_blocks.x * n_blocks.y > NUM_VECTOR_OP_BLOCKS)
1903 n_blocks.y /= 2;
1904 dim3 n_threads(
1905 std::min(
1906 CudaNdarray_HOST_DIMS(self)[2],
1907 NUM_VECTOR_OP_THREADS_PER_BLOCK));
1908 k3<<<n_blocks, n_threads>>>(
1909 CudaNdarray_HOST_DIMS(self)[0],
1910 CudaNdarray_HOST_DIMS(self)[1],
1911 CudaNdarray_HOST_DIMS(self)[2],
1912 CudaNdarray_DEV_DATA(self),
1913 CudaNdarray_HOST_STRIDES(self)[0],
1914 CudaNdarray_HOST_STRIDES(self)[1],
1915 CudaNdarray_HOST_STRIDES(self)[2],
1916 CudaNdarray_DEV_DATA(other),
1917 other_strides[0],
1918 other_strides[1],
1919 other_strides[2]);
1920 CNDA_THREAD_SYNC;
1921 cudaError_t err = cudaGetLastError();
1922 if (cudaSuccess != err)
1923 {
1924 PyErr_Format(
1925 PyExc_RuntimeError,
1926 "CudaNdarray_inplace_elemwise case3: Cuda error: %s: %s.\n",
1927 "k3",
1928 cudaGetErrorString(err));
1929 Py_XDECREF(new_other);
1930 return -1;
1931 }
1932 }
1933 break;
1934 case 4:
1935 {
1936 dim3 n_blocks(
1937 std::min(
1938 CudaNdarray_HOST_DIMS(self)[0],
1939 NUM_VECTOR_OP_BLOCKS),
1940 CudaNdarray_HOST_DIMS(self)[1]
1941 );
1942 while (n_blocks.x * n_blocks.y > NUM_VECTOR_OP_BLOCKS)
1943 n_blocks.y /= 2;
1944 dim3 n_threads(
1945 std::min(
1946 CudaNdarray_HOST_DIMS(self)[2],
1947 NUM_VECTOR_OP_THREADS_PER_BLOCK)
1948 //TODO: DON"T YOU NEED OT PUT DIMS[3] in here???
1949 );
1950 k4<<<n_blocks, n_threads>>>(
1951 CudaNdarray_HOST_DIMS(self)[0],
1952 CudaNdarray_HOST_DIMS(self)[1],
1953 CudaNdarray_HOST_DIMS(self)[2],
1954 CudaNdarray_HOST_DIMS(self)[3],
1955 CudaNdarray_DEV_DATA(self),
1956 CudaNdarray_HOST_STRIDES(self)[0],
1957 CudaNdarray_HOST_STRIDES(self)[1],
1958 CudaNdarray_HOST_STRIDES(self)[2],
1959 CudaNdarray_HOST_STRIDES(self)[3],
1960 CudaNdarray_DEV_DATA(other),
1961 other_strides[0],
1962 other_strides[1],
1963 other_strides[2],
1964 other_strides[3]);
1965 CNDA_THREAD_SYNC;
1966 cudaError_t err = cudaGetLastError();
1967 if (cudaSuccess != err)
1968 {
1969 PyErr_Format(
1970 PyExc_RuntimeError,
1971 "CudaNdarray_inplace_elemwise case4: Cuda error: %s: %s.\n",
1972 "k4",
1973 cudaGetErrorString(err));
1974 Py_XDECREF(new_other);
1975 return -1;
1976 }
1977 }
1978 break;
1979 case 5:
1980 {
1981 dim3 n_blocks(
1982 std::min(
1983 CudaNdarray_HOST_DIMS(self)[1],
1984 NUM_VECTOR_OP_BLOCKS),
1985 CudaNdarray_HOST_DIMS(self)[2]);
1986 while (n_blocks.x * n_blocks.y > NUM_VECTOR_OP_BLOCKS)
1987 n_blocks.y /= 2;
1988 dim3 n_threads(
1989 std::min(
1990 CudaNdarray_HOST_DIMS(self)[3],
1991 NUM_VECTOR_OP_THREADS_PER_BLOCK)
1992 //TODO: DON"T YOU NEED OT PUT DIMS[3] in here???
1993 );
1994 for (int i = 0; i < CudaNdarray_HOST_DIMS(self)[0]; ++i)
1995 {
1996 k4<<<n_blocks, n_threads>>>(
1997 CudaNdarray_HOST_DIMS(self)[1],
1998 CudaNdarray_HOST_DIMS(self)[2],
1999 CudaNdarray_HOST_DIMS(self)[3],
2000 CudaNdarray_HOST_DIMS(self)[4],
2001 CudaNdarray_DEV_DATA(self) + i * CudaNdarray_HOST_STRIDES(self)[0],
2002 CudaNdarray_HOST_STRIDES(self)[1],
2003 CudaNdarray_HOST_STRIDES(self)[2],
2004 CudaNdarray_HOST_STRIDES(self)[3],
2005 CudaNdarray_HOST_STRIDES(self)[4],
2006 CudaNdarray_DEV_DATA(other) + i * other_strides[0],
2007 other_strides[1],
2008 other_strides[2],
2009 other_strides[3],
2010 other_strides[4]);
2011 CNDA_THREAD_SYNC;
2012 cudaError_t err = cudaGetLastError();
2013 if( cudaSuccess != err)
2014 {
2015 PyErr_Format(
2016 PyExc_RuntimeError,
2017 "CudaNdarray_inplace_elemwise case5: Cuda error: %s: %s. n_block=(%ld,%ld) n_threads=%ld\n",
2018 "k5 with loop over k4",
2019 cudaGetErrorString(err),
2020 (long) n_blocks.x, (long) n_blocks.y, (long) n_threads.x);
2021 Py_XDECREF(new_other);
2022 return -1;
2023 }
2024 }
2025 }
2026 break;
2027 case 6:
2028 {
2029 dim3 n_blocks(
2030 std::min(
2031 CudaNdarray_HOST_DIMS(self)[0],
2032 NUM_VECTOR_OP_BLOCKS),
2033 CudaNdarray_HOST_DIMS(self)[1],
2034 CudaNdarray_HOST_DIMS(self)[2]
2035 );
2036 while (n_blocks.x * n_blocks.y > NUM_VECTOR_OP_BLOCKS)
2037 n_blocks.y /= 2;
2038 // GTX285(compute capabilities 1.3) don't support n_blocks.z > 1
2039 // (compute capabilities 2.0) support 65535 for n_blocks.z
2040 //while (n_blocks.x * n_blocks.y * n_blocks.z > NUM_VECTOR_OP_BLOCKS)
2041 // n_blocks.z /= 2;
2042 n_blocks.z = 1;
2043 dim3 n_threads(
2044 std::min(
2045 CudaNdarray_HOST_DIMS(self)[3],
2046 NUM_VECTOR_OP_THREADS_PER_BLOCK)
2047 //TODO: DON'T YOU NEED TO PUT DIMS[4] in here???
2048 //TODO: DON'T YOU NEED TO PUT DIMS[5] in here???
2049 );
2050 k6<<<n_blocks, n_threads>>>(
2051 CudaNdarray_HOST_DIMS(self)[0],
2052 CudaNdarray_HOST_DIMS(self)[1],
2053 CudaNdarray_HOST_DIMS(self)[2],
2054 CudaNdarray_HOST_DIMS(self)[3],
2055 CudaNdarray_HOST_DIMS(self)[4],
2056 CudaNdarray_HOST_DIMS(self)[5],
2057 CudaNdarray_DEV_DATA(self),
2058 CudaNdarray_HOST_STRIDES(self)[0],
2059 CudaNdarray_HOST_STRIDES(self)[1],
2060 CudaNdarray_HOST_STRIDES(self)[2],
2061 CudaNdarray_HOST_STRIDES(self)[3],
2062 CudaNdarray_HOST_STRIDES(self)[4],
2063 CudaNdarray_HOST_STRIDES(self)[5],
2064 CudaNdarray_DEV_DATA(other),
2065 other_strides[0],
2066 other_strides[1],
2067 other_strides[2],
2068 other_strides[3],
2069 other_strides[4],
2070 other_strides[5]);
2071 CNDA_THREAD_SYNC;
2072 cudaError_t err = cudaGetLastError();
2073 if (cudaSuccess != err)
2074 {
2075 PyErr_Format(
2076 PyExc_RuntimeError,
2077 "CudaNdarray_inplace_elemwise case6: Cuda error: %s: %s. n_blocks=(%ld, %ld, %ld) n_threads=(%ld)\n",
2078 "k6",
2079 cudaGetErrorString(err),
2080 (long) n_blocks.x, (long) n_blocks.y, (long) n_blocks.z,
2081 (long) n_threads.x);
2082 Py_XDECREF(new_other);
2083 return -1;
2084 }
2085 }
2086 break;
2087 default:
2088 {
2089 PyErr_Format(
2090 PyExc_NotImplementedError,
2091 "inplace_elemwise w nd=%i\n",
2092 self->nd);
2093 Py_XDECREF(new_other);
2094 return -1;
2095 }
2096 }
2097 if (verbose)
2098 fprintf(stderr, "INPLACE ADD/DIV end\n");
2099 Py_XDECREF(new_other);
2100 return 0;
2101 }
2102
2103 /*
2104 * We need this inplace Add to support IncSubTensor
2105 * It returns py_self on success with an additional reference. Else NULL.
2106 */
2107 // Will be called by __iadd__ in Python
2108 PyObject *
2109 CudaNdarray_inplace_add(PyObject* py_self, PyObject * py_other)
2110 {
2111 if (CudaNdarray_inplace_elemwise(py_self, py_other, IADD))
2112 {
2113 return NULL;
2114 }
2115 Py_INCREF(py_self);
2116 return py_self;
2117 }
2118
2119 /*
2120 * We need this inplace div for cuda/tests/test_basic_ops.py:test_shared_options
2121 * It returns py_self on success with an additional reference. Else NULL.
2122 */
2123 // Will be called by __idiv__ in Python
2124 static PyObject *
2125 CudaNdarray_inplace_div(PyObject* py_self, PyObject * py_other)
2126 {
2127 if (CudaNdarray_inplace_elemwise(py_self, py_other, IDIV))
2128 {
2129 return NULL;
2130 }
2131 Py_INCREF(py_self);
2132 return py_self;
2133 }
2134
2135 // The PyNumberMethods struct layout changed in a non-trivial way from 2 to 3.
2136 #if PY_MAJOR_VERSION == 3
2137 static PyNumberMethods CudaNdarrayNumberMethods =
2138 {
2139 (binaryfunc)CudaNdarray_add, //binaryfunc nb_add; __add__
2140 0, //binaryfunc nb_subtract;
2141 0, //binaryfunc nb_multiply;
2142 0, //binaryfunc nb_remainder;
2143 0, //binaryfunc nb_divmod;
2144 0, //ternaryfunc nb_power;
2145 0, //unaryfunc nb_negative;
2146 0, //unaryfunc nb_positive;
2147 0, //unaryfunc nb_absolute;
2148 0, //inquiry nb_bool;
2149 0, //unaryfunc nb_invert;
2150 0, //binaryfunc nb_lshift;
2151 0, //binaryfunc nb_rshift;
2152 0, //binaryfunc nb_and;
2153 0, //binaryfunc nb_xor;
2154 0, //binaryfunc nb_or;
2155 0, //unaryfunc nb_int;
2156 0, //void *nb_reserved;
2157 0, //unaryfunc nb_float;
2158
2159 (binaryfunc)CudaNdarray_inplace_add, //binaryfunc nb_inplace_add; __iadd__
2160 0, //binaryfunc nb_inplace_subtract;
2161 0, //binaryfunc nb_inplace_multiply;
2162 0, //binaryfunc nb_inplace_remainder;
2163 0, //ternaryfunc nb_inplace_power;
2164 0, //binaryfunc nb_inplace_lshift;
2165 0, //binaryfunc nb_inplace_rshift;
2166 0, //binaryfunc nb_inplace_and;
2167 0, //binaryfunc nb_inplace_xor;
2168 0, //binaryfunc nb_inplace_or;
2169
2170 0, //binaryfunc nb_floor_divide;
2171 0, //binaryfunc nb_true_divide;
2172 0, //binaryfunc nb_inplace_floor_divide;
2173 (binaryfunc)CudaNdarray_inplace_div, //binaryfunc nb_inplace_true_divide; __idiv__
2174
2175 0, //unaryfunc nb_index
2176 };
2177 #else
2178 static PyNumberMethods CudaNdarrayNumberMethods =
2179 {
2180 (binaryfunc)CudaNdarray_add, //binaryfunc nb_add; __add__
2181 0, //binaryfunc nb_subtract; __sub__
2182 0, //binaryfunc nb_multiply; __mul__
2183 0, //binaryfunc nb_divide; __div__
2184 0, //binaryfunc nb_remainder; __mod__
2185 0, //binaryfunc nb_divmod; __divmod__
2186 0, //ternaryfunc nb_power; __pow__
2187 0, //unaryfunc nb_negative; __neg__
2188 0, //unaryfunc nb_positive; __pos__
2189 0, //unaryfunc nb_absolute; __abs__
2190 0, //inquiry nb_nonzero; __nonzero__ /* Used by PyObject_IsTrue */
2191 0, //unaryfunc nb_invert; __invert__
2192 0, //binaryfunc nb_lshift; __lshift__
2193 0, //binaryfunc nb_rshift; __rshift__
2194 0, //binaryfunc nb_and; __and__
2195 0, //binaryfunc nb_xor; __xor__
2196 0, //binaryfunc nb_or; __or__
2197 0, //coercion nb_coerce; __coerce__ /* Used by the coerce() function */
2198 0, //unaryfunc nb_int; __int__
2199 0, //unaryfunc nb_long; __long__
2200 0, //unaryfunc nb_float; __float__
2201 0, //unaryfunc nb_oct; __oct__
2202 0, //unaryfunc nb_hex; __hex__
2203
2204 /* Added in release 2.0 */
2205 (binaryfunc)CudaNdarray_inplace_add, //binaryfunc nb_inplace_add; __iadd__
2206 0, //binaryfunc nb_inplace_subtract; __isub__
2207 0, //binaryfunc nb_inplace_multiply; __imul__
2208 (binaryfunc)CudaNdarray_inplace_div, //binaryfunc nb_inplace_divide; __idiv__
2209 0, //binaryfunc nb_inplace_remainder; __imod__
2210 0, //ternaryfunc nb_inplace_power; __ipow__
2211 0, //binaryfunc nb_inplace_lshift; __ilshift__
2212 0, //binaryfunc nb_inplace_rshift; __irshift__
2213 0, //binaryfunc nb_inplace_and; __iand__
2214 0, //binaryfunc nb_inplace_xor; __ixor__
2215 0, //binaryfunc nb_inplace_or; __ior__
2216
2217 /* Added in release 2.2 */
2218 0, //binaryfunc nb_floor_divide; __floordiv__
2219 0, //binaryfunc nb_true_divide; __truediv__
2220 0, //binaryfunc nb_inplace_floor_divide; __ifloordiv__
2221 (binaryfunc)CudaNdarray_inplace_div, //binaryfunc nb_inplace_true_divide; __itruediv__
2222
2223 #if PY_MINOR_VERSION > 4
2224 /* Added in release 2.5 */
2225 0 //unaryfunc nb_index; __index__
2226 #endif
2227 };
2228 #endif
2229
2230
2231 /////////////////////
2232 // Mapping protocol
2233 /////////////////////
2234
2235 // Will by called by __len__ in Python
2236 static Py_ssize_t
2237 CudaNdarray_len(PyObject * py_self)
2238 {
2239 CudaNdarray * self = (CudaNdarray*) py_self;
2240 if (self->nd <= 0)
2241 {
2242 return (Py_ssize_t) 0;
2243 }
2244 else
2245 {
2246 return (Py_ssize_t) CudaNdarray_HOST_DIMS(self)[0];
2247 }
2248 }
2249
2250 // Will by called by __getitem__ in Python
2251 PyObject *
2252 CudaNdarray_Subscript(PyObject * py_self, PyObject * key)
2253 {
2254 int verbose = 0;
2255 if (verbose) fprintf(stderr, "Subscript .... \n");
2256 CudaNdarray * self = (CudaNdarray*) py_self;
2257 PyObject * py_rval = NULL;
2258 CudaNdarray * rval = NULL;
2259 PyObject * intobj = NULL;
2260
2261 //PyObject_Print(key, stderr, 0);
2262
2263 if (key == Py_Ellipsis)
2264 {
2265 Py_INCREF(py_self);
2266 return py_self;
2267 }
2268 if ((intobj=PyNumber_Int(key))) //INDEXING BY INTEGER
2269 //else if (PyInt_Check(key)) //INDEXING BY INTEGER
2270 {
2271 int d_idx = PyInt_AsLong(intobj);
2272 Py_DECREF(intobj); intobj=NULL;
2273 //int d_idx = PyInt_AsLong(key);
2274 if (self->nd == 0)
2275 {
2276 PyErr_SetString(PyExc_IndexError, "0-d arrays can't be indexed");
2277 return NULL;
2278 }
2279 int d_dim = CudaNdarray_HOST_DIMS(self)[0];
2280 int offset = 0;
2281
2282 if ((d_idx >= 0) && (d_idx < d_dim))
2283 {
2284 //normal indexing
2285 offset += d_idx * CudaNdarray_HOST_STRIDES(self)[0];
2286 }
2287 else if ((d_idx < 0) && (d_idx >= -d_dim))
2288 {
2289 //end-based indexing
2290 // d_idx is negative
2291 offset += (d_dim + d_idx) * CudaNdarray_HOST_STRIDES(self)[0];
2292 }
2293 else
2294 {
2295 PyErr_Format(PyExc_IndexError,
2296 "index out of bounds. Asked %d, but size of %d",
2297 d_idx, d_dim);
2298 return NULL;
2299 }
2300
2301 //allocate our subtensor view
2302 py_rval = CudaNdarray_new_nd(self->nd - 1);
2303 rval = (CudaNdarray*) py_rval;
2304 if (!rval) return NULL;
2305 assert (0 == rval->data_allocated);
2306
2307 //initialize the view's data pointer to our own.
2308 if (CudaNdarray_set_device_data(rval, CudaNdarray_DEV_DATA(self) + offset, self))
2309 {
2310 Py_DECREF(rval);
2311 return NULL;
2312 }
2313 for (int d = 1; d < self->nd; ++d)
2314 {
2315 CudaNdarray_set_stride(rval, d-1, CudaNdarray_HOST_STRIDES(self)[d]);
2316 CudaNdarray_set_dim(rval, d-1, CudaNdarray_HOST_DIMS(self)[d]);
2317 }
2318 }
2319 else
2320 {
2321 PyErr_Clear();
2322 }
2323 if (PySlice_Check(key)) //INDEXING BY SLICE
2324 {
2325 if (verbose) fprintf(stderr, "by slice\n");
2326 if (self->nd == 0)
2327 {
2328 PyErr_SetString(PyExc_ValueError, "cannot slice a 0-d array");
2329 return NULL;
2330 }
2331
2332 int d_dim = CudaNdarray_HOST_DIMS(self)[0];
2333 Py_ssize_t start, stop, step, slen;
2334 if (PySlice_GetIndicesEx(SLICE_CAST(key), d_dim, &start, &stop, &step, &slen))
2335 {
2336 if (verbose)
2337 fprintf(stderr, "PySlice_GetIndicesEx failed\n");
2338 return NULL;
2339 }
2340 if (verbose)
2341 {
2342 std::cerr << "start " << start << "\n";
2343 std::cerr << "stop " << stop << "\n";
2344 std::cerr << "step " << step << "\n";
2345 std::cerr << "slen " << slen << "\n";
2346 }
2347
2348 //allocate our subtensor view
2349 py_rval = CudaNdarray_new_nd(self->nd);
2350 rval = (CudaNdarray*) py_rval;
2351 if (!rval) return NULL;
2352 assert (0 == rval->data_allocated);
2353
2354
2355 //initialize the view's data pointer to our own.
2356 if (CudaNdarray_set_device_data(rval,
2357 CudaNdarray_DEV_DATA(self) + start * CudaNdarray_HOST_STRIDES(self)[0],
2358 self))
2359 {
2360 Py_DECREF(rval);
2361 return NULL;
2362 }
2363 //initialize dimension 0 of rval
2364 CudaNdarray_set_stride(rval, 0,
2365 (slen == 1) ? 0 : step * CudaNdarray_HOST_STRIDES(self)[0]);
2366 CudaNdarray_set_dim(rval, 0, slen);
2367 if (verbose) std::cerr << "rval stride " << CudaNdarray_HOST_STRIDES(rval)[0] << "\n";
2368 // initialize dimensions > 0 of rval
2369 for (int d = 1; d < self->nd; ++d)
2370 {
2371 CudaNdarray_set_stride(rval, d, CudaNdarray_HOST_STRIDES(self)[d]);
2372 CudaNdarray_set_dim(rval, d, CudaNdarray_HOST_DIMS(self)[d]);
2373 }
2374 }
2375 if (PyTuple_Check(key)) //INDEXING BY TUPLE
2376 {
2377 if (verbose) fprintf(stderr, "by tuple\n");
2378 //elements of the tuple can be either integers or slices
2379 //the dimensionality of the view we will return is diminished for each slice in the tuple
2380
2381 if (PyTuple_Size(key) > self->nd)
2382 {
2383 PyErr_SetString(PyExc_IndexError, "index error");
2384 return NULL;
2385 }
2386
2387 //calculate the number of dimensions in the return value
2388 int rval_nd = self->nd;
2389 for (int d = 0; d < PyTuple_Size(key); ++d)
2390 {
2391 //On some paltform PyInt_Check(<type 'numpy.int64'>) return true, other it return false.
2392 //So we use PyArray_IsAnyScalar that should covert everything.
2393 rval_nd -= PyArray_IsAnyScalar(PyTuple_GetItem(key, d));
2394 }
2395
2396 //allocate our subtensor view
2397 py_rval = CudaNdarray_new_nd(rval_nd);
2398 rval = (CudaNdarray*) py_rval;
2399 if (!rval) return NULL;
2400 assert (0 == rval->data_allocated);
2401
2402 //initialize the view's data pointer to our own.
2403 if (CudaNdarray_set_device_data(rval, CudaNdarray_DEV_DATA(self), self))
2404 {
2405 Py_DECREF(rval);
2406 return NULL;
2407 }
2408
2409 // rval_d will refer to the current dimension in the rval.
2410 // It will not be incremented for integer keys, but will be incremented for slice
2411 // keys
2412 int rval_d = 0;
2413
2414 for (int d = 0; d < self->nd; ++d)
2415 {
2416 // keys can be shorter than self->nd.
2417 // when that happens, it means that the remaining dimensions are "full slices"
2418 if (d >=PyTuple_Size(key))
2419 {
2420 CudaNdarray_set_stride(rval, rval_d, CudaNdarray_HOST_STRIDES(self)[d]);
2421 CudaNdarray_set_dim(rval, rval_d, CudaNdarray_HOST_DIMS(self)[d]);
2422 ++rval_d;
2423 }
2424 else
2425 {
2426 PyObject * key_d = PyTuple_GetItem(key, d);
2427
2428 if (PySlice_Check(key_d))
2429 {
2430 Py_ssize_t start, stop, step, slen;
2431 if (PySlice_GetIndicesEx(SLICE_CAST(key_d), CudaNdarray_HOST_DIMS(self)[d], &start, &stop, &step, &slen))
2432 {
2433 Py_DECREF(rval);
2434 return NULL;
2435 }
2436 rval->devdata += start * CudaNdarray_HOST_STRIDES(self)[d];
2437 CudaNdarray_set_stride(rval, rval_d,
2438 (slen == 1) ? 0 : step * CudaNdarray_HOST_STRIDES(self)[d]);
2439 CudaNdarray_set_dim(rval, rval_d, slen);
2440 if (0)
2441 {
2442 std::cerr << "start " << start << "\n";
2443 std::cerr << "stop " << stop << "\n";
2444 std::cerr << "step " << step << "\n";
2445 std::cerr << "slen " << slen << "\n";
2446 }
2447 ++rval_d;
2448 }
2449 else if ((intobj=PyNumber_Int(key_d)))
2450 {
2451 assert(PyArray_IsAnyScalar(key_d));
2452 int d_idx = PyInt_AsLong(intobj);
2453 Py_DECREF(intobj);
2454 intobj = NULL;
2455 int d_dim = CudaNdarray_HOST_DIMS(self)[d];
2456
2457 if ((d_idx >= 0) && (d_idx < d_dim))
2458 {
2459 //normal indexing
2460 rval->devdata += d_idx * CudaNdarray_HOST_STRIDES(self)[d];
2461 }
2462 else if ((d_idx < 0) && (d_idx >= -d_dim))
2463 {
2464 //end-based indexing
2465 rval->devdata += (d_dim + d_idx) * CudaNdarray_HOST_STRIDES(self)[d];
2466 }
2467 else
2468 {
2469 PyErr_Format(PyExc_IndexError,
2470 "index out of bounds. Asked %d for dimensions %d, but size of %d",
2471 d_idx, d, d_dim);
2472 Py_DECREF(rval);
2473 return NULL;
2474 }
2475 }
2476 else
2477 {
2478 PyErr_Clear(); // clear the error set by PyNumber_Int
2479 PyErr_SetString(PyExc_IndexError, "index must be either int or slice");
2480 Py_DECREF(rval);
2481 return NULL;
2482 }
2483 }
2484 }
2485 }
2486 if (py_rval)
2487 {
2488 if (verbose) fprint_CudaNdarray(stderr, self);
2489 if (verbose) fprint_CudaNdarray(stderr, rval);
2490 }
2491 else
2492 {
2493 PyErr_SetString(PyExc_NotImplementedError, "Unknown key type");
2494 return NULL;
2495 }
2496 return py_rval;
2497 }
2498
2499 // Will by called by __setitem__ in Python
2500 // See http://docs.python.org/dev/py3k/c-api/object.html#PyObject_SetItem
2501 // Doesn't handle broadcasting, e.g. a[:] = 5
2502 // Can only be assigned from a CudaNdarray on the right side
2503 // Or a ndarray
2504 // Or a python scalar with value 0 when the left side part is c contiguous.
2505 static int
2506 CudaNdarray_setitem(PyObject *o, PyObject *key, PyObject *value)
2507 {
2508 int verbose = 0;
2509 if (verbose) fprintf(stderr, "CudaNdarray_setitem start\n");
2510 // We try to copy directly into this CudaNdarray from the ndarray
2511 CudaNdarray* rval = (CudaNdarray*)CudaNdarray_Subscript(o, key);
2512 CudaNdarray* new_value = NULL;
2513
2514 if(!rval){
2515 // CudaNdarray_Subscript failed and set the error msg.
2516 Py_XDECREF(rval);
2517 return -1;
2518 }
2519
2520 if(rval != (CudaNdarray*)o &&
2521 (rval->data_allocated ||
2522 // The new array should have a base
2523 !(((CudaNdarray*)rval)->base) ||
2524 // If the original array has no base, the base of the new
2525 // array should be the original one
2526 (!((CudaNdarray*)o)->base && ((CudaNdarray*)rval)->base != o) ||
2527 // Else, the two arrays should have the same base
2528 (((CudaNdarray*)o)->base && ((CudaNdarray*)rval)->base != ((CudaNdarray*)o)->base)))
2529 {
2530 // This case shouldn't happen, based on what I see in Subscript
2531 // but just in case it happens sometime in the future
2532
2533 PyErr_Format(PyExc_RuntimeError,
2534 "__getitem__ must return a CudaNdarray that refers to"
2535 " the original CudaNdarray, not a copy. rval.base=%p"
2536 " o.base=%p o=%p",
2537 (((CudaNdarray*)rval)->base), ((CudaNdarray*)o)->base, o);
2538 Py_DECREF(rval);
2539 return -1;
2540 }
2541
2542 PyObject * intobj = NULL;
2543 if (CudaNdarray_Check(o) && PyArray_Check(value)){
2544 if (verbose)
2545 fprintf(stderr,
2546 "CudaNdarray_setitem dest is a CudaNdarray and"
2547 " value is a ndarray\n");
2548 new_value = (CudaNdarray*) CudaNdarray_New();
2549 if(!new_value)
2550 {
2551 return -1;
2552 }
2553 if (CudaNdarray_CopyFromArray(new_value, (PyArrayObject *) value))
2554 {
2555 Py_XDECREF(new_value);
2556 Py_XDECREF(rval);
2557 return -1;
2558 }
2559 value = (PyObject *) new_value;
2560 }
2561 else if ((intobj=PyNumber_Int(value)))
2562 {
2563 if (verbose)
2564 fprintf(stderr,
2565 "CudaNdarray_setitem dest and value is a python number\n");
2566 if(! CudaNdarray_is_c_contiguous(rval)){
2567 PyErr_SetString(PyExc_NotImplementedError,
2568 "CudaNdarray.__setitem__: When the new value is a scalar"
2569 " of value 0 the part where we copy to must be c contiguous.");
2570 Py_XDECREF(rval);
2571 return -1;
2572 }
2573
2574 long val = PyInt_AsLong(intobj);
2575 Py_DECREF(intobj); intobj=NULL;
2576 if (val == 0)
2577 {
2578 cudaError_t err = cudaMemset(rval->devdata, 0,
2579 CudaNdarray_SIZE(rval) * sizeof(real));
2580 Py_XDECREF(rval);
2581 if (err)
2582 {
2583 // Clear the error flag, cudaMemset doesn't do it.
2584 // Currently this returns the same thing as err, but if in future
2585 // it returns something else I still don't see why we should ignore
2586 // it. All we want to do here is reset the flag.
2587 cudaGetLastError();
2588 PyErr_SetString(PyExc_RuntimeError,
2589 "CudaNdarray.__setitem__: cudaMemset failed");
2590 return -1;
2591 }
2592 return 0;
2593 } else {
2594 Py_XDECREF(rval);
2595 PyErr_SetString(PyExc_NotImplementedError,
2596 "CudaNdarray.__setitem__: we support setting only python"
2597 " scalar of value 0, numpy nd array and CudaNdarray.");
2598 return -1;
2599 }
2600 }
2601
2602 PyErr_Clear(); // clear PyNumber_Int error.
2603
2604 if(!CudaNdarray_Check(o) || !CudaNdarray_Check(value))
2605 {
2606 PyErr_SetString(PyExc_TypeError,
2607 "CudaNdarray.__setitem__: left must be a CudaNdarrays and right"
2608 " must be a CudaNdarrays, an ndarray or a python scalar of value 0.");
2609 Py_XDECREF(new_value);
2610 return -1;
2611 }
2612
2613 if (verbose)
2614 fprintf(stderr, "CudaNdarray_setitem dest and value are CudaNdarray\n");
2615
2616 if (cnda_copy_structure_to_device(rval))
2617 {
2618 PyErr_SetString(PyExc_RuntimeError,
2619 "CudaNdarray.__setitem__: syncing structure to device failed");
2620 Py_DECREF(rval);
2621 Py_XDECREF(new_value);
2622
2623 if (verbose)
2624 fprintf(stderr, "CudaNdarray_setitem error end\n");
2625 return -1;
2626 }
2627
2628 PyObject *baseSavedForComparison = rval->base;
2629
2630 if (CudaNdarray_CopyFromCudaNdarray(rval, (CudaNdarray*)value, true))
2631 {
2632 Py_DECREF((PyObject*)rval);
2633 Py_XDECREF(new_value);
2634
2635 if (verbose)
2636 fprintf(stderr, "CudaNdarray_setitem error end\n");
2637 return -1;
2638 }
2639
2640 assert (rval->base == baseSavedForComparison);
2641 assert (rval->dev_structure_fresh);
2642
2643 // Clean up locally-created references
2644 Py_DECREF(rval);
2645 Py_XDECREF(new_value);
2646
2647 return 0;
2648 }
2649
2650
2651 PyMappingMethods CudaNdarrayMappingMethods = {
2652 CudaNdarray_len, //lenfunc mp_length; __len__
2653 CudaNdarray_Subscript, //binaryfunc mp_subscript; __getitem__
2654 CudaNdarray_setitem //objobjargproc mp_ass_subscript; __setitem__
2655 };
2656
2657 ////////////////////
2658 //
2659 ////////////////////
2660
2661 static PyObject *
2662 CudaNdarray_get_shape(CudaNdarray *self, void *closure)
2663 {
2664 if (self->nd < 0)
2665 {
2666 PyErr_SetString(PyExc_ValueError, "CudaNdarray not initialized");
2667 return NULL;
2668 }
2669 PyObject * rval = PyTuple_New(self->nd);
2670 for (int i = 0; i < self->nd; ++i)
2671 {
2672 if (!rval || PyTuple_SetItem(rval, i, PyInt_FromLong(CudaNdarray_HOST_DIMS(self)[i])))
2673 {
2674 Py_XDECREF(rval);
2675 return NULL;
2676 }
2677
2678 }
2679 return rval;
2680 }
2681
2682 static int
2683 CudaNdarray_set_shape(CudaNdarray *self, PyObject *value, void *closure)
2684 {
2685 PyErr_SetString(PyExc_NotImplementedError, "TODO: call reshape");
2686 return -1;
2687 }
2688
2689 static PyObject *
2690 CudaNdarray_get_strides(CudaNdarray *self, void *closure)
2691 {
2692 if (self->nd < 0)
2693 {
2694 PyErr_SetString(PyExc_ValueError, "CudaNdarray not initialized");
2695 return NULL;
2696 }
2697 PyObject * rval = PyTuple_New(self->nd);
2698 for (int i = 0; i < self->nd; ++i)
2699 {
2700 if (!rval || PyTuple_SetItem(rval, i, PyInt_FromLong(CudaNdarray_HOST_STRIDES(self)[i])))
2701 {
2702 Py_XDECREF(rval);
2703 return NULL;
2704 }
2705
2706 }
2707 return rval;
2708 }
2709
2710 static int
2711 CudaNdarray_set_strides(CudaNdarray *self, PyObject *value, void *closure)
2712 {
2713 //npy_intp newstrides_bytes[PyTuple_Size(value)];
2714 if (PyTuple_Check(value)){
2715 if (PyTuple_Size(value) != CudaNdarray_NDIM(self)){
2716 PyErr_SetString(PyExc_ValueError,
2717 "The new strides tuple must have the same length"
2718 " as the number of dimensions");
2719 return -1;
2720 }
2721 }else if (PyList_Check(value)){
2722 if (PyList_Size(value) != CudaNdarray_NDIM(self)){
2723 PyErr_SetString(PyExc_ValueError,
2724 "The new strides list must have the same length"
2725 " as the number of dimensions");
2726 return -1;
2727 }
2728 }else{
2729 PyErr_SetString(PyExc_ValueError,
2730 "The new strides need to be encoded in a tuple or list");
2731 return -1;
2732 }
2733 npy_intp* newstrides = (npy_intp*) alloca(CudaNdarray_NDIM(self) * sizeof(npy_intp));
2734 if (PyTuple_Check(value)){
2735 for(int i=0; i < CudaNdarray_NDIM(self); i++){
2736 newstrides[i] = PyInt_AsLong(PyTuple_GetItem(value, Py_ssize_t(i)));
2737 //newstrides_bytes[i] = newstrides[i] * 4;
2738 }
2739 }else if (PyList_Check(value)){
2740 for(int i=0; i < CudaNdarray_NDIM(self); i++){
2741 newstrides[i] = PyInt_AsLong(PyList_GetItem(value, Py_ssize_t(i)));
2742 //newstrides_bytes[i] = newstrides[i] * 4;
2743 }
2744 }
2745 /*
2746 // Do not do this check, as ExtractDiag needs that, and NumPy does not seem
2747 // to do it.
2748 npy_intp dims[PyTuple_Size(value)];
2749 for(int i=0; i < CudaNdarray_NDIM(self); i++){
2750 dims[i] = CudaNdarray_HOST_DIMS(self)[i];
2751 }
2752 if (!PyArray_CheckStrides(4,
2753 CudaNdarray_NDIM(self),
2754 0, 0,
2755 dims,
2756 newstrides_bytes)){
2757 PyErr_SetString(PyExc_ValueError, "bad new strides");
2758 return -1;
2759 }
2760 */
2761 for(int i=0; i < CudaNdarray_NDIM(self); i++){
2762 CudaNdarray_set_stride(self, i, newstrides[i]);
2763 }
2764 return 0;
2765 }
2766
2767 static PyObject *
2768 CudaNdarray_get_dev_data(CudaNdarray *self, void *closure)
2769 {
2770 float * p = CudaNdarray_DEV_DATA(self);
2771 //printf("get_dev_data %p %li \n", p, (long int)p );
2772 return PyInt_FromSize_t((size_t) CudaNdarray_DEV_DATA(self));
2773 }
2774
2775 static int
2776 CudaNdarray_set_dev_data(CudaNdarray *self, PyObject *value, void *closure)
2777 {
2778 Py_ssize_t newdevdata = PyInt_AsSsize_t(value);
2779 //printf("set_dev_data %p %li \n",(float*)newdevdata ,newdevdata);
2780 if (PyErr_Occurred())
2781 {
2782 return -1;
2783 }
2784 return CudaNdarray_set_device_data(self, (float*)newdevdata, (CudaNdarray*)self->base);
2785 }
2786
2787 static PyObject *
2788 CudaNdarray_get_dtype(CudaNdarray *self, void *closure)
2789 {
2790 return PyString_FromString("float32");
2791 }
2792
2793 static PyObject *
2794 CudaNdarray_get_ndim(CudaNdarray *self, void *closure)
2795 {
2796 return PyInt_FromLong(self->nd);
2797 }
2798
2799 static PyObject *
2800 CudaNdarray_get_base(CudaNdarray *self, void *closure)
2801 {
2802 PyObject * base = self->base;
2803 if (!base)
2804 {
2805 // We cannot return a NULL pointer, use None instead
2806 base = Py_None;
2807 }
2808 Py_INCREF(base);
2809 return base;
2810 }
2811
2812 void put_in_dict(PyObject * dict, const char * key, int val)
2813 {
2814 PyObject * k = PyString_FromString(key);
2815 PyObject * v = PyInt_FromLong(val);
2816 PyDict_SetItem(dict, k, v);
2817 Py_DECREF(k);
2818 Py_DECREF(v);
2819 }
2820
2821 PyObject *
2822 GetDeviceProperties(PyObject* _unused, PyObject* args)
2823 {
2824 int dev_id = -1;
2825 if (! PyArg_ParseTuple(args, "i", &dev_id))
2826 return NULL;
2827 cudaDeviceProp deviceProp;
2828 cudaGetDeviceProperties(&deviceProp, dev_id);
2829
2830 PyObject * dict = PyDict_New();
2831 PyObject * str= PyString_FromString("name");
2832 PyObject * i = PyString_FromString(deviceProp.name);
2833 PyDict_SetItem(dict, str, i);
2834 Py_DECREF(str);
2835 Py_DECREF(i);
2836
2837 put_in_dict(dict, "major", deviceProp.major);
2838 put_in_dict(dict, "minor", deviceProp.minor);
2839 #if CUDART_VERSION >= 2020
2840 int driverVersion = 0, runtimeVersion = 0;
2841 cudaDriverGetVersion(&driverVersion);
2842 cudaRuntimeGetVersion(&runtimeVersion);
2843 put_in_dict(dict, "driverVersion", driverVersion);
2844 put_in_dict(dict, "runtimeVersion", runtimeVersion);
2845 #endif
2846 #if CUDART_VERSION >= 2000
2847
2848 put_in_dict(dict, "multiProcessorCount", deviceProp.multiProcessorCount);
2849 //if ConvertSMVer2Cores is not defined in cuda_runtime_api.h, the run time is too old.
2850 int sm_cores = -1;
2851 if(deviceProp.major==1)
2852 sm_cores = 32;
2853 else if(deviceProp.major==2 && deviceProp.minor==0)
2854 sm_cores = 32;
2855 else if(deviceProp.major==2 && deviceProp.minor==1)
2856 sm_cores = 48;
2857 put_in_dict(dict, "coresCount", sm_cores * deviceProp.multiProcessorCount);
2858 #endif
2859 put_in_dict(dict, "totalConstMem", deviceProp.totalConstMem);
2860 put_in_dict(dict, "sharedMemPerBlock", deviceProp.sharedMemPerBlock);
2861 put_in_dict(dict, "regsPerBlock", deviceProp.regsPerBlock);
2862 put_in_dict(dict, "warpSize", deviceProp.warpSize);
2863 put_in_dict(dict, "maxThreadsPerBlock", deviceProp.maxThreadsPerBlock);
2864 put_in_dict(dict, "maxThreadsDim0", deviceProp.maxThreadsDim[0]);
2865 put_in_dict(dict, "maxThreadsDim1", deviceProp.maxThreadsDim[1]);
2866 put_in_dict(dict, "maxThreadsDim2", deviceProp.maxThreadsDim[2]);
2867 put_in_dict(dict, "maxGridSize0", deviceProp.maxGridSize[0]);
2868 put_in_dict(dict, "maxGridSize1", deviceProp.maxGridSize[1]);
2869 put_in_dict(dict, "maxGridSize2", deviceProp.maxGridSize[2]);
2870 put_in_dict(dict, "memPitch", deviceProp.memPitch);
2871 put_in_dict(dict, "textureAlignment", deviceProp.textureAlignment);
2872 put_in_dict(dict, "clockRate", deviceProp.clockRate);
2873 #if CUDART_VERSION >= 2000
2874 put_in_dict(dict, "deviceOverlap", deviceProp.deviceOverlap);
2875 #endif
2876 #if CUDART_VERSION >= 2020
2877 put_in_dict(dict, "kernelExecTimeoutEnabled", deviceProp.kernelExecTimeoutEnabled);
2878 put_in_dict(dict, "integrated", deviceProp.integrated);
2879 put_in_dict(dict, "canMapHostMemory", deviceProp.canMapHostMemory);
2880 put_in_dict(dict, "computeMode", deviceProp.computeMode);
2881 //in the doc of this fct tell that 0 - Normal mode, 1 - only 1 context, 2 - no context
2882 #endif
2883 #if CUDART_VERSION >= 3000
2884 put_in_dict(dict, "concurrentKernels", deviceProp.concurrentKernels);
2885 #endif
2886 #if CUDART_VERSION >= 3010
2887 put_in_dict(dict, "ECCEnabled", deviceProp.ECCEnabled);
2888 #endif
2889 #if CUDART_VERSION >= 3020
2890 put_in_dict(dict, "tccDriver", deviceProp.tccDriver);
2891 #endif
2892
2893 return dict;
2894 }
2895
2896 /*
2897 * Returns in *free and *total respectively, the free and total amount of memory available for allocation by the device in bytes.
2898 */
2899 PyObject *
2900 GetDeviceMemInfo(PyObject* _unused, PyObject* dummy)
2901 {
2902 size_t free = 0, total = 0;
2903 if(g_gpu_context_active == 0){
2904 PyErr_Format(PyExc_RuntimeError, "No gpu device selected yet. Please make sure the gpu device was initialized by Theano before.");
2905 return NULL;
2906 }
2907
2908 cudaError_t err = cudaMemGetInfo(&free, &total);
2909 if (err != cudaSuccess){
2910 // Clear the error flag, cudaMemGetInfo doesn't do it.
2911 // Currently this returns the same thing as err, but if in future
2912 // it returns something else I still don't see why we should ignore
2913 // it. All we want to do here is reset the flag.
2914 cudaGetLastError();
2915 PyErr_Format(PyExc_RuntimeError,
2916 "Error while getting memory info about the gpu: %s",
2917 cudaGetErrorString(err));
2918 return NULL;
2919 }
2920 return PyTuple_Pack(2, PyLong_FromSize_t(free), PyLong_FromSize_t(total));
2921 }
2922
2923 /*
2924 * Synchronize with all the gpu device stream.
2925 */
2926 PyObject *
2927 CudaNdarray_synchronize(PyObject* _unused, PyObject* dummy)
2928 {
2929 CNDA_BEGIN_ALLOW_THREADS
2930 cudaThreadSynchronize();
2931 CNDA_END_ALLOW_THREADS
2932 Py_INCREF(Py_None);
2933 return Py_None;
2934 }
2935
2936 /*
2937 * Exist and return true if we link with cublas v2.
2938 */
2939 PyObject *
2940 CudaNdarray_cublasv2(PyObject* _unused, PyObject* dummy)
2941 {
2942 Py_INCREF(Py_True);
2943 return Py_True;
2944 }
2945
2946 PyObject *
2947 CudaNdarray_select_a_gpu(PyObject* _unused, PyObject* dummy)
2948 {
2949 void * rval = NULL;
2950 cudaError_t err;
2951 int num_gpus = 0;
2952
2953 err = cudaGetDeviceCount(&num_gpus);
2954 if (cudaSuccess != err){
2955 printf("ERR!\\n");
2956 PyErr_Format(PyExc_RuntimeError,
2957 "Not able to get number of GPUs (%s).",
2958 cudaGetErrorString(err));
2959 return NULL;
2960 }
2961
2962 for (int device = 0; device < num_gpus; device++) {
2963 cudaSetDevice(device);
2964 err = cudaDeviceSynchronize(); // << CUDA context gets created here.
2965 cudaGetLastError(); // reset the error state
2966 if (cudaSuccess == err)
2967 break;
2968 }
2969
2970 if (cudaSuccess != err){
2971 printf("ERR!\\n");
2972 PyErr_Format(PyExc_RuntimeError,
2973 "Not able to select available GPU from %d cards (%s).",
2974 num_gpus, cudaGetErrorString(err));
2975 return NULL;
2976 }
2977
2978 Py_INCREF(Py_None);
2979 return Py_None;
2980 }
2981
2982 #if COMPUTE_GPU_MEM_USED
2983 /*
2984 * Return the size in bytes that Theano currently have allocated on the gpu.
2985 */
2986 PyObject *
2987 GetTheanoAllocInfo(PyObject* _unused, PyObject* dummy)
2988 {
2989 PyObject* a = PyLong_FromSize_t(_allocated_size);
2990 PyObject* b = PyLong_FromSize_t(_max_allocated_size);
2991
2992 PyObject* tuple = PyTuple_New(2);
2993 PyTuple_SetItem(tuple, 0, a);
2994 PyTuple_SetItem(tuple, 1, b);
2995 return tuple;
2996 }
2997 #endif
2998
2999 static PyGetSetDef CudaNdarray_getset[] = {
3000 {"shape",
3001 (getter)CudaNdarray_get_shape,
3002 (setter)CudaNdarray_set_shape,
3003 "shape of this ndarray (tuple)",
3004 NULL},
3005 {"_strides",
3006 (getter)CudaNdarray_get_strides,
3007 (setter)CudaNdarray_set_strides,
3008 "data pointer strides (in elements)",
3009 NULL},
3010 {"strides",
3011 (getter)CudaNdarray_get_strides,
3012 (setter)CudaNdarray_set_strides,
3013 "data pointer strides (in elements)",
3014 NULL},
3015 //gpudata is needed to allow calling pycuda fct with CudaNdarray input.
3016 {"gpudata",
3017 (getter)CudaNdarray_get_dev_data,
3018 NULL,
3019 "device data pointer",
3020 NULL},
3021 {"_dev_data",
3022 (getter)CudaNdarray_get_dev_data,
3023 (setter)CudaNdarray_set_dev_data,
3024 "device data pointer",
3025 NULL},
3026 {"dtype",
3027 (getter)CudaNdarray_get_dtype,
3028 NULL,
3029 "The dtype of the element. Now always float32",
3030 NULL},
3031 {"size",
3032 (getter)CudaNdarray_SIZE_Object,
3033 NULL,
3034 "The number of elements in this object.",
3035 NULL},
3036 //mem_size is neede for pycuda.elementwise.ElementwiseKernel Why do they use size and mem_size of the same value?
3037 {"mem_size",
3038 (getter)CudaNdarray_SIZE_Object,
3039 NULL,
3040 "The number of elements in this object.",
3041 NULL},
3042 {"ndim",
3043 (getter)CudaNdarray_get_ndim,
3044 NULL,
3045 "The number of dimensions in this object.",
3046 NULL},
3047 {"base",
3048 (getter)CudaNdarray_get_base,
3049 NULL,
3050 "If this ndarray is a view, base is the original ndarray.",
3051 NULL},
3052
3053 {NULL, NULL, NULL, NULL} /* Sentinel */
3054 };
3055
3056 PyObject *CudaNdarray_repr(PyObject *self)
3057 {
3058 CudaNdarray *object = (CudaNdarray *)self;
3059 PyObject * np_object = CudaNdarray_CreateArrayObj(object);
3060 PyObject * str = PyObject_Str((PyObject *) np_object);
3061 char * cstr = PyString_AsString(str);
3062 PyObject * out = PyString_FromFormat("%s%s%s",
3063 "CudaNdarray(",
3064 cstr,
3065 ")");
3066 Py_DECREF(str);
3067 Py_DECREF(np_object);
3068 #if PY_MAJOR_VERSION >= 3
3069 // In Python 3 PyString_FromFormat return a Bytes object
3070 PyObject* out2 = PyObject_Str(out);
3071 Py_DECREF(out);
3072 return out2;
3073 #endif
3074 return out;
3075 }
3076
3077 static PyTypeObject CudaNdarrayType =
3078 {
3079 #if PY_MAJOR_VERSION >= 3
3080 PyVarObject_HEAD_INIT(NULL, 0)
3081 #else
3082 PyObject_HEAD_INIT(NULL)
3083 0, /*ob_size*/
3084 #endif
3085 "CudaNdarray", /*tp_name*/
3086 sizeof(CudaNdarray), /*tp_basicsize*/
3087 0, /*tp_itemsize*/
3088 (destructor)CudaNdarray_dealloc, /*tp_dealloc*/
3089 0, /*tp_print*/
3090 0, /*tp_getattr*/
3091 0, /*tp_setattr*/
3092 0, /*tp_compare*/
3093 CudaNdarray_repr, /*tp_repr*/
3094 &CudaNdarrayNumberMethods, /*tp_as_number*/
3095 0, /*tp_as_sequence*/
3096 &CudaNdarrayMappingMethods,/*tp_as_mapping*/
3097 0, /*tp_hash */
3098 0, /*tp_call*/
3099 0, /*tp_str*/
3100 0, /*tp_getattro*/
3101 0, /*tp_setattro*/
3102 0, /*tp_as_buffer*/
3103 #if PY_MAJOR_VERSION >= 3
3104 // Py_TPFLAGS_CHECKTYPES is always true and was removed in Python 3.
3105 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE, /*tp_flags*/
3106 #else
3107 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE | Py_TPFLAGS_CHECKTYPES, /*tp_flags*/
3108 #endif
3109 "CudaNdarray objects", /* tp_doc */
3110 0, /* tp_traverse */
3111 0, /* tp_clear */
3112 0, /* tp_richcompare */
3113 0, /* tp_weaklistoffset */
3114 0, /* tp_iter */
3115 0, /* tp_iternext */
3116 CudaNdarray_methods, /* tp_methods */
3117 CudaNdarray_members, /* tp_members */
3118 CudaNdarray_getset, /* tp_getset */
3119 0, /* tp_base */
3120 0, /* tp_dict */
3121 0, /* tp_descr_get */
3122 0, /* tp_descr_set */
3123 0, /* tp_dictoffset */
3124 (initproc)CudaNdarray_init,/* tp_init */
3125 0, /* tp_alloc */
3126 CudaNdarray_new, /* tp_new */
3127 };
3128
3129 static __global__ void get_gpu_ptr_size(int* dst)
3130 {
3131 dst[0] = sizeof(float*);
3132 dst[1] = sizeof(int);
3133 }
3134
3135 PyObject *
3136 CudaNdarray_ptr_int_size(PyObject* _unused, PyObject* args)
3137 {
3138 int *gpu_data = (int*)device_malloc(sizeof(int)*2);
3139 if(gpu_data == NULL){
3140 return NULL;
3141 }
3142 get_gpu_ptr_size<<<1,1>>>(gpu_data);
3143
3144 cudaError_t cudaErr = cudaGetLastError();
3145 if (cudaSuccess != cudaErr){
3146
3147 device_free(gpu_data);
3148 return PyErr_Format(PyExc_RuntimeError,
3149 "CudaNdarray_ptr_int_size: error when calling the gpu code. (%s)",
3150 cudaGetErrorString(cudaErr));
3151 }
3152
3153 // Transfer the result to cpu
3154 int gpu_sizes[] = {-1,-1};
3155 cublasStatus_t err;
3156 err = cublasGetVector(2, sizeof(int), gpu_data, 1, gpu_sizes, 1);
3157 device_free(gpu_data);
3158
3159 if (CUBLAS_STATUS_SUCCESS != err){
3160 PyErr_SetString(PyExc_RuntimeError, "error copying data to from memory");
3161 return NULL;
3162 }
3163 return Py_BuildValue("iiii", (int) gpu_sizes[0], (int)sizeof(float*),
3164 (int)sizeof(int), (int) gpu_sizes[1]);
3165 }
3166
3167 static int cublas_init();
3168 static void cublas_shutdown();
3169 // Initialize the gpu.
3170 // Takes two optional parameters, the device number and if we should use cnmem.
3171 // If the device number is provided, it sets that device to be the active device.
3172 // If not provided (usually just to test whether the gpu is available at all),
3173 // it does not set an active device.
3174 // Raises EnvironmentError or ValueError (as appropriate) if the initialization failed.
3175 // cnmem is threaded like a bool. If converted to 0, don't use cnmem. Otherwise, use it.
3176 PyObject *
3177 CudaNdarray_gpu_init(PyObject* _unused, PyObject* args)
3178 {
3179 int card_nb = 0;
3180 int card_number_provided = 1;
3181 float cnmem = 0; // Theano flag lib.cnmem
3182 // if we're given something wildly invalid, this will throw a TypeError
3183 if(!PyArg_ParseTuple(args, "|if", &card_nb, &cnmem))
3184 return NULL;
3185 if(cnmem)
3186 g_use_cnmem = true;
3187
3188 if(PyTuple_Size(args) == 0) {
3189 card_number_provided = 0;
3190 card_nb = 0;
3191 }
3192
3193 int deviceCount;
3194 cudaError err = cudaGetDeviceCount(&deviceCount);
3195 if(cudaSuccess != err) {
3196 return PyErr_Format(PyExc_EnvironmentError,
3197 "Unable to get the number of gpus available: %s",
3198 cudaGetErrorString(cudaGetLastError()));
3199 }
3200
3201 // as soon as the first successful call to a cuda* function is made, a
3202 // gpu context has been created
3203 g_gpu_context_active = 1;
3204
3205 if(deviceCount <= 0) {
3206 return PyErr_Format(PyExc_EnvironmentError,
3207 "Can't use the GPU, no devices support CUDA");
3208 }
3209 if(card_number_provided && (card_nb < 0 || card_nb > (deviceCount - 1))) {
3210 return PyErr_Format(PyExc_ValueError,
3211 "Bad device number %d. Only %d devices available.",
3212 card_nb,
3213 deviceCount);
3214 }
3215
3216 cudaDeviceProp deviceProp;
3217 err = cudaGetDeviceProperties(&deviceProp, card_nb);
3218 if(cudaSuccess != err) {
3219 return PyErr_Format(PyExc_EnvironmentError,
3220 "Unable to get properties of gpu %i: %s",
3221 card_nb,
3222 cudaGetErrorString(cudaGetLastError()));
3223 }
3224
3225 if(deviceProp.major == 9999 && deviceProp.minor == 9999 ){
3226 return PyErr_Format(PyExc_EnvironmentError,
3227 "There is no device that supports CUDA");
3228 }
3229
3230 if(card_number_provided) {
3231 err = cudaSetDevice(card_nb);
3232 if(cudaSuccess != err) {
3233 return PyErr_Format(PyExc_EnvironmentError,
3234 "Unable to set device %i: %s",
3235 card_nb,
3236 cudaGetErrorString(cudaGetLastError()));
3237 }
3238 if (cublas_init() == -1)
3239 return NULL;
3240 }
3241 if(card_number_provided && g_use_cnmem) {
3242 size_t mem = 0;
3243 if (cnmem > 1)
3244 mem = cnmem * 1024 * 1024;
3245 else{
3246 // Clip to 95% to let memory for the driver.
3247 // 98% didn't worked in some cases.
3248 if (cnmem > .95){
3249 cnmem = .95;
3250 }
3251 size_t free = 0, total = 0;
3252 cudaError_t err = cudaMemGetInfo(&free, &total);
3253 if (err != cudaSuccess){
3254 // Clear the error flag, cudaMemGetInfo doesn't do it.
3255 // Currently this returns the same thing as err, but if in future
3256 // it returns something else I still don't see why we should ignore
3257 // it. All we want to do here is reset the flag.
3258 cudaGetLastError();
3259 PyErr_Format(PyExc_RuntimeError,
3260 "Error while getting memory info about the gpu: %s",
3261 cudaGetErrorString(err));
3262 return NULL;
3263 }
3264 mem = total * cnmem;
3265 }
3266 if(initCnmem(card_number_provided, card_nb, mem) == -1){
3267 return NULL;
3268 }
3269 }
3270
3271 Py_INCREF(Py_None);
3272 return Py_None;
3273 }
3274
3275 PyObject *
3276 CudaNdarray_active_device_number(PyObject* _unused, PyObject* _unused_args) {
3277 // NB: No cuda error checking here; keeps things simple, and it's not
3278 // really necessary.
3279 int currentDevice;
3280 cudaGetDevice(¤tDevice);
3281 return PyInt_FromLong(currentDevice);
3282 }
3283
3284 PyObject *
3285 CudaNdarray_active_device_name(PyObject* _unused, PyObject* _unused_args) {
3286 // NB: No cuda error checking here; keeps things simple, and it's not
3287 // really necessary.
3288 int currentDevice;
3289 cudaGetDevice(¤tDevice);
3290
3291 cudaDeviceProp deviceProp;
3292 cudaGetDeviceProperties(&deviceProp, currentDevice);
3293 return PyString_FromString(deviceProp.name);
3294 }
3295
3296 PyObject *
3297 CudaNdarray_gpu_shutdown(PyObject* _unused, PyObject* _unused_args) {
3298 // Don't handle errors here
3299 cublas_shutdown();
3300 g_gpu_context_active = 0; // context has now been closed down
3301 if(g_use_cnmem) {
3302 cnmemStatus_t status = cnmemFinalize();
3303 if(status != CNMEM_STATUS_SUCCESS) {
3304 fprintf(stderr, "CudaNdarray_gpu_shutdown: cnmemFinalize failed! Reason=%s\n",
3305 cnmemGetErrorString(status));
3306 if(status == CNMEM_STATUS_CUDA_ERROR) {
3307 fprintf(stderr, " Cuda-Reason=%s\n",
3308 cudaGetErrorString(cudaGetLastError()));
3309 }
3310 }
3311 }
3312
3313 Py_INCREF(Py_None);
3314 return Py_None;
3315 }
3316
3317 /*
3318 * This function is tested in theano/misc/test_pycuda_theano_simple.py
3319 */
3320 PyObject *
3321 CudaNdarray_from_gpu_pointer(PyObject* _unused, PyObject* args)
3322 {
3323 int verbose = 0;
3324 PyObject *gpu_ptr = NULL;
3325 PyObject *shapes = NULL;
3326 PyObject *strides = NULL;
3327 PyObject *base = NULL;
3328 PyObject *rval = NULL;
3329
3330 //args should consist of 3 python objects
3331 //The first is the gpu ptr
3332 //The second if the shape
3333 //The third if the strides
3334 if (! PyArg_ParseTuple(args, "OOOO", &gpu_ptr, &shapes, &strides, &base))
3335 return NULL;
3336
3337 if (verbose) printf("In CudaNdarray_from_gpu_pointer\n");
3338 if (!PyLong_Check(gpu_ptr))
3339 {
3340 PyErr_Format(PyExc_Exception, "CudaNdarray_from_gpu_pointer: The gpu pointor is not an long");
3341 return NULL;
3342 }
3343
3344 Py_ssize_t nd = PyObject_Length(shapes);
3345 if (nd < 0)
3346 {
3347 PyErr_SetString(PyExc_TypeError, "CudaNdarray_from_gpu_pointer: Couldn't get length of second argument");
3348 return NULL;
3349 }
3350 Py_ssize_t nd_stride = PyObject_Length(strides);
3351 if (nd_stride < 0)
3352 {
3353 PyErr_SetString(PyExc_TypeError, "CudaNdarray_from_gpu_pointer: Couldn't get length of third argument");
3354 return NULL;
3355 }
3356
3357 if (nd != nd_stride)
3358 {
3359 PyErr_SetString(PyExc_TypeError, "CudaNdarray_from_gpu_pointer: We need the same number of shapes and strides");
3360 return NULL;
3361 }
3362
3363 rval = CudaNdarray_New();
3364
3365 if (CudaNdarray_set_nd((CudaNdarray *)rval, nd))
3366 {
3367 //CudaNdarray_set_nd set the error msg
3368 return NULL;
3369 }
3370 // set gpu pointeur
3371 assert(((CudaNdarray *)rval)->data_allocated == 0);
3372 if (CudaNdarray_set_device_data((CudaNdarray *)rval, (float *)PyInt_AsLong(gpu_ptr), base))
3373 {
3374 PyErr_SetString(PyExc_TypeError, "CudaNdarray_from_gpu_pointer: Error while setting the gpu pointor");
3375 return NULL;
3376
3377 }
3378
3379 // Set dims and strides
3380 for (int i = nd-1; i >= 0; --i)
3381 {
3382 PyObject * idx = PyLong_FromLong(i);
3383 if (idx == NULL)
3384 {
3385 PyErr_SetString(PyExc_Exception, "CudaNdarray_from_gpu_pointer: Couldn't make long object to loop over list/tuple");
3386 return NULL;
3387 }
3388 PyObject* dim_ = PyObject_GetItem(shapes, idx);
3389 PyObject* strd_ = PyObject_GetItem(strides, idx);
3390 if (!PyInt_Check(dim_))
3391 {
3392 PyErr_Format(PyExc_Exception, "CudaNdarray_from_gpu_pointer: shapes[%d] is not an int", i);
3393 return NULL;
3394 }
3395 if (!PyInt_Check(strd_))
3396 {
3397 PyErr_Format(PyExc_Exception, "CudaNdarray_from_gpu_pointer: strides[%d] is not an int", i);
3398 return NULL;
3399 }
3400 int dim = PyInt_AsLong(dim_);
3401 int strd = PyInt_AsLong(strd_);
3402 CudaNdarray_set_stride((CudaNdarray *)rval, i, strd);
3403 CudaNdarray_set_dim((CudaNdarray *)rval, i, dim);
3404 Py_DECREF(idx);
3405 Py_DECREF(dim_);
3406 Py_DECREF(strd_);
3407 }
3408 if (verbose) printf("CudaNdarray_from_gpu_pointer normal return\n");
3409 return rval;
3410 }
3411
3412 PyObject *
3413 CudaNdarray_Dot(PyObject* _unused, PyObject* args)
3414 {
3415 PyObject *l=NULL;
3416 PyObject *r=NULL;
3417 PyObject * rval = NULL;
3418
3419 //args should consist of two python objects ("OO")
3420 if (! PyArg_ParseTuple(args, "OO", &l, &r))
3421 return NULL;
3422
3423 if (!CudaNdarray_Check(l) || !CudaNdarray_Check(r))
3424 {
3425 PyErr_SetString(PyExc_TypeError, "CudaNdarray arguments required ");
3426 goto CudaNdarray_dot_fail;
3427 }
3428 if (((CudaNdarray*)l)->nd != 2)
3429 {
3430 PyErr_SetString(PyExc_TypeError, "need 2d CudaNdarray arg for now");
3431 goto CudaNdarray_dot_fail;
3432 }
3433 if (((CudaNdarray*)r)->nd != 2)
3434 {
3435 PyErr_SetString(PyExc_TypeError, "need 2d CudaNdarray arg for now");
3436 goto CudaNdarray_dot_fail;
3437 }
3438 rval = CudaNdarray_New();
3439 if (!rval)
3440 {
3441 goto CudaNdarray_dot_fail;
3442 }
3443 int dims[2];
3444 dims[0] = CudaNdarray_HOST_DIMS((CudaNdarray*)l)[0];
3445 dims[1] = CudaNdarray_HOST_DIMS((CudaNdarray*)r)[1];
3446 if (CudaNdarray_alloc_contiguous((CudaNdarray*)rval, 2, dims))
3447 {
3448 goto CudaNdarray_dot_fail;
3449 }
3450 if (CudaNdarray_gemm(1.0, (CudaNdarray*)l, (CudaNdarray*)r, 0.0, (CudaNdarray*)rval))
3451 {
3452 goto CudaNdarray_dot_fail;
3453 }
3454
3455 return rval;
3456
3457 CudaNdarray_dot_fail:
3458 Py_XDECREF(rval);
3459 return NULL;
3460 }
3461
3462 static PyObject *
3463 filter(PyObject* __unsed_self, PyObject *args) // args = (data, broadcastable, strict, storage)
3464 {
3465 /*
3466 * TODO: DOC what this function should do in the various cases of
3467 * What is 'strict' supposed to mean in the context of this function?
3468 * What do we do with input that could be interpreted as matching the broadcastable pattern in strict vs. non-strict cases?
3469 *
3470 */
3471 PyObject *py_data=NULL;
3472 PyArrayObject * data = NULL;
3473 int strict = 0;
3474 PyObject * broadcastable=NULL;
3475 PyObject * storage=NULL;
3476 CudaNdarray * rval=NULL;
3477
3478 //Python object references which are provided to the caller are borrowed references
3479 if (!PyArg_ParseTuple(args, "OOiO", &py_data, &broadcastable, &strict, &storage)) return NULL;
3480
3481 if (!PyTuple_Check(broadcastable)){
3482 PyErr_SetString(PyExc_TypeError, "broadcastable arg should be a tuple of int.");
3483 return NULL;
3484 }
3485 Py_INCREF(py_data);
3486 Py_INCREF(broadcastable);
3487
3488 CudaNdarray * cnda = (CudaNdarray*)py_data;
3489
3490 if (strict || CudaNdarray_Check(py_data))
3491 {
3492 //TODO: support non-strict "casting" from a vt to the broadcastable/type/size that we need.
3493 if (!CudaNdarray_Check(py_data))
3494 {
3495 Py_DECREF(py_data);
3496 Py_DECREF(broadcastable);
3497 PyErr_SetString(PyExc_TypeError, "strict mode requires CudaNdarray");
3498 return NULL;
3499 }
3500 if (cnda->nd != PyTuple_Size(broadcastable))
3501 {
3502 Py_DECREF(py_data);
3503 Py_DECREF(broadcastable);
3504 PyErr_Format(PyExc_TypeError, "Wrong rank: %i vs %li", cnda->nd, (long)PyTuple_Size(broadcastable));
3505 return NULL;
3506 }
3507 for (int i = 0; i < cnda->nd; ++i)
3508 {
3509 if ((CudaNdarray_HOST_DIMS(cnda)[i] > 1) && PyInt_AsLong(PyTuple_GetItem(broadcastable, Py_ssize_t(i))))
3510 {
3511 PyErr_Format(PyExc_TypeError, "Non-unit size in broadcastable vt dimension %i", i);
3512 Py_DECREF(py_data);
3513 Py_DECREF(broadcastable);
3514 return NULL;
3515 }else if (CudaNdarray_HOST_DIMS(cnda)[i] == 1 && CudaNdarray_HOST_STRIDES(cnda)[i] != 0){
3516 PyErr_Format(PyExc_TypeError, "Non-zeros strides(%d) on dimension %d of size 1",
3517 CudaNdarray_HOST_STRIDES(cnda)[i], i);
3518 Py_DECREF(py_data);
3519 Py_DECREF(broadcastable);
3520 return NULL;
3521 }
3522 }
3523 Py_DECREF(broadcastable);
3524 return py_data;
3525 }
3526 else
3527 {
3528 data = (PyArrayObject*)PyArray_FromObject(py_data, REAL_TYPENUM, PyTuple_Size(broadcastable), PyTuple_Size(broadcastable));
3529 if (!data)
3530 {
3531 //err message already defined
3532 Py_DECREF(py_data);
3533 Py_DECREF(broadcastable);
3534 return NULL;
3535 }
3536 for (int i = 0; i < PyArray_NDIM(data); ++i)
3537 {
3538 if ((PyArray_DIMS(data)[i] > 1) && PyInt_AsLong(PyTuple_GetItem(broadcastable, Py_ssize_t(i))))
3539 {
3540 PyErr_Format(PyExc_TypeError, "Non-unit size in broadcastable dimension %i", i);
3541 Py_DECREF(data);
3542 Py_DECREF(py_data);
3543 Py_DECREF(broadcastable);
3544 return NULL;
3545 }
3546 }
3547 if (storage && CudaNdarray_Check(storage))
3548 {
3549 rval = (CudaNdarray*) storage;
3550 Py_INCREF(rval);
3551 }
3552 else
3553 {
3554 rval = (CudaNdarray*) CudaNdarray_New();
3555 }
3556 if (rval)
3557 {
3558 if (CudaNdarray_CopyFromArray(rval, data))
3559 {
3560 Py_DECREF(rval);
3561 rval = NULL;
3562 }
3563 }
3564 Py_DECREF(data);
3565 Py_DECREF(py_data);
3566 Py_DECREF(broadcastable);
3567 return (PyObject*)rval;
3568 }
3569 }
3570
3571 //TODO-- CudaNdarray_Dot and CudaNdarray_active_device_name are following different capitalization conventions.
3572 // Pick one and standardize it, this file is already annoying enough to grep through
3573 static PyMethodDef module_methods[] = {
3574 {"dimshuffle", CudaNdarray_Dimshuffle, METH_VARARGS, "Returns the dimshuffle of a CudaNdarray."},
3575 {"dot", CudaNdarray_Dot, METH_VARARGS, "Returns the matrix product of two CudaNdarray arguments."},
3576 {"gpu_init", CudaNdarray_gpu_init, METH_VARARGS, "Select the gpu card to use; also usable to test whether CUDA is available."},
3577 {"select_a_gpu", CudaNdarray_select_a_gpu, METH_NOARGS, "Call this method if you want to select a GPU before gpu_init call and let the driver choose the GPU."},
3578 {"active_device_name", CudaNdarray_active_device_name, METH_VARARGS, "Get the name of the active device."},
3579 {"active_device_number", CudaNdarray_active_device_number, METH_VARARGS, "Get the number of the active device."},
3580 {"gpu_shutdown", CudaNdarray_gpu_shutdown, METH_VARARGS, "Shut down the gpu."},
3581 {"device_properties", GetDeviceProperties, METH_VARARGS, "Return a dictionary with the device properties."},
3582 {"mem_info", GetDeviceMemInfo, METH_NOARGS, "Return a tuple with the free and total memory on the gpu in bytes."},
3583 #if COMPUTE_GPU_MEM_USED
3584 {"theano_allocated", GetTheanoAllocInfo, METH_NOARGS, "Return the size in bytes of memory Theano currently have allocated on the gpu."},
3585 #endif
3586 {"ptr_int_size", CudaNdarray_ptr_int_size, METH_VARARGS, "Return a tuple with the size of gpu pointer, cpu pointer and int in bytes."},
3587 {"filter", filter, METH_VARARGS, "filter(obj, broadcastable, strict, storage) returns a CudaNdarray initialized to obj if it matches the constraints of broadcastable. strict=True prevents any numeric casting. If storage is a CudaNdarray it may be overwritten and used as the return value."},
3588 {"outstanding_mallocs", outstanding_mallocs, METH_VARARGS, "how many more mallocs have been called than free's"},
3589 {"from_gpu_pointer", CudaNdarray_from_gpu_pointer, METH_VARARGS, "Used to create a CudaNdarray from already allocated memory on the gpu.(example by pycuda)"},
3590 {"synchronize", CudaNdarray_synchronize, METH_NOARGS, "Used to synchronize the device"},
3591 {"cublas_v2", CudaNdarray_cublasv2, METH_NOARGS,
3592 "Used to know if this version of cuda_ndarray is linked with cublas v2."},
3593 {NULL, NULL, NULL, NULL} /* Sentinel */
3594 };
3595
3596 #define CNDA_MOD_NAME "cuda_ndarray"
3597 #define CNDA_DOCSTRING "CUDA implementation of a numpy ndarray-like object."
3598
3599 #if PY_MAJOR_VERSION == 3
3600 static struct PyModuleDef cuda_ndarray_moduledef =
3601 {
3602 PyModuleDef_HEAD_INIT,
3603 CNDA_MOD_NAME,
3604 CNDA_DOCSTRING,
3605 -1, /* size of per-interpreter state of the module,
3606 or -1 if the module keeps state in global variables. */
3607 module_methods
3608 };
3609
3610 PyMODINIT_FUNC
3611 PyInit_cuda_ndarray(void)
3612 #else
3613 PyMODINIT_FUNC
3614 initcuda_ndarray(void)
3615 #endif
3616 {
3617 import_array();
3618
3619 PyObject* m;
3620
3621 if (PyType_Ready(&CudaNdarrayType) < 0) {
3622 #if PY_MAJOR_VERSION == 3
3623 return NULL;
3624 #else
3625 return;
3626 #endif
3627 }
3628
3629 #if PY_MAJOR_VERSION == 3
3630 m = PyModule_Create(&cuda_ndarray_moduledef);
3631 #else
3632 m = Py_InitModule3(CNDA_MOD_NAME, module_methods, CNDA_DOCSTRING);
3633 #endif
3634
3635 if (m == NULL) {
3636 #if PY_MAJOR_VERSION == 3
3637 return NULL;
3638 #else
3639 return;
3640 #endif
3641 }
3642
3643 Py_INCREF(&CudaNdarrayType);
3644 PyModule_AddObject(m, "CudaNdarray", (PyObject *)&CudaNdarrayType);
3645 #if COMPUTE_GPU_MEM_USED
3646 for(int i=0;i<TABLE_SIZE;i++){
3647 _alloc_size_table[i].ptr=NULL;
3648 _alloc_size_table[i].size=0;
3649 }
3650 #endif
3651 // cublasInit();
3652 //if (0&&CUBLAS_STATUS_SUCCESS != cublasGetError())
3653 //{
3654 //std::cerr << "WARNING: initcuda_ndarray: error initializing device\n";
3655 //}
3656 if (0) //TODO: is this necessary?
3657 {
3658 int deviceId = 0; // TODO: what number goes here?
3659 cudaSetDevice(deviceId);
3660 cudaError_t err = cudaGetLastError();
3661 if( cudaSuccess != err)
3662 {
3663 std::cerr << "Error in SetDevice:" << cudaGetErrorString(err) << "\n";
3664 }
3665 }
3666
3667 #if PY_MAJOR_VERSION == 3
3668 return m;
3669 #endif
3670 }
3671
3672
3673 //////////////////////////////////////
3674 //
3675 // C API FOR CudaNdarray
3676 //
3677 //////////////////////////////////////
3678
3679 int
3680 CudaNdarray_Check(const PyObject * ob)
3681 {
3682 //TODO: doesn't work with inheritance
3683 return CudaNdarray_CheckExact(ob);
3684 }
3685 int
3686 CudaNdarray_CheckExact(const PyObject * ob)
3687 {
3688 return ((Py_TYPE(ob) == &CudaNdarrayType) ? 1 : 0);
3689 }
3690
3691 PyObject *
3692 CudaNdarray_New(int nd)
3693 {
3694 CudaNdarray *self = (CudaNdarray *)CudaNdarrayType.tp_alloc(&CudaNdarrayType, 0);
3695 if (self == NULL)
3696 {
3697 PyErr_SetString(PyExc_RuntimeError, "CudaNdarray_New failed to allocate self");
3698 return NULL;
3699 }
3700 CudaNdarray_null_init(self);
3701
3702 if (nd == 0)
3703 {
3704 self->nd = 0;
3705 }
3706 else if (nd > 0)
3707 {
3708 if (CudaNdarray_set_nd(self, nd))
3709 {
3710 Py_DECREF(self);
3711 return NULL;
3712 }
3713 }
3714 ++_outstanding_mallocs[1];
3715 return (PyObject *)self;
3716 }
3717
3718
3719
3720 //////////////////////////////
3721 //
3722 // Published helper functions
3723 //
3724 //////////////////////////////
3725
3726 static int
3727 cublas_init()
3728 {
3729 cublasStatus_t err;
3730 err = cublasCreate(&handle);
3731 if (CUBLAS_STATUS_SUCCESS != err)
3732 {
3733 if(CUBLAS_STATUS_NOT_INITIALIZED == err)
3734 PyErr_SetString(PyExc_RuntimeError,
3735 "cublasCreate() returned this error "
3736 "'the CUDA Runtime initialization failed'");
3737 else if(CUBLAS_STATUS_ALLOC_FAILED == err)
3738 PyErr_SetString(PyExc_RuntimeError,
3739 "cublasCreate() returned this error "
3740 "'the resources could not be allocated'");
3741 else
3742 PyErr_SetString(PyExc_RuntimeError,
3743 "unknow error during returned by cublasCreate()");
3744 return -1;
3745 }
3746 // Set the default stream as the one to execute on (default)
3747 cublasSetStream(handle, NULL);
3748 // Pointer to scalars are on the host (also default)
3749 cublasSetPointerMode(handle, CUBLAS_POINTER_MODE_HOST);
3750 #if CUDA_VERSION >= 5000
3751 // atomics can be used in kernels to speed up operations (not default)
3752 // This may lead to a slight variance from run to run in some operations
3753 cublasSetAtomicsMode(handle, CUBLAS_ATOMICS_ALLOWED);
3754 #endif
3755 return 0;
3756 }
3757
3758 static void
3759 cublas_shutdown()
3760 {
3761 if (handle != NULL)
3762 cublasDestroy(handle);
3763 // No point in handling any errors here
3764 handle = NULL;
3765 }
3766
3767 int
3768 CudaNdarray_CopyFromArray(CudaNdarray * self, PyArrayObject*obj)
3769 {
3770 int err = CudaNdarray_alloc_contiguous(self, PyArray_NDIM(obj),
3771 PyArray_DIMS(obj));
3772 if (err) {
3773 return err;
3774 }
3775
3776 int typenum = PyArray_TYPE(obj);
3777 if (typenum != REAL_TYPENUM)
3778 {
3779 PyErr_SetString(PyExc_TypeError, "can only copy from float arrays");
3780 return -1;
3781 }
3782 assert( 4 == PyArray_ITEMSIZE(obj));
3783 PyArrayObject * py_src = (PyArrayObject *)PyArray_ContiguousFromAny(
3784 (PyObject*)obj, typenum, self->nd, self->nd);
3785 if (!py_src) {
3786 return -1;
3787 }
3788 npy_intp py_src_size = PyArray_SIZE(py_src);
3789 void *py_src_data = PyArray_DATA(py_src);
3790 cudaError_t cerr;
3791 CNDA_BEGIN_ALLOW_THREADS;
3792 cerr = cudaMemcpy(self->devdata, py_src_data,
3793 py_src_size * sizeof(real),
3794 cudaMemcpyHostToDevice);
3795 //CNDA_THREAD_SYNC; // unneeded because cudaMemcpy is blocking anyway
3796 CNDA_END_ALLOW_THREADS;
3797 if (cudaSuccess != cerr)
3798 {
3799 PyErr_Format(PyExc_RuntimeError,
3800 "Cuda error '%s' while copying %lli data element"
3801 " to device memory. str ptr=%p. dst ptr=%p",
3802 cudaGetErrorString(cerr),
3803 (long long)py_src_size,
3804 py_src_data,
3805 self->devdata);
3806 Py_DECREF(py_src);
3807 return -1;
3808 }
3809 Py_DECREF(py_src);
3810 return 0;
3811 }
3812
3813 PyObject *
3814 CudaNdarray_new_nd(int nd)
3815 {
3816 CudaNdarray * rval = (CudaNdarray*) CudaNdarray_New();
3817 if (!rval || CudaNdarray_set_nd(rval, nd))
3818 {
3819 Py_XDECREF(rval);
3820 rval = NULL;
3821 }
3822 return (PyObject *) rval;
3823 }
3824
3825
3826 /**
3827 * Initialize 'self' as a view of 'base', with memory storage 'data'
3828 */
3829
3830 int CudaNdarray_set_device_data(CudaNdarray * self, float * data, PyObject * base)
3831 {
3832 if (self->data_allocated)
3833 {
3834 assert(self->devdata);
3835 if (device_free(self->devdata))
3836 {
3837 self->devdata = NULL;
3838 self->data_allocated = 0;
3839 return -1;
3840 }
3841 }
3842 // Get the original base object (base.base.base...)
3843 PyObject * orig_base = base;
3844 // base is not always a CudaNdarray. It can be a GpuArray from pycuda, ...
3845 while (orig_base && CudaNdarray_Check(orig_base) && ((CudaNdarray*) orig_base)->base)
3846 {
3847 // base_base is itself a view
3848 orig_base = ((CudaNdarray*) orig_base)->base;
3849 }
3850 //N.B. XDECREF and XINCREF are no-ops for NULL pointers
3851 if (self->base != orig_base)
3852 {
3853 Py_XDECREF(self->base);
3854 self->base = orig_base;
3855 Py_XINCREF(self->base);
3856 }
3857 self->data_allocated = 0;
3858 self->devdata = data;
3859 return 0;
3860 }
3861
3862 static __global__ void k_copy_1d(const int N, const float * x, const int sx, float * y, const int sy)
3863 {
3864 for (int i = threadIdx.x + blockIdx.x * blockDim.x; i < N; i += gridDim.x*blockDim.x)
3865 {
3866 y[i*sy] = x[i*sx];
3867 }
3868 }
3869
3870 // N1 through N4 are the size of y
3871 static __global__ void k_copy_4d(const int N1,
3872 const int N2, const int N3, const int N4,
3873 const float * x, const int sx1, const int sx2, const int sx3,
3874 const int sx4, float * y, const int sy1, const int sy2,
3875 const int sy3, const int sy4)
3876 {
3877 // These must be made int instead of unsigned int due to a bug in nvcc
3878 int bx = blockIdx.x;
3879 int by = blockIdx.y;
3880
3881 for (int i = bx; i < N1; i += gridDim.x)
3882 {
3883 for (int j = by; j < N2; j += gridDim.y)
3884 {
3885 for (int k = threadIdx.x; k < N3; k += (int) blockDim.x)
3886 {
3887 for (int l = threadIdx.y; l < N4; l += (int) blockDim.y)
3888 {
3889 y[i * sy1 + j * sy2 + k * sy3 + l * sy4] =
3890 x[i * sx1 + j * sx2 + k * sx3 + l * sx4];
3891 }
3892 }
3893 }
3894 }
3895 }
3896
3897 //copy from other into self
3898 int CudaNdarray_CopyFromCudaNdarray(CudaNdarray * self,
3899 const CudaNdarray * other,
3900 bool unbroadcast)
3901 {
3902 int verbose = 0;
3903 if (verbose>1) fprintf(stderr, "CudaNdarray_CopyFromCudaNdarray\n");
3904
3905 //standard elemwise size checks
3906 if (self->nd == -1)
3907 {
3908 PyErr_SetString(PyExc_TypeError,
3909 "can't copy into un-initialized CudaNdarray");
3910 return -1;
3911 }
3912 CudaNdarray * new_other = NULL;
3913
3914 if (self->nd < other->nd)
3915 {
3916 PyErr_Format(PyExc_NotImplementedError,
3917 "CudaNdarray_CopyFromCudaNdarray: The number of dimensions of the "
3918 "destination needs to be >= the number of dimensions of the "
3919 "source. Got %d and %d.", self->nd, other->nd);
3920 return -1;
3921 }
3922 else if (self->nd != other->nd)
3923 {
3924 new_other = (CudaNdarray *) CudaNdarray_View(other);
3925 int added_dims = self->nd - other->nd;
3926 int* pattern = (int*) alloca(self->nd * sizeof(int));
3927 for(int i = 0; i < added_dims; i++)
3928 pattern[i] = -1;
3929 for(int i = 0; i < other->nd; i++)
3930 pattern[i + added_dims] = i;
3931 CudaNdarray_dimshuffle(new_other, self->nd, pattern);
3932 other = new_other;
3933 }
3934 assert(self->nd == other->nd);
3935 //standard elemwise dim checks (also compute total size)
3936 unsigned int size = 1;
3937 unsigned int size_source = 1;
3938 for (int i = 0; i< self->nd; ++i)
3939 {
3940 if ((CudaNdarray_HOST_DIMS(self)[i] != CudaNdarray_HOST_DIMS(other)[i])
3941 && (1!=CudaNdarray_HOST_DIMS(other)[i] || !unbroadcast) )
3942 {
3943 PyErr_Format(PyExc_ValueError,
3944 "CudaNdarray_CopyFromCudaNdarray:"
3945 " need same dimensions for dim %d,"
3946 " destination=%d, source=%d",
3947 i, CudaNdarray_HOST_DIMS(self)[i],
3948 CudaNdarray_HOST_DIMS(other)[i]);
3949 Py_XDECREF(new_other);
3950 return -1;
3951 }
3952 size *= (unsigned int) CudaNdarray_HOST_DIMS(self)[i];
3953 size_source *= (unsigned int) CudaNdarray_HOST_DIMS(other)[i];
3954 }
3955 if (0 == size)
3956 {
3957 Py_XDECREF(new_other);
3958 return 0; //nothing to copy, we're done.
3959 }
3960 if (CudaNdarray_is_c_contiguous(self) &&
3961 CudaNdarray_is_c_contiguous(other) &&
3962 size == size_source)
3963 {
3964 if (verbose)
3965 fprintf(stderr, "Copying contiguous vector with cublasScopy\n");
3966
3967 cublasStatus_t err;
3968 err = cublasScopy(handle, size, CudaNdarray_DEV_DATA(other), 1,
3969 CudaNdarray_DEV_DATA(self), 1);
3970 CNDA_THREAD_SYNC;
3971 Py_XDECREF(new_other);
3972 if (CUBLAS_STATUS_SUCCESS != err)
3973 {
3974 PyErr_SetString(PyExc_RuntimeError, "Error copying memory");
3975 return -1;
3976 }
3977 return 0;
3978 }
3979 //TODO: rewrite these copy operations to be more efficient
3980 // See, for example the transpose example in the cuda_sdk.
3981 switch (self->nd)
3982 {
3983 case 0: // scalar
3984 {
3985 // THIS CASE SHOULD NEVER HAPPEN BECAUSE SCALARS ARE ALWAYS C CONTIGUOUS
3986 assert(0);
3987 }; break;
3988 case 1: // vector
3989 {
3990 if (verbose) fprintf(stderr, "Copying non-contiguous vector\n");
3991 if (verbose) fprint_CudaNdarray(stderr, other);
3992 unsigned int n_blocks = std::min(size,
3993 (unsigned int)NUM_VECTOR_OP_BLOCKS);
3994 unsigned int n_threads = std::min(ceil_intdiv(size, n_blocks),
3995 (unsigned int)NUM_VECTOR_OP_THREADS_PER_BLOCK);
3996 k_copy_1d<<<n_blocks, n_threads>>>(size,
3997 CudaNdarray_DEV_DATA(other),
3998 CudaNdarray_HOST_STRIDES(other)[0],
3999 CudaNdarray_DEV_DATA(self),
4000 CudaNdarray_HOST_STRIDES(self)[0]);
4001 CNDA_THREAD_SYNC;
4002 cudaError_t err = cudaGetLastError();
4003 if( cudaSuccess != err)
4004 {
4005 PyErr_Format(PyExc_RuntimeError,
4006 "Cuda error: %s: %s. (n_blocks=%i,"
4007 " n_threads_per_block=%i)\n", "k_copy_1d",
4008 cudaGetErrorString(err), n_blocks, n_threads);
4009 Py_XDECREF(new_other);
4010 return -1;
4011 }
4012 }; break;
4013 case 4: // 4-tensor
4014 {
4015 if (verbose)
4016 {
4017 if (0 != fprint_CudaNdarray(stderr, other))
4018 {
4019 Py_XDECREF(new_other);
4020 return -1;
4021 }
4022 }
4023
4024 // The blocks implement the looping over the first two axes so
4025 // this needs to be (N1, N2)
4026 dim3 n_blocks( std::min(CudaNdarray_HOST_DIMS(self)[0],
4027 NUM_VECTOR_OP_BLOCKS),
4028 std::min(CudaNdarray_HOST_DIMS(self)[1],
4029 NUM_VECTOR_OP_BLOCKS));
4030 // For the threads, just make as many as possible
4031 dim3 n_threads( std::min( (unsigned int) CudaNdarray_HOST_DIMS(self)[2],
4032 (unsigned int) NUM_VECTOR_OP_THREADS_PER_BLOCK),
4033 std::min( (unsigned int) CudaNdarray_HOST_DIMS(self)[3],
4034 (unsigned int) NUM_VECTOR_OP_THREADS_PER_BLOCK));
4035
4036 n_threads.x = std::min( (unsigned int) 32, (unsigned int) n_threads.x);
4037 n_threads.y = std::min( n_threads.y, NUM_VECTOR_OP_THREADS_PER_BLOCK / n_threads.x);
4038
4039 k_copy_4d<<<n_blocks, n_threads>>>(
4040 // size of y
4041 (unsigned int) CudaNdarray_HOST_DIMS(self)[0], // N1
4042 (unsigned int) CudaNdarray_HOST_DIMS(self)[1], // N2
4043 (unsigned int) CudaNdarray_HOST_DIMS(self)[2], // N3
4044 (unsigned int) CudaNdarray_HOST_DIMS(self)[3], // N4
4045 CudaNdarray_DEV_DATA(other), // x
4046 // x strides
4047 CudaNdarray_HOST_STRIDES(other)[0],
4048 CudaNdarray_HOST_STRIDES(other)[1],
4049 CudaNdarray_HOST_STRIDES(other)[2],
4050 CudaNdarray_HOST_STRIDES(other)[3],
4051 CudaNdarray_DEV_DATA(self), // y
4052 // y strides
4053 CudaNdarray_HOST_STRIDES(self)[0],
4054 CudaNdarray_HOST_STRIDES(self)[1],
4055 CudaNdarray_HOST_STRIDES(self)[2],
4056 CudaNdarray_HOST_STRIDES(self)[3]
4057 );
4058 CNDA_THREAD_SYNC;
4059 cudaError_t err = cudaGetLastError();
4060 if( cudaSuccess != err)
4061 {
4062 PyErr_Format(PyExc_RuntimeError,
4063 "Cuda error: %s: %s.",
4064 "k_copy_4d",
4065 cudaGetErrorString(err));
4066 Py_XDECREF(new_other);
4067 return -1;
4068 }
4069 }; break;
4070 default:
4071 {
4072 cudaError_t err = cudaGetLastError();
4073 if(cudaSuccess != err){
4074 PyErr_Format(PyExc_RuntimeError,
4075 "Unexpected Cuda error: %s: %s\n",
4076 "CudaNdarray_CopyFromCudaNdarray",
4077 cudaGetErrorString(err));
4078 Py_XDECREF(new_other);
4079 return -1;
4080 }
4081
4082 if (verbose)
4083 fprintf(stderr,
4084 "Copying with default version unbroadcast=%d\n",
4085 unbroadcast);
4086 // call worker routine
4087 unsigned int threads_per_block = std::min(size,
4088 (unsigned int)NUM_VECTOR_OP_THREADS_PER_BLOCK);
4089 unsigned int n_blocks = std::min(ceil_intdiv(size, threads_per_block),
4090 (unsigned int)NUM_VECTOR_OP_BLOCKS);
4091 const CudaNdarray * cuda_dims = other;
4092 if(unbroadcast)
4093 cuda_dims = self;
4094 //copy from other into self
4095 k_elemwise_unary_rowmajor_copy<<<n_blocks, threads_per_block>>>(
4096 size,
4097 (unsigned int)other->nd,
4098 (const int *)CudaNdarray_DEV_DIMS(cuda_dims),
4099 (const float*)CudaNdarray_DEV_DATA(other),
4100 (const int *)CudaNdarray_DEV_STRIDES(other),
4101 CudaNdarray_DEV_DATA(self),
4102 (const int *)CudaNdarray_DEV_STRIDES(self));
4103 CNDA_THREAD_SYNC;
4104 err = cudaGetLastError();
4105 if(verbose>1)
4106 fprintf(stderr,
4107 "INFO k_elemwise_unary_rowmaj (n_blocks=%i,"
4108 " n_threads_per_block=%i)\n",
4109 n_blocks, threads_per_block);
4110 if( cudaSuccess != err)
4111 {
4112 //fprint_CudaNdarray(stderr, self);
4113 //fprint_CudaNdarray(stderr, other);
4114 PyErr_Format(PyExc_RuntimeError,
4115 "Cuda error: %s: %s. (n_blocks=%i,"
4116 " n_threads_per_block=%i)\n",
4117 "k_elemwise_unary_rowmajor_copy",
4118 cudaGetErrorString(err), n_blocks,
4119 threads_per_block);
4120 Py_XDECREF(new_other);
4121 return -1;
4122 }
4123 }
4124 };
4125 Py_XDECREF(new_other);
4126 return 0;
4127 }
4128
4129 int CudaNdarray_gemm(float alpha, const CudaNdarray * A, const CudaNdarray * B, float beta, CudaNdarray * C)
4130 {
4131 if (A->nd != 2)
4132 {
4133 PyErr_SetString(PyExc_ValueError, "non-matrix arg A to gemm");
4134 return -1;
4135 }
4136 if (B->nd != 2)
4137 {
4138 PyErr_SetString(PyExc_ValueError, "non-matrix arg B to gemm");
4139 return -1;
4140 }
4141 if (C->nd != 2)
4142 {
4143 PyErr_SetString(PyExc_ValueError, "non-matrix arg C to gemm");
4144 return -1;
4145 }
4146
4147 // We must allow dimensions to be zeros.
4148 if ((CudaNdarray_HOST_DIMS(A)[1] != CudaNdarray_HOST_DIMS(B)[0])
4149 || (CudaNdarray_HOST_DIMS(A)[0] != CudaNdarray_HOST_DIMS(C)[0])
4150 || (CudaNdarray_HOST_DIMS(B)[1] != CudaNdarray_HOST_DIMS(C)[1]))
4151 {
4152 PyErr_Format(PyExc_ValueError, "dimension mismatch in args to gemm (%i,%i)x(%i,%i)->(%i,%i)",
4153 CudaNdarray_HOST_DIMS(A)[0],
4154 CudaNdarray_HOST_DIMS(A)[1],
4155 CudaNdarray_HOST_DIMS(B)[0],
4156 CudaNdarray_HOST_DIMS(B)[1],
4157 CudaNdarray_HOST_DIMS(C)[0],
4158 CudaNdarray_HOST_DIMS(C)[1]);
4159 return -1;
4160 }
4161
4162 // If matrix A or B has non-unit size and non-unit stride in both
4163 // dimensions, we can make a copy.
4164 CudaNdarray * A_new = NULL;
4165 CudaNdarray * B_new = NULL;
4166 if (((CudaNdarray_HOST_DIMS(A)[0] > 1)
4167 && (CudaNdarray_HOST_STRIDES(A)[0] != 1)
4168 && (CudaNdarray_HOST_DIMS(A)[1] > 1)
4169 && (CudaNdarray_HOST_STRIDES(A)[1] != 1))
4170 || (CudaNdarray_HOST_STRIDES(A)[0] < 0)
4171 || (CudaNdarray_HOST_STRIDES(A)[1] < 0))
4172 {
4173 A_new = (CudaNdarray*) CudaNdarray_Copy(A);
4174 if (!A_new)
4175 return -1;
4176 A = A_new;
4177 }
4178
4179 if (((CudaNdarray_HOST_DIMS(B)[0] > 1)
4180 && (CudaNdarray_HOST_STRIDES(B)[0] != 1)
4181 && (CudaNdarray_HOST_DIMS(B)[1] > 1)
4182 && (CudaNdarray_HOST_STRIDES(B)[1] != 1))
4183 || (CudaNdarray_HOST_STRIDES(B)[0] < 0)
4184 || (CudaNdarray_HOST_STRIDES(B)[1] < 0))
4185 {
4186 B_new = (CudaNdarray*) CudaNdarray_Copy(B);
4187 if (!B_new)
4188 {
4189 // If A_new is NULL, meaning A was not copied nothing happens
4190 Py_XDECREF(A_new);
4191 return -1;
4192 }
4193 B = B_new;
4194 }
4195
4196 // If matrix C has non-unit size and non-unit stride in both
4197 // dimensions, or negative strides, we can't operate. We cannot copy
4198 // C either, because the calling code will expect the result to be
4199 // in the original C container.
4200 if (((CudaNdarray_HOST_DIMS(C)[0] > 1)
4201 && (CudaNdarray_HOST_STRIDES(C)[0] != 1)
4202 && (CudaNdarray_HOST_DIMS(C)[1] > 1)
4203 && (CudaNdarray_HOST_STRIDES(C)[1] != 1))
4204 || (CudaNdarray_HOST_STRIDES(C)[0] < 0)
4205 || (CudaNdarray_HOST_STRIDES(C)[1] < 0))
4206 {
4207 PyErr_Format(PyExc_AssertionError,
4208 "non-unit or negative stride in gemm arg C (%i,%i) of shape (%i,%i)",
4209 CudaNdarray_HOST_STRIDES(C)[0],
4210 CudaNdarray_HOST_STRIDES(C)[1],
4211 CudaNdarray_HOST_DIMS(C)[0],
4212 CudaNdarray_HOST_DIMS(C)[1]);
4213 Py_XDECREF(A_new);
4214 Py_XDECREF(B_new);
4215 return -1;
4216 }
4217
4218 // the unit integer is divided logically into three fields of 4 bits
4219 // the lowermost 4 bits encode the stride pattern of the output
4220 // the next higher 4 bits encode the B variable (or y)
4221 // the next higher 4 bits encode the C variable (or x)
4222 //
4223 // the stride pattern for each input is encoded as 0 for unit stride from col to col (Row major)
4224 // 1 for unit stride from row to row (Col major)
4225
4226 // a stride of 0 implies a dimension of 1 - so we can actually define
4227 // a stride of 0 as a 'unit' stride because gemm will never use it.
4228 // If a dimension is 0, its stride will not be used either, so we can
4229 // consider it a 'unit' stride too.
4230 int unit = 0;
4231 if (CudaNdarray_HOST_STRIDES(A)[1] == 1 || CudaNdarray_HOST_DIMS(A)[1] <= 1) {
4232 unit |= (0x0 << 8);
4233 } else if (CudaNdarray_HOST_STRIDES(A)[0] == 1 || CudaNdarray_HOST_DIMS(A)[0] <= 1) {
4234 unit |= (0x1 << 8);
4235 } else {
4236 unit |= (0x2 << 8);
4237 }
4238 if (CudaNdarray_HOST_STRIDES(B)[1] == 1 || CudaNdarray_HOST_DIMS(B)[1] <= 1) {
4239 unit |= (0x0 << 4);
4240 } else if (CudaNdarray_HOST_STRIDES(B)[0] == 1 || CudaNdarray_HOST_DIMS(B)[0] <= 1) {
4241 unit |= (0x1 << 4);
4242 } else {
4243 unit |= (0x2 << 4);
4244 }
4245 if (CudaNdarray_HOST_STRIDES(C)[1] == 1 || CudaNdarray_HOST_DIMS(C)[1] <= 1) {
4246 unit |= (0x0 << 0);
4247 } else if (CudaNdarray_HOST_STRIDES(C)[0] == 1 || CudaNdarray_HOST_DIMS(C)[0] <= 1) {
4248 unit |= (0x1 << 0);
4249 } else {
4250 unit |= (0x2 << 0);
4251 }
4252
4253 /* create appropriate strides for malformed matrices that are row or column
4254 * vectors
4255 */
4256 int sa_0 = (CudaNdarray_HOST_DIMS(A)[0] > 1) ? CudaNdarray_HOST_STRIDES(A)[0] : CudaNdarray_HOST_DIMS(A)[1];
4257 int sa_1 = (CudaNdarray_HOST_DIMS(A)[1] > 1) ? CudaNdarray_HOST_STRIDES(A)[1] : CudaNdarray_HOST_DIMS(A)[0];
4258 int sb_0 = (CudaNdarray_HOST_DIMS(B)[0] > 1) ? CudaNdarray_HOST_STRIDES(B)[0] : CudaNdarray_HOST_DIMS(B)[1];
4259 int sb_1 = (CudaNdarray_HOST_DIMS(B)[1] > 1) ? CudaNdarray_HOST_STRIDES(B)[1] : CudaNdarray_HOST_DIMS(B)[0];
4260 int sc_0 = (CudaNdarray_HOST_DIMS(C)[0] > 1) ? CudaNdarray_HOST_STRIDES(C)[0] : CudaNdarray_HOST_DIMS(C)[1];
4261 int sc_1 = (CudaNdarray_HOST_DIMS(C)[1] > 1) ? CudaNdarray_HOST_STRIDES(C)[1] : CudaNdarray_HOST_DIMS(C)[0];
4262
4263 float* a = CudaNdarray_DEV_DATA(A);
4264 float* b = CudaNdarray_DEV_DATA(B);
4265 float* c = CudaNdarray_DEV_DATA(C);
4266 cublasOperation_t N = CUBLAS_OP_N;
4267 cublasOperation_t T = CUBLAS_OP_T;
4268 //std::cerr << (unit/256) MOD 16 << (unit / 16) MOD 16 << unit MOD 16<< '\\n';
4269 // There should be no negative stride at that point
4270 #define CHK_STRIDE_SGEMM(T0, T1, D0, D1, D2, a, x, sx, y, sy, b, z, sz) \
4271 if (sx == 0){sx = 1;}\
4272 if (sy == 0){sy = 1;}\
4273 if (sz == 0){sz = 1;}\
4274 if ((sx > 0) && (sy > 0) && (sz > 0)) { \
4275 err = cublasSgemm(handle, T0, T1, D0, D1, D2, &a, x, sx, y, sy, &b, z, sz); \
4276 } else { \
4277 PyErr_SetString(PyExc_AssertionError, "negative stride to sGemm");\
4278 Py_XDECREF(A_new);\
4279 Py_XDECREF(B_new);\
4280 return -1; \
4281 }
4282
4283 cublasStatus_t err;
4284 switch(unit)
4285 {
4286 case 0x000: CHK_STRIDE_SGEMM(N, N, CudaNdarray_HOST_DIMS(C)[1], CudaNdarray_HOST_DIMS(C)[0], CudaNdarray_HOST_DIMS(A)[1], alpha, b, sb_0, a, sa_0, beta, c, sc_0); break;
4287 case 0x100: CHK_STRIDE_SGEMM(N, T, CudaNdarray_HOST_DIMS(C)[1], CudaNdarray_HOST_DIMS(C)[0], CudaNdarray_HOST_DIMS(A)[1], alpha, b, sb_0, a, sa_1, beta, c, sc_0); break;
4288 case 0x010: CHK_STRIDE_SGEMM(T, N, CudaNdarray_HOST_DIMS(C)[1], CudaNdarray_HOST_DIMS(C)[0], CudaNdarray_HOST_DIMS(A)[1], alpha, b, sb_1, a, sa_0, beta, c, sc_0); break;
4289 case 0x110: CHK_STRIDE_SGEMM(T, T, CudaNdarray_HOST_DIMS(C)[1], CudaNdarray_HOST_DIMS(C)[0], CudaNdarray_HOST_DIMS(A)[1], alpha, b, sb_1, a, sa_1, beta, c, sc_0); break;
4290 case 0x001: CHK_STRIDE_SGEMM(T, T, CudaNdarray_HOST_DIMS(C)[0], CudaNdarray_HOST_DIMS(C)[1], CudaNdarray_HOST_DIMS(A)[1], alpha, a, sa_0, b, sb_0, beta, c, sc_1); break;
4291 case 0x101: CHK_STRIDE_SGEMM(N, T, CudaNdarray_HOST_DIMS(C)[0], CudaNdarray_HOST_DIMS(C)[1], CudaNdarray_HOST_DIMS(A)[1], alpha, a, sa_1, b, sb_0, beta, c, sc_1); break;
4292 case 0x011: CHK_STRIDE_SGEMM(T, N, CudaNdarray_HOST_DIMS(C)[0], CudaNdarray_HOST_DIMS(C)[1], CudaNdarray_HOST_DIMS(A)[1], alpha, a, sa_0, b, sb_1, beta, c, sc_1); break;
4293 case 0x111: CHK_STRIDE_SGEMM(N, N, CudaNdarray_HOST_DIMS(C)[0], CudaNdarray_HOST_DIMS(C)[1], CudaNdarray_HOST_DIMS(A)[1], alpha, a, sa_1, b, sb_1, beta, c, sc_1); break;
4294 default: PyErr_Format(PyExc_ValueError, "some matrix has no unit stride (unit=%x)", unit);
4295 return -1;
4296 };
4297 CNDA_THREAD_SYNC;
4298 Py_XDECREF(A_new);
4299 Py_XDECREF(B_new);
4300
4301 if (CUBLAS_STATUS_SUCCESS != err)
4302 {
4303 PyErr_Format(PyExc_RuntimeError,
4304 "cublasSgemm failed (%i) %s\n"
4305 " unit=%x N=%d, c.dims=[%d %d], a.dim=[%d %d], alpha=%f, beta=%f, a=%p, b=%p, c=%p"
4306 " sa_0=%d, sa_1=%d, sb_0=%d, sb_1=%d, sc_0=%d, sc_1=%d",
4307 err, cublasGetErrorString(err),
4308 unit, N,
4309 CudaNdarray_HOST_DIMS(C)[0],
4310 CudaNdarray_HOST_DIMS(C)[1],
4311 CudaNdarray_HOST_DIMS(A)[0], CudaNdarray_HOST_DIMS(A)[1],
4312 alpha, beta, a, b, c, sa_0, sa_1, sb_0, sb_1, sc_0, sc_1);
4313
4314 return -1;
4315 }
4316 return 0;
4317 }
4318
4319 int CudaNdarray_sgemv(float alpha, const CudaNdarray * A, const CudaNdarray * B, float beta, CudaNdarray * C)
4320 {
4321 /**
4322 * C <- alpha A B + beta C
4323 * A : matrix
4324 * B, C: vector
4325 * alpha, beta: scalars
4326 */
4327 if (A->nd != 2) { PyErr_SetString(PyExc_ValueError, "non-matrix arg to gemv"); return -1; }
4328 if (B->nd != 1) { PyErr_SetString(PyExc_ValueError, "non-vector arg to gemv"); return -1; }
4329 if (C->nd != 1) { PyErr_SetString(PyExc_ValueError, "non-vector arg to gemv"); return -1; }
4330
4331 // We must allow dimensions to be zeros.
4332 if ((CudaNdarray_HOST_DIMS(A)[1] != CudaNdarray_HOST_DIMS(B)[0])
4333 || (CudaNdarray_HOST_DIMS(A)[0] != CudaNdarray_HOST_DIMS(C)[0]))
4334 {
4335 PyErr_Format(PyExc_ValueError, "dimension mismatch in args to gemv (%i,%i)x(%i)->(%i)",
4336 CudaNdarray_HOST_DIMS(A)[0],
4337 CudaNdarray_HOST_DIMS(A)[1],
4338 CudaNdarray_HOST_DIMS(B)[0],
4339 CudaNdarray_HOST_DIMS(C)[0]);
4340 return -1;
4341 }
4342
4343 // If matrix A has non-unit size and non-unit stride in both
4344 // dimensions, or negative strides, we cannot operate, but we can
4345 // make a copy.
4346 CudaNdarray * A_new = NULL;
4347 CudaNdarray * B_new = NULL;
4348 if (((CudaNdarray_HOST_DIMS(A)[0] > 1)
4349 && (CudaNdarray_HOST_STRIDES(A)[0] != 1)
4350 && (CudaNdarray_HOST_DIMS(A)[1] > 1)
4351 && (CudaNdarray_HOST_STRIDES(A)[1] != 1))
4352 || (CudaNdarray_HOST_STRIDES(A)[0] < 0)
4353 || (CudaNdarray_HOST_STRIDES(A)[1] < 0))
4354 {
4355 A_new = (CudaNdarray*) CudaNdarray_Copy(A);
4356 if (!A_new)
4357 return -1;
4358 A = A_new;
4359 }
4360
4361 // If vector B as a negative stride, we also have to make a copy.
4362 if (CudaNdarray_HOST_STRIDES(B)[0] < 0)
4363 {
4364 B_new = (CudaNdarray*) CudaNdarray_Copy(B);
4365 if (!B_new)
4366 {
4367 // If A was not copied, A_new is NULL, and Py_XDECREF does not
4368 // do anything
4369 Py_XDECREF(A_new);
4370 return -1;
4371 }
4372 B = B_new;
4373 }
4374
4375 // cudablas does not handle negative strides as expected
4376 if ( (CudaNdarray_HOST_STRIDES(A)[0] < 0)
4377 || (CudaNdarray_HOST_STRIDES(A)[1] < 0))
4378 {
4379 PyErr_Format(PyExc_ValueError, "illegal strides in args to gemv (%i,%i)",
4380 CudaNdarray_HOST_STRIDES(A)[0],
4381 CudaNdarray_HOST_STRIDES(A)[1]);
4382 Py_XDECREF(A_new);
4383 Py_XDECREF(B_new);
4384 return -1;
4385 }
4386
4387 /* create appropriate strides for malformed matrices that are row or column
4388 * vectors
4389 */
4390 int sa_0 = (CudaNdarray_HOST_DIMS(A)[0] > 1) ? CudaNdarray_HOST_STRIDES(A)[0] : CudaNdarray_HOST_DIMS(A)[1];
4391 int sa_1 = (CudaNdarray_HOST_DIMS(A)[1] > 1) ? CudaNdarray_HOST_STRIDES(A)[1] : CudaNdarray_HOST_DIMS(A)[0];
4392 int sb_0 = (CudaNdarray_HOST_DIMS(B)[0] > 1) ? CudaNdarray_HOST_STRIDES(B)[0] : 1;
4393 int sc_0 = (CudaNdarray_HOST_DIMS(C)[0] > 1) ? CudaNdarray_HOST_STRIDES(C)[0] : 1;
4394
4395 if (sa_0 == 0) sa_0 = 1;
4396 if (sa_1 == 0) sa_1 = 1;
4397
4398 int used_dot = 0;
4399
4400 // This is important because we can end up not calling Sgemv at all
4401 cublasStatus_t err = CUBLAS_STATUS_SUCCESS;
4402 if (CudaNdarray_SIZE(C)) {
4403 // A is row vector & alpha==1 & beta==0 -> use cublasSdot
4404 if (CudaNdarray_HOST_DIMS(A)[0] == 1 && alpha==1.f && beta==0.f) {
4405 //replace this with custom inner product kernel with alpha and beta parameter?
4406 cublasPointerMode_t pmode;
4407 //set pointer mode to make sure cublas not storing on host pointer
4408 cublasGetPointerMode(handle, &pmode);
4409 cublasSetPointerMode(handle, CUBLAS_POINTER_MODE_DEVICE);
4410 err = cublasSdot(
4411 handle, CudaNdarray_HOST_DIMS(A)[1],
4412 CudaNdarray_DEV_DATA(A), sa_1,
4413 CudaNdarray_DEV_DATA(B), sb_0,
4414 CudaNdarray_DEV_DATA(C));
4415 cublasSetPointerMode(handle, pmode);
4416 used_dot = 1;
4417 }
4418 // A is row-contiguous | row vector
4419 else if ((CudaNdarray_HOST_DIMS(A)[0] <= 1)
4420 || ((CudaNdarray_HOST_STRIDES(A)[0] == 1)
4421 && (CudaNdarray_HOST_STRIDES(A)[1] > 0)))
4422 {
4423 err = cublasSgemv(handle, CUBLAS_OP_N,
4424 CudaNdarray_HOST_DIMS(A)[0], CudaNdarray_HOST_DIMS(A)[1],
4425 &alpha,
4426 CudaNdarray_DEV_DATA(A), sa_1,
4427 CudaNdarray_DEV_DATA(B), sb_0,
4428 &beta,
4429 CudaNdarray_DEV_DATA(C), sc_0);
4430 }
4431 // A is column-contiguous | column vector
4432 else if ((CudaNdarray_HOST_DIMS(A)[1] <= 1)
4433 || ((CudaNdarray_HOST_STRIDES(A)[1] == 1)
4434 && (CudaNdarray_HOST_STRIDES(A)[0] > 0)))
4435 {
4436 err = cublasSgemv(handle, CUBLAS_OP_T,
4437 CudaNdarray_HOST_DIMS(A)[1], CudaNdarray_HOST_DIMS(A)[0],
4438 &alpha,
4439 CudaNdarray_DEV_DATA(A), sa_0,
4440 CudaNdarray_DEV_DATA(B), sb_0,
4441 &beta,
4442 CudaNdarray_DEV_DATA(C), sc_0);
4443 }
4444 // A is non vector and have malformed strides
4445 else
4446 {
4447 PyErr_Format(PyExc_AssertionError,
4448 "Unexpected stride pattern in gemv: (%i, %i) x %i -> %i.\n"
4449 "Shapes are: (%i, %i) x %i -> %i\n",
4450 CudaNdarray_HOST_STRIDES(A)[0],
4451 CudaNdarray_HOST_STRIDES(A)[1],
4452 CudaNdarray_HOST_STRIDES(B)[0],
4453 CudaNdarray_HOST_STRIDES(C)[0],
4454 CudaNdarray_HOST_DIMS(A)[0],
4455 CudaNdarray_HOST_DIMS(A)[1],
4456 CudaNdarray_HOST_DIMS(B)[0],
4457 CudaNdarray_HOST_DIMS(C)[0]);
4458 Py_XDECREF(A_new);
4459 Py_XDECREF(B_new);
4460 return -1;
4461 }
4462 }
4463
4464 CNDA_THREAD_SYNC;
4465 Py_XDECREF(A_new);
4466 Py_XDECREF(B_new);
4467
4468 if (CUBLAS_STATUS_SUCCESS != err)
4469 {
4470 if (!used_dot)
4471 {
4472 PyErr_Format(PyExc_RuntimeError,
4473 "cublasSgemv failed (%i)",
4474 err);
4475 } else {
4476 PyErr_Format(PyExc_RuntimeError,
4477 "cublasSdot failed (%i)",
4478 err);
4479 }
4480 return -1;
4481 }
4482 return 0;
4483 }
4484
4485 int CudaNdarray_sger(float alpha, const CudaNdarray * x, const CudaNdarray * y, CudaNdarray * A) {
4486 if (x->nd != 1) { PyErr_SetString(PyExc_ValueError, "non-vector arg x to sger"); return -1; }
4487 if (y->nd != 1) { PyErr_SetString(PyExc_ValueError, "non-vector arg y to sger"); return -1; }
4488 if (A->nd != 2) { PyErr_SetString(PyExc_ValueError, "non-matrix arg A to sger"); return -1; }
4489
4490 if ((CudaNdarray_HOST_DIMS(A)[0] != CudaNdarray_HOST_DIMS(x)[0])
4491 || (CudaNdarray_HOST_DIMS(A)[1] != CudaNdarray_HOST_DIMS(y)[0])) {
4492 PyErr_Format(PyExc_ValueError,
4493 "dimension mismatch in args to sger (%i)x(%i)->(%i,%i)",
4494 CudaNdarray_HOST_DIMS(x)[0],
4495 CudaNdarray_HOST_DIMS(y)[0],
4496 CudaNdarray_HOST_DIMS(A)[0],
4497 CudaNdarray_HOST_DIMS(A)[1]);
4498 return -1;
4499 }
4500
4501 int x_strides = CudaNdarray_HOST_STRIDES(x)[0];
4502 CudaNdarray * x_new = NULL;
4503 if(x_strides == 0){
4504 if(CudaNdarray_HOST_DIMS(x)[0] != 1){
4505 PyErr_Format(PyExc_RuntimeError,
4506 "CudaNdarray_sger: Invalid input x (should not happen)."
4507 " We received a CudaNdarray vector with a stride of 0"
4508 " that has more than 1 element!");
4509 return -1;
4510 }
4511 x_strides = 1;
4512 } else if(x_strides < 0){
4513 x_new = (CudaNdarray*) CudaNdarray_Copy(x);
4514 x = x_new;
4515 x_strides = CudaNdarray_HOST_STRIDES(x)[0];
4516 }
4517
4518 int y_strides = CudaNdarray_HOST_STRIDES(y)[0];
4519 CudaNdarray * y_new = NULL;
4520 if(y_strides == 0){
4521 if(CudaNdarray_HOST_DIMS(y)[0] != 1){
4522 PyErr_Format(PyExc_RuntimeError,
4523 "CudaNdarray_sger: Invalid input y (should not happen)."
4524 " We received a CudaNdarray vector with a stride of 0"
4525 " that has more than 1 elements!");
4526 Py_XDECREF(x_new);
4527 return -1;
4528 }
4529 y_strides = 1;
4530 } else if(y_strides < 0){
4531 y_new = (CudaNdarray*) CudaNdarray_Copy(y);
4532 y = y_new;
4533 y_strides = CudaNdarray_HOST_STRIDES(y)[0];
4534 }
4535
4536 // Create appropriate strides if A is a row or column vector
4537 int sa_0 = (CudaNdarray_HOST_DIMS(A)[0] > 1) ? CudaNdarray_HOST_STRIDES(A)[0]
4538 : CudaNdarray_HOST_DIMS(A)[1];
4539 int sa_1 = (CudaNdarray_HOST_DIMS(A)[1] > 1) ? CudaNdarray_HOST_STRIDES(A)[1]
4540 : CudaNdarray_HOST_DIMS(A)[0];
4541
4542 // This is important because we can end up not calling Sger at all
4543 cublasStatus_t err = CUBLAS_STATUS_SUCCESS;
4544 if(CudaNdarray_SIZE(A)){
4545 // If A is in col-major
4546 if ((CudaNdarray_HOST_DIMS(A)[0] <= 1)
4547 || ((CudaNdarray_HOST_STRIDES(A)[0] == 1)
4548 && (CudaNdarray_HOST_STRIDES(A)[1] > 0)))
4549 {
4550 err = cublasSger(handle, CudaNdarray_HOST_DIMS(x)[0], CudaNdarray_HOST_DIMS(y)[0], &alpha,
4551 CudaNdarray_DEV_DATA(x), x_strides,
4552 CudaNdarray_DEV_DATA(y), y_strides,
4553 CudaNdarray_DEV_DATA(A), sa_1);
4554 }
4555 // Since Sger expects A in col-major, we invert x and y to fake this.
4556 else if ((CudaNdarray_HOST_DIMS(A)[1] <= 1)
4557 || ((CudaNdarray_HOST_STRIDES(A)[1] == 1)
4558 && (CudaNdarray_HOST_STRIDES(A)[0] > 0)))
4559 {
4560 err = cublasSger(handle, CudaNdarray_HOST_DIMS(y)[0], CudaNdarray_HOST_DIMS(x)[0], &alpha,
4561 CudaNdarray_DEV_DATA(y), y_strides,
4562 CudaNdarray_DEV_DATA(x), x_strides,
4563 CudaNdarray_DEV_DATA(A), sa_0);
4564 }
4565 // A has to be either c- or f-contiguous, with no negative strides
4566 else
4567 {
4568 PyErr_SetString(PyExc_NotImplementedError,
4569 "non-contiguous A, or negative strides, in sger");
4570 Py_XDECREF(x_new);
4571 Py_XDECREF(y_new);
4572 return -1;
4573 }
4574 }
4575 CNDA_THREAD_SYNC;
4576 Py_XDECREF(x_new);
4577 Py_XDECREF(y_new);
4578
4579 if (CUBLAS_STATUS_SUCCESS != err)
4580 {
4581 PyErr_Format(PyExc_RuntimeError,
4582 "cublasSger failed (%i)",
4583 err);
4584 return -1;
4585 }
4586
4587 return 0;
4588 }
4589
4590 /**
4591 *
4592 * Precondition:
4593 * a->dim[d] == (dims_a[d]==0) ? (1 << log2_dims_a[d]) : dims_a[d]
4594 * z->dim[d] == (z_str[d]==0) ? 1 : dims_a[d];
4595 *
4596 * TODO: templatize this function to support other reductions.
4597 * All that needs to change is the initial value for sum, and the reduction operator.
4598 */
4599
4600 static __global__ void kernel_reduce_sum(const unsigned int size_z,
4601 const unsigned int nd,
4602 const int * dims_a,
4603 const int * log2_dims_a,
4604 const int * a_str,
4605 const float * a_data,
4606 const int * z_str,
4607 float * z_data)
4608 {
4609 const unsigned int idx = blockIdx.x * blockDim.x + threadIdx.x;
4610 const unsigned int numThreads = blockDim.x * gridDim.x;
4611
4612 //structure data contains the strides and dimensions of both a and z
4613 // a_dim[0], a_dim[1], ... a_dim[nd-1],
4614 // a_log2dim[0], a_log2dim[1], ... a_log2dim[nd-1],
4615 // a_str[0], ... a_str[nd-1],
4616 // z_str[0], ... z_str[nd-1]
4617 extern __shared__ int structure_data[];
4618 for (unsigned int i = threadIdx.x; i < nd; i += blockDim.x)
4619 {
4620 structure_data[i+0*nd] = dims_a[i];
4621 structure_data[i+1*nd] = log2_dims_a[i];
4622 structure_data[i+2*nd] = a_str[i];
4623 structure_data[i+3*nd] = z_str[i];
4624 }
4625 dims_a = structure_data;
4626 log2_dims_a = structure_data + nd;
4627 a_str = structure_data + 2*nd;
4628 z_str = structure_data + 3*nd;
4629
4630 __syncthreads(); //wait for all the shared structure to be loaded
4631
4632 for (unsigned int i = idx; i < size_z; i += numThreads)
4633 {
4634 unsigned int ii = i;
4635 const float * a_data_i = a_data;
4636 float * z_data_i = z_data;
4637 unsigned int n_reduce_elements = 1;
4638 unsigned int n_reduce_dims = 0;
4639 unsigned int reduce_dim0 = nd-1;
4640
4641
4642 //In this loop, we locate the initial element of the slice that we'd like to reduce with this thread
4643 // At the same time, we [re]calculate the size of that slice (n_reduce_elements)
4644 for (unsigned int d = 0; d < nd; ++d)
4645 {
4646 if (a_str[d] && (!z_str[d])) // this means 'd' is a dimension we are reducing over
4647 {
4648 n_reduce_elements *= dims_a[d];
4649 n_reduce_dims += 1;
4650 reduce_dim0 = (d < reduce_dim0) ? d : reduce_dim0;
4651 }
4652 else //'d' is not a dimension that we are reducing over
4653 {
4654 unsigned int pos_d;
4655 if (log2_dims_a[d]==-1) //TODO: when things are working, use this switch
4656 {
4657 // this branch is not preferred,
4658 // because the manual said that integer mod and div operations are slow on gpu
4659 pos_d = (ii % dims_a[d]);
4660 ii = (ii / dims_a[d]);
4661 }
4662 else
4663 {
4664 pos_d = (ii & ((1 << log2_dims_a[d])-1)); //take the lower log2_dims bits
4665 ii = (ii >> log2_dims_a[d]); //shift those lower log2_dims bits off of ii
4666 }
4667 a_data_i += pos_d * a_str[d];
4668 z_data_i += pos_d * z_str[d];
4669 }
4670 }
4671 // now we've got pointers a_data_i and z_data_i into element 0 of the slice over which we are reducing
4672 // do a similar loop
4673
4674 float sum = 0.0f;
4675 switch(n_reduce_dims)
4676 {
4677 case 0:
4678 {
4679 sum = a_data_i[0];
4680 }
4681 break;
4682 case 1:
4683 {
4684 const int stride = a_str[reduce_dim0];
4685 const float * a_data_i_max = a_data_i + dims_a[reduce_dim0] * stride;
4686 while (a_data_i != a_data_i_max)
4687 {
4688 sum += a_data_i[0];
4689 a_data_i += stride;
4690 }
4691 }
4692 break;
4693 case 2:
4694 {
4695 int rd = reduce_dim0+1;
4696 for (; rd < nd; ++rd)
4697 {
4698 if (a_str[rd] && (!z_str[rd])) // this means 'rd' is a dimension we are reducing over
4699 break;
4700 }
4701 const int stride0 = a_str[reduce_dim0];
4702 const int stride1 = a_str[rd];
4703 for (int ii = 0; ii < dims_a[rd]; ++ii)
4704 {
4705 const float * a_data_ri = a_data_i + ii * stride1;
4706 const float * a_data_ri_max = a_data_ri + dims_a[reduce_dim0] * stride0;
4707 while (a_data_ri != a_data_ri_max)
4708 {
4709 sum += a_data_ri[0];
4710 a_data_ri += stride0;
4711 }
4712 }
4713 };
4714 break;
4715 default:
4716 {
4717 for (unsigned int reduce_i = 0; reduce_i < n_reduce_elements; ++reduce_i)
4718 {
4719 //TODO: optimize this loop to work more like theano's Elemwise. It's serial code.
4720 unsigned int reduce_ii = reduce_i;
4721 const float * a_data_ri = a_data_i;
4722
4723 //This loop finds the element in the a slice to add.
4724 for (unsigned int rd = reduce_dim0; rd < nd; ++rd)
4725 {
4726 unsigned int pos_d;
4727 if (a_str[rd] && (!z_str[rd])) // this means 'd' is a dimension we are reducing over
4728 {
4729 if (log2_dims_a[rd]==-1)
4730 {
4731 // this branch is not preferred,
4732 // because the manual said that integer mod and div operations are slow on gpu
4733 pos_d = (reduce_ii % dims_a[rd]);
4734 reduce_ii = (reduce_ii / dims_a[rd]);
4735 }
4736 else
4737 {
4738 pos_d = (reduce_ii & ((1 << log2_dims_a[rd])-1)); //take the lower log2_dims bits
4739 reduce_ii = (reduce_ii >> log2_dims_a[rd]); //shift those lower log2_dims bits off of ii
4740 }
4741 a_data_ri += pos_d * a_str[rd];
4742 }
4743 }
4744 sum += a_data_ri[0];
4745 }
4746 }
4747 }
4748 z_data_i[0] = sum;
4749 }
4750 }
4751
4752 static __global__ void kernel_reduce_sum_1011(
4753 const unsigned int d0,
4754 const unsigned int d1,
4755 const unsigned int d2,
4756 const unsigned int d3,
4757 const float *A, const int sA0, const int sA1, const int sA2, const int sA3,
4758 float * Z, const int sZ0)
4759 {
4760 const int threadCount = blockDim.x * blockDim.y * blockDim.z;
4761 const int threadNum = threadIdx.z * blockDim.x * blockDim.y + threadIdx.y * blockDim.x + threadIdx.x;
4762 extern __shared__ float buf[];
4763 float mysum = 0.0f;
4764
4765 if (warpSize != 32)
4766 {
4767 return; //TODO: set error code
4768 }
4769
4770 for (int i0 = threadIdx.z; i0 < d0; i0 += blockDim.z)
4771 {
4772 float Ai = A[i0 * sA0 + blockIdx.x * sA1 + threadIdx.y * sA2 + threadIdx.x * sA3];
4773 mysum += Ai;
4774 }
4775 buf[threadNum] = mysum;
4776 __syncthreads();
4777
4778 // rest of function is handled by one warp
4779 if (threadNum < warpSize)
4780 {
4781 for (int i = threadNum + warpSize; i < threadCount; i += warpSize)
4782 {
4783 mysum += buf[i];
4784 }
4785 buf[threadNum] = mysum;
4786 if (threadNum < 16)
4787 {
4788 //reduce so that threadNum 0 has the sum of everything
4789 if(threadNum + 16 < threadCount) buf[threadNum] += buf[threadNum+16];
4790 if(threadNum + 8 < threadCount) buf[threadNum] += buf[threadNum+8];
4791 if(threadNum + 4 < threadCount) buf[threadNum] += buf[threadNum+4];
4792 if(threadNum + 2 < threadCount) buf[threadNum] += buf[threadNum+2];
4793 if(threadNum + 1 < threadCount) buf[threadNum] += buf[threadNum+1];
4794 if (threadNum == 0)
4795 {
4796 Z[blockIdx.x*sZ0] = buf[0];
4797 }
4798 }
4799 }
4800 }
4801 /**
4802 * Dimensions in which the self has size 1 and A has size > 1 are considered summing dimensions
4803 * Dimensions in which self has size > 1 and A has size > 1 are considered non-summing dimensions, and in this case their sizes must be equal.
4804 */
4805 int
4806 CudaNdarray_reduce_sum(CudaNdarray * self, CudaNdarray * A)
4807 {
4808 int verbose = 0;
4809 //check input rank
4810 if (self->nd != A->nd)
4811 {
4812 PyErr_Format(PyExc_TypeError, "Rank mismatch in CudaNdarray_sum: %i vs %i", self->nd, A->nd);
4813 return -1;
4814 }
4815 for (int i = 0; i < self->nd; ++i)
4816 {
4817 if ((CudaNdarray_HOST_DIMS(self)[i] > 1) && (CudaNdarray_HOST_DIMS(self)[i] != CudaNdarray_HOST_DIMS(A)[i]))
4818 {
4819 PyErr_Format(PyExc_TypeError, "Dimension mismatch in CudaNdarray_sum: self->dim[%i] == %i , A->dim[%i] = %i",
4820 i, CudaNdarray_HOST_DIMS(self)[i], i, CudaNdarray_HOST_DIMS(A)[i]);
4821 return -1;
4822 }
4823 }
4824
4825 int n_summations = (unsigned int)CudaNdarray_SIZE(self);
4826 if (verbose)
4827 {
4828 std::cerr << "reduce_sum n_summations " << n_summations << '\n';
4829 std::cerr << "reduce_sum nd " << self->nd << '\n';
4830 fprint_CudaNdarray(stderr, A);
4831 fprint_CudaNdarray(stderr, self);
4832 }
4833 if (0 && (A->nd == 4) //check to see if kernel_reduce_sum_1011 applies
4834 && (CudaNdarray_HOST_DIMS(self)[0] == 1)
4835 && (CudaNdarray_HOST_DIMS(self)[2] == 1)
4836 && (CudaNdarray_HOST_DIMS(self)[3] == 1)
4837 )
4838 {
4839 dim3 n_threads(CudaNdarray_HOST_DIMS(A)[3], CudaNdarray_HOST_DIMS(A)[2]);
4840 dim3 n_blocks(CudaNdarray_HOST_DIMS(A)[1]);
4841 while (n_threads.x * n_threads.y * n_threads.z < NUM_VECTOR_OP_THREADS_PER_BLOCK) ++n_threads.z;
4842 n_threads.z -= 1;
4843 if (n_threads.z > 64) n_threads.z = 64;
4844 if (n_threads.z)
4845 {
4846 if (verbose) printf("trying kernel_reduce_sum_1011\n");
4847 int n_shared = sizeof(float) * n_threads.x * n_threads.y * n_threads.z;
4848 kernel_reduce_sum_1011<<<n_blocks, n_threads, n_shared>>>(
4849 CudaNdarray_HOST_DIMS(A)[0],
4850 CudaNdarray_HOST_DIMS(A)[1],
4851 CudaNdarray_HOST_DIMS(A)[2],
4852 CudaNdarray_HOST_DIMS(A)[3],
4853 CudaNdarray_DEV_DATA(A),
4854 CudaNdarray_HOST_STRIDES(A)[0],
4855 CudaNdarray_HOST_STRIDES(A)[1],
4856 CudaNdarray_HOST_STRIDES(A)[2],
4857 CudaNdarray_HOST_STRIDES(A)[3],
4858 CudaNdarray_DEV_DATA(self),
4859 CudaNdarray_HOST_STRIDES(self)[1]);
4860 CNDA_THREAD_SYNC;
4861 if (cudaSuccess == cudaGetLastError()) return 0;
4862 if (verbose) printf("failed, falling back to kernel_reduce_sum\n");
4863 }
4864 }
4865
4866 int n_threads_per_block = std::min(n_summations,
4867 NUM_VECTOR_OP_THREADS_PER_BLOCK);
4868 int n_blocks = std::min(ceil_intdiv(n_summations,n_threads_per_block),
4869 NUM_VECTOR_OP_BLOCKS);
4870 int n_structure_cache = self->nd * 4 * sizeof(int);
4871
4872 if (verbose)
4873 {
4874 std::cerr << "n_blocks, n_threads_per_block " << n_blocks << ' ' << n_threads_per_block << '\n';
4875 }
4876 assert (self->nd > 0);
4877 assert (self->nd == A->nd);
4878 kernel_reduce_sum<<<n_blocks, n_threads_per_block, n_structure_cache>>>(
4879 n_summations,
4880 self->nd,
4881 CudaNdarray_DEV_DIMS(A),
4882 CudaNdarray_DEV_LOG2DIMS(A),
4883 CudaNdarray_DEV_STRIDES(A),
4884 CudaNdarray_DEV_DATA(A),
4885 CudaNdarray_DEV_STRIDES(self),
4886 CudaNdarray_DEV_DATA(self));
4887 CNDA_THREAD_SYNC;
4888 cudaError_t err = cudaGetLastError();
4889 if (cudaSuccess != err)
4890 {
4891 PyErr_Format(PyExc_RuntimeError, "Cuda error: %s: %s.\n", "kernel_reduce_sum", cudaGetErrorString(err));
4892 return -1;
4893 }
4894 return 0;
4895 }
4896 int
4897 CudaNdarray_reduce_prod(CudaNdarray * self, const CudaNdarray * A)
4898 {
4899 PyErr_SetString(PyExc_NotImplementedError, "");
4900 return -1;
4901 }
4902 int
4903 CudaNdarray_reduce_min(CudaNdarray * self, const CudaNdarray * A)
4904 {
4905 PyErr_SetString(PyExc_NotImplementedError, "");
4906 return -1;
4907 }
4908 int
4909 CudaNdarray_reduce_max(CudaNdarray * self, const CudaNdarray * A)
4910 {
4911 PyErr_SetString(PyExc_NotImplementedError, "");
4912 return -1;
4913 }
4914
4915
4916 /**
4917 *
4918 * pattern is a permutation of [0, 1, ... self->nd-1] with the following twists:
4919 * - an element 'd' of the permutation can be dropped if CudaNdarray_HOST_DIMS(self)[d] == 1
4920 * - any number of '-1' elements can be in the pattern, and they will cause new ranks (with dim==1) to be inserted.
4921 *
4922 * For example, if CudaNdarray_HOST_DIMS(self) == [4, 5, 1, 6], and pattern = [0,3,-1,-1, 1], then CudaNdarray_HOST_DIMS(self) would be modified to become:
4923 * [4, 6, 1, 1, 5] (we dropped the original dim[2]==1, and inserted two singleton dimensions with the -1s.
4924 */
4925 int
4926 CudaNdarray_dimshuffle(CudaNdarray * self, unsigned int len, const int * pattern)
4927 {
4928 //TODO: pass a workspace pointer to avoid the internal malloc
4929 int * newdims = (int *)malloc(sizeof(int) * (len + len + self->nd)); //we tack on the taken buffer here for speed of not having to malloc twice.
4930 int * newstrides = newdims + len;
4931 int * dims_taken = newstrides + len;
4932 if (!newdims)
4933 {
4934 PyErr_SetString(PyExc_MemoryError, "CudaNdarray_dimshuffle: Failed to allocate temporary space");
4935 return -1;
4936 }
4937 for (int i = 0; i < self->nd; ++i)
4938 {
4939 dims_taken[i] = 0;
4940 }
4941 for (int i = 0; i < len; ++i)
4942 {
4943 if (pattern[i] < 0)
4944 {
4945 newdims[i] = 1;
4946 newstrides[i] = 0;
4947 }
4948 else if(dims_taken[pattern[i]])
4949 {
4950 PyErr_Format(PyExc_ValueError, "Cudandarray_dimshuffle: invalid pattern for Cudandarray_dimshuffle. You used the dimensions %d multiple time",
4951 pattern[i]);
4952 free(newdims);
4953 return -1;
4954 }
4955 else if (pattern[i]>= self->nd)
4956 {
4957 PyErr_Format(PyExc_ValueError, "Cudandarray_dimshuffle: invalid pattern for Cudandarray_dimshuffle. You asked for a dimensions that don't exist %d for a %d dims CudaNdarray",
4958 pattern[i], self->nd);
4959 free(newdims);
4960 return -1;
4961 }
4962 else
4963 {
4964 newdims[i] = CudaNdarray_HOST_DIMS(self)[pattern[i]];
4965 newstrides[i] = CudaNdarray_HOST_STRIDES(self)[pattern[i]];
4966 dims_taken[pattern[i]] = 1;
4967 }
4968 }
4969 //Check if we dropped not broadcastable dims
4970 for (int i = 0; i < self->nd; ++i)
4971 {
4972 if (dims_taken[i]==0 && CudaNdarray_HOST_DIMS(self)[i]!=1)
4973 {
4974 PyErr_SetString(PyExc_ValueError, "Cudandarray_dimshuffle: You cannot drop a non-broadcastable dimension.");
4975 free(newdims);
4976 return -1;
4977 }
4978 }
4979 //swap this structure in for the one in self, and sync to the card
4980 if (CudaNdarray_set_nd(self, len))
4981 {
4982 free(newdims);
4983 return -1;
4984 }
4985 for (int i = 0; i < len; ++i)
4986 {
4987 CudaNdarray_set_dim(self, i, newdims[i]);
4988 CudaNdarray_set_stride(self, i, newstrides[i]);
4989 }
4990 if (cnda_copy_structure_to_device(self))
4991 {
4992 free(newdims);
4993 return -1;
4994 }
4995 free(newdims);
4996 return 0;
4997 }
4998
4999
5000
5001 /**
5002 *
5003 * This is the function that bind to python.
5004 * See CudaNdarray_dimshuffle to call from C.
5005 * We use -1 to mean 'x' as in Tensor Dimshuffle.
5006 */
5007 PyObject *
5008 CudaNdarray_Dimshuffle(PyObject* _unused, PyObject* args)
5009 {
5010 PyObject * self = NULL;
5011 PyObject * pattern_object = NULL;
5012 int * pattern = NULL;
5013 PyObject * rval = NULL;
5014 int success = -1;
5015 //const int * dims = NULL;
5016
5017 //args should consist of two python objects ("OO")
5018 if (! PyArg_ParseTuple(args, "OO", &self, &pattern_object))
5019 return NULL;
5020
5021 if (!CudaNdarray_Check(self) )
5022 {
5023 PyErr_SetString(PyExc_TypeError, "First argument to cuda_ndarray.dimshuffle must be a CudaNdarray");
5024 return NULL;
5025 }
5026
5027 //parse pattern_object into int * pattern
5028
5029 Py_ssize_t pattern_dim = PyObject_Length(pattern_object);
5030
5031 if (pattern_dim < 0)
5032 {
5033 PyErr_SetString(PyExc_TypeError, "Couldn't get length of third argument to cuda_ndarray.dimshuffle");
5034 return NULL;
5035 }
5036
5037 pattern = (int *) malloc( pattern_dim * sizeof(int));
5038
5039 for (Py_ssize_t i = 0; i < pattern_dim; i++)
5040 {
5041 PyObject * idx = PyLong_FromSsize_t(i);
5042
5043 if (idx == NULL)
5044 {
5045 PyErr_SetString(PyExc_Exception, "Couldn't make long object to loop over list/tuple");
5046 goto CudaNdarray_dimshuffle_fail;
5047 }
5048
5049 long elem_value = 0;
5050
5051 PyObject * elem = PyObject_GetItem(pattern_object, idx);
5052
5053 if (elem == NULL)
5054 {
5055 Py_XDECREF( elem);
5056 PyErr_SetString(PyExc_ValueError, "Third argument to dimshuffle must be list or tuple of integers");
5057 goto CudaNdarray_dimshuffle_fail;
5058 }
5059
5060 elem_value = PyInt_AsLong(elem);
5061
5062 if (elem_value == -1 && PyErr_Occurred() )
5063 {
5064 Py_XDECREF(elem);
5065 PyErr_SetString(PyExc_ValueError, "Third argument to dimshuffle must be list or tuple of integers");
5066 goto CudaNdarray_dimshuffle_fail;
5067 }
5068
5069 pattern[i] = elem_value;
5070
5071 Py_XDECREF( elem );
5072 Py_XDECREF( idx );
5073 }
5074
5075 //allocate rval
5076 rval = (PyObject *) CudaNdarray_View((CudaNdarray *) self);
5077
5078 if (rval == NULL)
5079 {
5080 //CudaNdarray_New should have set the exception string
5081 goto CudaNdarray_dimshuffle_fail;
5082 }
5083
5084
5085 //printf("pattern_dim: %d\n",pattern_dim);
5086 //printf("pattern: %d %d\n",pattern[0],pattern[1]);
5087 //dims = CudaNdarray_HOST_DIMS( (CudaNdarray *) self);
5088 //printf("dims before: %d %d\n",dims[0],dims[1]);
5089
5090 success = CudaNdarray_dimshuffle((CudaNdarray *) rval, pattern_dim, pattern);
5091
5092 if (success != 0)
5093 {
5094 //Exception string should already be set by CudaNdarray_dimshuffle
5095 goto CudaNdarray_dimshuffle_fail;
5096 }
5097
5098 free(pattern);
5099
5100 return rval;
5101
5102 CudaNdarray_dimshuffle_fail:
5103
5104 if (pattern != NULL)
5105 free(pattern);
5106
5107 Py_XDECREF(rval);
5108 return NULL;
5109 }
5110
5111
5112 int
5113 cnda_structure_size(int nd)
5114 {
5115 // dim0, dim1, ...
5116 // str0, str1, ...
5117 // log2(dim0), log2(dim1), ...
5118 return nd + nd + nd;
5119 }
5120
5121 const int *
5122 CudaNdarray_HOST_DIMS(const CudaNdarray * self)
5123 {
5124 return self->host_structure;
5125 }
5126
5127 const int *
5128 CudaNdarray_HOST_STRIDES(const CudaNdarray * self)
5129 {
5130 return self->host_structure + self->nd;
5131 }
5132 const int *
5133 CudaNdarray_HOST_LOG2DIMS(const CudaNdarray * self)
5134 {
5135 return self->host_structure + 2*self->nd;
5136 }
5137
5138 int
5139 CudaNdarray_EqualAndIgnore(CudaNdarray *cnda1, CudaNdarray *cnda2, int ignoreSync, int ignoreBase)
5140 {
5141 int verbose = 0;
5142
5143 if (!ignoreSync && cnda1->dev_structure_fresh != cnda2->dev_structure_fresh)
5144 {
5145 if(verbose) fprintf(stdout, "CUDANDARRAY_EQUAL FAILED : 1\n");
5146 return 0;
5147 }
5148
5149 if (cnda1->nd != cnda2->nd)
5150 {
5151 if(verbose) fprintf(stdout, "CUDANDARRAY_EQUAL FAILED : 2\n");
5152 return 0;
5153 }
5154
5155 for (int i=0; i < 2*cnda1->nd; i++)
5156 {
5157 if (cnda1->host_structure[i] != cnda2->host_structure[i])
5158 {
5159 if(verbose)
5160 fprintf(stdout, "CUDANDARRAY_EQUAL : host_structure : %d, %d, %d\n", i, cnda1->host_structure[i], cnda2->host_structure[i]);
5161 return 0;
5162 }
5163 }
5164
5165 if (!ignoreBase && cnda1->base != cnda2->base)
5166 {
5167 if(verbose) fprintf(stdout, "CUDANDARRAY_EQUAL FAILED : 4");
5168 return 0;
5169 }
5170 else if (cnda1->data_allocated != cnda2->data_allocated)
5171 {
5172 if(verbose) fprintf(stdout, "CUDANDARRAY_EQUAL FAILED : 5");
5173 return 0;
5174 }
5175 else if (cnda1->data_allocated && cnda1->devdata != cnda2->devdata)
5176 {
5177 if(verbose) fprintf(stdout, "CUDANDARRAY_EQUAL FAILED : 6");
5178 // no need to check devdata if data is not allocated
5179 return 0;
5180 }
5181
5182 return 1;
5183 }
5184
5185
5186 int
5187 CudaNdarray_Equal(CudaNdarray *cnda1, CudaNdarray *cnda2)
5188 {
5189 return CudaNdarray_EqualAndIgnore(cnda1, cnda2, 0, 0);
5190 }
5191
5192 int
5193 cnda_copy_structure_to_device(const CudaNdarray * self)
5194 {
5195 //If the device structure do not exists, create it.
5196 //We allocate it here as we do not need it often.
5197 //In fact, we need it so infrequently that we expect
5198 //that most object won't need it. Not allocating it
5199 //save a significant when creating object.
5200 //This speed up a benchmark by 8% with the gc.
5201 if (!self->dev_structure)
5202 {
5203 int struct_size = cnda_structure_size(self->nd);
5204 if (struct_size)
5205 {
5206 self->dev_structure = (int*)device_malloc(struct_size* sizeof(int));
5207 if (NULL == self->dev_structure)
5208 {
5209 return -1;
5210 }
5211 }
5212 }
5213 if (cublasSetVector(cnda_structure_size(self->nd),
5214 sizeof(int),
5215 self->host_structure,
5216 1,
5217 self->dev_structure,
5218 1) != CUBLAS_STATUS_SUCCESS)
5219 {
5220 PyErr_SetString(PyExc_RuntimeError, "error copying structure to device memory");
5221 return -1;
5222 }
5223 self->dev_structure_fresh = 1;
5224 return 0;
5225 }
5226
5227 const int *
5228 CudaNdarray_DEV_DIMS(const CudaNdarray * self)
5229 {
5230 if (!self->dev_structure_fresh)
5231 {
5232 if (cnda_copy_structure_to_device(self))
5233 return NULL;
5234 }
5235 return self->dev_structure;
5236 }
5237 const int *
5238 CudaNdarray_DEV_STRIDES(const CudaNdarray * self)
5239 {
5240 if (!self->dev_structure_fresh)
5241 {
5242 if (cnda_copy_structure_to_device(self))
5243 return NULL;
5244 }
5245 return self->dev_structure + self->nd;
5246 }
5247 const int *
5248 CudaNdarray_DEV_LOG2DIMS(const CudaNdarray * self)
5249 {
5250 if (!self->dev_structure_fresh)
5251 {
5252 if (cnda_copy_structure_to_device(self))
5253 return NULL;
5254 }
5255 return self->dev_structure + 2*self->nd;
5256 }
5257 float *
5258 CudaNdarray_DEV_DATA(const CudaNdarray * self)
5259 {
5260 return self->devdata;
5261 }
5262
5263 /**
5264 * Return the number of elements in the ndarray (product of the dimensions)
5265 */
5266 size_t
5267 CudaNdarray_SIZE(const CudaNdarray *self)
5268 {
5269 if (self->nd == -1) return 0;
5270 size_t size = 1;
5271 for (int i = 0; i < self->nd; ++i)
5272 {
5273 size *= CudaNdarray_HOST_DIMS(self)[i];
5274 }
5275 return size;
5276 }
5277
5278 PyObject *
5279 CudaNdarray_SIZE_Object(const CudaNdarray *self, void *closure)
5280 {
5281 return PyInt_FromLong(CudaNdarray_SIZE(self));
5282 }
5283
5284 int CudaNdarray_set_device_data(CudaNdarray * self, float * data, const CudaNdarray * base)
5285 {
5286 return CudaNdarray_set_device_data(self, data, (PyObject *) base);
5287 }
5288
5289 PyObject * CudaNdarray_IS_C_Contiguous(CudaNdarray * self)
5290 {
5291 return PyBool_FromLong(CudaNdarray_is_c_contiguous(self));
5292 }
5293
5294 int fprint_CudaNdarray(FILE * fd, const CudaNdarray *self)
5295 {
5296 cudaError_t err = cudaGetLastError();
5297 if( cudaSuccess != err)
5298 {
5299 PyErr_Format(PyExc_RuntimeError,
5300 "Cuda error: %s: %s.",
5301 "fprint_CudaNdarray was called with an uncleared error",
5302 cudaGetErrorString(err));
5303 return -1;
5304 }
5305 fprintf(fd, "CudaNdarray <%p, %p> nd=%i dev_structure_fresh=%d data_allocated=%d\n",
5306 self, self->devdata, self->nd, self->dev_structure_fresh, self->data_allocated);
5307 fprintf(fd, "\tHOST_DIMS: ");
5308 for (int i = 0; i < self->nd; ++i)
5309 {
5310 fprintf(fd, "%i\t", CudaNdarray_HOST_DIMS(self)[i]);
5311 }
5312 fprintf(fd, "\n\tHOST_STRIDES: ");
5313 for (int i = 0; i < self->nd; ++i)
5314 {
5315 fprintf(fd, "%i\t", CudaNdarray_HOST_STRIDES(self)[i]);
5316 }
5317
5318 if (self->dev_structure)
5319 {
5320 int data=0;
5321 fprintf(fd, "\n\tDEV_DIMS: ");
5322 for (int i = 0; i < self->nd; ++i)
5323 {
5324 cublasGetVector(1, sizeof(int),
5325 self->dev_structure+i, 1,
5326 &data, 1);
5327 fprintf(fd, "%i\t", data);
5328 }
5329 fprintf(fd, "\n\tDEV_STRIDES: ");
5330 for (int i = 0; i < self->nd; ++i)
5331 {
5332 cublasGetVector(1, sizeof(int),
5333 self->dev_structure + self->nd+i, 1,
5334 &data, 1);
5335 fprintf(fd, "%i \t", data);
5336 }
5337 fprintf(fd, "\n");
5338 }
5339 else
5340 {
5341 fprintf(fd, "\n\tdev_structure not allocated\n");
5342 }
5343
5344 err = cudaGetLastError();
5345 if( cudaSuccess != err)
5346 {
5347 PyErr_Format(PyExc_RuntimeError,
5348 "Cuda error: %s: %s.",
5349 "fprint_CudaNdarray",
5350 cudaGetErrorString(err));
5351 return -1;
5352 }
5353 return 0;
5354 }
5355
5356
5357 int CudaNdarray_prep_output(CudaNdarray ** arr, int nd,
5358 const int * dims, int fortran)
5359 {
5360 bool allocated = false;
5361 if (*arr == NULL)
5362 {
5363 // This allocates the metadata but not the data
5364 *arr = (CudaNdarray *) CudaNdarray_new_nd(nd);
5365 if (*arr == NULL)
5366 return -1;
5367 allocated = true;
5368 }
5369
5370 if (CudaNdarray_alloc_contiguous(*arr, nd, dims, fortran))
5371 {
5372 if (allocated)
5373 {
5374 Py_DECREF(*arr);
5375 *arr = NULL;
5376 }
5377 return -1;
5378 }
5379 return 0;
5380 }
5381
5382
5383 /*
5384 Local Variables:
5385 mode:c++
5386 c-basic-offset:4
5387 c-file-style:"stroustrup"
5388 indent-tabs-mode:nil
5389 fill-column:79
5390 End:
5391 */
5392 // vim: filetype=cpp:expandtab:shiftwidth=4:tabstop=8:softtabstop=4:textwidth=79 :
5393
===============================
In file included from mod.cu:10:0:
/home/vlaand/.local/lib/python3.6/site-packages/theano/sandbox/cuda/cuda_ndarray.cuh:17:0: warning: "PyString_Check" redefined
#define PyString_Check PyUnicode_Check
^
In file included from /home/vlaand/.local/lib/python3.6/site-packages/theano/sandbox/cuda/cuda_ndarray.cuh:11:0,
from mod.cu:10:
/home/vlaand/anaconda3/lib/python3.6/site-packages/numpy/core/include/numpy/npy_3kcompat.h:71:0: note: this is the location of the previous definition
#define PyString_Check PyBytes_Check
^
In file included from mod.cu:10:0:
/home/vlaand/.local/lib/python3.6/site-packages/theano/sandbox/cuda/cuda_ndarray.cuh:18:0: warning: "PyString_FromString" redefined
#define PyString_FromString PyUnicode_FromString
^
In file included from /home/vlaand/.local/lib/python3.6/site-packages/theano/sandbox/cuda/cuda_ndarray.cuh:11:0,
from mod.cu:10:
/home/vlaand/anaconda3/lib/python3.6/site-packages/numpy/core/include/numpy/npy_3kcompat.h:73:0: note: this is the location of the previous definition
#define PyString_FromString PyBytes_FromString
^
In file included from mod.cu:10:0:
/home/vlaand/.local/lib/python3.6/site-packages/theano/sandbox/cuda/cuda_ndarray.cuh:19:0: warning: "PyString_AsString" redefined
#define PyString_AsString PyUnicode_AsUTF8
^
In file included from /home/vlaand/.local/lib/python3.6/site-packages/theano/sandbox/cuda/cuda_ndarray.cuh:11:0,
from mod.cu:10:
/home/vlaand/anaconda3/lib/python3.6/site-packages/numpy/core/include/numpy/npy_3kcompat.h:80:0: note: this is the location of the previous definition
#define PyString_AsString PyBytes_AsString
^
In file included from mod.cu:10:0:
/home/vlaand/.local/lib/python3.6/site-packages/theano/sandbox/cuda/cuda_ndarray.cuh:20:0: warning: "PyString_FromStringAndSize" redefined
#define PyString_FromStringAndSize PyUnicode_FromStringAndSize
^
In file included from /home/vlaand/.local/lib/python3.6/site-packages/theano/sandbox/cuda/cuda_ndarray.cuh:11:0,
from mod.cu:10:
/home/vlaand/anaconda3/lib/python3.6/site-packages/numpy/core/include/numpy/npy_3kcompat.h:74:0: note: this is the location of the previous definition
#define PyString_FromStringAndSize PyBytes_FromStringAndSize
^
In file included from mod.cu:10:0:
/home/vlaand/.local/lib/python3.6/site-packages/theano/sandbox/cuda/cuda_ndarray.cuh:21:0: warning: "PyString_Size" redefined
#define PyString_Size PyUnicode_GET_SIZE
^
In file included from /home/vlaand/.local/lib/python3.6/site-packages/theano/sandbox/cuda/cuda_ndarray.cuh:11:0,
from mod.cu:10:
/home/vlaand/anaconda3/lib/python3.6/site-packages/numpy/core/include/numpy/npy_3kcompat.h:82:0: note: this is the location of the previous definition
#define PyString_Size PyBytes_Size
^
mod.cu(940): warning: pointless comparison of unsigned integer with zero
mod.cu(3000): warning: conversion from a string literal to "char *" is deprecated
mod.cu(3003): warning: conversion from a string literal to "char *" is deprecated
mod.cu(3005): warning: conversion from a string literal to "char *" is deprecated
mod.cu(3008): warning: conversion from a string literal to "char *" is deprecated
mod.cu(3010): warning: conversion from a string literal to "char *" is deprecated
mod.cu(3013): warning: conversion from a string literal to "char *" is deprecated
mod.cu(3016): warning: conversion from a string literal to "char *" is deprecated
mod.cu(3019): warning: conversion from a string literal to "char *" is deprecated
mod.cu(3021): warning: conversion from a string literal to "char *" is deprecated
mod.cu(3024): warning: conversion from a string literal to "char *" is deprecated
mod.cu(3026): warning: conversion from a string literal to "char *" is deprecated
mod.cu(3029): warning: conversion from a string literal to "char *" is deprecated
mod.cu(3031): warning: conversion from a string literal to "char *" is deprecated
mod.cu(3034): warning: conversion from a string literal to "char *" is deprecated
mod.cu(3037): warning: conversion from a string literal to "char *" is deprecated
mod.cu(3040): warning: conversion from a string literal to "char *" is deprecated
mod.cu(3042): warning: conversion from a string literal to "char *" is deprecated
mod.cu(3045): warning: conversion from a string literal to "char *" is deprecated
mod.cu(3047): warning: conversion from a string literal to "char *" is deprecated
mod.cu(3050): warning: conversion from a string literal to "char *" is deprecated
mod.cu(3074): warning: statement is unreachable
In file included from mod.cu:10:0:
/home/vlaand/.local/lib/python3.6/site-packages/theano/sandbox/cuda/cuda_ndarray.cuh:17:0: warning: "PyString_Check" redefined
#define PyString_Check PyUnicode_Check
^
In file included from /home/vlaand/.local/lib/python3.6/site-packages/theano/sandbox/cuda/cuda_ndarray.cuh:11:0,
from mod.cu:10:
/home/vlaand/anaconda3/lib/python3.6/site-packages/numpy/core/include/numpy/npy_3kcompat.h:71:0: note: this is the location of the previous definition
#define PyString_Check PyBytes_Check
^
In file included from mod.cu:10:0:
/home/vlaand/.local/lib/python3.6/site-packages/theano/sandbox/cuda/cuda_ndarray.cuh:18:0: warning: "PyString_FromString" redefined
#define PyString_FromString PyUnicode_FromString
^
In file included from /home/vlaand/.local/lib/python3.6/site-packages/theano/sandbox/cuda/cuda_ndarray.cuh:11:0,
from mod.cu:10:
/home/vlaand/anaconda3/lib/python3.6/site-packages/numpy/core/include/numpy/npy_3kcompat.h:73:0: note: this is the location of the previous definition
#define PyString_FromString PyBytes_FromString
^
In file included from mod.cu:10:0:
/home/vlaand/.local/lib/python3.6/site-packages/theano/sandbox/cuda/cuda_ndarray.cuh:19:0: warning: "PyString_AsString" redefined
#define PyString_AsString PyUnicode_AsUTF8
^
In file included from /home/vlaand/.local/lib/python3.6/site-packages/theano/sandbox/cuda/cuda_ndarray.cuh:11:0,
from mod.cu:10:
/home/vlaand/anaconda3/lib/python3.6/site-packages/numpy/core/include/numpy/npy_3kcompat.h:80:0: note: this is the location of the previous definition
#define PyString_AsString PyBytes_AsString
^
In file included from mod.cu:10:0:
/home/vlaand/.local/lib/python3.6/site-packages/theano/sandbox/cuda/cuda_ndarray.cuh:20:0: warning: "PyString_FromStringAndSize" redefined
#define PyString_FromStringAndSize PyUnicode_FromStringAndSize
^
In file included from /home/vlaand/.local/lib/python3.6/site-packages/theano/sandbox/cuda/cuda_ndarray.cuh:11:0,
from mod.cu:10:
/home/vlaand/anaconda3/lib/python3.6/site-packages/numpy/core/include/numpy/npy_3kcompat.h:74:0: note: this is the location of the previous definition
#define PyString_FromStringAndSize PyBytes_FromStringAndSize
^
In file included from mod.cu:10:0:
/home/vlaand/.local/lib/python3.6/site-packages/theano/sandbox/cuda/cuda_ndarray.cuh:21:0: warning: "PyString_Size" redefined
#define PyString_Size PyUnicode_GET_SIZE
^
In file included from /home/vlaand/.local/lib/python3.6/site-packages/theano/sandbox/cuda/cuda_ndarray.cuh:11:0,
from mod.cu:10:
/home/vlaand/anaconda3/lib/python3.6/site-packages/numpy/core/include/numpy/npy_3kcompat.h:82:0: note: this is the location of the previous definition
#define PyString_Size PyBytes_Size
^
mod.cu(940): warning: pointless comparison of unsigned integer with zero
mod.cu(3000): warning: conversion from a string literal to "char *" is deprecated
mod.cu(3003): warning: conversion from a string literal to "char *" is deprecated
mod.cu(3005): warning: conversion from a string literal to "char *" is deprecated
mod.cu(3008): warning: conversion from a string literal to "char *" is deprecated
mod.cu(3010): warning: conversion from a string literal to "char *" is deprecated
mod.cu(3013): warning: conversion from a string literal to "char *" is deprecated
mod.cu(3016): warning: conversion from a string literal to "char *" is deprecated
mod.cu(3019): warning: conversion from a string literal to "char *" is deprecated
mod.cu(3021): warning: conversion from a string literal to "char *" is deprecated
mod.cu(3024): warning: conversion from a string literal to "char *" is deprecated
mod.cu(3026): warning: conversion from a string literal to "char *" is deprecated
mod.cu(3029): warning: conversion from a string literal to "char *" is deprecated
mod.cu(3031): warning: conversion from a string literal to "char *" is deprecated
mod.cu(3034): warning: conversion from a string literal to "char *" is deprecated
mod.cu(3037): warning: conversion from a string literal to "char *" is deprecated
mod.cu(3040): warning: conversion from a string literal to "char *" is deprecated
mod.cu(3042): warning: conversion from a string literal to "char *" is deprecated
mod.cu(3045): warning: conversion from a string literal to "char *" is deprecated
mod.cu(3047): warning: conversion from a string literal to "char *" is deprecated
mod.cu(3050): warning: conversion from a string literal to "char *" is deprecated
mod.cu(3074): warning: statement is unreachable
/usr/include/string.h: In function ‘void* __mempcpy_inline(void*, const void*, size_t)’:
/usr/include/string.h:652:42: error: ‘memcpy’ was not declared in this scope
return (char *) memcpy (__dest, __src, __n) + __n;
^
ERROR (theano.sandbox.cuda): Failed to compile cuda_ndarray.cu: ('nvcc return status', 1, 'for cmd', 'nvcc -shared -O3 -m64 -Xcompiler -DCUDA_NDARRAY_CUH=mc72d035fdf91890f3b36710688069b2e,-DNPY_NO_DEPRECATED_API=NPY_1_7_API_VERSION,-fPIC,-fvisibility=hidden -Xlinker -rpath,/home/vlaand/.theano/compiledir_Linux-4.4--generic-x86_64-with-debian-stretch-sid-x86_64-3.6.1-64/cuda_ndarray -I/home/vlaand/.local/lib/python3.6/site-packages/theano/sandbox/cuda -I/home/vlaand/anaconda3/lib/python3.6/site-packages/numpy/core/include -I/home/vlaand/anaconda3/include/python3.6m -I/home/vlaand/.local/lib/python3.6/site-packages/theano/gof -L/home/vlaand/anaconda3/lib -o /home/vlaand/.theano/compiledir_Linux-4.4--generic-x86_64-with-debian-stretch-sid-x86_64-3.6.1-64/cuda_ndarray/cuda_ndarray.so mod.cu -lcublas -lpython3.6m -lcudart')
['nvcc', '-shared', '-O3', '-m64', '-Xcompiler', '-DCUDA_NDARRAY_CUH=mc72d035fdf91890f3b36710688069b2e,-DNPY_NO_DEPRECATED_API=NPY_1_7_API_VERSION,-fPIC,-fvisibility=hidden', '-Xlinker', '-rpath,/home/vlaand/.theano/compiledir_Linux-4.4--generic-x86_64-with-debian-stretch-sid-x86_64-3.6.1-64/cuda_ndarray', '-I/home/vlaand/.local/lib/python3.6/site-packages/theano/sandbox/cuda', '-I/home/vlaand/anaconda3/lib/python3.6/site-packages/numpy/core/include', '-I/home/vlaand/anaconda3/include/python3.6m', '-I/home/vlaand/.local/lib/python3.6/site-packages/theano/gof', '-L/home/vlaand/anaconda3/lib', '-o', '/home/vlaand/.theano/compiledir_Linux-4.4--generic-x86_64-with-debian-stretch-sid-x86_64-3.6.1-64/cuda_ndarray/cuda_ndarray.so', 'mod.cu', '-lcublas', '-lpython3.6m', '-lcudart']
/home/vlaand/.local/lib/python3.6/site-packages/theano/gpuarray/dnn.py:135: UserWarning: Your cuDNN version is more recent than Theano. If you encounter problems, try updating Theano or downgrading cuDNN to version 5.1.
warnings.warn("Your cuDNN version is more recent than "
Using cuDNN version 7001 on context None
Mapped name None to device cuda3: GeForce GTX 1080 Ti (0000:83:00.0)