Training Regression on 5point CrowdFlower data


In [1]:
import numpy as np
import pandas as pd
import csv
import math, itertools
import os
import subprocess

import matplotlib.pyplot as plt

from multiprocessing import Pool, Manager

from collections import Counter
from stop_words import get_stop_words
import natsort
from natsort import natsorted

from scipy import spatial
from scipy.stats import pearsonr, spearmanr

from sklearn.svm import SVR, LinearSVR
from sklearn.externals import joblib
from sklearn.model_selection import cross_val_predict, cross_val_score
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.metrics import r2_score, f1_score 
from sklearn.metrics import classification_report, precision_recall_fscore_support
from sklearn.feature_extraction.text import CountVectorizer

from imblearn.over_sampling import RandomOverSampler

from keras.preprocessing import sequence
from keras.utils.np_utils import to_categorical
from keras.models import Sequential,load_model, model_from_json
from keras.layers import Dense, Activation, Embedding, Bidirectional, Dropout, LSTM
from keras.regularizers import l2
import keras.backend as K

from theano import function

import warnings


Using Theano backend.
1 #define _CUDA_NDARRAY_C
2 
3 #include <Python.h>
4 #include <structmember.h>
5 #include "theano_mod_helper.h"
6 
7 #include <numpy/arrayobject.h>
8 #include <iostream>
9 
10 #include "cuda_ndarray.cuh"
11 
12 #ifndef CNMEM_DLLEXPORT
13 #define CNMEM_DLLEXPORT
14 #endif
15 
16 #include "cnmem.h"
17 #include "cnmem.cpp"
18 
19 //If true, when there is a gpu malloc or free error, we print the size of allocated memory on the device.
20 #define COMPUTE_GPU_MEM_USED 0
21 
22 //If true, we fill with NAN allocated device memory.
23 #define ALLOC_MEMSET 0
24 
25 //If true, we print out when we free a device pointer, uninitialize a
26 //CudaNdarray, or allocate a device pointer
27 #define PRINT_FREE_MALLOC 0
28 
29 //If true, we do error checking at the start of functions, to make sure there
30 //is not a pre-existing error when the function is called.
31 //You probably need to set the environment variable
32 //CUDA_LAUNCH_BLOCKING=1, and/or modify the CNDA_THREAD_SYNC
33 //preprocessor macro in cuda_ndarray.cuh
34 //if you want this to work.
35 #define PRECHECK_ERROR 0
36 
37 cublasHandle_t handle = NULL;
38 int* err_var = NULL;
39 
40 /////////////////////////
41 // Alloc and Free
42 /////////////////////////
43 
44 static int g_gpu_context_active = 0;
45 
46 
47 PyObject *
48 CudaNdarray_Dimshuffle(PyObject* _unused, PyObject* args);
49 static PyObject *CudaNdarray_get_shape(CudaNdarray *self, void *closure);
50 
51 
52 /**
53  *
54  * In the test program I'm using, the _outstanding_mallocs decreases with every call.
55  * This suggests there are more free() calls being made than alloc(), but I can't figure out why.
56  *
57  */
58 int _outstanding_mallocs[] = {0,0};
59 
60 #if COMPUTE_GPU_MEM_USED
61 size_t _allocated_size = 0;
62 size_t _max_allocated_size = 0;
63 
64 const int TABLE_SIZE = 10000;
65 struct table_struct{
66     void* ptr;
67     size_t size;
68 };
69 table_struct _alloc_size_table[TABLE_SIZE];
70 #endif
71 
72 void * device_malloc(size_t size)
73 {
74     return device_malloc(size, VERBOSE_DEVICE_MALLOC);
75 }
76 
77 static bool g_use_cnmem = false;
78 static const int g_max_devices = 8;
79 int initCnmem(int card_number_provided, int card_nb, size_t mem) {
80     static bool cnmemInitialized = false;
81     if(cnmemInitialized) {
82         return 0;
83     }
84     // On stderr to be at the same place as "Using gpu device..."
85     int numDevices = 0;
86     cnmemDevice_t devices[g_max_devices];
87     if(cudaGetDeviceCount(&numDevices) != cudaSuccess) {
88         PyErr_Format(PyExc_RuntimeError,
89                      "initCnmem: 'cudaGetDeviceCount' failed! Reason=%s\n",
90                      cudaGetErrorString(cudaGetLastError()));
91         return -1;
92     }
93     if(card_number_provided){
94         numDevices = 1;
95         int i = 0;
96         devices[i].device = card_nb;
97         devices[i].size = mem;
98         ///@TODO: thejaswi: add support for multiple streams
99         devices[i].numStreams = 0;
100         devices[i].streams = NULL;
101         devices[i].streamSizes = NULL;
102     }else{
103         for(int i=0;i<numDevices;++i) {
104             devices[i].device = i;
105             devices[i].size = mem;
106             ///@TODO: thejaswi: add support for multiple streams
107             devices[i].numStreams = 0;
108             devices[i].streams = NULL;
109         }
110     }
111 
112     ///@TODO: thejaswi: passing custom cnmem flags?
113     cnmemStatus_t status = cnmemInit(numDevices, devices, CNMEM_FLAGS_DEFAULT);
114     if(status != CNMEM_STATUS_SUCCESS) {
115         PyErr_Format(PyExc_RuntimeError,
116                      "initCnmem: cnmemInit call failed! Reason=%s. numdev=%d\n",
117                      cnmemGetErrorString(status), numDevices);
118         return -1;
119     }
120     cnmemInitialized = true;
121     return 0;
122 }
123 
124 void * device_malloc(size_t size, int verbose)
125 {
126     #if PRECHECK_ERROR
127         cudaThreadSynchronize();
128         cudaError_t prevError = cudaGetLastError();
129         if (cudaSuccess != prevError)
130         {
131             fprintf(stderr,
132                     "Error existed before calling device_malloc. %s\n",
133                     cudaGetErrorString(prevError)
134                     );
135         }
136     #endif
137     void * rval=NULL;
138     ///@TODO: thejaswi: support for multiple-streams?
139     if(g_use_cnmem) {
140         cnmemStatus_t status = CNMEM_STATUS_SUCCESS;
141         status = cnmemMalloc(&rval, size, NULL);
142         if(status != CNMEM_STATUS_SUCCESS) {
143             PyErr_Format(PyExc_MemoryError,
144                          "Error allocating %llu bytes of device memory (%s).",
145                          (unsigned long long)size, cnmemGetErrorString(status));
146             return NULL;
147         }
148     }
149     else {
150         cudaError_t err = cudaMalloc(&rval, size);
151         if (cudaSuccess != err)
152         {
153             // Clear the error flag, cudaMalloc doesn't do it.
154             // Currently this returns the same thing as err, but if in future
155             // it returns something else I still don't see why we should ignore
156             // it.  All we want to do here is reset the flag.
157             cudaGetLastError();
158             if (verbose)
159             {
160                 size_t free = 0, total = 0;
161                 cudaError_t err2 = cudaMemGetInfo(&free, &total);
162                 if (err2 != cudaSuccess){
163                     cudaGetLastError();
164                     fprintf(stderr,
165                             "Error when trying to find the memory information"
166                             " on the GPU: %s\n", cudaGetErrorString(err2));
167                 }
168                 #if COMPUTE_GPU_MEM_USED
169                     fprintf(stderr,
170                             "Error allocating %llu bytes of device memory (%s)."
171                             " new total bytes allocated: %llu."
172                             " Driver report %llu bytes free and %llu bytes total \n",
173                             (unsigned long long)size, cudaGetErrorString(err), (unsigned long long)_allocated_size,
174                             (unsigned long long)free, (unsigned long long)total);
175                 #else
176                     fprintf(stderr,
177                             "Error allocating %llu bytes of device memory (%s)."
178                             " Driver report %llu bytes free and %llu bytes total \n",
179                             (unsigned long long)size, cudaGetErrorString(err), (unsigned long long)free, (unsigned long long)total);
180                 #endif
181             }
182             PyErr_Format(PyExc_MemoryError,
183                          "Error allocating %llu bytes of device memory (%s).",
184                          (unsigned long long)size, cudaGetErrorString(err));
185             return NULL;
186         }
187     }
188     if (rval != NULL){
189         // Can it happen that cudaMalloc return cudaSuccess, but return a NULL ptr?
190         // Could this be what happen if size is 0?
191         _outstanding_mallocs[0] += 1;
192 
193 #if COMPUTE_GPU_MEM_USED
194         _allocated_size += size;
195         _max_allocated_size = std::max(_max_allocated_size, _allocated_size);
196         int i = 0;
197         for(;i<TABLE_SIZE;i++){
198             if(NULL==_alloc_size_table[i].ptr){
199                 _alloc_size_table[i].ptr=rval;
200                 _alloc_size_table[i].size=size;
201                 break;
202             }
203         }
204         if (i == TABLE_SIZE){
205             fprintf(stderr,
206                     "When tracking GPU malloc, our table size wasn't big enough."
207                     " So we loose some tracking. Raise the value of TABLE_SIZE in the file cuda_ndarra.cu");
208         }
209 #endif
210     }
211     //fprintf(stderr,
212     //"allocated %li bytes of device memory (%s). new total bytes allocated: %d. ptr: %p\n",
213     //(long)size, cudaGetErrorString(err),_allocated_size,rval);
214 
215     if(ALLOC_MEMSET){
216         //We init them to nan to make sure we catch more debug case.
217         cudaMemset(rval, 0xFF, size);
218         //printf("MEMSET\n");
219     }
220     #if PRINT_FREE_MALLOC
221         fprintf(stderr, "device malloc %p of size %d\n", rval, size);
222     #endif
223     return rval;
224 }
225 
226 int device_free(void *ptr)
227 {
228     #if PRECHECK_ERROR
229         cudaThreadSynchronize();
230         cudaError_t prevError = cudaGetLastError();
231         if (cudaSuccess != prevError)
232         {
233             fprintf(stderr,
234                     "Error existed before calling device_free. %s\n",
235                     cudaGetErrorString(prevError)
236                     );
237         }
238     #endif
239     #if PRINT_FREE_MALLOC
240         size_t free = 0, total = 0;
241         cudaError_t err2 = cudaMemGetInfo(&free, &total);
242         if (err2 != cudaSuccess){
243             cudaGetLastError();
244             fprintf(stderr,
245                     "Error when tring to find the memory information"
246                     " on the GPU: %s\n", cudaGetErrorString(err2));
247         }
248         #if COMPUTE_GPU_MEM_USED
249         {
250             int i = 0;
251             for(;i<TABLE_SIZE;i++)
252                 if(_alloc_size_table[i].ptr==ptr){
253                     break;
254                 }
255             assert(i<TABLE_SIZE);
256             fprintf(stderr, "device_free %p of size %d."
257                     " Driver report %d bytes free and %d bytes total \n",
258                     ptr, _alloc_size_table[i].size, free, total);
259         }
260         #else
261             fprintf(stderr, "device_free %p."
262                     " Driver report %d bytes free and %d bytes total \n",
263                     ptr, free, total);
264         #endif
265     #endif
266 
267     // if there is no gpu context, the call to cudaFree will fail; skip it entirely
268     if(!g_gpu_context_active) {
269         return 0;
270     }
271 
272     ///@TODO: thejaswi: multi-stream support
273     if(g_use_cnmem) {
274         cnmemStatus_t status = cnmemFree(ptr, NULL);
275         if(status != CNMEM_STATUS_SUCCESS) {
276             fprintf(stderr, "device_free: cnmemFree call failed! Reason=%s\n",
277                     cnmemGetErrorString(status));
278         }
279     }
280     else {
281         // We need sync as the Theano's GC could remove intermediate variable that
282         // are still needed as the gpu kernel are running or in the queue.
283         CNDA_BEGIN_ALLOW_THREADS
284         cudaThreadSynchronize();
285         CNDA_END_ALLOW_THREADS
286 
287         cudaError_t err =  cudaFree(ptr);
288         if (cudaSuccess != err)
289         {
290             // Clear the error flag, cudaFree doesn't do it.
291             // Currently this returns the same thing as err, but if in future
292             // it returns something else I still don't see why we should ignore
293             // it.  All we want to do here is reset the flag.
294             cudaGetLastError();
295             size_t free = 0, total = 0;
296             cudaError_t err2 = cudaMemGetInfo(&free, &total);
297             if (err2 != cudaSuccess){
298                 cudaGetLastError();
299                 fprintf(stderr,
300                         "Error when tring to find the memory information"
301                         " on the GPU: %s\n", cudaGetErrorString(err2));
302             }
303             #if COMPUTE_GPU_MEM_USED
304             {
305                 int i = 0;
306                 for(;i<TABLE_SIZE;i++)
307                     if(_alloc_size_table[i].ptr==ptr){
308                         break;
309                     }
310                 assert(i<TABLE_SIZE);
311                 fprintf(stderr,
312                         "Error freeing device pointer %p (%s) of size %llu. %llu byte already allocated."
313                         " Driver report %llu bytes free and %llu bytes total \n",
314                         ptr, cudaGetErrorString(err),
315                         (unsigned long long)_alloc_size_table[i].size, (unsigned long long)_allocated_size, (unsigned long long)free, (unsigned long long)total);
316             }
317             #else
318                 fprintf(stderr,
319                         "Error freeing device pointer %p (%s)."
320                         " Driver report %llu bytes free and %llu bytes total \n",
321                         ptr,
322                         cudaGetErrorString(err), (unsigned long long)free, (unsigned long long)total);
323             #endif
324             if (NULL != PyErr_Occurred()){
325                 fprintf(stderr,
326                         "device_free: cudaFree() returned an error, but there is already an"
327                         " Python error set. This happen during the clean up when there is a"
328                         " first error and the CUDA driver is in a so bad state that it don't"
329                         " work anymore. We keep the previous error set to help debugging it.");
330                 return -1;
331             }
332             PyErr_Format(PyExc_MemoryError,
333                     "error freeing device pointer %p (%s)",
334                     ptr,
335                     cudaGetErrorString(err));
336             return -1;
337         }
338     }
339     _outstanding_mallocs[0] -= (ptr != NULL);
340     #if COMPUTE_GPU_MEM_USED
341         int i=0;
342         size_t total_freed = 0;
343         for(;i<TABLE_SIZE;i++)
344             if(_alloc_size_table[i].ptr==ptr){
345                 _allocated_size -= _alloc_size_table[i].size;
346                 total_freed += _alloc_size_table[i].size;
347                 _alloc_size_table[i].ptr=0;
348                 _alloc_size_table[i].size=0;
349 
350                 break;
351             }
352         //if(i==TABLE_SIZE)
353         //    printf("Unallocated unknow size!\n");
354         //fprintf(stderr, "freed %li bytes of device memory (%s). %d already allocated, ptr=%p\n", (long)total_freed, cudaGetErrorString(err),_allocated_size,ptr);
355     #endif
356     return 0;
357 }
358 
359 static PyObject *
360 outstanding_mallocs(PyObject* self, PyObject * args)
361 {
362     return PyInt_FromLong(_outstanding_mallocs[0]);
363 }
364 
365 
366 static void *work_mem = NULL;
367 static size_t work_size = 0;
368 
369 /*
370  * Returns a chunk of memory for temporary work inside of an op. You can only
371  * request a single chunk of memory at a time since it is reused.
372  */
373 void *get_work_mem(size_t sz) {
374     if (sz <= work_size)
375         return work_mem;
376     device_free(work_mem);
377     work_mem = device_malloc(sz);
378     work_size = sz;
379     if (work_mem == NULL)
380         work_size = 0;
381     return work_mem;
382 }
383 
384 /////////////////////////
385 // Static helper methods
386 /////////////////////////
387 
388 static void
389 CudaNdarray_null_init(CudaNdarray*self)
390 {
391     self->base = NULL;
392     self->nd = -1;
393     self->host_structure = NULL;
394     self->data_allocated = 0;
395     self->dev_structure_fresh = 1;
396     self->dev_structure = NULL;
397     self->devdata = NULL;
398 }
399 
400 static int
401 CudaNdarray_uninit(CudaNdarray*self)
402 {
403     #if PRINT_FREE_MALLOC
404         fprintf(stderr, "CudaNdarray_uninit %p\n", self);
405     #endif
406     int rval = 0;
407     if (self->data_allocated) {
408         assert(self->devdata);
409         if (device_free(self->devdata))
410         {
411             fprintf(stderr,
412                     "CudaNdarray_uninit: error freeing self->devdata. (self=%p, self->devata=%p)\n",
413                     self, self->devdata);
414             rval = -1;
415         }
416         self->devdata = NULL;
417         self->data_allocated = 0;
418     }
419     if (self->dev_structure)
420     {
421         if (device_free(self->dev_structure))
422         {
423             fprintf(stderr,
424                     "CudaNdarray_uninit: error freeing dev_structure memory %p (self=%p)\n",
425                     self->dev_structure, self);
426             rval = -1;
427         }
428         self->dev_structure = NULL;
429     }
430     if (self->host_structure)
431     {
432         free(self->host_structure);
433         self->host_structure = NULL;
434     }
435     self->nd = -1;
436     Py_XDECREF(self->base);
437     self->base = NULL;
438     return rval;
439 }
440 
441 
442 //make the rightmost coords change fastest
443 //TODO: why does a downward for-loop not work????
444 //TODO: use the log2_dims and driver code to remove / and %
445 //TODO: skip the last division (when d == 0)
446 #define decl_k_elemwise_unary_rowmajor(name, F) \
447 __global__ void name (unsigned int numEls,  \
448         unsigned int nd, \
449         const int * dim,  \
450         const float * a_data, const int * a_str, \
451         float * z_data, const int * z_str) \
452 { \
453     const unsigned int idx = blockIdx.x * blockDim.x + threadIdx.x; \
454     const unsigned int numThreads = blockDim.x * gridDim.x; \
455  \
456     for (unsigned int i = idx; i < numEls; i += numThreads) \
457     { \
458         unsigned int ii = i; \
459         const float * a_i = a_data; \
460         float * z_i = z_data; \
461         for (unsigned int _d = 0; _d < nd; ++_d) \
462         { \
463             unsigned int d = nd - _d-1;  \
464             int i_d = ii % dim[d]; /* i_d is our position in the d'th dimension   */ \
465             ii = ii / dim[d]; \
466             a_i += i_d * a_str[d]; /* increment our a and z pointers by i_d elements */ \
467             z_i += i_d * z_str[d]; \
468         } \
469         z_i[0] = F(a_i[0]); \
470     } \
471 }
472 
473 template<typename T> __device__ T unary_copy(T a) { return a; }
474 decl_k_elemwise_unary_rowmajor(k_elemwise_unary_rowmajor_copy, unary_copy<float>)
475 
476 template<typename T> __device__ T unary_exp(T a) { return exp(a); }
477 decl_k_elemwise_unary_rowmajor(k_elemwise_unary_rowmajor_exp, unary_exp<float>)
478 
479 /////////////////////////////
480 // Satisfying reqs to be Type
481 /////////////////////////////
482 
483 //DON'T use directly(if their is other CudaNdarray that point to it, it will cause problem)! use Py_DECREF() instead
484 static void
485 CudaNdarray_dealloc(CudaNdarray* self)
486 {
487     if (0) std::cerr << "CudaNdarray dealloc " << self << " " << self->devdata << '\n';
488     if(Py_REFCNT(self) > 1)
489       printf("WARNING:CudaNdarray_dealloc called when there is still active reference to it.\n");
490     CudaNdarray_uninit(self);
491     Py_TYPE(self)->tp_free((PyObject*)self);
492     --_outstanding_mallocs[1];
493     if (0)
494     {
495         fprintf(stderr, "device_malloc_counts: (device) %i (obj) %i\n",
496                 _outstanding_mallocs[0],
497                 _outstanding_mallocs[1]);
498     }
499 }
500 
501 static PyObject *
502 CudaNdarray_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
503 {
504     CudaNdarray *self;
505 
506     self = (CudaNdarray *)type->tp_alloc(type, 0);
507     if (self != NULL)
508     {
509         CudaNdarray_null_init(self);
510         ++_outstanding_mallocs[1];
511     }
512     return (PyObject *)self;
513 }
514 static int
515 CudaNdarray_init(CudaNdarray *self, PyObject *args, PyObject *kwds)
516 {
517     PyObject *arr=NULL;
518 
519     if (! PyArg_ParseTuple(args, "O", &arr))
520         return -1;
521     if (! PyArray_Check(arr))
522     {
523         PyErr_SetString(PyExc_TypeError, "PyArray arg required");
524         return -1;
525     }
526     int rval = CudaNdarray_CopyFromArray(self, (PyArrayObject*)arr);
527     return rval;
528 }
529 static PyMemberDef CudaNdarray_members[] =
530 {
531     /*
532     {"first", T_OBJECT_EX, offsetof(CudaNdarray, first), 0,
533      "first name"},
534     {"last", T_OBJECT_EX, offsetof(CudaNdarray, last), 0,
535      "last name"},
536     {"number", T_INT, offsetof(CudaNdarray, number), 0,
537      "noddy number"},
538      */
539     {NULL}  /* Sentinel */
540 };
541 
542 PyObject * CudaNdarray_CreateArrayObj(CudaNdarray * self, PyObject *args)
543 {
544     PyObject * dtype = NULL;
545     if (args && !PyArg_ParseTuple(args, "|O", &dtype))
546         return NULL;
547     if (dtype) {
548         PyArray_Descr* dtype2;
549         // PyArray_DescrConverter try to convert anything to a PyArray_Descr.
550         if(!PyArray_DescrConverter(dtype, &dtype2))
551         {
552             PyObject * str = PyObject_Repr(dtype);
553             PyErr_Format(PyExc_TypeError,
554                          "CudaNdarray dtype parameter not understood: %s",
555                          PyString_AsString(str)
556                          );
557             Py_CLEAR(str);
558             return NULL;
559         }
560         int typeNum = dtype2->type_num;
561         Py_DECREF(dtype2);
562         if (typeNum != NPY_FLOAT32)
563         {
564             PyObject * str = PyObject_Repr(dtype);
565             PyErr_Format(PyExc_TypeError,
566                          "CudaNdarray support only support float32 dtype, provided: %d",
567                          typeNum
568                          );
569             Py_CLEAR(str);
570             return NULL;
571         }
572     }
573 
574     int verbose = 0;
575     if(self->nd>=0 && CudaNdarray_SIZE(self)==0){
576         npy_intp * npydims = (npy_intp*)malloc(self->nd * sizeof(npy_intp));
577         assert (npydims);
578         for (int i = 0; i < self->nd; ++i) npydims[i] = (npy_intp)(CudaNdarray_HOST_DIMS(self)[i]);
579         PyObject * rval = PyArray_SimpleNew(self->nd, npydims, REAL_TYPENUM);
580         free(npydims);
581         if (!rval){
582             return NULL;
583         }
584         assert (PyArray_ITEMSIZE((PyArrayObject *)rval) == sizeof(real));
585         return rval;
586     }
587     if ((self->nd < 0) || (self->devdata == 0))
588     {
589         PyErr_SetString(PyExc_ValueError, "can't copy from un-initialized CudaNdarray");
590         return NULL;
591     }
592     CudaNdarray * contiguous_self = NULL;
593     if (CudaNdarray_is_c_contiguous(self))
594     {
595         contiguous_self = self;
596         Py_INCREF(contiguous_self);
597         if (verbose) std::cerr << "CreateArrayObj already contiguous" << contiguous_self << '\n';
598     }
599     else
600     {
601         contiguous_self = (CudaNdarray*)CudaNdarray_Copy(self);
602         if (verbose) std::cerr << "CreateArrayObj created contiguous" << contiguous_self << '\n';
603     }
604     if (!contiguous_self)
605     {
606         return NULL;
607     }
608 
609     npy_intp * npydims = (npy_intp*)malloc(self->nd * sizeof(npy_intp));
610     assert (npydims);
611     for (int i = 0; i < self->nd; ++i)
612         npydims[i] = (npy_intp)(CudaNdarray_HOST_DIMS(self)[i]);
613     PyArrayObject * rval = (PyArrayObject *) PyArray_SimpleNew(self->nd,
614                                                                npydims,
615                                                                REAL_TYPENUM);
616     free(npydims);
617     if (!rval)
618     {
619         Py_DECREF(contiguous_self);
620         return NULL;
621     }
622 
623     assert (PyArray_ITEMSIZE(rval) == sizeof(real));
624 
625     npy_intp rval_size = PyArray_SIZE(rval);
626     void *rval_data = PyArray_DATA(rval);
627     cudaError_t err;
628     CNDA_BEGIN_ALLOW_THREADS;
629 
630     err = cudaMemcpy(rval_data, contiguous_self->devdata,
631                      rval_size * sizeof(real),
632                      cudaMemcpyDeviceToHost
633                      );
634     //CNDA_THREAD_SYNC;  // unneeded because cudaMemcpy is blocking anyway
635     CNDA_END_ALLOW_THREADS;
636 
637     if (cudaSuccess != err)
638     {
639         PyErr_Format(PyExc_RuntimeError, "error (%s)copying data to host",
640                      cudaGetErrorString(err));
641         Py_DECREF(rval);
642         rval = NULL;
643     }
644 
645     Py_DECREF(contiguous_self);
646     return (PyObject *)rval;
647 }
648 
649 // TODO-- we have two functions here, ZEROS and Zeros.
650 // ZEROS is meant to be called just from C code (you don't need to pass it PyObject * s)
651 // but this naming is very weird, makes it look like a macro
652 // we should figure out the correct convention and change to that
653 PyObject* CudaNdarray_ZEROS(int n, int * dims)
654 {
655 
656     size_t total_elements = 1;
657 
658     for(size_t i=0;i<n;i++){
659         // Detect overflow on unsigned integer
660         if (dims[i] != 0 && total_elements > (SIZE_MAX / dims[i])) {
661             PyErr_Format(PyExc_RuntimeError,
662                          "Can't store in size_t for the bytes requested %llu * %llu",
663                          (unsigned long long)total_elements,
664                          (unsigned long long)dims[i]);
665             return NULL;
666         }
667         total_elements*=dims[i];
668     }
669 
670     // total_elements now contains the size of the array, in reals
671     if (total_elements > (SIZE_MAX / sizeof(real))){
672         PyErr_Format(PyExc_RuntimeError,
673                      "Can't store in size_t for the bytes requested %llu * 4",
674                      (unsigned long long)total_elements);
675         return NULL;
676     }
677     size_t total_size = total_elements * sizeof(real);
678 
679     CudaNdarray* rval = (CudaNdarray*)CudaNdarray_New();
680     if (!rval)
681     {
682         PyErr_SetString(PyExc_RuntimeError, "CudaNdarray_ZEROS: call to New failed");
683         return NULL;
684     }
685 
686     if (CudaNdarray_alloc_contiguous(rval, n, dims))
687     {
688         PyErr_SetString(PyExc_RuntimeError, "CudaNdarray_ZEROS: allocation failed.");
689         Py_DECREF(rval);
690         return NULL;
691     }
692 
693     // Fill with zeros
694     //fprintf(stdout, "Sizeof: %d\n", total_size);
695     if (cudaSuccess != cudaMemset(rval->devdata, 0, total_size))
696     {
697         PyErr_Format(PyExc_MemoryError,
698                      "CudaNdarray_ZEROS: Error memsetting %llu bytes of device memory.",
699                      (unsigned long long)total_size);
700         Py_DECREF(rval);
701         return NULL;
702     }
703 
704     if (cnda_copy_structure_to_device(rval))
705     {
706         PyErr_SetString(PyExc_RuntimeError, "CudaNdarray_ZEROS: syncing structure to device failed");
707         Py_DECREF(rval);
708         return NULL;
709     }
710     return (PyObject*) rval;
711 }
712 
713 // declared as a static method (hence 1st parameter is not used)
714 // Based on _Copy and _dimshuffle
715 PyObject* CudaNdarray_Zeros(PyObject* _unused, PyObject* shape)
716 {
717     if(!shape)
718     {
719         PyErr_SetString(PyExc_TypeError, "CudaNdarray_Zeros: function takes at least 1 argument (0 given)");
720         return NULL;
721     }
722     if(!PySequence_Check(shape))
723     {
724         PyErr_SetString(PyExc_TypeError, "shape argument must be a sequence");
725         return NULL;
726     }
727 
728     int shplen = PySequence_Length(shape);
729 
730     if (shplen == 0)
731     {
732         return CudaNdarray_ZEROS(0, NULL);
733     }
734 
735     int* newdims = (int *)malloc(sizeof(int) * shplen);
736 
737     if (!newdims)
738     {
739         PyErr_SetString(PyExc_MemoryError,
740             "CudaNdarray_Zeros: Failed to allocate temporary space");
741         return NULL;
742     }
743 
744     // start from the end to compute strides
745     for (int i = shplen-1; i >= 0; --i)
746     {
747         PyObject* shp_el_obj = PySequence_GetItem(shape, i);
748         if(shp_el_obj == NULL)
749         {
750             // shouldn't happen since we checked length before...
751             PyErr_SetString(PyExc_RuntimeError, "CudaNdarray_Zeros: Index out of bound in sequence");
752             free(newdims);
753             return NULL;
754         }
755 
756         int shp_el = PyInt_AsLong(shp_el_obj);
757         Py_DECREF(shp_el_obj);
758 
759         if (shp_el < 0)
760         {
761             PyErr_SetString(PyExc_ValueError, "CudaNdarray_Zeros: shape must contain only non-negative values for size of a dimension");
762             free(newdims);
763             return NULL;
764         }
765 
766         newdims[i] = shp_el;
767     }
768 
769     PyObject* rval = CudaNdarray_ZEROS(shplen,newdims);
770 
771     free(newdims);
772 
773     return (PyObject*)rval;
774 }
775 
776 
777 
778 
779 
780 PyObject * CudaNdarray_Copy(const CudaNdarray * self)
781 {
782     PyObject * rval = CudaNdarray_New();
783     if ((!rval) || (-1 == self->nd))
784     {
785         return rval;
786     }
787     if (CudaNdarray_alloc_contiguous((CudaNdarray*)rval, self->nd, CudaNdarray_HOST_DIMS(self)))
788     {
789         Py_DECREF(rval);
790         return NULL;
791     }
792     if (CudaNdarray_CopyFromCudaNdarray((CudaNdarray*)rval, self))
793     {
794         Py_DECREF(rval);
795         return NULL;
796     }
797     return rval;
798 }
799 PyObject * CudaNdarray_DeepCopy(CudaNdarray * self, PyObject * memo)
800 {
801     assert(PyDict_Check(memo));
802     PyObject * selfkey = PyInt_FromLong((long)self);
803     assert(selfkey);
804     if (PyDict_Contains(memo, selfkey))
805     {
806         PyObject * rval = PyDict_GetItem(memo, selfkey);
807         Py_DECREF(selfkey);
808         Py_XINCREF(rval);
809         return rval;
810     }
811     else
812     {
813         PyObject * rval = CudaNdarray_Copy(self);
814         if (0) std::cerr << "DeepCopy created " << rval << " devdata " << ((CudaNdarray*)rval)->devdata << "\n";
815         if (NULL == rval)
816         {
817             Py_DECREF(selfkey);
818             return NULL;
819         }
820         if (PyDict_SetItem(memo, selfkey, rval))
821         {
822             Py_DECREF(rval);
823             Py_DECREF(selfkey);
824             return NULL;
825         }
826         Py_DECREF(selfkey);
827         return rval;
828     }
829 }
830 PyObject * CudaNdarray_ReduceSum(CudaNdarray * self, PyObject * py_reduce_mask)
831 {
832     if (!PySequence_Check(py_reduce_mask))
833     {
834         PyErr_SetString(PyExc_TypeError, "reduce_mask must be sequence of ints");
835         return NULL;
836     }
837     int len = PySequence_Length(py_reduce_mask);
838     if (len != self->nd)
839     {
840         PyErr_SetString(PyExc_TypeError, "length of reduce_mask must match self->nd");
841         return NULL;
842     }
843     CudaNdarray * self_sum = (CudaNdarray*)CudaNdarray_New();
844     if (!self_sum)
845     {
846         return NULL;
847     }
848     //TODO: allocate a fixed size dimshuffle_pattern_cache on the stack,
849     //      and use it if it is big enough.
850     int * dimshuffle_pattern = (int*)malloc(len * 2 * sizeof(int));
851     int * sum_dims = dimshuffle_pattern + len;
852     int n_remaining_dims = 0;
853     if (!dimshuffle_pattern)
854     {
855         Py_DECREF(self_sum);
856         PyErr_SetString(PyExc_MemoryError, "failed to alloc internal storage");
857         return NULL;
858     }
859     for (int i = 0; i < len; ++i)
860     {
861         PyObject *o_i = PySequence_GetItem(py_reduce_mask, i);
862         int o_i_int = PyInt_AsLong(o_i);
863         Py_XDECREF(o_i);
864         if (PyErr_Occurred())
865         {
866             Py_DECREF(self_sum);
867             free(dimshuffle_pattern);
868             return NULL;
869         }
870         if (o_i_int) // this is a dimension over which we are reducing
871         {
872             sum_dims[i] = 1;
873         }
874         else
875         {
876             sum_dims[i] = CudaNdarray_HOST_DIMS(self)[i];
877             dimshuffle_pattern[n_remaining_dims++] = i;
878         }
879     }
880     if (0   || CudaNdarray_alloc_contiguous(self_sum, len, sum_dims)
881             || CudaNdarray_reduce_sum(self_sum, self)
882             || CudaNdarray_dimshuffle(self_sum, n_remaining_dims, dimshuffle_pattern))
883     {
884         Py_DECREF(self_sum);
885         free(dimshuffle_pattern);
886         return NULL;
887     }
888     free(dimshuffle_pattern);
889     return (PyObject*)self_sum;
890 }
891 
892 // Reshape self to the new shape gived by the tuple shape.
893 //
894 // If self is c contiguous, it return a view. Otherwise it always do a copy.
895 // TODO: make it return a view when the strides allow it even if it is not
896 //       c contiguous
897 PyObject * CudaNdarray_Reshape(CudaNdarray * self, PyObject * shape)
898 {
899     if(!CudaNdarray_is_c_contiguous(self))
900     {
901         // allocate new space
902         //TODO: test to see if we can re-use old one and take a new param to
903         //  use this
904         CudaNdarray* rval = (CudaNdarray*) CudaNdarray_Copy(self);
905         if (!rval)
906         {
907             return NULL;
908         }
909 
910         CudaNdarray* ret = (CudaNdarray*) CudaNdarray_Reshape(rval, shape);
911         Py_XDECREF(rval);
912         return (PyObject*)ret;
913     }
914 
915     // check shape tuple
916     unsigned int rval_nd;
917     unsigned int * rval_dims;
918     size_t rval_size = 1;
919 
920     if (PyTuple_Check(shape)){
921         // copy shape to integer array
922         rval_nd = PyTuple_Size(shape);
923     }else if (PyInt_Check(shape)){
924         rval_nd = 1;
925     }else{
926         PyErr_SetString(PyExc_TypeError, "shape must be tuple of integers or an integer");
927         return NULL;
928     }
929     rval_dims = (unsigned int*)malloc(rval_nd * sizeof(int));
930 
931     if(PyTuple_Check(shape)){
932         for (int i = 0; i < rval_nd; ++i)
933         {
934             rval_dims[i] = PyInt_AsLong(PyTuple_GetItem(shape, i)); //GetItem returns borrowed reference
935             if (PyErr_Occurred()) //error in AsLong
936             {
937                 free(rval_dims);
938                 return NULL;
939             }
940             if(rval_dims[i]<0){
941                 PyErr_Format(PyExc_ValueError, "Reshape has invalid dimension %i (must be >=0)",rval_dims[i]);
942                 free(rval_dims);
943                 return NULL;
944             }
945             rval_size = rval_size * rval_dims[i];
946         }
947     }else{
948         rval_size = PyInt_AsLong(shape);
949         rval_dims[0] = rval_size;
950     }
951     // calculate new size, assert same as old size
952     if (rval_size != CudaNdarray_SIZE(self))
953     {
954         PyErr_Format(PyExc_ValueError, "size must remain unchanged, changed from %lld to %lld", CudaNdarray_SIZE(self), rval_size);
955         free(rval_dims);
956         return NULL;
957     }
958     if (rval_size==0)
959     {
960         PyObject * rval = CudaNdarray_NewDims(rval_nd, rval_dims);
961         free(rval_dims);
962         return rval;
963     }
964 
965     //return a view, not a copy
966     //we can do this as we checked self is c_contiguous
967     CudaNdarray * rval = (CudaNdarray * )CudaNdarray_New(rval_nd);
968 
969     if (!rval || 0 != rval->data_allocated
970         ||CudaNdarray_set_device_data(rval, CudaNdarray_DEV_DATA(self), self))
971     {
972         Py_XDECREF(rval);
973         free(rval_dims);
974         return NULL;
975     }
976     //set dim and stride
977     int size = 1;
978     for (int i = rval_nd-1; i >= 0; --i)
979     {
980         CudaNdarray_set_stride(rval, i, (rval_dims[i] == 1) ? 0 : size);
981         CudaNdarray_set_dim(rval, i, rval_dims[i]);
982         size = size * rval_dims[i];
983     }
984     free(rval_dims);
985     return (PyObject*)rval;
986 }
987 
988 PyObject * CudaNdarray_View(const CudaNdarray * self)
989 {
990     CudaNdarray * rval = (CudaNdarray*)CudaNdarray_New(self->nd);
991     if (!rval || CudaNdarray_set_device_data(rval, CudaNdarray_DEV_DATA(self), self))
992     {
993         Py_XDECREF(rval);
994         rval = NULL;
995     }
996     else
997     {
998         for (int i = 0; i < self->nd; ++i)
999         {
1000             CudaNdarray_set_dim(rval, i, CudaNdarray_HOST_DIMS(self)[i]);
1001             CudaNdarray_set_stride(rval, i, CudaNdarray_HOST_STRIDES(self)[i]);
1002         }
1003     }
1004     return (PyObject*)rval;
1005 }
1006 
1007 /*
1008  * d0,... are the output dims
1009  * indices are a list of index to operate on
1010  *         They are int32 viewed as float32.
1011  * a is the output
1012  * b is the input
1013  * dB0, the source leading dimensions size
1014  */
1015 template <int operator_num>
1016 __global__ void k_take_3(const int d0, const int d1, const int d2,
1017                          const npy_int64* indices,
1018                          float* a,
1019                          const int sA0, const int sA1, const int sA2,
1020                          const float* b, const int dB0,
1021                          const int sB0, const int sB1, const int sB2,
1022                          int* err){
1023     for (int i0 = blockIdx.x; i0 < d0; i0 += gridDim.x){
1024         npy_int64 idx = indices[i0];
1025         if (idx<0)
1026             idx += dB0; // To allow negative indexing.
1027         if ((idx < 0) || (idx >= dB0)){
1028             // Any value other the 0 probably work. But to be more safe, I want
1029             // to change all bits to prevent problem with concurrent write that
1030             // could cross cache line. But this should not happen with the
1031             // current code and driver.
1032             *err = 0xFFFF;
1033             continue;
1034         }
1035         for (int i1 = threadIdx.x; i1 < d1; i1 += blockDim.x){
1036             for (int i2 = threadIdx.y; i2 < d2; i2 += blockDim.y){
1037                 int a_idx = i0*sA0 + i1*sA1 + i2*sA2;
1038                 int b_idx = idx*sB0 + i1*sB1 + i2*sB2;
1039                 a[a_idx] = b[b_idx];
1040             }
1041         }
1042     }
1043 }
1044 
1045 // We try to be similar to the PyArray_TakeFrom function
1046 //http://docs.scipy.org/doc/numpy/reference/c-api.array.html
1047 //TODO: support other clip mode then raise(clip, wrap)
1048 //self is the input that we copy data from.
1049 //The indices that we receive MUST be an CudaNdarray(float32)
1050 //    that is in fact a view to int64 indices
1051 PyObject*
1052 CudaNdarray_TakeFrom(CudaNdarray * self, PyObject *args){
1053     int verbose = 0;
1054     PyObject * indices_obj = NULL;
1055     //int axis; Default None, that mean the flattened array.
1056     PyObject * axis_obj = Py_None;
1057     PyObject * out_obj = Py_None;
1058     PyObject * clipmode_obj = NULL;
1059     int max_threads = 1; // max threads per blocks
1060 
1061     if (! PyArg_ParseTuple(args, "O|OOOi", &indices_obj, &axis_obj,
1062                            &out_obj, &clipmode_obj, &max_threads))
1063         return NULL;
1064 
1065     //Check argument indices
1066     //TODO: if not a numpy.ndarray, convert to numpy.ndarray
1067     //TODO: If a CudaNdarray, accept it and suppose the data is int32? is float32 number of int?
1068     //TODO: Support ndarray of other dtype then int32
1069     //TODO: support list of indices that are not c_contiguous
1070     CudaNdarray * indices = NULL;
1071     if (CudaNdarray_Check(indices_obj)) {
1072         if (verbose) printf("cudandarray indices\n");
1073         indices = (CudaNdarray*) indices_obj;
1074         Py_INCREF(indices);
1075     } else if (PyArray_Check(indices_obj)) {
1076         if (verbose) printf("ndarray indices\n");
1077         if (PyArray_TYPE((PyArrayObject *)indices_obj) != NPY_INT64) {
1078             PyErr_SetString(PyExc_TypeError,
1079                             "CudaNdarray_TakeFrom: need a ndarray for indices"
1080                             " with dtype int64");
1081             return NULL;
1082         }
1083         if (PyArray_NDIM(((PyArrayObject*)indices_obj)) != 1) {
1084             PyErr_SetString(PyExc_TypeError,
1085                             "CudaNdarray_TakeFrom: need a CudaNdarray of"
1086                             " indices with only 1 dimensions");
1087             return NULL;
1088         }
1089         // We need indices_obj to be contiguous, in order to take a view
1090         // with a different dtype.
1091         if (!PyArray_IS_C_CONTIGUOUS((PyArrayObject*) indices_obj)) {
1092             PyObject* indices_obj_contig = PyArray_NewCopy((PyArrayObject*) indices_obj, NPY_CORDER);
1093             if (!indices_obj_contig)
1094                 return NULL;
1095             indices_obj = indices_obj_contig;
1096         } else {
1097             // Keep the refcount consistent
1098             Py_INCREF(indices_obj);
1099         }
1100         PyArray_Descr* float32_descr = PyArray_DescrFromType(NPY_FLOAT32);
1101         PyObject * indices_float32 = NULL;
1102         indices_float32 = PyArray_View((PyArrayObject*)indices_obj,
1103                                                   float32_descr, NULL);
1104         if (verbose) printf("ndarray indices\n");
1105         if (!indices_float32) {
1106             Py_DECREF(indices_obj);
1107             return NULL;
1108         }
1109 
1110         indices = (CudaNdarray*) CudaNdarray_New();
1111         if (verbose) printf("\nndarray after new\n");
1112         if (! indices){
1113             Py_DECREF(indices_obj);
1114             Py_DECREF(indices_float32);
1115             return NULL;
1116         }
1117         if (CudaNdarray_CopyFromArray(indices,
1118                                       (PyArrayObject *)indices_float32)){
1119             Py_DECREF(indices_obj);
1120             Py_DECREF(indices_float32);
1121             return NULL;
1122         }
1123         Py_DECREF(indices_obj);
1124         Py_DECREF(indices_float32);
1125     } else {
1126         PyObject* py_s = PyObject_Str(indices_obj);
1127         const char* s = PyString_AsString(py_s);
1128         Py_DECREF(py_s);
1129         PyErr_Format(PyExc_TypeError,
1130                      "CudaNdarray_TakeFrom: need an ndarray of int64 or a"
1131                      " CudaNdarray(float32) that is a view from int64 data"
1132                      " for indices. Got %s", s);
1133         return NULL;
1134     }
1135 
1136     if (verbose) {
1137         printf("indices used on the gpu\n");
1138         fprint_CudaNdarray(stdout, indices);
1139         PyObject * used_indices = CudaNdarray_CreateArrayObj(indices);
1140         PyObject_Print(used_indices, stdout, 0);
1141         Py_DECREF(used_indices);
1142     }
1143     if (verbose) printf("after print of object\n");
1144     if(!CudaNdarray_is_c_contiguous(indices) != 0) {
1145         PyErr_SetString(PyExc_NotImplementedError,
1146                         "CudaNdarray_TakeFrom: The indices must be contiguous in memory.");
1147         Py_DECREF(indices);
1148         return NULL;
1149     }
1150     int nb_indices = CudaNdarray_SIZE((CudaNdarray *)indices) / 2;// int64 are 8 bytes, float32 are 4 bytes
1151 
1152     //Check argument axis
1153     //TODO: implement the default and other axis
1154     long axis = PyInt_AsLong(axis_obj);
1155 
1156     if (axis != 0) {
1157         PyErr_Format(PyExc_NotImplementedError,
1158                      "CudaNdarray_TakeFrom: only axis=0 is currently supported."
1159                      " Got %ld.", axis);
1160         Py_DECREF(indices);
1161         return NULL;
1162     }
1163 
1164     //Check argument out_obj
1165     CudaNdarray * out = NULL;
1166     if (out_obj && CudaNdarray_Check(out_obj))
1167         out = (CudaNdarray*) out_obj;
1168     if (out && (out->nd != self->nd ||
1169                 CudaNdarray_HOST_DIMS(out)[0] != nb_indices))
1170         out = NULL;
1171     int * dims = (int *)malloc(sizeof(int) * self->nd);
1172     dims[0] = nb_indices;
1173 
1174     for (int i=1 ; i<self->nd ; i++) {
1175         dims[i] = CudaNdarray_HOST_DIMS(self)[i];
1176         if (out && CudaNdarray_HOST_DIMS(out)[i] != dims[i]) {
1177             out = NULL;
1178         }
1179     }
1180     if (!out) {
1181         out = (CudaNdarray*)CudaNdarray_New();
1182         if (!out){
1183             Py_DECREF(indices);
1184             free(dims);
1185             return NULL;
1186         }
1187         if (CudaNdarray_alloc_contiguous(out, self->nd, dims)) {
1188             Py_DECREF(out);
1189             Py_DECREF(indices);
1190             free(dims);
1191             return NULL;
1192         }
1193     }else {
1194         Py_INCREF(out);
1195     }
1196 
1197     //Check argument clipmode
1198     if (clipmode_obj) {
1199         char * clipmode = PyString_AsString(clipmode_obj);
1200         if (! clipmode){
1201             Py_DECREF(indices);
1202             Py_DECREF(out);
1203             free(dims);
1204             return NULL;
1205         }
1206         if (strcmp(clipmode, "raise") != 0) {
1207             PyErr_Format(PyExc_NotImplementedError,
1208                          "CudaNdarray_TakeFrom: only the raise mode is currently supported. Got '%s'",
1209                          clipmode);
1210             Py_DECREF(indices);
1211             Py_DECREF(out);
1212             free(dims);
1213             return NULL;
1214         }
1215     }
1216     void (*k3)(const int, const int, const int,
1217                const npy_int64*,
1218                float*, const int, const int, const int,
1219                const float*, const int,
1220                const int, const int, const int,
1221                int*);
1222     k3 = k_take_3<CPY>;
1223 
1224     // Create the memory place that will store the error information.
1225     if(init_err_var() != 0) return NULL;
1226 
1227     dim3 n_blocks(std::min(CudaNdarray_HOST_DIMS(out)[0],65535),1,1);
1228     if(CudaNdarray_HOST_DIMS(out)[0] == 0){
1229         // We take 0 elements, so no need for the rest of the code.
1230         // This speed up that case AND fix crash otherwise.
1231         free(dims);
1232         Py_DECREF(indices);
1233         return (PyObject *)out;
1234     }
1235 
1236     switch (self->nd) {
1237         case 1:
1238             {
1239                 dim3 n_threads(1, 1, 1);
1240                 if (verbose)
1241                     printf("cudaGetLastError=%d, nd=%d"
1242                            " kernel config: (n_blocks.x=%d, n_blocks.y=%d,"
1243                            " n_threads.x=%i, n_threads.y=%i)\n",
1244                            cudaGetLastError(), self->nd,
1245                            n_blocks.x, n_blocks.y, n_threads.x, n_threads.y);
1246                 k3<<<n_blocks, n_threads>>>(
1247                         dims[0],
1248                         1,
1249                         1,
1250                         (npy_int64*) CudaNdarray_DEV_DATA(indices),
1251                         CudaNdarray_DEV_DATA(out),
1252                         CudaNdarray_HOST_STRIDES(out)[0], //strides
1253                         1,
1254                         1,
1255                         CudaNdarray_DEV_DATA(self),
1256                         CudaNdarray_HOST_DIMS(self)[0], //For indices check
1257                         CudaNdarray_HOST_STRIDES(self)[0], //strides
1258                         1,
1259                         1,
1260                         err_var);
1261             }
1262             break;
1263         case 2:
1264             {
1265                 dim3 n_threads(std::min(CudaNdarray_HOST_DIMS(out)[1], max_threads), 1, 1);
1266 
1267                 if (verbose)
1268                     printf("cudaGetLastError=%d, nd=%d"
1269                            " kernel config: (n_blocks.x=%d, n_blocks.y=%d,"
1270                            " n_threads.x=%i, n_threads.y=%i)\n",
1271                            cudaGetLastError(), self->nd,
1272                            n_blocks.x, n_blocks.y, n_threads.x, n_threads.y);
1273 
1274                 k3<<<n_blocks, n_threads>>>(
1275                         dims[0], //dimensions
1276                         dims[1],
1277                         1,
1278                         (npy_int64*) CudaNdarray_DEV_DATA(indices),
1279                         CudaNdarray_DEV_DATA(out),
1280                         CudaNdarray_HOST_STRIDES(out)[0], //strides
1281                         CudaNdarray_HOST_STRIDES(out)[1],
1282                         1,
1283                         CudaNdarray_DEV_DATA(self),
1284                         CudaNdarray_HOST_DIMS(self)[0], //For indices check
1285                         CudaNdarray_HOST_STRIDES(self)[0], //strides
1286                         CudaNdarray_HOST_STRIDES(self)[1],
1287                         1,
1288                         err_var);
1289             }
1290             break;
1291         case 3:
1292             {
1293                 int ty = std::min(CudaNdarray_HOST_DIMS(out)[2], max_threads);
1294                 int tx = std::min(CudaNdarray_HOST_DIMS(out)[1], max_threads / ty);
1295                 dim3 n_threads(tx, ty, 1);
1296                 if (verbose)
1297                     printf("cudaGetLastError=%d, nd=%d"
1298                            " kernel config: (n_blocks.x=%d, n_blocks.y=%d,"
1299                            " n_threads.x=%i, n_threads.y=%i)\n",
1300                            cudaGetLastError(), self->nd,
1301                            n_blocks.x, n_blocks.y, n_threads.x, n_threads.y);
1302                 k3<<<n_blocks, n_threads>>>(
1303                         dims[0], //dimensions
1304                         dims[1],
1305                         dims[2],
1306                         (npy_int64*) CudaNdarray_DEV_DATA(indices),
1307                         CudaNdarray_DEV_DATA(out),
1308                         CudaNdarray_HOST_STRIDES(out)[0], //strides
1309                         CudaNdarray_HOST_STRIDES(out)[1],
1310                         CudaNdarray_HOST_STRIDES(out)[2],
1311                         CudaNdarray_DEV_DATA(self),
1312                         CudaNdarray_HOST_DIMS(self)[0], //For indices check
1313                         CudaNdarray_HOST_STRIDES(self)[0], //strides
1314                         CudaNdarray_HOST_STRIDES(self)[1],
1315                         CudaNdarray_HOST_STRIDES(self)[2],
1316                         err_var);
1317             }
1318             break;
1319     default:
1320         PyErr_SetString(PyExc_NotImplementedError,
1321                         "CudaNdarray_TakeFrom: only input with 1, 2 or 3"
1322                         " dimensions are currently supported");
1323 
1324     }
1325     free(dims);
1326     CNDA_THREAD_SYNC;
1327     cudaError_t err = cudaGetLastError();
1328     if (cudaSuccess != err) {
1329         PyErr_Format(PyExc_RuntimeError,
1330                      "Cuda error: %s: %s.\n",
1331                      "CudaNdarray_TakeFrom",
1332                      cudaGetErrorString(err));
1333         Py_DECREF(indices);
1334         Py_DECREF(out);
1335         return NULL;
1336     }
1337 
1338     int index_err = check_err_var();
1339     Py_DECREF(indices);
1340     if (index_err != 0) {
1341         Py_DECREF(out);
1342         return NULL;
1343     }
1344 
1345     if (verbose) printf("TAKE SUCCEDED\n");
1346     return (PyObject *)out;
1347 }
1348 
1349 
1350 PyObject * CudaNdarray_SetStride(CudaNdarray * self, PyObject *args)
1351 {
1352     int pos, stride;
1353     if (! PyArg_ParseTuple(args, "ii", &pos, &stride))
1354         return NULL;
1355     if ((pos < 0) || (pos >= self->nd))
1356     {
1357         PyErr_Format(PyExc_ValueError, "position argument out of legal range [0, %i)", self->nd);
1358         return NULL;
1359     }
1360     CudaNdarray_set_stride(self, pos, stride);
1361     if (cnda_copy_structure_to_device(self))
1362     {
1363         return NULL;
1364     }
1365     Py_INCREF(Py_None);
1366     return Py_None;
1367 }
1368 PyObject * CudaNdarray_SetShapeI(CudaNdarray * self, PyObject *args)
1369 {
1370     int pos, dim;
1371     if (! PyArg_ParseTuple(args, "ii", &pos, &dim))
1372         return NULL;
1373     if ((pos < 0) || (pos >= self->nd))
1374     {
1375         PyErr_Format(PyExc_ValueError, "position argument out of legal range [0, %i)", self->nd);
1376         return NULL;
1377     }
1378     CudaNdarray_set_dim(self, pos, dim);
1379     if (cnda_copy_structure_to_device(self))
1380     {
1381         return NULL;
1382     }
1383     Py_INCREF(Py_None);
1384     return Py_None;
1385 }
1386 
1387 static PyObject *
1388 CudaNdarray_exp(CudaNdarray* self)
1389 {
1390     CudaNdarray * rval = (CudaNdarray *)CudaNdarray_New();
1391     if ((NULL == rval) || CudaNdarray_alloc_contiguous(rval, self->nd, CudaNdarray_HOST_DIMS(self)))
1392     {
1393         Py_XDECREF(rval);
1394         return NULL;
1395     }
1396     unsigned int size = 1;
1397     for (int i = 0; i < self->nd; i++)
1398     {
1399         size *= (unsigned int) CudaNdarray_HOST_DIMS(self)[i];
1400     }
1401     unsigned int threads_per_block = std::min(size, (unsigned int)NUM_VECTOR_OP_THREADS_PER_BLOCK);
1402     unsigned int n_blocks = std::min(ceil_intdiv(size,threads_per_block), (unsigned int)NUM_VECTOR_OP_BLOCKS);
1403     k_elemwise_unary_rowmajor_exp<<<n_blocks,threads_per_block>>>(size, self->nd, CudaNdarray_DEV_DIMS(self),
1404             CudaNdarray_DEV_DATA(self), CudaNdarray_DEV_STRIDES(self),
1405             CudaNdarray_DEV_DATA(rval), CudaNdarray_DEV_STRIDES(rval));
1406 
1407     //TODO: don't do this right away, do it when we need the result
1408     CNDA_THREAD_SYNC;
1409     cudaError_t err = cudaGetLastError();
1410     if( cudaSuccess != err)
1411     {
1412         Py_DECREF(rval);
1413         PyErr_Format(PyExc_RuntimeError, "Cuda error: %s: %s.\n", "kExp", cudaGetErrorString(err));
1414         return NULL;
1415     }
1416 
1417     return (PyObject*)rval;
1418 }
1419 
1420 static PyMethodDef CudaNdarray_methods[] =
1421 {
1422     {"__array__",
1423         (PyCFunction)CudaNdarray_CreateArrayObj, METH_VARARGS,
1424         "Copy from the device to a numpy ndarray"},
1425     {"__copy__",
1426         (PyCFunction)CudaNdarray_View, METH_NOARGS,
1427         "Create a shallow copy of this object. used by module copy"},
1428     {"__deepcopy__",
1429         (PyCFunction)CudaNdarray_DeepCopy, METH_O,
1430         "Create a copy of this object"},
1431     {"zeros",
1432         (PyCFunction)CudaNdarray_Zeros, METH_STATIC | METH_O,
1433         "Create a new CudaNdarray with specified shape, filled with zeros."},
1434     {"copy",
1435         (PyCFunction)CudaNdarray_Copy, METH_NOARGS,
1436         "Create a copy of this object"},
1437     {"is_c_contiguous",
1438         (PyCFunction)CudaNdarray_IS_C_Contiguous, METH_NOARGS,
1439         "Return True is the object is c contiguous. False otherwise."},
1440     {"reduce_sum",
1441         (PyCFunction)CudaNdarray_ReduceSum, METH_O,
1442         "Reduce over the given dimensions by summation"},
1443     {"exp",
1444         (PyCFunction)CudaNdarray_exp, METH_NOARGS,
1445         "Return the exponential of all elements"},
1446     {"reshape",
1447         (PyCFunction)CudaNdarray_Reshape, METH_O,
1448         "Return a reshaped view (or copy) of this ndarray\n\
1449             The required argument is a tuple of integers specifying the shape of the new ndarray."},
1450     {"view",
1451         (PyCFunction)CudaNdarray_View, METH_NOARGS,
1452         "Return an alias of this ndarray"},
1453     {"_set_stride",
1454         (PyCFunction)CudaNdarray_SetStride, METH_VARARGS,
1455         "For integer arguments (i, s), set the 'i'th stride to 's'"},
1456     {"take",
1457         (PyCFunction)CudaNdarray_TakeFrom, METH_VARARGS,
1458         "Equivalent of numpy.take"},
1459     {"_set_shape_i",
1460         (PyCFunction)CudaNdarray_SetShapeI, METH_VARARGS,
1461         "For integer arguments (i, s), set the 'i'th shape to 's'"},
1462     {NULL, NULL, NULL, NULL}  /* Sentinel */
1463 };
1464 
1465 
1466 ////////////////////
1467 // Number protocol
1468 ////////////////////
1469 
1470 __global__ void kAdd_contiguous(float* a, float* b, float* dest, unsigned int numEls) {
1471     const unsigned int idx = blockIdx.x * blockDim.x + threadIdx.x;
1472     const unsigned int numThreads = blockDim.x * gridDim.x;
1473 
1474     for (unsigned int i = idx; i < numEls; i += numThreads) {
1475         dest[i] = a[i] + b[i];
1476     }
1477 }
1478 
1479 // Will be called by __add__ in Python
1480 static PyObject *
1481 CudaNdarray_add(PyObject* py_self, PyObject * py_other)
1482 {
1483     if (! CudaNdarray_Check(py_self)) {
1484         PyErr_SetString(PyExc_TypeError, "need a CudaNdarray on left");
1485         return NULL;
1486     }
1487     if (! CudaNdarray_Check(py_other)) {
1488         PyErr_SetString(PyExc_TypeError, "need a CudaNdarray on right");
1489         return NULL;
1490     }
1491     CudaNdarray * self = (CudaNdarray *)py_self;
1492     CudaNdarray * other = (CudaNdarray *)py_other;
1493     if(!CudaNdarray_is_c_contiguous(self) || !CudaNdarray_is_c_contiguous(other)){
1494         PyErr_SetString(PyExc_TypeError, "We have implementet only the c_contiguous version for now.");
1495         return NULL;
1496     }
1497 
1498     //standard elemwise size checks
1499     if (self->nd != other->nd)
1500     {
1501         PyErr_SetString(PyExc_TypeError, "CudaNdarray_add: need same number of dims");
1502         return NULL;
1503     }
1504     //standard elemwise dim checks
1505     unsigned int size = 1;
1506     for (int i = 0; i< self->nd; ++i)
1507     {
1508         if (CudaNdarray_HOST_DIMS(self)[i] != CudaNdarray_HOST_DIMS(other)[i])
1509         {
1510             PyErr_SetString(PyExc_TypeError, "need same dimensions");
1511             return NULL;
1512         }
1513         size *= (unsigned int) CudaNdarray_HOST_DIMS(self)[i];
1514     }
1515     CudaNdarray * rval = (CudaNdarray *)CudaNdarray_New();
1516     if (!rval || CudaNdarray_alloc_contiguous(rval, self->nd, CudaNdarray_HOST_DIMS(self)))
1517     {
1518         Py_XDECREF(rval);
1519         return NULL;
1520     }
1521 
1522     if(CudaNdarray_SIZE((CudaNdarray *)py_self)==0 && CudaNdarray_SIZE((CudaNdarray *)py_other)==0){
1523       return (PyObject *) rval;
1524     }
1525 
1526     int threads_per_block = std::min(size, (unsigned int)NUM_VECTOR_OP_THREADS_PER_BLOCK);
1527     int n_blocks = std::min(ceil_intdiv(size,(unsigned int)threads_per_block), (unsigned int)NUM_VECTOR_OP_BLOCKS);
1528     kAdd_contiguous<<<n_blocks,threads_per_block>>>(
1529             self->devdata, other->devdata, rval->devdata, size);
1530     CNDA_THREAD_SYNC;
1531     cudaError_t err = cudaGetLastError();
1532     if( cudaSuccess != err)
1533     {
1534         PyErr_Format(PyExc_RuntimeError, "Cuda error: %s: %s.\n", "kAdd", cudaGetErrorString(err));
1535         Py_DECREF(rval);
1536         return NULL;
1537     }
1538     return (PyObject *) rval;
1539 }
1540 
1541 template <int operator_num>
1542 __global__ void k_ielem_3(const int d0, const int d1, const int d2,
1543         float* a, const int sA0, const int sA1, const int sA2,
1544         const float* b, const int sB0, const int sB1, const int sB2){
1545     for (int i0 = blockIdx.x; i0 < d0; i0 += gridDim.x){
1546         for (int i1 = blockIdx.y; i1 < d1; i1 += gridDim.y){
1547             for (int i2 = threadIdx.x; i2 < d2; i2 += blockDim.x){
1548                 switch (operator_num)
1549                 {
1550                   case IADD:
1551                     a[i0*sA0 + i1*sA1 + i2*sA2] += b[i0*sB0 + i1*sB1 + i2*sB2];
1552                     break;
1553                   case IDIV:
1554                     a[i0*sA0 + i1*sA1 + i2*sA2] /= b[i0*sB0 + i1*sB1 + i2*sB2];
1555                     break;
1556                   case CPY:
1557                     a[i0*sA0 + i1*sA1 + i2*sA2] = b[i0*sB0 + i1*sB1 + i2*sB2];
1558                     break;
1559                 }
1560             }
1561         }
1562     }
1563 }
1564 
1565 template <int operator_num>
1566 __global__ void k_ielem_4(const int d0, const int d1, const int d2, const int d3,
1567                          float* a, const int sA0, const int sA1,
1568                          const int sA2, const int sA3,
1569                          const float* b, const int sB0, const int sB1,
1570                          const int sB2, const int sB3){
1571     for (int i0 = blockIdx.x; i0 < d0; i0 += gridDim.x){
1572         for (int i1 = blockIdx.y; i1 < d1; i1 += gridDim.y){
1573             for (int i2 = threadIdx.x; i2 < d2; i2 += blockDim.x){
1574                 for (int i3 = threadIdx.y; i3 < d3; i3 += blockDim.y){
1575                     switch (operator_num) {
1576                         case IADD:
1577                             a[i0*sA0 + i1*sA1 + i2*sA2 + i3*sA3]
1578                             += b[i0*sB0 + i1*sB1 + i2*sB2 + i3*sB3];
1579                             break;
1580                         case IDIV:
1581                             a[i0*sA0 + i1*sA1 + i2*sA2 + i3*sA3]
1582                             /= b[i0*sB0 + i1*sB1 + i2*sB2 + i3*sB3];
1583                             break;
1584                         case CPY:
1585                             a[i0*sA0 + i1*sA1 + i2*sA2 + i3*sA3]
1586                             = b[i0*sB0 + i1*sB1 + i2*sB2 + i3*sB3];
1587                             break;
1588                     }
1589                 }
1590             }
1591         }
1592     }
1593 }
1594 
1595 template <int operator_num>
1596 __global__ void k_ielem_6(const int d0, const int d1,
1597                           const int d2, const int d3,
1598                           const int d4, const int d5,
1599                           float* a, const int sA0, const int sA1,
1600                           const int sA2, const int sA3,
1601                           const int sA4, const int sA5,
1602                           const float* b, const int sB0, const int sB1,
1603                           const int sB2, const int sB3,
1604                           const int sB4, const int sB5
1605                           ){
1606     for (int i0 = blockIdx.x; i0 < d0; i0 += gridDim.x){
1607         for (int i1 = blockIdx.y; i1 < d1; i1 += gridDim.y){
1608             for (int i2 = blockIdx.z; i2 < d2; i2 += gridDim.z){
1609                 for (int i3 = threadIdx.x; i3 < d3; i3 += blockDim.x){
1610                     for (int i4 = threadIdx.y; i4 < d4; i4 += blockDim.y){
1611                         for (int i5 = threadIdx.z; i5 < d5; i5 += blockDim.z){
1612                             switch (operator_num) {
1613                             case IADD:
1614                                 a[i0*sA0 + i1*sA1 + i2*sA2 + i3*sA3 + i4*sA4 + i5*sA5]
1615                                     += b[i0*sB0 + i1*sB1 + i2*sB2 + i3*sB3 + i4*sB4 + i5*sB5];
1616                                 break;
1617                             case IDIV:
1618                                 a[i0*sA0 + i1*sA1 + i2*sA2 + i3*sA3 + i4*sA4 + i5*sA5]
1619                                     /= b[i0*sB0 + i1*sB1 + i2*sB2 + i3*sB3 + i4*sB4 + i5*sB5];
1620                                 break;
1621                             case CPY:
1622                                 a[i0*sA0 + i1*sA1 + i2*sA2 + i3*sA3 + i4*sA4 + i5*sA5]
1623                                     = b[i0*sB0 + i1*sB1 + i2*sB2 + i3*sB3 + i4*sB4 + i5*sB5];
1624                                 break;
1625                             }
1626                         }
1627                     }
1628                 }
1629             }
1630         }
1631     }
1632 }
1633 
1634 /*
1635 CudaNdarray_inplace_elemwise
1636 Compute elemwise, working inplace on A.
1637 Currently implemented A / B, A + B and A = B
1638 (the last is not tested and not used!)
1639 
1640 py_self - the CudaNdarray that we'll modify (A)
1641 py_other - the other argument (B)
1642 fct_nb - which operation to perform (operator_t)
1643 
1644 Returns 0 on success.
1645 Returns -1 on failure, and sets Python exception.
1646 
1647 */
1648 int
1649 CudaNdarray_inplace_elemwise(PyObject* py_self, PyObject * py_other, operator_t fct_nb)
1650 {
1651     int verbose = 0;
1652     void (*k3)(const int, const int, const int,
1653                     float*, const int, const int, const int,
1654                     const float*, const int, const int, const int);
1655     void (*k4)(const int, const int, const int, const int,
1656                     float*, const int, const int,
1657                     const int, const int,
1658                     const float*, const int, const int,
1659                     const int, const int);
1660     void (*k6)(const int, const int,
1661                const int, const int,
1662                const int, const int,
1663                float*, const int, const int,
1664                const int, const int,
1665                const int, const int,
1666                const float*, const int, const int,
1667                const int, const int,
1668                const int, const int);
1669     switch (fct_nb)
1670     {
1671         case IADD:
1672             k3 = k_ielem_3<IADD>;
1673             k4 = k_ielem_4<IADD>;
1674             k6 = k_ielem_6<IADD>;
1675             break;
1676         case IDIV:
1677             k3 = k_ielem_3<IDIV>;
1678             k4 = k_ielem_4<IDIV>;
1679             k6 = k_ielem_6<IDIV>;
1680             break;
1681         case CPY:
1682             k3 = k_ielem_3<CPY>;
1683             k4 = k_ielem_4<CPY>;
1684             k6 = k_ielem_6<CPY>;
1685             break;
1686         default:
1687             assert (0);
1688             PyErr_Format(
1689                 PyExc_TypeError,
1690                 "CudaNdarray_inplace_elemwise invalid fct_nb (%i).",
1691                 (int)fct_nb);
1692             return -1;
1693     }
1694     if (!CudaNdarray_Check(py_self)) {
1695         PyErr_SetString(
1696             PyExc_TypeError,
1697             "CudaNdarray_inplace_elemwise need a CudaNdarray on left");
1698         return -1;
1699     }
1700     CudaNdarray * new_other = NULL;
1701     if (!CudaNdarray_Check(py_other)) {
1702         new_other = (CudaNdarray*) CudaNdarray_New();
1703         if(!new_other)
1704         {
1705             return -1;
1706         }
1707         if(CudaNdarray_CopyFromArray(new_other, (PyArrayObject *) py_other))
1708         {
1709             Py_XDECREF(new_other);
1710             return -1;
1711         }
1712         py_other = (PyObject *) new_other;
1713     }
1714 
1715     CudaNdarray * self = (CudaNdarray *)py_self;
1716     CudaNdarray * other = (CudaNdarray *)py_other;
1717 
1718     if (verbose)
1719     {
1720         fprintf(stderr,
1721             "INPLACE ADD/DIV for self->nd=%d other->nd=%d\n",
1722             self->nd, other->nd);
1723     }
1724 
1725     //standard elemwise nb dim checks
1726     if (self->nd < other->nd)
1727     {
1728         PyErr_Format(
1729             PyExc_TypeError,
1730             "CudaNdarray_inplace_elemwise: The destination need more or the"
1731             " same number of dimensions then the source. Got %d and %d.",
1732             self->nd, other->nd);
1733         Py_XDECREF(new_other);
1734         return -1;
1735     }
1736 
1737     //broadcast to the same number of dimensions.
1738     int* other_dims = (int*) alloca(self->nd * sizeof(int));
1739     int* other_strides = (int*) alloca(self->nd * sizeof(int));
1740     int added_dims = self->nd - other->nd;
1741     // Add the added broadcasted dimensions
1742     for (int i = 0; i< added_dims; ++i)
1743     {
1744         other_dims[i] = 1;
1745         other_strides[i] = 0;
1746     }
1747     // Copy the existing dimensions
1748     for (int i = 0; i< other->nd; ++i)
1749     {
1750         other_dims[i+added_dims] = CudaNdarray_HOST_DIMS(other)[i];
1751         other_strides[i+added_dims] = CudaNdarray_HOST_STRIDES(other)[i];
1752     }
1753 
1754     //standard elemwise dim checks
1755     unsigned int size = 1;
1756     for (int i = 0; i< self->nd; ++i)
1757     {
1758         if ((CudaNdarray_HOST_DIMS(self)[i] != other_dims[i])
1759             && (other_dims[i] != 1))
1760         {
1761             PyErr_SetString(
1762                 PyExc_ValueError,
1763                 "CudaNdarray_inplace_elemwise need same dimensions (or broadcastable dimension)");
1764             Py_XDECREF(new_other);
1765             return -1;
1766         }
1767         // if we're broadcasting other, then make sure it has stride 0
1768         assert ((CudaNdarray_HOST_DIMS(self)[i] == other_dims[i])
1769             || (other_strides[i] == 0));
1770         size *= (unsigned int) CudaNdarray_HOST_DIMS(self)[i];
1771     }
1772 
1773     if (size==0)
1774     {
1775         int other_size = CudaNdarray_SIZE((CudaNdarray *)py_other);
1776         if (!(other_size == 0 || other_size == 1))
1777         {
1778             PyErr_SetString(
1779                 PyExc_ValueError,
1780                 "CudaNdarray_inplace_elemwise cannot work inplace on"
1781                 " un-initialized array when the new value have more than"
1782                 " 0 or 1 broadcastable dimensions");
1783             Py_XDECREF(new_other);
1784             return 0;
1785         }
1786         Py_XDECREF(new_other);
1787         return 0;
1788     }
1789 
1790     switch(self->nd)
1791     {
1792         case 0:
1793             {
1794                 dim3 n_blocks(1, 1, 1);
1795                 dim3 n_threads(1);
1796                 k3<<<n_blocks, n_threads>>>(
1797                         1, //d0
1798                         1, //d1
1799                         1, //d2
1800                         CudaNdarray_DEV_DATA(self),
1801                         1, //strides
1802                         1,
1803                         1,
1804                         CudaNdarray_DEV_DATA(other),
1805                         1, //strides
1806                         1,
1807                         1);
1808                 CNDA_THREAD_SYNC;
1809                 cudaError_t err = cudaGetLastError();
1810                 if (cudaSuccess != err)
1811                 {
1812                     PyErr_Format(
1813                         PyExc_RuntimeError,
1814                         "CudaNdarray_inplace_elemwise case0: Cuda error: %s: %s.\n",
1815                         "k3",
1816                         cudaGetErrorString(err));
1817                     Py_XDECREF(new_other);
1818                     return -1;
1819                 }
1820             }
1821             break;
1822         case 1:
1823             {
1824                 dim3 n_blocks(1, 1, 1);
1825                 dim3 n_threads(
1826                         std::min(
1827                             CudaNdarray_HOST_DIMS(self)[0],
1828                             NUM_VECTOR_OP_THREADS_PER_BLOCK));
1829                 k3<<<n_blocks, n_threads>>>(
1830                         1, //dimensions
1831                         1,
1832                         CudaNdarray_HOST_DIMS(self)[0],
1833                         CudaNdarray_DEV_DATA(self),
1834                         1, //strides
1835                         1,
1836                         CudaNdarray_HOST_STRIDES(self)[0],
1837                         CudaNdarray_DEV_DATA(other),
1838                         1, //strides
1839                         1,
1840                         other_strides[0]);
1841                 CNDA_THREAD_SYNC;
1842                 cudaError_t err = cudaGetLastError();
1843                 if (cudaSuccess != err)
1844                 {
1845                     PyErr_Format(
1846                         PyExc_RuntimeError,
1847                         "CudaNdarray_inplace_elemwise case1: Cuda error: %s: %s.\n",
1848                         "k3",
1849                         cudaGetErrorString(err));
1850                     Py_XDECREF(new_other);
1851                     return -1;
1852                 }
1853             }
1854             break;
1855         case 2:
1856             {
1857                 //TODO:  if both self and other are f-contiguous
1858                 //       Then flip the block and thread dimensions
1859                 //       to make contiguous reads & writes
1860                 dim3 n_blocks(1,
1861                         std::min(
1862                             CudaNdarray_HOST_DIMS(self)[0],
1863                             NUM_VECTOR_OP_BLOCKS));
1864                 dim3 n_threads(
1865                         std::min(
1866                             CudaNdarray_HOST_DIMS(self)[1],
1867                             NUM_VECTOR_OP_THREADS_PER_BLOCK));
1868                 k3<<<n_blocks, n_threads>>>(1,
1869                         CudaNdarray_HOST_DIMS(self)[0],
1870                         CudaNdarray_HOST_DIMS(self)[1],
1871                         CudaNdarray_DEV_DATA(self),
1872                         1,
1873                         CudaNdarray_HOST_STRIDES(self)[0],
1874                         CudaNdarray_HOST_STRIDES(self)[1],
1875                         CudaNdarray_DEV_DATA(other),
1876                         1,
1877                         other_strides[0],
1878                         other_strides[1]);
1879                 CNDA_THREAD_SYNC;
1880                 cudaError_t err = cudaGetLastError();
1881                 if (cudaSuccess != err)
1882                 {
1883                     PyErr_Format(
1884                         PyExc_RuntimeError,
1885                         "CudaNdarray_inplace_elemwise case2: Cuda error: %s: %s.\n",
1886                         "k3",
1887                         cudaGetErrorString(err));
1888                     Py_XDECREF(new_other);
1889                     return -1;
1890                 }
1891             }
1892             break;
1893         case 3:
1894             {
1895                 //TODO:  Dimshuffle so that at least one of the arrays
1896                 //       has a contiguous dimension on the thread idx.
1897                 dim3 n_blocks(
1898                         std::min(
1899                             CudaNdarray_HOST_DIMS(self)[0],
1900                             NUM_VECTOR_OP_BLOCKS),
1901                         CudaNdarray_HOST_DIMS(self)[1]);
1902                 while (n_blocks.x * n_blocks.y > NUM_VECTOR_OP_BLOCKS)
1903                     n_blocks.y /= 2;
1904                 dim3 n_threads(
1905                         std::min(
1906                             CudaNdarray_HOST_DIMS(self)[2],
1907                             NUM_VECTOR_OP_THREADS_PER_BLOCK));
1908                 k3<<<n_blocks, n_threads>>>(
1909                         CudaNdarray_HOST_DIMS(self)[0],
1910                         CudaNdarray_HOST_DIMS(self)[1],
1911                         CudaNdarray_HOST_DIMS(self)[2],
1912                         CudaNdarray_DEV_DATA(self),
1913                         CudaNdarray_HOST_STRIDES(self)[0],
1914                         CudaNdarray_HOST_STRIDES(self)[1],
1915                         CudaNdarray_HOST_STRIDES(self)[2],
1916                         CudaNdarray_DEV_DATA(other),
1917                         other_strides[0],
1918                         other_strides[1],
1919                         other_strides[2]);
1920                 CNDA_THREAD_SYNC;
1921                 cudaError_t err = cudaGetLastError();
1922                 if (cudaSuccess != err)
1923                 {
1924                     PyErr_Format(
1925                         PyExc_RuntimeError,
1926                         "CudaNdarray_inplace_elemwise case3: Cuda error: %s: %s.\n",
1927                         "k3",
1928                         cudaGetErrorString(err));
1929                     Py_XDECREF(new_other);
1930                     return -1;
1931                 }
1932             }
1933             break;
1934         case 4:
1935             {
1936                 dim3 n_blocks(
1937                         std::min(
1938                             CudaNdarray_HOST_DIMS(self)[0],
1939                             NUM_VECTOR_OP_BLOCKS),
1940                         CudaNdarray_HOST_DIMS(self)[1]
1941                         );
1942                 while (n_blocks.x * n_blocks.y > NUM_VECTOR_OP_BLOCKS)
1943                     n_blocks.y /= 2;
1944                 dim3 n_threads(
1945                         std::min(
1946                             CudaNdarray_HOST_DIMS(self)[2],
1947                             NUM_VECTOR_OP_THREADS_PER_BLOCK)
1948                     //TODO: DON"T YOU NEED OT PUT DIMS[3] in here???
1949                             );
1950                 k4<<<n_blocks, n_threads>>>(
1951                         CudaNdarray_HOST_DIMS(self)[0],
1952                         CudaNdarray_HOST_DIMS(self)[1],
1953                         CudaNdarray_HOST_DIMS(self)[2],
1954                         CudaNdarray_HOST_DIMS(self)[3],
1955                         CudaNdarray_DEV_DATA(self),
1956                         CudaNdarray_HOST_STRIDES(self)[0],
1957                         CudaNdarray_HOST_STRIDES(self)[1],
1958                         CudaNdarray_HOST_STRIDES(self)[2],
1959                         CudaNdarray_HOST_STRIDES(self)[3],
1960                         CudaNdarray_DEV_DATA(other),
1961                         other_strides[0],
1962                         other_strides[1],
1963                         other_strides[2],
1964                         other_strides[3]);
1965                 CNDA_THREAD_SYNC;
1966                 cudaError_t err = cudaGetLastError();
1967                 if (cudaSuccess != err)
1968                 {
1969                     PyErr_Format(
1970                         PyExc_RuntimeError,
1971                         "CudaNdarray_inplace_elemwise case4: Cuda error: %s: %s.\n",
1972                         "k4",
1973                         cudaGetErrorString(err));
1974                     Py_XDECREF(new_other);
1975                     return -1;
1976                 }
1977             }
1978             break;
1979         case 5:
1980             {
1981                 dim3 n_blocks(
1982                         std::min(
1983                             CudaNdarray_HOST_DIMS(self)[1],
1984                             NUM_VECTOR_OP_BLOCKS),
1985                         CudaNdarray_HOST_DIMS(self)[2]);
1986                 while (n_blocks.x * n_blocks.y > NUM_VECTOR_OP_BLOCKS)
1987                     n_blocks.y /= 2;
1988                 dim3 n_threads(
1989                         std::min(
1990                             CudaNdarray_HOST_DIMS(self)[3],
1991                             NUM_VECTOR_OP_THREADS_PER_BLOCK)
1992                     //TODO: DON"T YOU NEED OT PUT DIMS[3] in here???
1993                     );
1994                 for (int i = 0; i < CudaNdarray_HOST_DIMS(self)[0]; ++i)
1995                 {
1996                      k4<<<n_blocks, n_threads>>>(
1997                             CudaNdarray_HOST_DIMS(self)[1],
1998                             CudaNdarray_HOST_DIMS(self)[2],
1999                             CudaNdarray_HOST_DIMS(self)[3],
2000                             CudaNdarray_HOST_DIMS(self)[4],
2001                             CudaNdarray_DEV_DATA(self) + i * CudaNdarray_HOST_STRIDES(self)[0],
2002                             CudaNdarray_HOST_STRIDES(self)[1],
2003                             CudaNdarray_HOST_STRIDES(self)[2],
2004                             CudaNdarray_HOST_STRIDES(self)[3],
2005                             CudaNdarray_HOST_STRIDES(self)[4],
2006                             CudaNdarray_DEV_DATA(other) + i * other_strides[0],
2007                             other_strides[1],
2008                             other_strides[2],
2009                             other_strides[3],
2010                             other_strides[4]);
2011                     CNDA_THREAD_SYNC;
2012                     cudaError_t err = cudaGetLastError();
2013                     if( cudaSuccess != err)
2014                     {
2015                         PyErr_Format(
2016                             PyExc_RuntimeError,
2017                             "CudaNdarray_inplace_elemwise case5: Cuda error: %s: %s. n_block=(%ld,%ld) n_threads=%ld\n",
2018                             "k5 with loop over k4",
2019                             cudaGetErrorString(err),
2020                             (long) n_blocks.x, (long) n_blocks.y, (long) n_threads.x);
2021                         Py_XDECREF(new_other);
2022                         return -1;
2023                     }
2024                 }
2025             }
2026             break;
2027         case 6:
2028             {
2029                 dim3 n_blocks(
2030                         std::min(
2031                             CudaNdarray_HOST_DIMS(self)[0],
2032                             NUM_VECTOR_OP_BLOCKS),
2033                         CudaNdarray_HOST_DIMS(self)[1],
2034                         CudaNdarray_HOST_DIMS(self)[2]
2035                         );
2036                 while (n_blocks.x * n_blocks.y > NUM_VECTOR_OP_BLOCKS)
2037                     n_blocks.y /= 2;
2038                 // GTX285(compute capabilities 1.3) don't support n_blocks.z > 1
2039                 // (compute capabilities 2.0) support 65535 for n_blocks.z
2040                 //while (n_blocks.x * n_blocks.y * n_blocks.z > NUM_VECTOR_OP_BLOCKS)
2041                 //    n_blocks.z /= 2;
2042                 n_blocks.z = 1;
2043                 dim3 n_threads(
2044                         std::min(
2045                             CudaNdarray_HOST_DIMS(self)[3],
2046                             NUM_VECTOR_OP_THREADS_PER_BLOCK)
2047                     //TODO: DON'T YOU NEED TO PUT DIMS[4] in here???
2048                     //TODO: DON'T YOU NEED TO PUT DIMS[5] in here???
2049                             );
2050                 k6<<<n_blocks, n_threads>>>(
2051                         CudaNdarray_HOST_DIMS(self)[0],
2052                         CudaNdarray_HOST_DIMS(self)[1],
2053                         CudaNdarray_HOST_DIMS(self)[2],
2054                         CudaNdarray_HOST_DIMS(self)[3],
2055                         CudaNdarray_HOST_DIMS(self)[4],
2056                         CudaNdarray_HOST_DIMS(self)[5],
2057                         CudaNdarray_DEV_DATA(self),
2058                         CudaNdarray_HOST_STRIDES(self)[0],
2059                         CudaNdarray_HOST_STRIDES(self)[1],
2060                         CudaNdarray_HOST_STRIDES(self)[2],
2061                         CudaNdarray_HOST_STRIDES(self)[3],
2062                         CudaNdarray_HOST_STRIDES(self)[4],
2063                         CudaNdarray_HOST_STRIDES(self)[5],
2064                         CudaNdarray_DEV_DATA(other),
2065                         other_strides[0],
2066                         other_strides[1],
2067                         other_strides[2],
2068                         other_strides[3],
2069                         other_strides[4],
2070                         other_strides[5]);
2071                 CNDA_THREAD_SYNC;
2072                 cudaError_t err = cudaGetLastError();
2073                 if (cudaSuccess != err)
2074                 {
2075                     PyErr_Format(
2076                         PyExc_RuntimeError,
2077                         "CudaNdarray_inplace_elemwise case6: Cuda error: %s: %s. n_blocks=(%ld, %ld, %ld) n_threads=(%ld)\n",
2078                         "k6",
2079                         cudaGetErrorString(err),
2080                         (long) n_blocks.x, (long) n_blocks.y, (long) n_blocks.z,
2081                         (long) n_threads.x);
2082                     Py_XDECREF(new_other);
2083                     return -1;
2084                 }
2085             }
2086             break;
2087         default:
2088         {
2089             PyErr_Format(
2090                 PyExc_NotImplementedError,
2091                 "inplace_elemwise w nd=%i\n",
2092                 self->nd);
2093             Py_XDECREF(new_other);
2094             return -1;
2095         }
2096     }
2097     if (verbose)
2098         fprintf(stderr, "INPLACE ADD/DIV end\n");
2099     Py_XDECREF(new_other);
2100     return 0;
2101 }
2102 
2103 /*
2104  * We need this inplace Add to support IncSubTensor
2105  * It returns py_self on success with an additional reference. Else NULL.
2106  */
2107 // Will be called by __iadd__ in Python
2108 PyObject *
2109 CudaNdarray_inplace_add(PyObject* py_self, PyObject * py_other)
2110 {
2111     if (CudaNdarray_inplace_elemwise(py_self, py_other, IADD))
2112     {
2113         return NULL;
2114     }
2115     Py_INCREF(py_self);
2116     return py_self;
2117 }
2118 
2119 /*
2120  * We need this inplace div for cuda/tests/test_basic_ops.py:test_shared_options
2121  * It returns py_self on success with an additional reference. Else NULL.
2122  */
2123 // Will be called by __idiv__ in Python
2124 static PyObject *
2125 CudaNdarray_inplace_div(PyObject* py_self, PyObject * py_other)
2126 {
2127     if (CudaNdarray_inplace_elemwise(py_self, py_other, IDIV))
2128     {
2129         return NULL;
2130     }
2131     Py_INCREF(py_self);
2132     return py_self;
2133 }
2134 
2135 // The PyNumberMethods struct layout changed in a non-trivial way from 2 to 3.
2136 #if PY_MAJOR_VERSION == 3
2137 static PyNumberMethods CudaNdarrayNumberMethods =
2138 {
2139     (binaryfunc)CudaNdarray_add,  //binaryfunc nb_add;  __add__
2140     0,  //binaryfunc nb_subtract;
2141     0,  //binaryfunc nb_multiply;
2142     0,  //binaryfunc nb_remainder;
2143     0,  //binaryfunc nb_divmod;
2144     0,  //ternaryfunc nb_power;
2145     0,  //unaryfunc nb_negative;
2146     0,  //unaryfunc nb_positive;
2147     0,  //unaryfunc nb_absolute;
2148     0,  //inquiry nb_bool;
2149     0,  //unaryfunc nb_invert;
2150     0,  //binaryfunc nb_lshift;
2151     0,  //binaryfunc nb_rshift;
2152     0,  //binaryfunc nb_and;
2153     0,  //binaryfunc nb_xor;
2154     0,  //binaryfunc nb_or;
2155     0,  //unaryfunc nb_int;
2156     0,  //void *nb_reserved;
2157     0,  //unaryfunc nb_float;
2158 
2159     (binaryfunc)CudaNdarray_inplace_add,  //binaryfunc nb_inplace_add;  __iadd__
2160     0,  //binaryfunc nb_inplace_subtract;
2161     0,  //binaryfunc nb_inplace_multiply;
2162     0,  //binaryfunc nb_inplace_remainder;
2163     0,  //ternaryfunc nb_inplace_power;
2164     0,  //binaryfunc nb_inplace_lshift;
2165     0,  //binaryfunc nb_inplace_rshift;
2166     0,  //binaryfunc nb_inplace_and;
2167     0,  //binaryfunc nb_inplace_xor;
2168     0,  //binaryfunc nb_inplace_or;
2169 
2170     0,  //binaryfunc nb_floor_divide;
2171     0,  //binaryfunc nb_true_divide;
2172     0,  //binaryfunc nb_inplace_floor_divide;
2173     (binaryfunc)CudaNdarray_inplace_div,  //binaryfunc nb_inplace_true_divide;        __idiv__
2174 
2175     0,  //unaryfunc nb_index
2176 };
2177 #else
2178 static PyNumberMethods CudaNdarrayNumberMethods =
2179 {
2180     (binaryfunc)CudaNdarray_add,  //binaryfunc nb_add;  __add__
2181     0,  //binaryfunc nb_subtract;      __sub__
2182     0,  //binaryfunc nb_multiply;      __mul__
2183     0,  //binaryfunc nb_divide;        __div__
2184     0,  //binaryfunc nb_remainder;     __mod__
2185     0,  //binaryfunc nb_divmod;        __divmod__
2186     0,  //ternaryfunc nb_power;        __pow__
2187     0,  //unaryfunc nb_negative;       __neg__
2188     0,  //unaryfunc nb_positive;       __pos__
2189     0,  //unaryfunc nb_absolute;       __abs__
2190     0,  //inquiry nb_nonzero;          __nonzero__     /* Used by PyObject_IsTrue */
2191     0,  //unaryfunc nb_invert;         __invert__
2192     0,  //binaryfunc nb_lshift;        __lshift__
2193     0,  //binaryfunc nb_rshift;        __rshift__
2194     0,  //binaryfunc nb_and;           __and__
2195     0,  //binaryfunc nb_xor;           __xor__
2196     0,  //binaryfunc nb_or;            __or__
2197     0,  //coercion nb_coerce;          __coerce__     /* Used by the coerce() function */
2198     0,  //unaryfunc nb_int;            __int__
2199     0,  //unaryfunc nb_long;           __long__
2200     0,  //unaryfunc nb_float;          __float__
2201     0,  //unaryfunc nb_oct;            __oct__
2202     0,  //unaryfunc nb_hex;            __hex__
2203 
2204     /* Added in release 2.0 */
2205     (binaryfunc)CudaNdarray_inplace_add,  //binaryfunc nb_inplace_add;  __iadd__
2206     0,  //binaryfunc nb_inplace_subtract;      __isub__
2207     0,  //binaryfunc nb_inplace_multiply;      __imul__
2208     (binaryfunc)CudaNdarray_inplace_div,  //binaryfunc nb_inplace_divide;        __idiv__
2209     0,  //binaryfunc nb_inplace_remainder;     __imod__
2210     0,  //ternaryfunc nb_inplace_power;        __ipow__
2211     0,  //binaryfunc nb_inplace_lshift;        __ilshift__
2212     0,  //binaryfunc nb_inplace_rshift;        __irshift__
2213     0,  //binaryfunc nb_inplace_and;           __iand__
2214     0,  //binaryfunc nb_inplace_xor;           __ixor__
2215     0,  //binaryfunc nb_inplace_or;            __ior__
2216 
2217     /* Added in release 2.2 */
2218     0,  //binaryfunc nb_floor_divide;          __floordiv__
2219     0,  //binaryfunc nb_true_divide;           __truediv__
2220     0,  //binaryfunc nb_inplace_floor_divide;  __ifloordiv__
2221     (binaryfunc)CudaNdarray_inplace_div,  //binaryfunc nb_inplace_true_divide;   __itruediv__
2222 
2223 #if PY_MINOR_VERSION > 4
2224     /* Added in release 2.5 */
2225     0  //unaryfunc nb_index;  __index__
2226 #endif
2227 };
2228 #endif
2229 
2230 
2231 /////////////////////
2232 // Mapping protocol
2233 /////////////////////
2234 
2235 // Will by called by __len__ in Python
2236 static Py_ssize_t
2237 CudaNdarray_len(PyObject * py_self)
2238 {
2239     CudaNdarray * self = (CudaNdarray*) py_self;
2240     if (self->nd <= 0)
2241     {
2242         return (Py_ssize_t) 0;
2243     }
2244     else
2245     {
2246         return (Py_ssize_t) CudaNdarray_HOST_DIMS(self)[0];
2247     }
2248 }
2249 
2250 // Will by called by __getitem__ in Python
2251 PyObject *
2252 CudaNdarray_Subscript(PyObject * py_self, PyObject * key)
2253 {
2254     int verbose = 0;
2255     if (verbose) fprintf(stderr, "Subscript .... \n");
2256     CudaNdarray * self = (CudaNdarray*) py_self;
2257     PyObject * py_rval = NULL;
2258     CudaNdarray * rval = NULL;
2259     PyObject * intobj = NULL;
2260 
2261     //PyObject_Print(key, stderr, 0);
2262 
2263     if (key == Py_Ellipsis)
2264     {
2265         Py_INCREF(py_self);
2266         return py_self;
2267     }
2268     if ((intobj=PyNumber_Int(key))) //INDEXING BY INTEGER
2269     //else if (PyInt_Check(key)) //INDEXING BY INTEGER
2270     {
2271         int d_idx = PyInt_AsLong(intobj);
2272         Py_DECREF(intobj); intobj=NULL;
2273         //int d_idx = PyInt_AsLong(key);
2274         if (self->nd == 0)
2275         {
2276             PyErr_SetString(PyExc_IndexError, "0-d arrays can't be indexed");
2277             return NULL;
2278         }
2279         int d_dim = CudaNdarray_HOST_DIMS(self)[0];
2280         int offset = 0;
2281 
2282         if ((d_idx >= 0) && (d_idx < d_dim))
2283         {
2284             //normal indexing
2285             offset += d_idx * CudaNdarray_HOST_STRIDES(self)[0];
2286         }
2287         else if ((d_idx < 0) && (d_idx >= -d_dim))
2288         {
2289             //end-based indexing
2290             // d_idx is negative
2291             offset += (d_dim + d_idx) * CudaNdarray_HOST_STRIDES(self)[0];
2292         }
2293         else
2294         {
2295             PyErr_Format(PyExc_IndexError,
2296                          "index out of bounds. Asked %d, but size of %d",
2297                          d_idx, d_dim);
2298             return NULL;
2299         }
2300 
2301         //allocate our subtensor view
2302         py_rval = CudaNdarray_new_nd(self->nd - 1);
2303         rval = (CudaNdarray*) py_rval;
2304         if (!rval) return NULL;
2305         assert (0 == rval->data_allocated);
2306 
2307         //initialize the view's data pointer to our own.
2308         if (CudaNdarray_set_device_data(rval, CudaNdarray_DEV_DATA(self) + offset, self))
2309         {
2310             Py_DECREF(rval);
2311             return NULL;
2312         }
2313         for (int d = 1; d < self->nd; ++d)
2314         {
2315             CudaNdarray_set_stride(rval, d-1, CudaNdarray_HOST_STRIDES(self)[d]);
2316             CudaNdarray_set_dim(rval, d-1, CudaNdarray_HOST_DIMS(self)[d]);
2317         }
2318     }
2319     else
2320     {
2321         PyErr_Clear();
2322     }
2323     if (PySlice_Check(key)) //INDEXING BY SLICE
2324     {
2325         if (verbose) fprintf(stderr, "by slice\n");
2326         if (self->nd == 0)
2327         {
2328             PyErr_SetString(PyExc_ValueError, "cannot slice a 0-d array");
2329             return NULL;
2330         }
2331 
2332         int d_dim = CudaNdarray_HOST_DIMS(self)[0];
2333         Py_ssize_t start, stop, step, slen;
2334         if (PySlice_GetIndicesEx(SLICE_CAST(key), d_dim, &start, &stop, &step, &slen))
2335         {
2336             if (verbose)
2337                 fprintf(stderr, "PySlice_GetIndicesEx failed\n");
2338             return NULL;
2339         }
2340         if (verbose)
2341         {
2342             std::cerr << "start " << start << "\n";
2343             std::cerr << "stop " << stop << "\n";
2344             std::cerr << "step " << step << "\n";
2345             std::cerr << "slen " << slen << "\n";
2346         }
2347 
2348         //allocate our subtensor view
2349         py_rval = CudaNdarray_new_nd(self->nd);
2350         rval = (CudaNdarray*) py_rval;
2351         if (!rval) return NULL;
2352         assert (0 == rval->data_allocated);
2353 
2354 
2355         //initialize the view's data pointer to our own.
2356         if (CudaNdarray_set_device_data(rval,
2357                     CudaNdarray_DEV_DATA(self) + start * CudaNdarray_HOST_STRIDES(self)[0],
2358                     self))
2359         {
2360             Py_DECREF(rval);
2361             return NULL;
2362         }
2363         //initialize dimension 0 of rval
2364         CudaNdarray_set_stride(rval, 0,
2365                 (slen == 1) ? 0 : step * CudaNdarray_HOST_STRIDES(self)[0]);
2366         CudaNdarray_set_dim(rval, 0, slen);
2367         if (verbose) std::cerr << "rval stride " << CudaNdarray_HOST_STRIDES(rval)[0] << "\n";
2368         // initialize dimensions > 0 of rval
2369         for (int d = 1; d < self->nd; ++d)
2370         {
2371             CudaNdarray_set_stride(rval, d, CudaNdarray_HOST_STRIDES(self)[d]);
2372             CudaNdarray_set_dim(rval, d, CudaNdarray_HOST_DIMS(self)[d]);
2373         }
2374     }
2375     if (PyTuple_Check(key)) //INDEXING BY TUPLE
2376     {
2377         if (verbose) fprintf(stderr, "by tuple\n");
2378         //elements of the tuple can be either integers or slices
2379         //the dimensionality of the view we will return is diminished for each slice in the tuple
2380 
2381         if (PyTuple_Size(key) > self->nd)
2382         {
2383             PyErr_SetString(PyExc_IndexError, "index error");
2384             return NULL;
2385         }
2386 
2387         //calculate the number of dimensions in the return value
2388         int rval_nd = self->nd;
2389         for (int d = 0; d < PyTuple_Size(key); ++d)
2390         {
2391             //On some paltform PyInt_Check(<type 'numpy.int64'>) return true, other it return false.
2392             //So we use PyArray_IsAnyScalar that should covert everything.
2393             rval_nd -= PyArray_IsAnyScalar(PyTuple_GetItem(key, d));
2394         }
2395 
2396         //allocate our subtensor view
2397         py_rval = CudaNdarray_new_nd(rval_nd);
2398         rval = (CudaNdarray*) py_rval;
2399         if (!rval) return NULL;
2400         assert (0 == rval->data_allocated);
2401 
2402         //initialize the view's data pointer to our own.
2403         if (CudaNdarray_set_device_data(rval, CudaNdarray_DEV_DATA(self), self))
2404         {
2405             Py_DECREF(rval);
2406             return NULL;
2407         }
2408 
2409         // rval_d will refer to the current dimension in the rval.
2410         // It will not be incremented for integer keys, but will be incremented for slice
2411         // keys
2412         int rval_d = 0;
2413 
2414         for (int d = 0; d < self->nd; ++d)
2415         {
2416             // keys can be shorter than self->nd.
2417             // when that happens, it means that the remaining dimensions are "full slices"
2418             if (d >=PyTuple_Size(key))
2419             {
2420                 CudaNdarray_set_stride(rval, rval_d, CudaNdarray_HOST_STRIDES(self)[d]);
2421                 CudaNdarray_set_dim(rval, rval_d, CudaNdarray_HOST_DIMS(self)[d]);
2422                 ++rval_d;
2423             }
2424             else
2425             {
2426                 PyObject * key_d = PyTuple_GetItem(key, d);
2427 
2428                 if (PySlice_Check(key_d))
2429                 {
2430                     Py_ssize_t start, stop, step, slen;
2431                     if (PySlice_GetIndicesEx(SLICE_CAST(key_d), CudaNdarray_HOST_DIMS(self)[d], &start, &stop, &step, &slen))
2432                     {
2433                         Py_DECREF(rval);
2434                         return NULL;
2435                     }
2436                     rval->devdata += start * CudaNdarray_HOST_STRIDES(self)[d];
2437                     CudaNdarray_set_stride(rval, rval_d,
2438                             (slen == 1) ? 0 : step * CudaNdarray_HOST_STRIDES(self)[d]);
2439                     CudaNdarray_set_dim(rval, rval_d, slen);
2440                     if (0)
2441                     {
2442                         std::cerr << "start " << start << "\n";
2443                         std::cerr << "stop " << stop << "\n";
2444                         std::cerr << "step " << step << "\n";
2445                         std::cerr << "slen " << slen << "\n";
2446                     }
2447                     ++rval_d;
2448                 }
2449                 else if ((intobj=PyNumber_Int(key_d)))
2450                 {
2451                     assert(PyArray_IsAnyScalar(key_d));
2452                     int d_idx = PyInt_AsLong(intobj);
2453                     Py_DECREF(intobj);
2454                     intobj = NULL;
2455                     int d_dim = CudaNdarray_HOST_DIMS(self)[d];
2456 
2457                     if ((d_idx >= 0) && (d_idx < d_dim))
2458                     {
2459                         //normal indexing
2460                         rval->devdata += d_idx * CudaNdarray_HOST_STRIDES(self)[d];
2461                     }
2462                     else if ((d_idx < 0) && (d_idx >= -d_dim))
2463                     {
2464                         //end-based indexing
2465                         rval->devdata += (d_dim + d_idx) * CudaNdarray_HOST_STRIDES(self)[d];
2466                     }
2467                     else
2468                     {
2469                         PyErr_Format(PyExc_IndexError,
2470                                      "index out of bounds. Asked %d for dimensions %d, but size of %d",
2471                                      d_idx, d, d_dim);
2472                         Py_DECREF(rval);
2473                         return NULL;
2474                     }
2475                 }
2476                 else
2477                 {
2478                     PyErr_Clear(); // clear the error set by PyNumber_Int
2479                     PyErr_SetString(PyExc_IndexError, "index must be either int or slice");
2480                     Py_DECREF(rval);
2481                     return NULL;
2482                 }
2483             }
2484         }
2485     }
2486     if (py_rval)
2487     {
2488         if (verbose) fprint_CudaNdarray(stderr, self);
2489         if (verbose) fprint_CudaNdarray(stderr, rval);
2490     }
2491     else
2492     {
2493         PyErr_SetString(PyExc_NotImplementedError, "Unknown key type");
2494         return NULL;
2495     }
2496     return py_rval;
2497 }
2498 
2499 // Will by called by __setitem__ in Python
2500 // See http://docs.python.org/dev/py3k/c-api/object.html#PyObject_SetItem
2501 // Doesn't handle broadcasting, e.g. a[:] = 5
2502 // Can only be assigned from a CudaNdarray on the right side
2503 // Or a ndarray
2504 // Or a python scalar with value 0 when the left side part is c contiguous.
2505 static int
2506 CudaNdarray_setitem(PyObject *o, PyObject  *key, PyObject  *value)
2507 {
2508     int verbose = 0;
2509     if (verbose) fprintf(stderr, "CudaNdarray_setitem start\n");
2510     // We try to copy directly into this CudaNdarray from the ndarray
2511     CudaNdarray* rval = (CudaNdarray*)CudaNdarray_Subscript(o, key);
2512     CudaNdarray* new_value = NULL;
2513 
2514     if(!rval){
2515         // CudaNdarray_Subscript failed and set the error msg.
2516         Py_XDECREF(rval);
2517         return -1;
2518     }
2519 
2520     if(rval != (CudaNdarray*)o &&
2521                 (rval->data_allocated ||
2522                  // The new array should have a base
2523                  !(((CudaNdarray*)rval)->base) ||
2524                  // If the original array has no base, the base of the new
2525                  // array should be the original one
2526                  (!((CudaNdarray*)o)->base && ((CudaNdarray*)rval)->base != o) ||
2527                  // Else, the two arrays should have the same base
2528                  (((CudaNdarray*)o)->base && ((CudaNdarray*)rval)->base != ((CudaNdarray*)o)->base)))
2529     {
2530         // This case shouldn't happen, based on what I see in Subscript
2531         // but just in case it happens sometime in the future
2532 
2533         PyErr_Format(PyExc_RuntimeError,
2534                      "__getitem__ must return a CudaNdarray that refers to"
2535                      " the original CudaNdarray, not a copy. rval.base=%p"
2536                      " o.base=%p o=%p",
2537                      (((CudaNdarray*)rval)->base), ((CudaNdarray*)o)->base, o);
2538         Py_DECREF(rval);
2539         return -1;
2540     }
2541 
2542     PyObject * intobj = NULL;
2543     if (CudaNdarray_Check(o)  && PyArray_Check(value)){
2544         if (verbose)
2545             fprintf(stderr,
2546                     "CudaNdarray_setitem dest is a CudaNdarray and"
2547                     " value is a ndarray\n");
2548         new_value = (CudaNdarray*) CudaNdarray_New();
2549         if(!new_value)
2550         {
2551             return -1;
2552         }
2553         if (CudaNdarray_CopyFromArray(new_value, (PyArrayObject *) value))
2554         {
2555             Py_XDECREF(new_value);
2556             Py_XDECREF(rval);
2557             return -1;
2558         }
2559         value = (PyObject *) new_value;
2560     }
2561     else if ((intobj=PyNumber_Int(value)))
2562     {
2563         if (verbose)
2564             fprintf(stderr,
2565                     "CudaNdarray_setitem dest and value is a python number\n");
2566         if(! CudaNdarray_is_c_contiguous(rval)){
2567             PyErr_SetString(PyExc_NotImplementedError,
2568                  "CudaNdarray.__setitem__: When the new value is a scalar"
2569                  " of value 0 the part where we copy to must be c contiguous.");
2570             Py_XDECREF(rval);
2571             return -1;
2572         }
2573 
2574         long val = PyInt_AsLong(intobj);
2575         Py_DECREF(intobj); intobj=NULL;
2576         if (val == 0)
2577         {
2578             cudaError_t err = cudaMemset(rval->devdata, 0,
2579                                          CudaNdarray_SIZE(rval) * sizeof(real));
2580             Py_XDECREF(rval);
2581             if (err)
2582             {
2583                 // Clear the error flag, cudaMemset doesn't do it.
2584                 // Currently this returns the same thing as err, but if in future
2585                 // it returns something else I still don't see why we should ignore
2586                 // it.  All we want to do here is reset the flag.
2587                 cudaGetLastError();
2588                 PyErr_SetString(PyExc_RuntimeError,
2589                                 "CudaNdarray.__setitem__: cudaMemset failed");
2590                 return -1;
2591             }
2592             return 0;
2593         } else {
2594             Py_XDECREF(rval);
2595             PyErr_SetString(PyExc_NotImplementedError,
2596                   "CudaNdarray.__setitem__: we support setting only python"
2597                   " scalar of value 0, numpy nd array and CudaNdarray.");
2598                 return -1;
2599         }
2600     }
2601 
2602     PyErr_Clear(); // clear PyNumber_Int error.
2603 
2604     if(!CudaNdarray_Check(o) || !CudaNdarray_Check(value))
2605     {
2606         PyErr_SetString(PyExc_TypeError,
2607           "CudaNdarray.__setitem__: left must be a CudaNdarrays and right"
2608           " must be a CudaNdarrays, an ndarray or a python scalar of value 0.");
2609         Py_XDECREF(new_value);
2610         return -1;
2611     }
2612 
2613     if (verbose)
2614         fprintf(stderr, "CudaNdarray_setitem dest and value are CudaNdarray\n");
2615 
2616     if (cnda_copy_structure_to_device(rval))
2617     {
2618         PyErr_SetString(PyExc_RuntimeError,
2619                 "CudaNdarray.__setitem__: syncing structure to device failed");
2620         Py_DECREF(rval);
2621         Py_XDECREF(new_value);
2622 
2623         if (verbose)
2624             fprintf(stderr, "CudaNdarray_setitem error end\n");
2625         return -1;
2626     }
2627 
2628     PyObject *baseSavedForComparison = rval->base;
2629 
2630     if (CudaNdarray_CopyFromCudaNdarray(rval, (CudaNdarray*)value, true))
2631     {
2632         Py_DECREF((PyObject*)rval);
2633         Py_XDECREF(new_value);
2634 
2635         if (verbose)
2636             fprintf(stderr, "CudaNdarray_setitem error end\n");
2637         return -1;
2638     }
2639 
2640     assert (rval->base == baseSavedForComparison);
2641     assert (rval->dev_structure_fresh);
2642 
2643     // Clean up locally-created references
2644     Py_DECREF(rval);
2645     Py_XDECREF(new_value);
2646 
2647     return 0;
2648 }
2649 
2650 
2651 PyMappingMethods CudaNdarrayMappingMethods = {
2652     CudaNdarray_len, //lenfunc mp_length;               __len__
2653     CudaNdarray_Subscript, //binaryfunc mp_subscript;   __getitem__
2654     CudaNdarray_setitem //objobjargproc mp_ass_subscript;                __setitem__
2655 };
2656 
2657 ////////////////////
2658 //
2659 ////////////////////
2660 
2661 static PyObject *
2662 CudaNdarray_get_shape(CudaNdarray *self, void *closure)
2663 {
2664     if (self->nd < 0)
2665     {
2666         PyErr_SetString(PyExc_ValueError, "CudaNdarray not initialized");
2667         return NULL;
2668     }
2669     PyObject * rval = PyTuple_New(self->nd);
2670     for (int i = 0; i < self->nd; ++i)
2671     {
2672         if (!rval || PyTuple_SetItem(rval, i, PyInt_FromLong(CudaNdarray_HOST_DIMS(self)[i])))
2673         {
2674             Py_XDECREF(rval);
2675             return NULL;
2676         }
2677 
2678     }
2679     return rval;
2680 }
2681 
2682 static int
2683 CudaNdarray_set_shape(CudaNdarray *self, PyObject *value, void *closure)
2684 {
2685     PyErr_SetString(PyExc_NotImplementedError, "TODO: call reshape");
2686     return -1;
2687 }
2688 
2689 static PyObject *
2690 CudaNdarray_get_strides(CudaNdarray *self, void *closure)
2691 {
2692     if (self->nd < 0)
2693     {
2694         PyErr_SetString(PyExc_ValueError, "CudaNdarray not initialized");
2695         return NULL;
2696     }
2697     PyObject * rval = PyTuple_New(self->nd);
2698     for (int i = 0; i < self->nd; ++i)
2699     {
2700         if (!rval || PyTuple_SetItem(rval, i, PyInt_FromLong(CudaNdarray_HOST_STRIDES(self)[i])))
2701         {
2702             Py_XDECREF(rval);
2703             return NULL;
2704         }
2705 
2706     }
2707     return rval;
2708 }
2709 
2710 static int
2711 CudaNdarray_set_strides(CudaNdarray *self, PyObject *value, void *closure)
2712 {
2713     //npy_intp newstrides_bytes[PyTuple_Size(value)];
2714     if (PyTuple_Check(value)){
2715         if (PyTuple_Size(value) != CudaNdarray_NDIM(self)){
2716             PyErr_SetString(PyExc_ValueError,
2717                             "The new strides tuple must have the same length"
2718                             " as the number of dimensions");
2719             return -1;
2720         }
2721     }else if (PyList_Check(value)){
2722         if (PyList_Size(value) != CudaNdarray_NDIM(self)){
2723             PyErr_SetString(PyExc_ValueError,
2724                             "The new strides list must have the same length"
2725                             " as the number of dimensions");
2726             return -1;
2727         }
2728     }else{
2729         PyErr_SetString(PyExc_ValueError,
2730                         "The new strides need to be encoded in a tuple or list");
2731         return -1;
2732     }
2733     npy_intp* newstrides = (npy_intp*) alloca(CudaNdarray_NDIM(self) * sizeof(npy_intp));
2734     if (PyTuple_Check(value)){
2735         for(int i=0; i < CudaNdarray_NDIM(self); i++){
2736             newstrides[i] = PyInt_AsLong(PyTuple_GetItem(value, Py_ssize_t(i)));
2737             //newstrides_bytes[i] = newstrides[i] * 4;
2738         }
2739     }else if (PyList_Check(value)){
2740         for(int i=0; i < CudaNdarray_NDIM(self); i++){
2741             newstrides[i] = PyInt_AsLong(PyList_GetItem(value, Py_ssize_t(i)));
2742             //newstrides_bytes[i] = newstrides[i] * 4;
2743         }
2744     }
2745     /*
2746     // Do not do this check, as ExtractDiag needs that, and NumPy does not seem
2747     // to do it.
2748     npy_intp dims[PyTuple_Size(value)];
2749     for(int i=0; i < CudaNdarray_NDIM(self); i++){
2750         dims[i] = CudaNdarray_HOST_DIMS(self)[i];
2751     }
2752     if (!PyArray_CheckStrides(4,
2753                               CudaNdarray_NDIM(self),
2754                               0, 0,
2755                               dims,
2756                               newstrides_bytes)){
2757         PyErr_SetString(PyExc_ValueError, "bad new strides");
2758         return -1;
2759         }
2760     */
2761     for(int i=0; i < CudaNdarray_NDIM(self); i++){
2762         CudaNdarray_set_stride(self, i, newstrides[i]);
2763     }
2764     return 0;
2765 }
2766 
2767 static PyObject *
2768 CudaNdarray_get_dev_data(CudaNdarray *self, void *closure)
2769 {
2770     float * p =  CudaNdarray_DEV_DATA(self);
2771     //printf("get_dev_data %p %li \n", p, (long int)p );
2772     return PyInt_FromSize_t((size_t) CudaNdarray_DEV_DATA(self));
2773 }
2774 
2775 static int
2776 CudaNdarray_set_dev_data(CudaNdarray *self, PyObject *value, void *closure)
2777 {
2778     Py_ssize_t newdevdata = PyInt_AsSsize_t(value);
2779     //printf("set_dev_data %p %li \n",(float*)newdevdata ,newdevdata);
2780     if (PyErr_Occurred())
2781     {
2782         return -1;
2783     }
2784     return  CudaNdarray_set_device_data(self, (float*)newdevdata, (CudaNdarray*)self->base);
2785 }
2786 
2787 static PyObject *
2788 CudaNdarray_get_dtype(CudaNdarray *self, void *closure)
2789 {
2790     return PyString_FromString("float32");
2791 }
2792 
2793 static PyObject *
2794 CudaNdarray_get_ndim(CudaNdarray *self, void *closure)
2795 {
2796     return PyInt_FromLong(self->nd);
2797 }
2798 
2799 static PyObject *
2800 CudaNdarray_get_base(CudaNdarray *self, void *closure)
2801 {
2802     PyObject * base = self->base;
2803     if (!base)
2804     {
2805         // We cannot return a NULL pointer, use None instead
2806         base = Py_None;
2807     }
2808     Py_INCREF(base);
2809     return base;
2810 }
2811 
2812 void put_in_dict(PyObject * dict, const char * key, int val)
2813 {
2814   PyObject * k = PyString_FromString(key);
2815   PyObject * v = PyInt_FromLong(val);
2816   PyDict_SetItem(dict, k, v);
2817   Py_DECREF(k);
2818   Py_DECREF(v);
2819 }
2820 
2821 PyObject *
2822 GetDeviceProperties(PyObject* _unused, PyObject* args)
2823 {
2824   int dev_id = -1;
2825   if (! PyArg_ParseTuple(args, "i", &dev_id))
2826     return NULL;
2827   cudaDeviceProp deviceProp;
2828   cudaGetDeviceProperties(&deviceProp, dev_id);
2829 
2830   PyObject * dict = PyDict_New();
2831   PyObject * str= PyString_FromString("name");
2832   PyObject * i = PyString_FromString(deviceProp.name);
2833   PyDict_SetItem(dict, str, i);
2834   Py_DECREF(str);
2835   Py_DECREF(i);
2836 
2837   put_in_dict(dict, "major", deviceProp.major);
2838   put_in_dict(dict, "minor", deviceProp.minor);
2839 #if CUDART_VERSION >= 2020
2840   int driverVersion = 0, runtimeVersion = 0;
2841   cudaDriverGetVersion(&driverVersion);
2842   cudaRuntimeGetVersion(&runtimeVersion);
2843   put_in_dict(dict, "driverVersion", driverVersion);
2844   put_in_dict(dict, "runtimeVersion", runtimeVersion);
2845 #endif
2846 #if CUDART_VERSION >= 2000
2847 
2848   put_in_dict(dict, "multiProcessorCount", deviceProp.multiProcessorCount);
2849   //if ConvertSMVer2Cores is not defined in cuda_runtime_api.h, the run time is too old.
2850   int sm_cores = -1;
2851   if(deviceProp.major==1)
2852     sm_cores = 32;
2853   else if(deviceProp.major==2 && deviceProp.minor==0)
2854     sm_cores = 32;
2855   else if(deviceProp.major==2 && deviceProp.minor==1)
2856     sm_cores = 48;
2857   put_in_dict(dict, "coresCount", sm_cores * deviceProp.multiProcessorCount);
2858 #endif
2859   put_in_dict(dict, "totalConstMem", deviceProp.totalConstMem);
2860   put_in_dict(dict, "sharedMemPerBlock", deviceProp.sharedMemPerBlock);
2861   put_in_dict(dict, "regsPerBlock", deviceProp.regsPerBlock);
2862   put_in_dict(dict, "warpSize", deviceProp.warpSize);
2863   put_in_dict(dict, "maxThreadsPerBlock", deviceProp.maxThreadsPerBlock);
2864   put_in_dict(dict, "maxThreadsDim0", deviceProp.maxThreadsDim[0]);
2865   put_in_dict(dict, "maxThreadsDim1", deviceProp.maxThreadsDim[1]);
2866   put_in_dict(dict, "maxThreadsDim2", deviceProp.maxThreadsDim[2]);
2867   put_in_dict(dict, "maxGridSize0", deviceProp.maxGridSize[0]);
2868   put_in_dict(dict, "maxGridSize1", deviceProp.maxGridSize[1]);
2869   put_in_dict(dict, "maxGridSize2", deviceProp.maxGridSize[2]);
2870   put_in_dict(dict, "memPitch", deviceProp.memPitch);
2871   put_in_dict(dict, "textureAlignment", deviceProp.textureAlignment);
2872   put_in_dict(dict, "clockRate", deviceProp.clockRate);
2873 #if CUDART_VERSION >= 2000
2874   put_in_dict(dict, "deviceOverlap", deviceProp.deviceOverlap);
2875 #endif
2876 #if CUDART_VERSION >= 2020
2877   put_in_dict(dict, "kernelExecTimeoutEnabled", deviceProp.kernelExecTimeoutEnabled);
2878   put_in_dict(dict, "integrated", deviceProp.integrated);
2879   put_in_dict(dict, "canMapHostMemory", deviceProp.canMapHostMemory);
2880   put_in_dict(dict, "computeMode", deviceProp.computeMode);
2881   //in the doc of this fct tell that 0 - Normal mode, 1 - only 1 context, 2 - no context
2882 #endif
2883 #if CUDART_VERSION >= 3000
2884   put_in_dict(dict, "concurrentKernels", deviceProp.concurrentKernels);
2885 #endif
2886 #if CUDART_VERSION >= 3010
2887   put_in_dict(dict, "ECCEnabled", deviceProp.ECCEnabled);
2888 #endif
2889 #if CUDART_VERSION >= 3020
2890   put_in_dict(dict, "tccDriver", deviceProp.tccDriver);
2891 #endif
2892 
2893   return dict;
2894 }
2895 
2896 /*
2897  * Returns in *free and *total respectively, the free and total amount of memory available for allocation by the device in bytes.
2898  */
2899 PyObject *
2900 GetDeviceMemInfo(PyObject* _unused, PyObject* dummy)
2901 {
2902     size_t free = 0, total = 0;
2903     if(g_gpu_context_active == 0){
2904         PyErr_Format(PyExc_RuntimeError, "No gpu device selected yet. Please make sure the gpu device was initialized by Theano before.");
2905         return NULL;
2906     }
2907 
2908     cudaError_t err = cudaMemGetInfo(&free, &total);
2909     if (err != cudaSuccess){
2910         // Clear the error flag, cudaMemGetInfo doesn't do it.
2911         // Currently this returns the same thing as err, but if in future
2912         // it returns something else I still don't see why we should ignore
2913         // it.  All we want to do here is reset the flag.
2914         cudaGetLastError();
2915         PyErr_Format(PyExc_RuntimeError,
2916                      "Error while getting memory info about the gpu: %s",
2917                      cudaGetErrorString(err));
2918         return NULL;
2919     }
2920     return PyTuple_Pack(2, PyLong_FromSize_t(free), PyLong_FromSize_t(total));
2921 }
2922 
2923 /*
2924  * Synchronize with all the gpu device stream.
2925  */
2926 PyObject *
2927 CudaNdarray_synchronize(PyObject* _unused, PyObject* dummy)
2928 {
2929     CNDA_BEGIN_ALLOW_THREADS
2930     cudaThreadSynchronize();
2931     CNDA_END_ALLOW_THREADS
2932     Py_INCREF(Py_None);
2933     return Py_None;
2934 }
2935 
2936 /*
2937  * Exist and return true if we link with cublas v2.
2938  */
2939 PyObject *
2940 CudaNdarray_cublasv2(PyObject* _unused, PyObject* dummy)
2941 {
2942     Py_INCREF(Py_True);
2943     return Py_True;
2944 }
2945 
2946 PyObject *
2947 CudaNdarray_select_a_gpu(PyObject* _unused, PyObject* dummy)
2948 {
2949     void * rval = NULL;
2950     cudaError_t err;
2951     int num_gpus = 0;
2952 
2953     err = cudaGetDeviceCount(&num_gpus);
2954     if (cudaSuccess != err){
2955         printf("ERR!\\n");
2956             PyErr_Format(PyExc_RuntimeError,
2957                          "Not able to get number of GPUs (%s).",
2958                          cudaGetErrorString(err));
2959             return NULL;
2960     }
2961 
2962     for (int device = 0; device < num_gpus; device++) {
2963         cudaSetDevice(device);
2964         err = cudaDeviceSynchronize(); // << CUDA context gets created here.
2965         cudaGetLastError(); // reset the error state
2966         if (cudaSuccess == err)
2967             break;
2968     }
2969 
2970     if (cudaSuccess != err){
2971             printf("ERR!\\n");
2972                 PyErr_Format(PyExc_RuntimeError,
2973                              "Not able to select available GPU from %d cards (%s).",
2974                              num_gpus, cudaGetErrorString(err));
2975                 return NULL;
2976     }
2977 
2978     Py_INCREF(Py_None);
2979     return Py_None;
2980 }
2981 
2982 #if COMPUTE_GPU_MEM_USED
2983 /*
2984  * Return the size in bytes that Theano currently have allocated on the gpu.
2985  */
2986 PyObject *
2987 GetTheanoAllocInfo(PyObject* _unused, PyObject* dummy)
2988 {
2989     PyObject* a = PyLong_FromSize_t(_allocated_size);
2990     PyObject* b = PyLong_FromSize_t(_max_allocated_size);
2991 
2992     PyObject* tuple = PyTuple_New(2);
2993     PyTuple_SetItem(tuple, 0, a);
2994     PyTuple_SetItem(tuple, 1, b);
2995     return tuple;
2996 }
2997 #endif
2998 
2999 static PyGetSetDef CudaNdarray_getset[] = {
3000     {"shape",
3001         (getter)CudaNdarray_get_shape,
3002         (setter)CudaNdarray_set_shape,
3003         "shape of this ndarray (tuple)",
3004         NULL},
3005     {"_strides",
3006         (getter)CudaNdarray_get_strides,
3007         (setter)CudaNdarray_set_strides,
3008         "data pointer strides (in elements)",
3009         NULL},
3010     {"strides",
3011         (getter)CudaNdarray_get_strides,
3012         (setter)CudaNdarray_set_strides,
3013         "data pointer strides (in elements)",
3014         NULL},
3015     //gpudata is needed to allow calling pycuda fct with CudaNdarray input.
3016     {"gpudata",
3017         (getter)CudaNdarray_get_dev_data,
3018         NULL,
3019         "device data pointer",
3020         NULL},
3021     {"_dev_data",
3022         (getter)CudaNdarray_get_dev_data,
3023         (setter)CudaNdarray_set_dev_data,
3024         "device data pointer",
3025         NULL},
3026     {"dtype",
3027         (getter)CudaNdarray_get_dtype,
3028         NULL,
3029         "The dtype of the element. Now always float32",
3030         NULL},
3031     {"size",
3032         (getter)CudaNdarray_SIZE_Object,
3033         NULL,
3034         "The number of elements in this object.",
3035         NULL},
3036     //mem_size is neede for pycuda.elementwise.ElementwiseKernel Why do they use size and mem_size of the same value?
3037     {"mem_size",
3038         (getter)CudaNdarray_SIZE_Object,
3039         NULL,
3040         "The number of elements in this object.",
3041         NULL},
3042     {"ndim",
3043         (getter)CudaNdarray_get_ndim,
3044         NULL,
3045         "The number of dimensions in this object.",
3046         NULL},
3047     {"base",
3048         (getter)CudaNdarray_get_base,
3049         NULL,
3050         "If this ndarray is a view, base is the original ndarray.",
3051         NULL},
3052 
3053     {NULL, NULL, NULL, NULL}  /* Sentinel */
3054 };
3055 
3056 PyObject *CudaNdarray_repr(PyObject *self)
3057 {
3058     CudaNdarray *object = (CudaNdarray *)self;
3059     PyObject * np_object = CudaNdarray_CreateArrayObj(object);
3060     PyObject * str = PyObject_Str((PyObject *) np_object);
3061     char * cstr = PyString_AsString(str);
3062     PyObject * out = PyString_FromFormat("%s%s%s",
3063                         "CudaNdarray(",
3064                         cstr,
3065                         ")");
3066     Py_DECREF(str);
3067     Py_DECREF(np_object);
3068     #if PY_MAJOR_VERSION >= 3
3069     // In Python 3 PyString_FromFormat return a Bytes object
3070     PyObject* out2 = PyObject_Str(out);
3071     Py_DECREF(out);
3072     return out2;
3073     #endif
3074     return out;
3075 }
3076 
3077 static PyTypeObject CudaNdarrayType =
3078 {
3079 #if PY_MAJOR_VERSION >= 3
3080     PyVarObject_HEAD_INIT(NULL, 0)
3081 #else
3082     PyObject_HEAD_INIT(NULL)
3083     0,                         /*ob_size*/
3084 #endif
3085     "CudaNdarray",             /*tp_name*/
3086     sizeof(CudaNdarray),       /*tp_basicsize*/
3087     0,                         /*tp_itemsize*/
3088     (destructor)CudaNdarray_dealloc, /*tp_dealloc*/
3089     0,                         /*tp_print*/
3090     0,                         /*tp_getattr*/
3091     0,                         /*tp_setattr*/
3092     0,                         /*tp_compare*/
3093     CudaNdarray_repr,          /*tp_repr*/
3094     &CudaNdarrayNumberMethods, /*tp_as_number*/
3095     0,                         /*tp_as_sequence*/
3096     &CudaNdarrayMappingMethods,/*tp_as_mapping*/
3097     0,                         /*tp_hash */
3098     0,                         /*tp_call*/
3099     0,                         /*tp_str*/
3100     0,                         /*tp_getattro*/
3101     0,                         /*tp_setattro*/
3102     0,                         /*tp_as_buffer*/
3103 #if PY_MAJOR_VERSION >= 3
3104     // Py_TPFLAGS_CHECKTYPES is always true and was removed in Python 3.
3105     Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE, /*tp_flags*/
3106 #else
3107     Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE | Py_TPFLAGS_CHECKTYPES, /*tp_flags*/
3108 #endif
3109     "CudaNdarray objects",     /* tp_doc */
3110     0,                         /* tp_traverse */
3111     0,                         /* tp_clear */
3112     0,                         /* tp_richcompare */
3113     0,                         /* tp_weaklistoffset */
3114     0,                         /* tp_iter */
3115     0,                         /* tp_iternext */
3116     CudaNdarray_methods,       /* tp_methods */
3117     CudaNdarray_members,       /* tp_members */
3118     CudaNdarray_getset,        /* tp_getset */
3119     0,                         /* tp_base */
3120     0,                         /* tp_dict */
3121     0,                         /* tp_descr_get */
3122     0,                         /* tp_descr_set */
3123     0,                         /* tp_dictoffset */
3124     (initproc)CudaNdarray_init,/* tp_init */
3125     0,                         /* tp_alloc */
3126     CudaNdarray_new,           /* tp_new */
3127 };
3128 
3129 static __global__ void get_gpu_ptr_size(int* dst)
3130 {
3131     dst[0] = sizeof(float*);
3132     dst[1] = sizeof(int);
3133 }
3134 
3135 PyObject *
3136 CudaNdarray_ptr_int_size(PyObject* _unused, PyObject* args)
3137 {
3138     int *gpu_data = (int*)device_malloc(sizeof(int)*2);
3139     if(gpu_data == NULL){
3140         return NULL;
3141     }
3142     get_gpu_ptr_size<<<1,1>>>(gpu_data);
3143 
3144     cudaError_t cudaErr = cudaGetLastError();
3145     if (cudaSuccess != cudaErr){
3146 
3147         device_free(gpu_data);
3148         return PyErr_Format(PyExc_RuntimeError,
3149                             "CudaNdarray_ptr_int_size: error when calling the gpu code. (%s)",
3150                             cudaGetErrorString(cudaErr));
3151     }
3152 
3153     // Transfer the result to cpu
3154     int gpu_sizes[] = {-1,-1};
3155     cublasStatus_t err;
3156     err = cublasGetVector(2, sizeof(int), gpu_data, 1, gpu_sizes, 1);
3157     device_free(gpu_data);
3158 
3159     if (CUBLAS_STATUS_SUCCESS != err){
3160         PyErr_SetString(PyExc_RuntimeError, "error copying data to from memory");
3161         return NULL;
3162     }
3163     return Py_BuildValue("iiii", (int) gpu_sizes[0], (int)sizeof(float*),
3164                          (int)sizeof(int), (int) gpu_sizes[1]);
3165 }
3166 
3167 static int cublas_init();
3168 static void cublas_shutdown();
3169 // Initialize the gpu.
3170 // Takes two optional parameters, the device number and if we should use cnmem.
3171 // If the device number is provided, it sets that device to be the active device.
3172 // If not provided (usually just to test whether the gpu is available at all),
3173 // it does not set an active device.
3174 // Raises EnvironmentError or ValueError (as appropriate) if the initialization failed.
3175 // cnmem is threaded like a bool. If converted to 0, don't use cnmem. Otherwise, use it.
3176 PyObject *
3177 CudaNdarray_gpu_init(PyObject* _unused, PyObject* args)
3178 {
3179     int card_nb = 0;
3180     int card_number_provided = 1;
3181     float cnmem = 0; // Theano flag lib.cnmem
3182     // if we're given something wildly invalid, this will throw a TypeError
3183     if(!PyArg_ParseTuple(args, "|if", &card_nb, &cnmem))
3184         return NULL;
3185     if(cnmem)
3186         g_use_cnmem = true;
3187 
3188     if(PyTuple_Size(args) == 0) {
3189         card_number_provided = 0;
3190         card_nb = 0;
3191     }
3192 
3193     int deviceCount;
3194     cudaError err = cudaGetDeviceCount(&deviceCount);
3195     if(cudaSuccess != err) {
3196         return PyErr_Format(PyExc_EnvironmentError,
3197                             "Unable to get the number of gpus available: %s",
3198                             cudaGetErrorString(cudaGetLastError()));
3199     }
3200 
3201     // as soon as the first successful call to a cuda* function is made, a
3202     // gpu context has been created
3203     g_gpu_context_active = 1;
3204 
3205     if(deviceCount <= 0) {
3206         return PyErr_Format(PyExc_EnvironmentError,
3207                             "Can't use the GPU, no devices support CUDA");
3208     }
3209     if(card_number_provided && (card_nb < 0 || card_nb > (deviceCount - 1))) {
3210         return PyErr_Format(PyExc_ValueError,
3211                             "Bad device number %d. Only %d devices available.",
3212                             card_nb,
3213                             deviceCount);
3214     }
3215 
3216     cudaDeviceProp deviceProp;
3217     err = cudaGetDeviceProperties(&deviceProp, card_nb);
3218     if(cudaSuccess != err) {
3219         return PyErr_Format(PyExc_EnvironmentError,
3220                             "Unable to get properties of gpu %i: %s",
3221                             card_nb,
3222                             cudaGetErrorString(cudaGetLastError()));
3223     }
3224 
3225     if(deviceProp.major == 9999 && deviceProp.minor == 9999 ){
3226         return PyErr_Format(PyExc_EnvironmentError,
3227                             "There is no device that supports CUDA");
3228     }
3229 
3230     if(card_number_provided) {
3231         err = cudaSetDevice(card_nb);
3232         if(cudaSuccess != err) {
3233             return PyErr_Format(PyExc_EnvironmentError,
3234                                 "Unable to set device %i: %s",
3235                                 card_nb,
3236                                 cudaGetErrorString(cudaGetLastError()));
3237         }
3238         if (cublas_init() == -1)
3239             return NULL;
3240     }
3241     if(card_number_provided && g_use_cnmem) {
3242         size_t mem = 0;
3243         if (cnmem > 1)
3244             mem = cnmem * 1024 * 1024;
3245         else{
3246             // Clip to 95% to let memory for the driver.
3247             // 98% didn't worked in some cases.
3248             if (cnmem > .95){
3249                 cnmem = .95;
3250             }
3251             size_t free = 0, total = 0;
3252             cudaError_t err = cudaMemGetInfo(&free, &total);
3253             if (err != cudaSuccess){
3254                 // Clear the error flag, cudaMemGetInfo doesn't do it.
3255                 // Currently this returns the same thing as err, but if in future
3256                 // it returns something else I still don't see why we should ignore
3257                 // it.  All we want to do here is reset the flag.
3258                 cudaGetLastError();
3259                 PyErr_Format(PyExc_RuntimeError,
3260                              "Error while getting memory info about the gpu: %s",
3261                              cudaGetErrorString(err));
3262                 return NULL;
3263             }
3264             mem = total * cnmem;
3265         }
3266         if(initCnmem(card_number_provided, card_nb, mem) == -1){
3267             return NULL;
3268         }
3269     }
3270 
3271     Py_INCREF(Py_None);
3272     return Py_None;
3273 }
3274 
3275 PyObject *
3276 CudaNdarray_active_device_number(PyObject* _unused, PyObject* _unused_args) {
3277     // NB: No cuda error checking here; keeps things simple, and it's not
3278     // really necessary.
3279     int currentDevice;
3280     cudaGetDevice(&currentDevice);
3281     return PyInt_FromLong(currentDevice);
3282 }
3283 
3284 PyObject *
3285 CudaNdarray_active_device_name(PyObject* _unused, PyObject* _unused_args) {
3286     // NB: No cuda error checking here; keeps things simple, and it's not
3287     // really necessary.
3288     int currentDevice;
3289     cudaGetDevice(&currentDevice);
3290 
3291     cudaDeviceProp deviceProp;
3292     cudaGetDeviceProperties(&deviceProp, currentDevice);
3293     return PyString_FromString(deviceProp.name);
3294 }
3295 
3296 PyObject *
3297 CudaNdarray_gpu_shutdown(PyObject* _unused, PyObject* _unused_args) {
3298     // Don't handle errors here
3299     cublas_shutdown();
3300     g_gpu_context_active = 0; // context has now been closed down
3301     if(g_use_cnmem) {
3302         cnmemStatus_t status = cnmemFinalize();
3303         if(status != CNMEM_STATUS_SUCCESS) {
3304             fprintf(stderr, "CudaNdarray_gpu_shutdown: cnmemFinalize failed! Reason=%s\n",
3305                     cnmemGetErrorString(status));
3306             if(status == CNMEM_STATUS_CUDA_ERROR) {
3307                 fprintf(stderr, "  Cuda-Reason=%s\n",
3308                         cudaGetErrorString(cudaGetLastError()));
3309             }
3310         }
3311     }
3312 
3313     Py_INCREF(Py_None);
3314     return Py_None;
3315 }
3316 
3317 /*
3318  * This function is tested in theano/misc/test_pycuda_theano_simple.py
3319  */
3320 PyObject *
3321 CudaNdarray_from_gpu_pointer(PyObject* _unused, PyObject* args)
3322 {
3323     int verbose = 0;
3324     PyObject *gpu_ptr = NULL;
3325     PyObject *shapes = NULL;
3326     PyObject *strides = NULL;
3327     PyObject *base = NULL;
3328     PyObject *rval = NULL;
3329 
3330     //args should consist of 3 python objects
3331     //The first is the gpu ptr
3332     //The second if the shape
3333     //The third if the strides
3334     if (! PyArg_ParseTuple(args, "OOOO", &gpu_ptr, &shapes, &strides, &base))
3335         return NULL;
3336 
3337     if (verbose) printf("In CudaNdarray_from_gpu_pointer\n");
3338     if (!PyLong_Check(gpu_ptr))
3339     {
3340         PyErr_Format(PyExc_Exception, "CudaNdarray_from_gpu_pointer: The gpu pointor is not an long");
3341         return NULL;
3342     }
3343 
3344     Py_ssize_t nd =  PyObject_Length(shapes);
3345     if (nd < 0)
3346     {
3347         PyErr_SetString(PyExc_TypeError, "CudaNdarray_from_gpu_pointer: Couldn't get length of second argument");
3348         return NULL;
3349     }
3350     Py_ssize_t nd_stride =  PyObject_Length(strides);
3351     if (nd_stride < 0)
3352     {
3353         PyErr_SetString(PyExc_TypeError, "CudaNdarray_from_gpu_pointer: Couldn't get length of third argument");
3354         return NULL;
3355     }
3356 
3357     if (nd != nd_stride)
3358     {
3359         PyErr_SetString(PyExc_TypeError, "CudaNdarray_from_gpu_pointer: We need the same number of shapes and strides");
3360         return NULL;
3361     }
3362 
3363     rval = CudaNdarray_New();
3364 
3365     if (CudaNdarray_set_nd((CudaNdarray *)rval, nd))
3366     {
3367         //CudaNdarray_set_nd set the error msg
3368         return NULL;
3369     }
3370     // set gpu pointeur
3371     assert(((CudaNdarray *)rval)->data_allocated == 0);
3372     if (CudaNdarray_set_device_data((CudaNdarray *)rval, (float *)PyInt_AsLong(gpu_ptr), base))
3373     {
3374         PyErr_SetString(PyExc_TypeError, "CudaNdarray_from_gpu_pointer: Error while setting the gpu pointor");
3375         return NULL;
3376 
3377     }
3378 
3379     // Set dims and strides
3380     for (int i = nd-1; i >= 0; --i)
3381     {
3382         PyObject * idx = PyLong_FromLong(i);
3383         if (idx == NULL)
3384         {
3385             PyErr_SetString(PyExc_Exception, "CudaNdarray_from_gpu_pointer: Couldn't make long object to loop over list/tuple");
3386             return NULL;
3387         }
3388         PyObject* dim_ = PyObject_GetItem(shapes, idx);
3389         PyObject* strd_ = PyObject_GetItem(strides, idx);
3390         if (!PyInt_Check(dim_))
3391         {
3392             PyErr_Format(PyExc_Exception, "CudaNdarray_from_gpu_pointer: shapes[%d] is not an int", i);
3393             return NULL;
3394         }
3395         if (!PyInt_Check(strd_))
3396         {
3397             PyErr_Format(PyExc_Exception, "CudaNdarray_from_gpu_pointer: strides[%d] is not an int", i);
3398             return NULL;
3399         }
3400         int dim = PyInt_AsLong(dim_);
3401         int strd = PyInt_AsLong(strd_);
3402         CudaNdarray_set_stride((CudaNdarray *)rval, i, strd);
3403         CudaNdarray_set_dim((CudaNdarray *)rval, i, dim);
3404         Py_DECREF(idx);
3405         Py_DECREF(dim_);
3406         Py_DECREF(strd_);
3407     }
3408     if (verbose) printf("CudaNdarray_from_gpu_pointer normal return\n");
3409     return rval;
3410 }
3411 
3412 PyObject *
3413 CudaNdarray_Dot(PyObject* _unused, PyObject* args)
3414 {
3415     PyObject *l=NULL;
3416     PyObject *r=NULL;
3417     PyObject * rval = NULL;
3418 
3419     //args should consist of two python objects ("OO")
3420     if (! PyArg_ParseTuple(args, "OO", &l, &r))
3421         return NULL;
3422 
3423     if (!CudaNdarray_Check(l) || !CudaNdarray_Check(r))
3424     {
3425         PyErr_SetString(PyExc_TypeError, "CudaNdarray arguments required ");
3426         goto CudaNdarray_dot_fail;
3427     }
3428     if (((CudaNdarray*)l)->nd != 2)
3429     {
3430         PyErr_SetString(PyExc_TypeError, "need 2d CudaNdarray arg for now");
3431         goto CudaNdarray_dot_fail;
3432     }
3433     if (((CudaNdarray*)r)->nd != 2)
3434     {
3435         PyErr_SetString(PyExc_TypeError, "need 2d CudaNdarray arg for now");
3436         goto CudaNdarray_dot_fail;
3437     }
3438     rval = CudaNdarray_New();
3439     if (!rval)
3440     {
3441         goto CudaNdarray_dot_fail;
3442     }
3443     int dims[2];
3444     dims[0] = CudaNdarray_HOST_DIMS((CudaNdarray*)l)[0];
3445     dims[1] = CudaNdarray_HOST_DIMS((CudaNdarray*)r)[1];
3446     if (CudaNdarray_alloc_contiguous((CudaNdarray*)rval, 2, dims))
3447     {
3448         goto CudaNdarray_dot_fail;
3449     }
3450     if (CudaNdarray_gemm(1.0, (CudaNdarray*)l, (CudaNdarray*)r, 0.0, (CudaNdarray*)rval))
3451     {
3452         goto CudaNdarray_dot_fail;
3453     }
3454 
3455     return rval;
3456 
3457     CudaNdarray_dot_fail:
3458     Py_XDECREF(rval);
3459     return NULL;
3460 }
3461 
3462 static PyObject *
3463 filter(PyObject* __unsed_self, PyObject *args) // args = (data, broadcastable, strict, storage)
3464 {
3465     /*
3466      * TODO: DOC what this function should do in the various cases of
3467      * What is 'strict' supposed to mean in the context of this function?
3468      * What do we do with input that could be interpreted as matching the broadcastable pattern in strict vs. non-strict cases?
3469      *
3470      */
3471     PyObject *py_data=NULL;
3472     PyArrayObject * data = NULL;
3473     int strict = 0;
3474     PyObject * broadcastable=NULL;
3475     PyObject * storage=NULL;
3476     CudaNdarray * rval=NULL;
3477 
3478     //Python object references which are provided to the caller are borrowed references
3479     if (!PyArg_ParseTuple(args, "OOiO", &py_data, &broadcastable, &strict, &storage)) return NULL;
3480 
3481     if (!PyTuple_Check(broadcastable)){
3482         PyErr_SetString(PyExc_TypeError, "broadcastable arg should be a tuple of int.");
3483         return NULL;
3484     }
3485     Py_INCREF(py_data);
3486     Py_INCREF(broadcastable);
3487 
3488     CudaNdarray * cnda = (CudaNdarray*)py_data;
3489 
3490     if (strict || CudaNdarray_Check(py_data))
3491     {
3492         //TODO: support non-strict "casting" from a vt to the broadcastable/type/size that we need.
3493         if (!CudaNdarray_Check(py_data))
3494         {
3495             Py_DECREF(py_data);
3496             Py_DECREF(broadcastable);
3497             PyErr_SetString(PyExc_TypeError, "strict mode requires CudaNdarray");
3498             return NULL;
3499         }
3500         if (cnda->nd != PyTuple_Size(broadcastable))
3501         {
3502             Py_DECREF(py_data);
3503             Py_DECREF(broadcastable);
3504             PyErr_Format(PyExc_TypeError, "Wrong rank: %i vs %li", cnda->nd, (long)PyTuple_Size(broadcastable));
3505             return NULL;
3506         }
3507         for (int i = 0; i < cnda->nd; ++i)
3508         {
3509             if ((CudaNdarray_HOST_DIMS(cnda)[i] > 1) && PyInt_AsLong(PyTuple_GetItem(broadcastable, Py_ssize_t(i))))
3510             {
3511                 PyErr_Format(PyExc_TypeError, "Non-unit size in broadcastable vt dimension %i", i);
3512                 Py_DECREF(py_data);
3513                 Py_DECREF(broadcastable);
3514                 return NULL;
3515             }else if (CudaNdarray_HOST_DIMS(cnda)[i] == 1 && CudaNdarray_HOST_STRIDES(cnda)[i] != 0){
3516                 PyErr_Format(PyExc_TypeError, "Non-zeros strides(%d) on dimension %d of size 1",
3517                              CudaNdarray_HOST_STRIDES(cnda)[i], i);
3518                 Py_DECREF(py_data);
3519                 Py_DECREF(broadcastable);
3520                 return NULL;
3521             }
3522         }
3523         Py_DECREF(broadcastable);
3524         return py_data;
3525     }
3526     else
3527     {
3528         data = (PyArrayObject*)PyArray_FromObject(py_data, REAL_TYPENUM, PyTuple_Size(broadcastable), PyTuple_Size(broadcastable));
3529         if (!data)
3530         {
3531             //err message already defined
3532             Py_DECREF(py_data);
3533             Py_DECREF(broadcastable);
3534             return NULL;
3535         }
3536         for (int i = 0; i < PyArray_NDIM(data); ++i)
3537         {
3538             if ((PyArray_DIMS(data)[i] > 1) && PyInt_AsLong(PyTuple_GetItem(broadcastable, Py_ssize_t(i))))
3539             {
3540                 PyErr_Format(PyExc_TypeError, "Non-unit size in broadcastable dimension %i", i);
3541                 Py_DECREF(data);
3542                 Py_DECREF(py_data);
3543                 Py_DECREF(broadcastable);
3544                 return NULL;
3545             }
3546         }
3547         if (storage && CudaNdarray_Check(storage))
3548         {
3549             rval = (CudaNdarray*) storage;
3550             Py_INCREF(rval);
3551         }
3552         else
3553         {
3554             rval = (CudaNdarray*) CudaNdarray_New();
3555         }
3556         if (rval)
3557         {
3558             if (CudaNdarray_CopyFromArray(rval, data))
3559             {
3560                 Py_DECREF(rval);
3561                 rval = NULL;
3562             }
3563         }
3564         Py_DECREF(data);
3565         Py_DECREF(py_data);
3566         Py_DECREF(broadcastable);
3567         return (PyObject*)rval;
3568     }
3569 }
3570 
3571 //TODO-- CudaNdarray_Dot and CudaNdarray_active_device_name are following different capitalization conventions.
3572 //       Pick one and standardize it, this file is already annoying enough to grep through
3573 static PyMethodDef module_methods[] = {
3574     {"dimshuffle", CudaNdarray_Dimshuffle, METH_VARARGS, "Returns the dimshuffle of a CudaNdarray."},
3575     {"dot", CudaNdarray_Dot, METH_VARARGS, "Returns the matrix product of two CudaNdarray arguments."},
3576     {"gpu_init", CudaNdarray_gpu_init, METH_VARARGS, "Select the gpu card to use; also usable to test whether CUDA is available."},
3577     {"select_a_gpu", CudaNdarray_select_a_gpu, METH_NOARGS, "Call this method if you want to select a GPU before gpu_init call and let the driver choose the GPU."},
3578     {"active_device_name", CudaNdarray_active_device_name, METH_VARARGS, "Get the name of the active device."},
3579     {"active_device_number", CudaNdarray_active_device_number, METH_VARARGS, "Get the number of the active device."},
3580     {"gpu_shutdown", CudaNdarray_gpu_shutdown, METH_VARARGS, "Shut down the gpu."},
3581     {"device_properties", GetDeviceProperties, METH_VARARGS, "Return a dictionary with the device properties."},
3582     {"mem_info", GetDeviceMemInfo, METH_NOARGS, "Return a tuple with the free and total memory on the gpu in bytes."},
3583 #if COMPUTE_GPU_MEM_USED
3584     {"theano_allocated", GetTheanoAllocInfo, METH_NOARGS, "Return the size in bytes of memory Theano currently have allocated on the gpu."},
3585 #endif
3586     {"ptr_int_size", CudaNdarray_ptr_int_size, METH_VARARGS, "Return a tuple with the size of gpu pointer, cpu pointer and int in bytes."},
3587     {"filter", filter, METH_VARARGS, "filter(obj, broadcastable, strict, storage) returns a CudaNdarray initialized to obj if it matches the constraints of broadcastable.  strict=True prevents any numeric casting. If storage is a CudaNdarray it may be overwritten and used as the return value."},
3588     {"outstanding_mallocs", outstanding_mallocs, METH_VARARGS, "how many more mallocs have been called than free's"},
3589     {"from_gpu_pointer", CudaNdarray_from_gpu_pointer, METH_VARARGS, "Used to create a CudaNdarray from already allocated memory on the gpu.(example by pycuda)"},
3590     {"synchronize", CudaNdarray_synchronize, METH_NOARGS, "Used to synchronize the device"},
3591     {"cublas_v2", CudaNdarray_cublasv2, METH_NOARGS,
3592      "Used to know if this version of cuda_ndarray is linked with cublas v2."},
3593     {NULL, NULL, NULL, NULL}  /* Sentinel */
3594 };
3595 
3596 #define CNDA_MOD_NAME "cuda_ndarray"
3597 #define CNDA_DOCSTRING "CUDA implementation of a numpy ndarray-like object."
3598 
3599 #if PY_MAJOR_VERSION == 3
3600 static struct PyModuleDef cuda_ndarray_moduledef =
3601 {
3602     PyModuleDef_HEAD_INIT,
3603     CNDA_MOD_NAME,
3604     CNDA_DOCSTRING,
3605     -1,     /* size of per-interpreter state of the module,
3606                or -1 if the module keeps state in global variables. */
3607     module_methods
3608 };
3609 
3610 PyMODINIT_FUNC
3611 PyInit_cuda_ndarray(void)
3612 #else
3613 PyMODINIT_FUNC
3614 initcuda_ndarray(void)
3615 #endif
3616 {
3617     import_array();
3618 
3619     PyObject* m;
3620 
3621     if (PyType_Ready(&CudaNdarrayType) < 0) {
3622 #if PY_MAJOR_VERSION == 3
3623         return NULL;
3624 #else
3625         return;
3626 #endif
3627     }
3628 
3629 #if PY_MAJOR_VERSION == 3
3630     m = PyModule_Create(&cuda_ndarray_moduledef);
3631 #else
3632     m = Py_InitModule3(CNDA_MOD_NAME, module_methods, CNDA_DOCSTRING);
3633 #endif
3634 
3635     if (m == NULL) {
3636 #if PY_MAJOR_VERSION == 3
3637         return NULL;
3638 #else
3639         return;
3640 #endif
3641     }
3642 
3643     Py_INCREF(&CudaNdarrayType);
3644     PyModule_AddObject(m, "CudaNdarray", (PyObject *)&CudaNdarrayType);
3645 #if COMPUTE_GPU_MEM_USED
3646     for(int i=0;i<TABLE_SIZE;i++){
3647         _alloc_size_table[i].ptr=NULL;
3648         _alloc_size_table[i].size=0;
3649     }
3650 #endif
3651     //    cublasInit();
3652     //if (0&&CUBLAS_STATUS_SUCCESS != cublasGetError())
3653     //{
3654         //std::cerr << "WARNING: initcuda_ndarray: error initializing device\n";
3655     //}
3656     if (0) //TODO: is this necessary?
3657     {
3658         int deviceId = 0; // TODO: what number goes here?
3659         cudaSetDevice(deviceId);
3660         cudaError_t err = cudaGetLastError();
3661         if( cudaSuccess != err)
3662         {
3663             std::cerr << "Error in SetDevice:" << cudaGetErrorString(err) << "\n";
3664         }
3665     }
3666 
3667 #if PY_MAJOR_VERSION == 3
3668     return m;
3669 #endif
3670 }
3671 
3672 
3673 //////////////////////////////////////
3674 //
3675 // C API FOR CudaNdarray
3676 //
3677 //////////////////////////////////////
3678 
3679 int
3680 CudaNdarray_Check(const PyObject * ob)
3681 {
3682     //TODO: doesn't work with inheritance
3683     return CudaNdarray_CheckExact(ob);
3684 }
3685 int
3686 CudaNdarray_CheckExact(const PyObject * ob)
3687 {
3688     return ((Py_TYPE(ob) == &CudaNdarrayType) ? 1 : 0);
3689 }
3690 
3691 PyObject *
3692 CudaNdarray_New(int nd)
3693 {
3694     CudaNdarray *self = (CudaNdarray *)CudaNdarrayType.tp_alloc(&CudaNdarrayType, 0);
3695     if (self == NULL)
3696     {
3697         PyErr_SetString(PyExc_RuntimeError, "CudaNdarray_New failed to allocate self");
3698         return NULL;
3699     }
3700     CudaNdarray_null_init(self);
3701 
3702     if (nd == 0)
3703     {
3704         self->nd = 0;
3705     }
3706     else if (nd > 0)
3707     {
3708         if (CudaNdarray_set_nd(self, nd))
3709         {
3710             Py_DECREF(self);
3711             return NULL;
3712         }
3713     }
3714     ++_outstanding_mallocs[1];
3715     return (PyObject *)self;
3716 }
3717 
3718 
3719 
3720 //////////////////////////////
3721 //
3722 // Published helper functions
3723 //
3724 //////////////////////////////
3725 
3726 static int
3727 cublas_init()
3728 {
3729     cublasStatus_t err;
3730     err = cublasCreate(&handle);
3731     if (CUBLAS_STATUS_SUCCESS != err)
3732     {
3733         if(CUBLAS_STATUS_NOT_INITIALIZED == err)
3734             PyErr_SetString(PyExc_RuntimeError,
3735                             "cublasCreate() returned this error "
3736                             "'the CUDA Runtime initialization failed'");
3737         else if(CUBLAS_STATUS_ALLOC_FAILED == err)
3738             PyErr_SetString(PyExc_RuntimeError,
3739                             "cublasCreate() returned this error "
3740                             "'the resources could not be allocated'");
3741         else
3742             PyErr_SetString(PyExc_RuntimeError,
3743                             "unknow error during returned by cublasCreate()");
3744         return -1;
3745     }
3746     // Set the default stream as the one to execute on (default)
3747     cublasSetStream(handle, NULL);
3748     // Pointer to scalars are on the host (also default)
3749     cublasSetPointerMode(handle, CUBLAS_POINTER_MODE_HOST);
3750 #if CUDA_VERSION >= 5000
3751     // atomics can be used in kernels to speed up operations (not default)
3752     // This may lead to a slight variance from run to run in some operations
3753     cublasSetAtomicsMode(handle, CUBLAS_ATOMICS_ALLOWED);
3754 #endif
3755     return 0;
3756 }
3757 
3758 static void
3759 cublas_shutdown()
3760 {
3761     if (handle != NULL)
3762         cublasDestroy(handle);
3763     // No point in handling any errors here
3764     handle = NULL;
3765 }
3766 
3767 int
3768 CudaNdarray_CopyFromArray(CudaNdarray * self, PyArrayObject*obj)
3769 {
3770     int err = CudaNdarray_alloc_contiguous(self, PyArray_NDIM(obj),
3771                                            PyArray_DIMS(obj));
3772     if (err) {
3773         return err;
3774     }
3775 
3776     int typenum = PyArray_TYPE(obj);
3777     if (typenum != REAL_TYPENUM)
3778     {
3779         PyErr_SetString(PyExc_TypeError, "can only copy from float arrays");
3780         return -1;
3781     }
3782     assert( 4 ==  PyArray_ITEMSIZE(obj));
3783     PyArrayObject * py_src = (PyArrayObject *)PyArray_ContiguousFromAny(
3784         (PyObject*)obj, typenum, self->nd, self->nd);
3785     if (!py_src) {
3786         return -1;
3787     }
3788     npy_intp py_src_size = PyArray_SIZE(py_src);
3789     void *py_src_data = PyArray_DATA(py_src);
3790     cudaError_t cerr;
3791     CNDA_BEGIN_ALLOW_THREADS;
3792     cerr = cudaMemcpy(self->devdata, py_src_data,
3793                       py_src_size * sizeof(real),
3794                       cudaMemcpyHostToDevice);
3795     //CNDA_THREAD_SYNC;  // unneeded because cudaMemcpy is blocking anyway
3796     CNDA_END_ALLOW_THREADS;
3797     if (cudaSuccess != cerr)
3798     {
3799         PyErr_Format(PyExc_RuntimeError,
3800                      "Cuda error '%s' while copying %lli data element"
3801                      " to device memory. str ptr=%p. dst ptr=%p",
3802                      cudaGetErrorString(cerr),
3803                      (long long)py_src_size,
3804                      py_src_data,
3805                      self->devdata);
3806         Py_DECREF(py_src);
3807         return -1;
3808     }
3809     Py_DECREF(py_src);
3810     return 0;
3811 }
3812 
3813 PyObject *
3814 CudaNdarray_new_nd(int nd)
3815 {
3816     CudaNdarray * rval = (CudaNdarray*) CudaNdarray_New();
3817     if (!rval || CudaNdarray_set_nd(rval, nd))
3818     {
3819         Py_XDECREF(rval);
3820         rval = NULL;
3821     }
3822     return (PyObject *) rval;
3823 }
3824 
3825 
3826 /**
3827  * Initialize 'self' as a view of 'base', with memory storage 'data'
3828  */
3829 
3830 int CudaNdarray_set_device_data(CudaNdarray * self, float * data, PyObject * base)
3831 {
3832     if (self->data_allocated)
3833     {
3834         assert(self->devdata);
3835         if (device_free(self->devdata))
3836         {
3837             self->devdata = NULL;
3838             self->data_allocated = 0;
3839             return -1;
3840         }
3841     }
3842     // Get the original base object (base.base.base...)
3843     PyObject * orig_base = base;
3844     // base is not always a CudaNdarray. It can be a GpuArray from pycuda, ...
3845     while (orig_base && CudaNdarray_Check(orig_base) && ((CudaNdarray*) orig_base)->base)
3846     {
3847         // base_base is itself a view
3848         orig_base = ((CudaNdarray*) orig_base)->base;
3849     }
3850     //N.B. XDECREF and XINCREF are no-ops for NULL pointers
3851     if (self->base != orig_base)
3852     {
3853         Py_XDECREF(self->base);
3854         self->base = orig_base;
3855         Py_XINCREF(self->base);
3856     }
3857     self->data_allocated = 0;
3858     self->devdata = data;
3859     return 0;
3860 }
3861 
3862 static __global__ void k_copy_1d(const int N, const float * x, const int sx, float * y, const int sy)
3863 {
3864     for (int i = threadIdx.x + blockIdx.x * blockDim.x; i < N; i += gridDim.x*blockDim.x)
3865     {
3866         y[i*sy] = x[i*sx];
3867     }
3868 }
3869 
3870 // N1 through N4 are the size of y
3871 static __global__ void k_copy_4d(const int N1,
3872         const int N2, const int N3, const int N4,
3873         const float * x, const int sx1, const int sx2, const int sx3,
3874         const int sx4,  float * y, const int sy1, const int sy2,
3875         const int sy3, const int sy4)
3876 {
3877     // These must be made int instead of unsigned int due to a bug in nvcc
3878     int bx = blockIdx.x;
3879     int by = blockIdx.y;
3880 
3881     for (int i = bx; i < N1; i += gridDim.x)
3882     {
3883         for (int j = by; j < N2; j += gridDim.y)
3884         {
3885             for (int k = threadIdx.x; k < N3; k += (int) blockDim.x)
3886             {
3887                 for (int l = threadIdx.y; l < N4; l += (int) blockDim.y)
3888                 {
3889                     y[i * sy1 + j * sy2 + k * sy3 + l * sy4] =
3890                         x[i * sx1 + j * sx2 + k * sx3 + l * sx4];
3891                 }
3892             }
3893         }
3894     }
3895 }
3896 
3897 //copy from other into self
3898 int CudaNdarray_CopyFromCudaNdarray(CudaNdarray * self,
3899                                     const CudaNdarray * other,
3900                                     bool unbroadcast)
3901 {
3902     int verbose = 0;
3903     if (verbose>1) fprintf(stderr, "CudaNdarray_CopyFromCudaNdarray\n");
3904 
3905     //standard elemwise size checks
3906     if (self->nd == -1)
3907     {
3908         PyErr_SetString(PyExc_TypeError,
3909                         "can't copy into un-initialized CudaNdarray");
3910         return -1;
3911     }
3912     CudaNdarray * new_other = NULL;
3913 
3914     if (self->nd < other->nd)
3915     {
3916         PyErr_Format(PyExc_NotImplementedError,
3917             "CudaNdarray_CopyFromCudaNdarray: The number of dimensions of the "
3918             "destination needs to be >= the number of dimensions of the "
3919             "source. Got %d and %d.", self->nd, other->nd);
3920         return -1;
3921     }
3922     else if (self->nd != other->nd)
3923     {
3924         new_other = (CudaNdarray *) CudaNdarray_View(other);
3925         int added_dims = self->nd - other->nd;
3926         int* pattern = (int*) alloca(self->nd * sizeof(int));
3927         for(int i = 0; i < added_dims; i++)
3928             pattern[i] = -1;
3929         for(int i = 0; i < other->nd; i++)
3930             pattern[i + added_dims] = i;
3931         CudaNdarray_dimshuffle(new_other, self->nd, pattern);
3932         other = new_other;
3933     }
3934     assert(self->nd == other->nd);
3935     //standard elemwise dim checks (also compute total size)
3936     unsigned int size = 1;
3937     unsigned int size_source = 1;
3938     for (int i = 0; i< self->nd; ++i)
3939     {
3940         if ((CudaNdarray_HOST_DIMS(self)[i] != CudaNdarray_HOST_DIMS(other)[i])
3941             && (1!=CudaNdarray_HOST_DIMS(other)[i] || !unbroadcast) )
3942         {
3943           PyErr_Format(PyExc_ValueError,
3944                        "CudaNdarray_CopyFromCudaNdarray:"
3945                        " need same dimensions for dim %d,"
3946                        " destination=%d, source=%d",
3947                        i, CudaNdarray_HOST_DIMS(self)[i],
3948                        CudaNdarray_HOST_DIMS(other)[i]);
3949           Py_XDECREF(new_other);
3950           return -1;
3951         }
3952         size *= (unsigned int) CudaNdarray_HOST_DIMS(self)[i];
3953         size_source *= (unsigned int) CudaNdarray_HOST_DIMS(other)[i];
3954     }
3955     if (0 == size)
3956     {
3957         Py_XDECREF(new_other);
3958         return 0; //nothing to copy, we're done.
3959     }
3960     if (CudaNdarray_is_c_contiguous(self) &&
3961         CudaNdarray_is_c_contiguous(other) &&
3962         size == size_source)
3963     {
3964         if (verbose)
3965             fprintf(stderr, "Copying contiguous vector with cublasScopy\n");
3966 
3967         cublasStatus_t err;
3968         err = cublasScopy(handle, size, CudaNdarray_DEV_DATA(other), 1,
3969                           CudaNdarray_DEV_DATA(self), 1);
3970         CNDA_THREAD_SYNC;
3971         Py_XDECREF(new_other);
3972         if (CUBLAS_STATUS_SUCCESS != err)
3973         {
3974             PyErr_SetString(PyExc_RuntimeError, "Error copying memory");
3975             return -1;
3976         }
3977         return 0;
3978     }
3979     //TODO: rewrite these copy operations to be more efficient
3980     //      See, for example the transpose example in the cuda_sdk.
3981     switch (self->nd)
3982     {
3983         case 0: // scalar
3984             {
3985                 // THIS CASE SHOULD NEVER HAPPEN BECAUSE SCALARS ARE ALWAYS C CONTIGUOUS
3986                 assert(0);
3987             }; break;
3988         case 1: // vector
3989             {
3990                 if (verbose) fprintf(stderr, "Copying non-contiguous vector\n");
3991                 if (verbose) fprint_CudaNdarray(stderr, other);
3992                 unsigned int n_blocks = std::min(size,
3993                                                  (unsigned int)NUM_VECTOR_OP_BLOCKS);
3994                 unsigned int n_threads = std::min(ceil_intdiv(size, n_blocks),
3995                                                   (unsigned int)NUM_VECTOR_OP_THREADS_PER_BLOCK);
3996                 k_copy_1d<<<n_blocks, n_threads>>>(size,
3997                                             CudaNdarray_DEV_DATA(other),
3998                                             CudaNdarray_HOST_STRIDES(other)[0],
3999                                             CudaNdarray_DEV_DATA(self),
4000                                             CudaNdarray_HOST_STRIDES(self)[0]);
4001                 CNDA_THREAD_SYNC;
4002                 cudaError_t err = cudaGetLastError();
4003                 if( cudaSuccess != err)
4004                 {
4005                     PyErr_Format(PyExc_RuntimeError,
4006                                  "Cuda error: %s: %s. (n_blocks=%i,"
4007                                  " n_threads_per_block=%i)\n", "k_copy_1d",
4008                                  cudaGetErrorString(err), n_blocks, n_threads);
4009                     Py_XDECREF(new_other);
4010                     return -1;
4011                 }
4012             }; break;
4013         case 4: // 4-tensor
4014             {
4015                 if (verbose)
4016                 {
4017                     if (0 != fprint_CudaNdarray(stderr, other))
4018                     {
4019                         Py_XDECREF(new_other);
4020                         return -1;
4021                     }
4022                 }
4023 
4024                 // The blocks implement the looping over the first two axes so
4025                 // this needs to be (N1, N2)
4026                 dim3 n_blocks( std::min(CudaNdarray_HOST_DIMS(self)[0],
4027                                         NUM_VECTOR_OP_BLOCKS),
4028                                std::min(CudaNdarray_HOST_DIMS(self)[1],
4029                                         NUM_VECTOR_OP_BLOCKS));
4030                 // For the threads, just make as many as possible
4031                 dim3 n_threads( std::min( (unsigned int) CudaNdarray_HOST_DIMS(self)[2],
4032                                  (unsigned int) NUM_VECTOR_OP_THREADS_PER_BLOCK),
4033                                 std::min( (unsigned int) CudaNdarray_HOST_DIMS(self)[3],
4034                                     (unsigned int) NUM_VECTOR_OP_THREADS_PER_BLOCK));
4035 
4036                 n_threads.x = std::min( (unsigned int) 32, (unsigned int) n_threads.x);
4037                 n_threads.y = std::min( n_threads.y, NUM_VECTOR_OP_THREADS_PER_BLOCK / n_threads.x);
4038 
4039                 k_copy_4d<<<n_blocks, n_threads>>>(
4040                                             // size of y
4041                                             (unsigned int) CudaNdarray_HOST_DIMS(self)[0], // N1
4042                                             (unsigned int) CudaNdarray_HOST_DIMS(self)[1], // N2
4043                                             (unsigned int) CudaNdarray_HOST_DIMS(self)[2], // N3
4044                                             (unsigned int) CudaNdarray_HOST_DIMS(self)[3], // N4
4045                                             CudaNdarray_DEV_DATA(other), // x
4046                                             // x strides
4047                                             CudaNdarray_HOST_STRIDES(other)[0],
4048                                             CudaNdarray_HOST_STRIDES(other)[1],
4049                                             CudaNdarray_HOST_STRIDES(other)[2],
4050                                             CudaNdarray_HOST_STRIDES(other)[3],
4051                                             CudaNdarray_DEV_DATA(self), // y
4052                                             // y strides
4053                                             CudaNdarray_HOST_STRIDES(self)[0],
4054                                             CudaNdarray_HOST_STRIDES(self)[1],
4055                                             CudaNdarray_HOST_STRIDES(self)[2],
4056                                             CudaNdarray_HOST_STRIDES(self)[3]
4057                                             );
4058                 CNDA_THREAD_SYNC;
4059                 cudaError_t err = cudaGetLastError();
4060                 if( cudaSuccess != err)
4061                 {
4062                     PyErr_Format(PyExc_RuntimeError,
4063                                  "Cuda error: %s: %s.",
4064                                  "k_copy_4d",
4065                                  cudaGetErrorString(err));
4066                     Py_XDECREF(new_other);
4067                     return -1;
4068                 }
4069             }; break;
4070         default:
4071             {
4072                 cudaError_t err = cudaGetLastError();
4073                 if(cudaSuccess != err){
4074                     PyErr_Format(PyExc_RuntimeError,
4075                                  "Unexpected Cuda error: %s: %s\n",
4076                                  "CudaNdarray_CopyFromCudaNdarray",
4077                                  cudaGetErrorString(err));
4078                     Py_XDECREF(new_other);
4079                     return -1;
4080                 }
4081 
4082                 if (verbose)
4083                     fprintf(stderr,
4084                             "Copying with default version unbroadcast=%d\n",
4085                             unbroadcast);
4086                 // call worker routine
4087                 unsigned int threads_per_block = std::min(size,
4088                                                           (unsigned int)NUM_VECTOR_OP_THREADS_PER_BLOCK);
4089                 unsigned int n_blocks = std::min(ceil_intdiv(size, threads_per_block),
4090                                                  (unsigned int)NUM_VECTOR_OP_BLOCKS);
4091                 const CudaNdarray * cuda_dims = other;
4092                 if(unbroadcast)
4093                     cuda_dims = self;
4094                 //copy from other into self
4095                 k_elemwise_unary_rowmajor_copy<<<n_blocks, threads_per_block>>>(
4096                         size,
4097                         (unsigned int)other->nd,
4098                         (const int *)CudaNdarray_DEV_DIMS(cuda_dims),
4099                         (const float*)CudaNdarray_DEV_DATA(other),
4100                         (const int *)CudaNdarray_DEV_STRIDES(other),
4101                         CudaNdarray_DEV_DATA(self),
4102                         (const int *)CudaNdarray_DEV_STRIDES(self));
4103                 CNDA_THREAD_SYNC;
4104                 err = cudaGetLastError();
4105                 if(verbose>1)
4106                     fprintf(stderr,
4107                             "INFO k_elemwise_unary_rowmaj (n_blocks=%i,"
4108                             " n_threads_per_block=%i)\n",
4109                             n_blocks, threads_per_block);
4110                 if( cudaSuccess != err)
4111                 {
4112                     //fprint_CudaNdarray(stderr, self);
4113                     //fprint_CudaNdarray(stderr, other);
4114                     PyErr_Format(PyExc_RuntimeError,
4115                                  "Cuda error: %s: %s. (n_blocks=%i,"
4116                                  " n_threads_per_block=%i)\n",
4117                                  "k_elemwise_unary_rowmajor_copy",
4118                                  cudaGetErrorString(err), n_blocks,
4119                                  threads_per_block);
4120                     Py_XDECREF(new_other);
4121                     return -1;
4122                 }
4123             }
4124     };
4125     Py_XDECREF(new_other);
4126     return 0;
4127 }
4128 
4129 int CudaNdarray_gemm(float alpha, const CudaNdarray * A, const CudaNdarray * B, float beta, CudaNdarray * C)
4130 {
4131     if (A->nd != 2)
4132     {
4133         PyErr_SetString(PyExc_ValueError, "non-matrix arg A to gemm");
4134         return -1;
4135     }
4136     if (B->nd != 2)
4137     {
4138         PyErr_SetString(PyExc_ValueError, "non-matrix arg B to gemm");
4139         return -1;
4140     }
4141     if (C->nd != 2)
4142     {
4143         PyErr_SetString(PyExc_ValueError, "non-matrix arg C to gemm");
4144         return -1;
4145     }
4146 
4147     // We must allow dimensions to be zeros.
4148     if ((CudaNdarray_HOST_DIMS(A)[1] != CudaNdarray_HOST_DIMS(B)[0])
4149             || (CudaNdarray_HOST_DIMS(A)[0] != CudaNdarray_HOST_DIMS(C)[0])
4150             || (CudaNdarray_HOST_DIMS(B)[1] != CudaNdarray_HOST_DIMS(C)[1]))
4151     {
4152         PyErr_Format(PyExc_ValueError, "dimension mismatch in args to gemm (%i,%i)x(%i,%i)->(%i,%i)",
4153                 CudaNdarray_HOST_DIMS(A)[0],
4154                 CudaNdarray_HOST_DIMS(A)[1],
4155                 CudaNdarray_HOST_DIMS(B)[0],
4156                 CudaNdarray_HOST_DIMS(B)[1],
4157                 CudaNdarray_HOST_DIMS(C)[0],
4158                 CudaNdarray_HOST_DIMS(C)[1]);
4159         return -1;
4160     }
4161 
4162     // If matrix A or B has non-unit size and non-unit stride in both
4163     // dimensions, we can make a copy.
4164     CudaNdarray * A_new = NULL;
4165     CudaNdarray * B_new = NULL;
4166     if (((CudaNdarray_HOST_DIMS(A)[0] > 1)
4167          && (CudaNdarray_HOST_STRIDES(A)[0] != 1)
4168          && (CudaNdarray_HOST_DIMS(A)[1] > 1)
4169          && (CudaNdarray_HOST_STRIDES(A)[1] != 1))
4170         || (CudaNdarray_HOST_STRIDES(A)[0] < 0)
4171         || (CudaNdarray_HOST_STRIDES(A)[1] < 0))
4172     {
4173         A_new = (CudaNdarray*) CudaNdarray_Copy(A);
4174         if (!A_new)
4175             return -1;
4176         A = A_new;
4177     }
4178 
4179     if (((CudaNdarray_HOST_DIMS(B)[0] > 1)
4180          && (CudaNdarray_HOST_STRIDES(B)[0] != 1)
4181          && (CudaNdarray_HOST_DIMS(B)[1] > 1)
4182          && (CudaNdarray_HOST_STRIDES(B)[1] != 1))
4183         || (CudaNdarray_HOST_STRIDES(B)[0] < 0)
4184         || (CudaNdarray_HOST_STRIDES(B)[1] < 0))
4185     {
4186         B_new = (CudaNdarray*) CudaNdarray_Copy(B);
4187         if (!B_new)
4188         {
4189             // If A_new is NULL, meaning A was not copied nothing happens
4190             Py_XDECREF(A_new);
4191             return -1;
4192         }
4193         B = B_new;
4194     }
4195 
4196     // If matrix C has non-unit size and non-unit stride in both
4197     // dimensions, or negative strides, we can't operate. We cannot copy
4198     // C either, because the calling code will expect the result to be
4199     // in the original C container.
4200     if (((CudaNdarray_HOST_DIMS(C)[0] > 1)
4201          && (CudaNdarray_HOST_STRIDES(C)[0] != 1)
4202          && (CudaNdarray_HOST_DIMS(C)[1] > 1)
4203          && (CudaNdarray_HOST_STRIDES(C)[1] != 1))
4204         || (CudaNdarray_HOST_STRIDES(C)[0] < 0)
4205         || (CudaNdarray_HOST_STRIDES(C)[1] < 0))
4206     {
4207         PyErr_Format(PyExc_AssertionError,
4208                      "non-unit or negative stride in gemm arg C (%i,%i) of shape (%i,%i)",
4209                      CudaNdarray_HOST_STRIDES(C)[0],
4210                      CudaNdarray_HOST_STRIDES(C)[1],
4211                      CudaNdarray_HOST_DIMS(C)[0],
4212                      CudaNdarray_HOST_DIMS(C)[1]);
4213         Py_XDECREF(A_new);
4214         Py_XDECREF(B_new);
4215         return -1;
4216     }
4217 
4218     // the unit integer is divided logically into three fields of 4 bits
4219     // the lowermost 4 bits encode the stride pattern of the output
4220     // the next higher 4 bits encode the B variable (or y)
4221     // the next higher 4 bits encode the C variable (or x)
4222     //
4223     // the stride pattern for each input is encoded as 0 for unit stride from col to col (Row major)
4224     //                                                 1 for unit stride from row to row (Col major)
4225 
4226     // a stride of 0 implies a dimension of 1 - so we can actually define
4227     // a stride of 0 as a 'unit' stride because gemm will never use it.
4228     // If a dimension is 0, its stride will not be used either, so we can
4229     // consider it a 'unit' stride too.
4230     int unit = 0;
4231     if (CudaNdarray_HOST_STRIDES(A)[1] == 1 || CudaNdarray_HOST_DIMS(A)[1] <= 1) {
4232         unit |= (0x0 << 8);
4233     } else if (CudaNdarray_HOST_STRIDES(A)[0] == 1 || CudaNdarray_HOST_DIMS(A)[0] <= 1) {
4234         unit |= (0x1 << 8);
4235     } else {
4236         unit |= (0x2 << 8);
4237     }
4238     if (CudaNdarray_HOST_STRIDES(B)[1] == 1 || CudaNdarray_HOST_DIMS(B)[1] <= 1) {
4239         unit |= (0x0 << 4);
4240     } else if (CudaNdarray_HOST_STRIDES(B)[0] == 1 || CudaNdarray_HOST_DIMS(B)[0] <= 1) {
4241         unit |= (0x1 << 4);
4242     } else {
4243         unit |= (0x2 << 4);
4244     }
4245     if (CudaNdarray_HOST_STRIDES(C)[1] == 1 || CudaNdarray_HOST_DIMS(C)[1] <= 1) {
4246         unit |= (0x0 << 0);
4247     } else if (CudaNdarray_HOST_STRIDES(C)[0] == 1 || CudaNdarray_HOST_DIMS(C)[0] <= 1) {
4248         unit |= (0x1 << 0);
4249     } else {
4250         unit |= (0x2 << 0);
4251     }
4252 
4253     /* create appropriate strides for malformed matrices that are row or column
4254      * vectors
4255      */
4256     int sa_0 = (CudaNdarray_HOST_DIMS(A)[0] > 1) ? CudaNdarray_HOST_STRIDES(A)[0] : CudaNdarray_HOST_DIMS(A)[1];
4257     int sa_1 = (CudaNdarray_HOST_DIMS(A)[1] > 1) ? CudaNdarray_HOST_STRIDES(A)[1] : CudaNdarray_HOST_DIMS(A)[0];
4258     int sb_0 = (CudaNdarray_HOST_DIMS(B)[0] > 1) ? CudaNdarray_HOST_STRIDES(B)[0] : CudaNdarray_HOST_DIMS(B)[1];
4259     int sb_1 = (CudaNdarray_HOST_DIMS(B)[1] > 1) ? CudaNdarray_HOST_STRIDES(B)[1] : CudaNdarray_HOST_DIMS(B)[0];
4260     int sc_0 = (CudaNdarray_HOST_DIMS(C)[0] > 1) ? CudaNdarray_HOST_STRIDES(C)[0] : CudaNdarray_HOST_DIMS(C)[1];
4261     int sc_1 = (CudaNdarray_HOST_DIMS(C)[1] > 1) ? CudaNdarray_HOST_STRIDES(C)[1] : CudaNdarray_HOST_DIMS(C)[0];
4262 
4263     float* a = CudaNdarray_DEV_DATA(A);
4264     float* b = CudaNdarray_DEV_DATA(B);
4265     float* c = CudaNdarray_DEV_DATA(C);
4266     cublasOperation_t N = CUBLAS_OP_N;
4267     cublasOperation_t T = CUBLAS_OP_T;
4268     //std::cerr << (unit/256) MOD 16 << (unit / 16) MOD 16 << unit MOD 16<< '\\n';
4269     // There should be no negative stride at that point
4270 #define CHK_STRIDE_SGEMM(T0, T1, D0, D1, D2, a, x, sx, y, sy, b, z, sz) \
4271     if (sx == 0){sx = 1;}\
4272     if (sy == 0){sy = 1;}\
4273     if (sz == 0){sz = 1;}\
4274     if ((sx > 0) && (sy > 0) && (sz > 0)) { \
4275         err = cublasSgemm(handle, T0, T1, D0, D1, D2, &a, x, sx, y, sy, &b, z, sz); \
4276     } else { \
4277         PyErr_SetString(PyExc_AssertionError, "negative stride to sGemm");\
4278         Py_XDECREF(A_new);\
4279         Py_XDECREF(B_new);\
4280         return -1; \
4281     }
4282 
4283     cublasStatus_t err;
4284     switch(unit)
4285     {
4286         case 0x000: CHK_STRIDE_SGEMM(N, N, CudaNdarray_HOST_DIMS(C)[1], CudaNdarray_HOST_DIMS(C)[0], CudaNdarray_HOST_DIMS(A)[1], alpha, b, sb_0, a, sa_0, beta, c, sc_0); break;
4287         case 0x100: CHK_STRIDE_SGEMM(N, T, CudaNdarray_HOST_DIMS(C)[1], CudaNdarray_HOST_DIMS(C)[0], CudaNdarray_HOST_DIMS(A)[1], alpha, b, sb_0, a, sa_1, beta, c, sc_0); break;
4288         case 0x010: CHK_STRIDE_SGEMM(T, N, CudaNdarray_HOST_DIMS(C)[1], CudaNdarray_HOST_DIMS(C)[0], CudaNdarray_HOST_DIMS(A)[1], alpha, b, sb_1, a, sa_0, beta, c, sc_0); break;
4289         case 0x110: CHK_STRIDE_SGEMM(T, T, CudaNdarray_HOST_DIMS(C)[1], CudaNdarray_HOST_DIMS(C)[0], CudaNdarray_HOST_DIMS(A)[1], alpha, b, sb_1, a, sa_1, beta, c, sc_0); break;
4290         case 0x001: CHK_STRIDE_SGEMM(T, T, CudaNdarray_HOST_DIMS(C)[0], CudaNdarray_HOST_DIMS(C)[1], CudaNdarray_HOST_DIMS(A)[1], alpha, a, sa_0, b, sb_0, beta, c, sc_1); break;
4291         case 0x101: CHK_STRIDE_SGEMM(N, T, CudaNdarray_HOST_DIMS(C)[0], CudaNdarray_HOST_DIMS(C)[1], CudaNdarray_HOST_DIMS(A)[1], alpha, a, sa_1, b, sb_0, beta, c, sc_1); break;
4292         case 0x011: CHK_STRIDE_SGEMM(T, N, CudaNdarray_HOST_DIMS(C)[0], CudaNdarray_HOST_DIMS(C)[1], CudaNdarray_HOST_DIMS(A)[1], alpha, a, sa_0, b, sb_1, beta, c, sc_1); break;
4293         case 0x111: CHK_STRIDE_SGEMM(N, N, CudaNdarray_HOST_DIMS(C)[0], CudaNdarray_HOST_DIMS(C)[1], CudaNdarray_HOST_DIMS(A)[1], alpha, a, sa_1, b, sb_1, beta, c, sc_1); break;
4294         default: PyErr_Format(PyExc_ValueError, "some matrix has no unit stride (unit=%x)", unit);
4295                  return -1;
4296     };
4297     CNDA_THREAD_SYNC;
4298     Py_XDECREF(A_new);
4299     Py_XDECREF(B_new);
4300 
4301     if (CUBLAS_STATUS_SUCCESS != err)
4302     {
4303         PyErr_Format(PyExc_RuntimeError,
4304                      "cublasSgemm failed (%i) %s\n"
4305                      " unit=%x N=%d, c.dims=[%d %d], a.dim=[%d %d], alpha=%f, beta=%f, a=%p, b=%p, c=%p"
4306                      " sa_0=%d, sa_1=%d, sb_0=%d, sb_1=%d, sc_0=%d, sc_1=%d",
4307                      err,  cublasGetErrorString(err),
4308                      unit, N,
4309                      CudaNdarray_HOST_DIMS(C)[0],
4310                      CudaNdarray_HOST_DIMS(C)[1],
4311                      CudaNdarray_HOST_DIMS(A)[0], CudaNdarray_HOST_DIMS(A)[1],
4312                      alpha, beta, a, b, c, sa_0, sa_1, sb_0, sb_1, sc_0, sc_1);
4313 
4314         return -1;
4315     }
4316     return 0;
4317 }
4318 
4319 int CudaNdarray_sgemv(float alpha, const CudaNdarray * A, const CudaNdarray * B, float beta, CudaNdarray * C)
4320 {
4321     /**
4322     * C <- alpha A B + beta C
4323     *    A : matrix
4324     *    B, C: vector
4325     *    alpha, beta: scalars
4326     */
4327     if (A->nd != 2) { PyErr_SetString(PyExc_ValueError, "non-matrix arg to gemv"); return -1; }
4328     if (B->nd != 1) { PyErr_SetString(PyExc_ValueError, "non-vector arg to gemv"); return -1; }
4329     if (C->nd != 1) { PyErr_SetString(PyExc_ValueError, "non-vector arg to gemv"); return -1; }
4330 
4331     // We must allow dimensions to be zeros.
4332     if ((CudaNdarray_HOST_DIMS(A)[1] != CudaNdarray_HOST_DIMS(B)[0])
4333             || (CudaNdarray_HOST_DIMS(A)[0] != CudaNdarray_HOST_DIMS(C)[0]))
4334     {
4335         PyErr_Format(PyExc_ValueError, "dimension mismatch in args to gemv (%i,%i)x(%i)->(%i)",
4336                 CudaNdarray_HOST_DIMS(A)[0],
4337                 CudaNdarray_HOST_DIMS(A)[1],
4338                 CudaNdarray_HOST_DIMS(B)[0],
4339                 CudaNdarray_HOST_DIMS(C)[0]);
4340         return -1;
4341     }
4342 
4343     // If matrix A has non-unit size and non-unit stride in both
4344     // dimensions, or negative strides, we cannot operate, but we can
4345     // make a copy.
4346     CudaNdarray * A_new = NULL;
4347     CudaNdarray * B_new = NULL;
4348     if (((CudaNdarray_HOST_DIMS(A)[0] > 1)
4349          && (CudaNdarray_HOST_STRIDES(A)[0] != 1)
4350          && (CudaNdarray_HOST_DIMS(A)[1] > 1)
4351          && (CudaNdarray_HOST_STRIDES(A)[1] != 1))
4352         || (CudaNdarray_HOST_STRIDES(A)[0] < 0)
4353         || (CudaNdarray_HOST_STRIDES(A)[1] < 0))
4354     {
4355         A_new = (CudaNdarray*) CudaNdarray_Copy(A);
4356         if (!A_new)
4357             return -1;
4358         A = A_new;
4359     }
4360 
4361     // If vector B as a negative stride, we also have to make a copy.
4362     if (CudaNdarray_HOST_STRIDES(B)[0] < 0)
4363     {
4364         B_new = (CudaNdarray*) CudaNdarray_Copy(B);
4365         if (!B_new)
4366         {
4367             // If A was not copied, A_new is NULL, and Py_XDECREF does not
4368             // do anything
4369             Py_XDECREF(A_new);
4370             return -1;
4371         }
4372         B = B_new;
4373     }
4374 
4375     // cudablas does not handle negative strides as expected
4376     if (   (CudaNdarray_HOST_STRIDES(A)[0] < 0)
4377         || (CudaNdarray_HOST_STRIDES(A)[1] < 0))
4378     {
4379         PyErr_Format(PyExc_ValueError, "illegal strides in args to gemv (%i,%i)",
4380                 CudaNdarray_HOST_STRIDES(A)[0],
4381                 CudaNdarray_HOST_STRIDES(A)[1]);
4382         Py_XDECREF(A_new);
4383         Py_XDECREF(B_new);
4384         return -1;
4385     }
4386 
4387     /* create appropriate strides for malformed matrices that are row or column
4388      * vectors
4389      */
4390     int sa_0 = (CudaNdarray_HOST_DIMS(A)[0] > 1) ? CudaNdarray_HOST_STRIDES(A)[0] : CudaNdarray_HOST_DIMS(A)[1];
4391     int sa_1 = (CudaNdarray_HOST_DIMS(A)[1] > 1) ? CudaNdarray_HOST_STRIDES(A)[1] : CudaNdarray_HOST_DIMS(A)[0];
4392     int sb_0 = (CudaNdarray_HOST_DIMS(B)[0] > 1) ? CudaNdarray_HOST_STRIDES(B)[0] : 1;
4393     int sc_0 = (CudaNdarray_HOST_DIMS(C)[0] > 1) ? CudaNdarray_HOST_STRIDES(C)[0] : 1;
4394 
4395     if (sa_0 == 0) sa_0 = 1;
4396     if (sa_1 == 0) sa_1 = 1;
4397 
4398     int used_dot = 0;
4399 
4400     // This is important because we can end up not calling Sgemv at all
4401     cublasStatus_t err = CUBLAS_STATUS_SUCCESS;
4402     if (CudaNdarray_SIZE(C)) {
4403         // A is row vector & alpha==1 & beta==0 -> use cublasSdot
4404         if (CudaNdarray_HOST_DIMS(A)[0] == 1 && alpha==1.f && beta==0.f) {
4405             //replace this with custom inner product kernel with alpha and beta parameter?
4406             cublasPointerMode_t pmode;
4407             //set pointer mode to make sure cublas not storing on host pointer
4408             cublasGetPointerMode(handle, &pmode);
4409             cublasSetPointerMode(handle, CUBLAS_POINTER_MODE_DEVICE);
4410             err = cublasSdot(
4411                     handle, CudaNdarray_HOST_DIMS(A)[1],
4412                     CudaNdarray_DEV_DATA(A), sa_1,
4413                     CudaNdarray_DEV_DATA(B), sb_0,
4414                     CudaNdarray_DEV_DATA(C));
4415             cublasSetPointerMode(handle, pmode);
4416             used_dot = 1;
4417         }
4418         // A is row-contiguous | row vector
4419         else if ((CudaNdarray_HOST_DIMS(A)[0] <= 1)
4420             || ((CudaNdarray_HOST_STRIDES(A)[0] == 1)
4421                 && (CudaNdarray_HOST_STRIDES(A)[1] > 0)))
4422         {
4423             err = cublasSgemv(handle, CUBLAS_OP_N,
4424                     CudaNdarray_HOST_DIMS(A)[0], CudaNdarray_HOST_DIMS(A)[1],
4425                     &alpha,
4426                     CudaNdarray_DEV_DATA(A), sa_1,
4427                     CudaNdarray_DEV_DATA(B), sb_0,
4428                     &beta,
4429                     CudaNdarray_DEV_DATA(C), sc_0);
4430         }
4431         // A is column-contiguous | column vector
4432         else if ((CudaNdarray_HOST_DIMS(A)[1] <= 1)
4433                 || ((CudaNdarray_HOST_STRIDES(A)[1] == 1)
4434                     && (CudaNdarray_HOST_STRIDES(A)[0] > 0)))
4435         {
4436             err = cublasSgemv(handle, CUBLAS_OP_T,
4437                     CudaNdarray_HOST_DIMS(A)[1], CudaNdarray_HOST_DIMS(A)[0],
4438                     &alpha,
4439                     CudaNdarray_DEV_DATA(A), sa_0,
4440                     CudaNdarray_DEV_DATA(B), sb_0,
4441                     &beta,
4442                     CudaNdarray_DEV_DATA(C), sc_0);
4443         }
4444         // A is non vector and have malformed strides
4445         else
4446         {
4447             PyErr_Format(PyExc_AssertionError,
4448                          "Unexpected stride pattern in gemv: (%i, %i) x %i -> %i.\n"
4449                          "Shapes are: (%i, %i) x %i -> %i\n",
4450                          CudaNdarray_HOST_STRIDES(A)[0],
4451                          CudaNdarray_HOST_STRIDES(A)[1],
4452                          CudaNdarray_HOST_STRIDES(B)[0],
4453                          CudaNdarray_HOST_STRIDES(C)[0],
4454                          CudaNdarray_HOST_DIMS(A)[0],
4455                          CudaNdarray_HOST_DIMS(A)[1],
4456                          CudaNdarray_HOST_DIMS(B)[0],
4457                          CudaNdarray_HOST_DIMS(C)[0]);
4458             Py_XDECREF(A_new);
4459             Py_XDECREF(B_new);
4460             return -1;
4461         }
4462     }
4463 
4464     CNDA_THREAD_SYNC;
4465     Py_XDECREF(A_new);
4466     Py_XDECREF(B_new);
4467 
4468     if (CUBLAS_STATUS_SUCCESS != err)
4469     {
4470         if (!used_dot)
4471         {
4472             PyErr_Format(PyExc_RuntimeError,
4473                          "cublasSgemv failed (%i)",
4474                          err);
4475         } else {
4476         PyErr_Format(PyExc_RuntimeError,
4477                      "cublasSdot failed (%i)",
4478                      err);
4479         }
4480         return -1;
4481     }
4482     return 0;
4483 }
4484 
4485 int CudaNdarray_sger(float alpha, const CudaNdarray * x, const CudaNdarray * y, CudaNdarray * A) {
4486     if (x->nd != 1) { PyErr_SetString(PyExc_ValueError, "non-vector arg x to sger"); return -1; }
4487     if (y->nd != 1) { PyErr_SetString(PyExc_ValueError, "non-vector arg y to sger"); return -1; }
4488     if (A->nd != 2) { PyErr_SetString(PyExc_ValueError, "non-matrix arg A to sger"); return -1; }
4489 
4490     if ((CudaNdarray_HOST_DIMS(A)[0] != CudaNdarray_HOST_DIMS(x)[0])
4491         || (CudaNdarray_HOST_DIMS(A)[1] != CudaNdarray_HOST_DIMS(y)[0])) {
4492         PyErr_Format(PyExc_ValueError,
4493                      "dimension mismatch in args to sger (%i)x(%i)->(%i,%i)",
4494                      CudaNdarray_HOST_DIMS(x)[0],
4495                      CudaNdarray_HOST_DIMS(y)[0],
4496                      CudaNdarray_HOST_DIMS(A)[0],
4497                      CudaNdarray_HOST_DIMS(A)[1]);
4498         return -1;
4499     }
4500 
4501     int x_strides = CudaNdarray_HOST_STRIDES(x)[0];
4502     CudaNdarray * x_new = NULL;
4503     if(x_strides == 0){
4504         if(CudaNdarray_HOST_DIMS(x)[0] != 1){
4505             PyErr_Format(PyExc_RuntimeError,
4506                          "CudaNdarray_sger: Invalid input x (should not happen)."
4507                          " We received a CudaNdarray vector with a stride of 0"
4508                          " that has more than 1 element!");
4509             return -1;
4510         }
4511         x_strides = 1;
4512     } else if(x_strides < 0){
4513         x_new = (CudaNdarray*) CudaNdarray_Copy(x);
4514         x = x_new;
4515         x_strides = CudaNdarray_HOST_STRIDES(x)[0];
4516     }
4517 
4518     int y_strides = CudaNdarray_HOST_STRIDES(y)[0];
4519     CudaNdarray * y_new = NULL;
4520     if(y_strides == 0){
4521         if(CudaNdarray_HOST_DIMS(y)[0] != 1){
4522             PyErr_Format(PyExc_RuntimeError,
4523                          "CudaNdarray_sger: Invalid input y (should not happen)."
4524                          " We received a CudaNdarray vector with a stride of 0"
4525                          " that has more than 1 elements!");
4526             Py_XDECREF(x_new);
4527             return -1;
4528         }
4529         y_strides = 1;
4530     } else if(y_strides < 0){
4531         y_new = (CudaNdarray*) CudaNdarray_Copy(y);
4532         y = y_new;
4533         y_strides = CudaNdarray_HOST_STRIDES(y)[0];
4534     }
4535 
4536     // Create appropriate strides if A is a row or column vector
4537     int sa_0 = (CudaNdarray_HOST_DIMS(A)[0] > 1) ? CudaNdarray_HOST_STRIDES(A)[0]
4538                                                  : CudaNdarray_HOST_DIMS(A)[1];
4539     int sa_1 = (CudaNdarray_HOST_DIMS(A)[1] > 1) ? CudaNdarray_HOST_STRIDES(A)[1]
4540                                                  : CudaNdarray_HOST_DIMS(A)[0];
4541 
4542     // This is important because we can end up not calling Sger at all
4543     cublasStatus_t err = CUBLAS_STATUS_SUCCESS;
4544     if(CudaNdarray_SIZE(A)){
4545         // If A is in col-major
4546         if ((CudaNdarray_HOST_DIMS(A)[0] <= 1)
4547             || ((CudaNdarray_HOST_STRIDES(A)[0] == 1)
4548                 && (CudaNdarray_HOST_STRIDES(A)[1] > 0)))
4549         {
4550             err = cublasSger(handle, CudaNdarray_HOST_DIMS(x)[0], CudaNdarray_HOST_DIMS(y)[0], &alpha,
4551                        CudaNdarray_DEV_DATA(x), x_strides,
4552                        CudaNdarray_DEV_DATA(y), y_strides,
4553                        CudaNdarray_DEV_DATA(A), sa_1);
4554         }
4555         // Since Sger expects A in col-major, we invert x and y to fake this.
4556         else if ((CudaNdarray_HOST_DIMS(A)[1] <= 1)
4557                 || ((CudaNdarray_HOST_STRIDES(A)[1] == 1)
4558                     && (CudaNdarray_HOST_STRIDES(A)[0] > 0)))
4559         {
4560             err = cublasSger(handle, CudaNdarray_HOST_DIMS(y)[0], CudaNdarray_HOST_DIMS(x)[0], &alpha,
4561                        CudaNdarray_DEV_DATA(y), y_strides,
4562                        CudaNdarray_DEV_DATA(x), x_strides,
4563                        CudaNdarray_DEV_DATA(A), sa_0);
4564         }
4565         // A has to be either c- or f-contiguous, with no negative strides
4566         else
4567         {
4568             PyErr_SetString(PyExc_NotImplementedError,
4569                             "non-contiguous A, or negative strides, in sger");
4570             Py_XDECREF(x_new);
4571             Py_XDECREF(y_new);
4572             return -1;
4573         }
4574     }
4575     CNDA_THREAD_SYNC;
4576     Py_XDECREF(x_new);
4577     Py_XDECREF(y_new);
4578 
4579     if (CUBLAS_STATUS_SUCCESS != err)
4580     {
4581         PyErr_Format(PyExc_RuntimeError,
4582                      "cublasSger failed (%i)",
4583                      err);
4584         return -1;
4585     }
4586 
4587     return 0;
4588 }
4589 
4590 /**
4591  *
4592  * Precondition:
4593  *  a->dim[d] == (dims_a[d]==0) ? (1 << log2_dims_a[d]) : dims_a[d]
4594  *  z->dim[d] == (z_str[d]==0) ? 1 : dims_a[d];
4595  *
4596  *  TODO: templatize this function to support other reductions.
4597  *  All that needs to change is the initial value for sum, and the reduction operator.
4598  */
4599 
4600 static __global__ void kernel_reduce_sum(const unsigned int size_z,
4601         const unsigned int nd,
4602         const int * dims_a,
4603         const int * log2_dims_a,
4604         const int * a_str,
4605         const float * a_data,
4606         const int * z_str,
4607         float * z_data)
4608 {
4609     const unsigned int idx = blockIdx.x * blockDim.x + threadIdx.x;
4610     const unsigned int numThreads = blockDim.x * gridDim.x;
4611 
4612     //structure data contains the strides and dimensions of both a and z
4613     // a_dim[0], a_dim[1], ... a_dim[nd-1],
4614     // a_log2dim[0], a_log2dim[1], ... a_log2dim[nd-1],
4615     // a_str[0], ... a_str[nd-1],
4616     // z_str[0], ... z_str[nd-1]
4617     extern __shared__ int structure_data[];
4618     for (unsigned int i = threadIdx.x; i < nd; i += blockDim.x)
4619     {
4620         structure_data[i+0*nd] = dims_a[i];
4621         structure_data[i+1*nd] = log2_dims_a[i];
4622         structure_data[i+2*nd] = a_str[i];
4623         structure_data[i+3*nd] = z_str[i];
4624     }
4625     dims_a = structure_data;
4626     log2_dims_a = structure_data + nd;
4627     a_str = structure_data + 2*nd;
4628     z_str = structure_data + 3*nd;
4629 
4630     __syncthreads(); //wait for all the shared structure to be loaded
4631 
4632     for (unsigned int i = idx; i < size_z; i += numThreads)
4633     {
4634         unsigned int ii = i;
4635         const float * a_data_i = a_data;
4636         float * z_data_i = z_data;
4637         unsigned int n_reduce_elements = 1;
4638         unsigned int n_reduce_dims = 0;
4639         unsigned int reduce_dim0 = nd-1;
4640 
4641 
4642         //In this loop, we locate the initial element of the slice that we'd like to reduce with this thread
4643         //  At the same time, we [re]calculate the size of that slice (n_reduce_elements)
4644         for (unsigned int d = 0; d < nd; ++d)
4645         {
4646             if (a_str[d] && (!z_str[d])) // this means 'd' is a dimension we are reducing over
4647             {
4648                 n_reduce_elements *= dims_a[d];
4649                 n_reduce_dims += 1;
4650                 reduce_dim0 = (d < reduce_dim0) ? d : reduce_dim0;
4651             }
4652             else //'d' is not a dimension that we are reducing over
4653             {
4654                 unsigned int pos_d;
4655                 if (log2_dims_a[d]==-1) //TODO: when things are working, use this switch
4656                 {
4657                     // this branch is not preferred,
4658                     // because the manual said that integer mod and div operations are slow on gpu
4659                     pos_d = (ii % dims_a[d]);
4660                     ii = (ii / dims_a[d]);
4661                 }
4662                 else
4663                 {
4664                     pos_d = (ii & ((1 << log2_dims_a[d])-1)); //take the lower log2_dims bits
4665                     ii = (ii >> log2_dims_a[d]);  //shift those lower log2_dims bits off of ii
4666                 }
4667                 a_data_i += pos_d * a_str[d];
4668                 z_data_i += pos_d * z_str[d];
4669             }
4670         }
4671         // now we've got pointers a_data_i and z_data_i into element 0 of the slice over which we are reducing
4672         // do a similar loop
4673 
4674         float sum = 0.0f;
4675         switch(n_reduce_dims)
4676         {
4677             case 0:
4678                 {
4679                     sum = a_data_i[0];
4680                 }
4681                 break;
4682             case 1:
4683                 {
4684                     const int stride = a_str[reduce_dim0];
4685                     const float * a_data_i_max = a_data_i + dims_a[reduce_dim0] * stride;
4686                     while (a_data_i != a_data_i_max)
4687                     {
4688                         sum += a_data_i[0];
4689                         a_data_i += stride;
4690                     }
4691                 }
4692                 break;
4693             case 2:
4694                 {
4695                     int rd = reduce_dim0+1;
4696                     for (; rd < nd; ++rd)
4697                     {
4698                         if (a_str[rd] && (!z_str[rd])) // this means 'rd' is a dimension we are reducing over
4699                             break;
4700                     }
4701                     const int stride0 = a_str[reduce_dim0];
4702                     const int stride1 = a_str[rd];
4703                     for (int ii = 0; ii < dims_a[rd]; ++ii)
4704                     {
4705                         const float * a_data_ri = a_data_i + ii * stride1;
4706                         const float * a_data_ri_max = a_data_ri + dims_a[reduce_dim0] * stride0;
4707                         while (a_data_ri != a_data_ri_max)
4708                         {
4709                             sum += a_data_ri[0];
4710                             a_data_ri += stride0;
4711                         }
4712                     }
4713                 };
4714                 break;
4715             default:
4716                 {
4717                     for (unsigned int reduce_i = 0; reduce_i < n_reduce_elements; ++reduce_i)
4718                     {
4719                         //TODO: optimize this loop to work more like theano's Elemwise.  It's serial code.
4720                         unsigned int reduce_ii = reduce_i;
4721                         const float * a_data_ri = a_data_i;
4722 
4723                         //This loop finds the element in the a slice to add.
4724                         for (unsigned int rd = reduce_dim0; rd < nd; ++rd)
4725                         {
4726                             unsigned int pos_d;
4727                             if (a_str[rd] && (!z_str[rd])) // this means 'd' is a dimension we are reducing over
4728                             {
4729                                 if (log2_dims_a[rd]==-1)
4730                                 {
4731                                     // this branch is not preferred,
4732                                     // because the manual said that integer mod and div operations are slow on gpu
4733                                     pos_d = (reduce_ii % dims_a[rd]);
4734                                     reduce_ii = (reduce_ii / dims_a[rd]);
4735                                 }
4736                                 else
4737                                 {
4738                                     pos_d = (reduce_ii & ((1 << log2_dims_a[rd])-1)); //take the lower log2_dims bits
4739                                     reduce_ii = (reduce_ii >> log2_dims_a[rd]);  //shift those lower log2_dims bits off of ii
4740                                 }
4741                                 a_data_ri += pos_d * a_str[rd];
4742                             }
4743                         }
4744                         sum += a_data_ri[0];
4745                     }
4746                 }
4747         }
4748         z_data_i[0] = sum;
4749     }
4750 }
4751 
4752 static __global__ void kernel_reduce_sum_1011(
4753         const unsigned int d0,
4754         const unsigned int d1,
4755         const unsigned int d2,
4756         const unsigned int d3,
4757         const float *A, const int sA0, const int sA1, const int sA2, const int sA3,
4758         float * Z, const int sZ0)
4759 {
4760     const int threadCount = blockDim.x * blockDim.y * blockDim.z;
4761     const int threadNum = threadIdx.z * blockDim.x * blockDim.y + threadIdx.y * blockDim.x + threadIdx.x;
4762     extern __shared__ float buf[];
4763     float mysum = 0.0f;
4764 
4765     if (warpSize != 32)
4766     {
4767         return;  //TODO: set error code
4768     }
4769 
4770     for (int i0 = threadIdx.z; i0 < d0; i0 += blockDim.z)
4771     {
4772         float Ai = A[i0 * sA0 + blockIdx.x * sA1 + threadIdx.y * sA2 + threadIdx.x * sA3];
4773         mysum += Ai;
4774     }
4775     buf[threadNum] = mysum;
4776     __syncthreads();
4777 
4778     // rest of function is handled by one warp
4779     if (threadNum < warpSize)
4780     {
4781         for (int i = threadNum + warpSize; i < threadCount; i += warpSize)
4782         {
4783             mysum += buf[i];
4784         }
4785         buf[threadNum] = mysum;
4786         if (threadNum < 16)
4787         {
4788             //reduce so that threadNum 0 has the sum of everything
4789             if(threadNum + 16 < threadCount) buf[threadNum] += buf[threadNum+16];
4790             if(threadNum + 8 < threadCount) buf[threadNum] += buf[threadNum+8];
4791             if(threadNum + 4 < threadCount) buf[threadNum] += buf[threadNum+4];
4792             if(threadNum + 2 < threadCount) buf[threadNum] += buf[threadNum+2];
4793             if(threadNum + 1 < threadCount) buf[threadNum] += buf[threadNum+1];
4794             if (threadNum == 0)
4795             {
4796                 Z[blockIdx.x*sZ0] = buf[0];
4797             }
4798         }
4799     }
4800 }
4801 /**
4802  * Dimensions in which the self has size 1 and A has size > 1 are considered summing dimensions
4803  * Dimensions in which self has size > 1 and A has size > 1 are considered non-summing dimensions, and in this case their sizes must be equal.
4804  */
4805 int
4806 CudaNdarray_reduce_sum(CudaNdarray * self, CudaNdarray * A)
4807 {
4808     int verbose = 0;
4809     //check input rank
4810     if (self->nd != A->nd)
4811     {
4812         PyErr_Format(PyExc_TypeError, "Rank mismatch in CudaNdarray_sum: %i vs %i", self->nd, A->nd);
4813         return -1;
4814     }
4815     for (int i = 0; i < self->nd; ++i)
4816     {
4817         if ((CudaNdarray_HOST_DIMS(self)[i] > 1) && (CudaNdarray_HOST_DIMS(self)[i] != CudaNdarray_HOST_DIMS(A)[i]))
4818         {
4819             PyErr_Format(PyExc_TypeError, "Dimension mismatch in CudaNdarray_sum: self->dim[%i] == %i , A->dim[%i] = %i",
4820                     i, CudaNdarray_HOST_DIMS(self)[i], i, CudaNdarray_HOST_DIMS(A)[i]);
4821             return -1;
4822         }
4823     }
4824 
4825     int n_summations = (unsigned int)CudaNdarray_SIZE(self);
4826     if (verbose)
4827     {
4828         std::cerr << "reduce_sum n_summations " << n_summations  << '\n';
4829         std::cerr << "reduce_sum nd " << self->nd  << '\n';
4830         fprint_CudaNdarray(stderr, A);
4831         fprint_CudaNdarray(stderr, self);
4832     }
4833     if (0 && (A->nd == 4) //check to see if kernel_reduce_sum_1011 applies
4834             && (CudaNdarray_HOST_DIMS(self)[0] == 1)
4835             && (CudaNdarray_HOST_DIMS(self)[2] == 1)
4836             && (CudaNdarray_HOST_DIMS(self)[3] == 1)
4837        )
4838     {
4839         dim3 n_threads(CudaNdarray_HOST_DIMS(A)[3], CudaNdarray_HOST_DIMS(A)[2]);
4840         dim3 n_blocks(CudaNdarray_HOST_DIMS(A)[1]);
4841         while (n_threads.x * n_threads.y * n_threads.z < NUM_VECTOR_OP_THREADS_PER_BLOCK) ++n_threads.z;
4842         n_threads.z -= 1;
4843         if (n_threads.z > 64) n_threads.z = 64;
4844         if (n_threads.z)
4845         {
4846             if (verbose) printf("trying kernel_reduce_sum_1011\n");
4847             int n_shared = sizeof(float) * n_threads.x * n_threads.y * n_threads.z;
4848             kernel_reduce_sum_1011<<<n_blocks, n_threads, n_shared>>>(
4849                     CudaNdarray_HOST_DIMS(A)[0],
4850                     CudaNdarray_HOST_DIMS(A)[1],
4851                     CudaNdarray_HOST_DIMS(A)[2],
4852                     CudaNdarray_HOST_DIMS(A)[3],
4853                     CudaNdarray_DEV_DATA(A),
4854                     CudaNdarray_HOST_STRIDES(A)[0],
4855                     CudaNdarray_HOST_STRIDES(A)[1],
4856                     CudaNdarray_HOST_STRIDES(A)[2],
4857                     CudaNdarray_HOST_STRIDES(A)[3],
4858                     CudaNdarray_DEV_DATA(self),
4859                     CudaNdarray_HOST_STRIDES(self)[1]);
4860             CNDA_THREAD_SYNC;
4861             if (cudaSuccess == cudaGetLastError()) return 0;
4862             if (verbose) printf("failed, falling back to kernel_reduce_sum\n");
4863         }
4864     }
4865 
4866     int n_threads_per_block = std::min(n_summations,
4867             NUM_VECTOR_OP_THREADS_PER_BLOCK);
4868     int n_blocks = std::min(ceil_intdiv(n_summations,n_threads_per_block),
4869             NUM_VECTOR_OP_BLOCKS);
4870     int n_structure_cache = self->nd * 4 * sizeof(int);
4871 
4872     if (verbose)
4873     {
4874         std::cerr << "n_blocks, n_threads_per_block " << n_blocks << ' ' << n_threads_per_block  << '\n';
4875     }
4876     assert (self->nd > 0);
4877     assert (self->nd == A->nd);
4878     kernel_reduce_sum<<<n_blocks, n_threads_per_block, n_structure_cache>>>(
4879             n_summations,
4880             self->nd,
4881             CudaNdarray_DEV_DIMS(A),
4882             CudaNdarray_DEV_LOG2DIMS(A),
4883             CudaNdarray_DEV_STRIDES(A),
4884             CudaNdarray_DEV_DATA(A),
4885             CudaNdarray_DEV_STRIDES(self),
4886             CudaNdarray_DEV_DATA(self));
4887     CNDA_THREAD_SYNC;
4888     cudaError_t err = cudaGetLastError();
4889     if (cudaSuccess != err)
4890     {
4891         PyErr_Format(PyExc_RuntimeError, "Cuda error: %s: %s.\n", "kernel_reduce_sum", cudaGetErrorString(err));
4892         return -1;
4893     }
4894     return 0;
4895 }
4896 int
4897 CudaNdarray_reduce_prod(CudaNdarray * self, const CudaNdarray * A)
4898 {
4899     PyErr_SetString(PyExc_NotImplementedError, "");
4900     return -1;
4901 }
4902 int
4903 CudaNdarray_reduce_min(CudaNdarray * self, const CudaNdarray * A)
4904 {
4905     PyErr_SetString(PyExc_NotImplementedError, "");
4906     return -1;
4907 }
4908 int
4909 CudaNdarray_reduce_max(CudaNdarray * self, const CudaNdarray * A)
4910 {
4911     PyErr_SetString(PyExc_NotImplementedError, "");
4912     return -1;
4913 }
4914 
4915 
4916 /**
4917  *
4918  *  pattern is a permutation of [0, 1, ... self->nd-1] with the following twists:
4919  *  - an element 'd' of the permutation can be dropped if CudaNdarray_HOST_DIMS(self)[d] == 1
4920  *  - any number of '-1' elements can be in the pattern, and they will cause new ranks (with dim==1) to be inserted.
4921  *
4922  *  For example, if CudaNdarray_HOST_DIMS(self) == [4, 5, 1, 6], and pattern = [0,3,-1,-1, 1], then CudaNdarray_HOST_DIMS(self) would be modified to become:
4923  *     [4, 6, 1, 1, 5] (we dropped the original dim[2]==1, and inserted two singleton dimensions with the -1s.
4924  */
4925 int
4926 CudaNdarray_dimshuffle(CudaNdarray * self, unsigned int len, const int * pattern)
4927 {
4928     //TODO: pass a workspace pointer to avoid the internal malloc
4929     int * newdims = (int *)malloc(sizeof(int) * (len + len + self->nd)); //we tack on the taken buffer here for speed of not having to malloc twice.
4930     int * newstrides = newdims + len;
4931     int * dims_taken = newstrides + len;
4932     if (!newdims)
4933     {
4934         PyErr_SetString(PyExc_MemoryError, "CudaNdarray_dimshuffle: Failed to allocate temporary space");
4935         return -1;
4936     }
4937     for (int i = 0; i < self->nd; ++i)
4938     {
4939         dims_taken[i] = 0;
4940     }
4941     for (int i = 0; i < len; ++i)
4942     {
4943         if (pattern[i] < 0)
4944         {
4945             newdims[i] = 1;
4946             newstrides[i] = 0;
4947         }
4948         else if(dims_taken[pattern[i]])
4949         {
4950             PyErr_Format(PyExc_ValueError, "Cudandarray_dimshuffle: invalid pattern for Cudandarray_dimshuffle. You used the dimensions %d multiple time",
4951                          pattern[i]);
4952             free(newdims);
4953             return -1;
4954         }
4955         else if (pattern[i]>= self->nd)
4956         {
4957             PyErr_Format(PyExc_ValueError, "Cudandarray_dimshuffle: invalid pattern for Cudandarray_dimshuffle. You asked for a dimensions that don't exist %d for a %d dims CudaNdarray",
4958                          pattern[i], self->nd);
4959             free(newdims);
4960             return -1;
4961         }
4962         else
4963         {
4964             newdims[i] = CudaNdarray_HOST_DIMS(self)[pattern[i]];
4965             newstrides[i] = CudaNdarray_HOST_STRIDES(self)[pattern[i]];
4966             dims_taken[pattern[i]] = 1;
4967         }
4968     }
4969     //Check if we dropped not broadcastable dims
4970     for (int i = 0; i < self->nd; ++i)
4971     {
4972         if (dims_taken[i]==0 && CudaNdarray_HOST_DIMS(self)[i]!=1)
4973         {
4974             PyErr_SetString(PyExc_ValueError, "Cudandarray_dimshuffle: You cannot drop a non-broadcastable dimension.");
4975             free(newdims);
4976             return -1;
4977         }
4978     }
4979     //swap this structure in for the one in self, and sync to the card
4980     if (CudaNdarray_set_nd(self, len))
4981     {
4982         free(newdims);
4983         return -1;
4984     }
4985     for (int i = 0; i < len; ++i)
4986     {
4987         CudaNdarray_set_dim(self, i, newdims[i]);
4988         CudaNdarray_set_stride(self, i, newstrides[i]);
4989     }
4990     if (cnda_copy_structure_to_device(self))
4991     {
4992         free(newdims);
4993         return -1;
4994     }
4995     free(newdims);
4996     return 0;
4997 }
4998 
4999 
5000 
5001 /**
5002  *
5003  *  This is the function that bind to python.
5004  *  See CudaNdarray_dimshuffle to call from C.
5005  *  We use -1 to mean 'x' as in Tensor Dimshuffle.
5006  */
5007 PyObject *
5008 CudaNdarray_Dimshuffle(PyObject* _unused, PyObject* args)
5009 {
5010     PyObject * self = NULL;
5011     PyObject * pattern_object = NULL;
5012     int * pattern = NULL;
5013     PyObject * rval = NULL;
5014     int success = -1;
5015     //const int * dims = NULL;
5016 
5017     //args should consist of two python objects ("OO")
5018     if (! PyArg_ParseTuple(args, "OO", &self, &pattern_object))
5019         return NULL;
5020 
5021     if (!CudaNdarray_Check(self) )
5022     {
5023         PyErr_SetString(PyExc_TypeError, "First argument to cuda_ndarray.dimshuffle must be a CudaNdarray");
5024         return NULL;
5025     }
5026 
5027     //parse pattern_object into int * pattern
5028 
5029     Py_ssize_t pattern_dim =  PyObject_Length(pattern_object);
5030 
5031     if (pattern_dim < 0)
5032     {
5033         PyErr_SetString(PyExc_TypeError, "Couldn't get length of third argument to cuda_ndarray.dimshuffle");
5034         return NULL;
5035     }
5036 
5037     pattern = (int *) malloc( pattern_dim * sizeof(int));
5038 
5039     for (Py_ssize_t i = 0; i < pattern_dim; i++)
5040     {
5041         PyObject * idx = PyLong_FromSsize_t(i);
5042 
5043         if (idx == NULL)
5044         {
5045             PyErr_SetString(PyExc_Exception, "Couldn't make long object to loop over list/tuple");
5046             goto CudaNdarray_dimshuffle_fail;
5047         }
5048 
5049         long elem_value = 0;
5050 
5051         PyObject * elem = PyObject_GetItem(pattern_object, idx);
5052 
5053         if (elem == NULL)
5054         {
5055             Py_XDECREF( elem);
5056             PyErr_SetString(PyExc_ValueError, "Third argument to dimshuffle must be list or tuple of integers");
5057             goto CudaNdarray_dimshuffle_fail;
5058         }
5059 
5060         elem_value = PyInt_AsLong(elem);
5061 
5062         if (elem_value == -1 && PyErr_Occurred() )
5063         {
5064             Py_XDECREF(elem);
5065             PyErr_SetString(PyExc_ValueError, "Third argument to dimshuffle must be list or tuple of integers");
5066             goto CudaNdarray_dimshuffle_fail;
5067         }
5068 
5069         pattern[i] = elem_value;
5070 
5071         Py_XDECREF( elem );
5072         Py_XDECREF( idx );
5073     }
5074 
5075     //allocate rval
5076     rval =  (PyObject *) CudaNdarray_View((CudaNdarray *) self);
5077 
5078     if (rval == NULL)
5079     {
5080         //CudaNdarray_New should have set the exception string
5081         goto CudaNdarray_dimshuffle_fail;
5082     }
5083 
5084 
5085     //printf("pattern_dim: %d\n",pattern_dim);
5086     //printf("pattern: %d %d\n",pattern[0],pattern[1]);
5087     //dims = CudaNdarray_HOST_DIMS( (CudaNdarray *) self);
5088     //printf("dims before: %d %d\n",dims[0],dims[1]);
5089 
5090     success = CudaNdarray_dimshuffle((CudaNdarray *) rval, pattern_dim, pattern);
5091 
5092     if (success != 0)
5093     {
5094         //Exception string should already be set by CudaNdarray_dimshuffle
5095         goto CudaNdarray_dimshuffle_fail;
5096     }
5097 
5098     free(pattern);
5099 
5100     return rval;
5101 
5102     CudaNdarray_dimshuffle_fail:
5103 
5104     if (pattern != NULL)
5105         free(pattern);
5106 
5107     Py_XDECREF(rval);
5108     return NULL;
5109 }
5110 
5111 
5112 int
5113 cnda_structure_size(int nd)
5114 {
5115     // dim0, dim1, ...
5116     // str0, str1, ...
5117     // log2(dim0), log2(dim1), ...
5118     return nd + nd + nd;
5119 }
5120 
5121 const int *
5122 CudaNdarray_HOST_DIMS(const CudaNdarray * self)
5123 {
5124     return self->host_structure;
5125 }
5126 
5127 const int *
5128 CudaNdarray_HOST_STRIDES(const CudaNdarray * self)
5129 {
5130     return self->host_structure + self->nd;
5131 }
5132 const int *
5133 CudaNdarray_HOST_LOG2DIMS(const CudaNdarray * self)
5134 {
5135     return self->host_structure + 2*self->nd;
5136 }
5137 
5138 int
5139 CudaNdarray_EqualAndIgnore(CudaNdarray *cnda1, CudaNdarray *cnda2, int ignoreSync, int ignoreBase)
5140 {
5141     int verbose = 0;
5142 
5143     if (!ignoreSync && cnda1->dev_structure_fresh != cnda2->dev_structure_fresh)
5144     {
5145         if(verbose) fprintf(stdout, "CUDANDARRAY_EQUAL FAILED : 1\n");
5146         return 0;
5147     }
5148 
5149     if (cnda1->nd != cnda2->nd)
5150     {
5151         if(verbose) fprintf(stdout, "CUDANDARRAY_EQUAL FAILED : 2\n");
5152         return 0;
5153     }
5154 
5155     for (int i=0; i < 2*cnda1->nd; i++)
5156     {
5157         if (cnda1->host_structure[i] != cnda2->host_structure[i])
5158         {
5159             if(verbose)
5160                 fprintf(stdout, "CUDANDARRAY_EQUAL : host_structure : %d, %d, %d\n", i, cnda1->host_structure[i], cnda2->host_structure[i]);
5161             return 0;
5162         }
5163     }
5164 
5165     if (!ignoreBase && cnda1->base != cnda2->base)
5166     {
5167         if(verbose) fprintf(stdout, "CUDANDARRAY_EQUAL FAILED : 4");
5168         return 0;
5169     }
5170     else if (cnda1->data_allocated != cnda2->data_allocated)
5171     {
5172         if(verbose) fprintf(stdout, "CUDANDARRAY_EQUAL FAILED : 5");
5173         return 0;
5174     }
5175     else if (cnda1->data_allocated && cnda1->devdata != cnda2->devdata)
5176     {
5177         if(verbose) fprintf(stdout, "CUDANDARRAY_EQUAL FAILED : 6");
5178         // no need to check devdata if data is not allocated
5179         return 0;
5180     }
5181 
5182     return 1;
5183 }
5184 
5185 
5186 int
5187 CudaNdarray_Equal(CudaNdarray *cnda1, CudaNdarray *cnda2)
5188 {
5189     return CudaNdarray_EqualAndIgnore(cnda1, cnda2, 0, 0);
5190 }
5191 
5192 int
5193 cnda_copy_structure_to_device(const CudaNdarray * self)
5194 {
5195     //If the device structure do not exists, create it.
5196     //We allocate it here as we do not need it often.
5197     //In fact, we need it so infrequently that we expect
5198     //that most object won't need it. Not allocating it
5199     //save a significant when creating object.
5200     //This speed up a benchmark by 8% with the gc.
5201     if (!self->dev_structure)
5202     {
5203         int struct_size = cnda_structure_size(self->nd);
5204         if (struct_size)
5205         {
5206             self->dev_structure = (int*)device_malloc(struct_size* sizeof(int));
5207             if (NULL == self->dev_structure)
5208             {
5209                 return -1;
5210             }
5211         }
5212     }
5213     if (cublasSetVector(cnda_structure_size(self->nd),
5214                         sizeof(int),
5215                         self->host_structure,
5216                         1,
5217                         self->dev_structure,
5218                         1) != CUBLAS_STATUS_SUCCESS)
5219     {
5220         PyErr_SetString(PyExc_RuntimeError, "error copying structure to device memory");
5221         return -1;
5222     }
5223     self->dev_structure_fresh = 1;
5224     return 0;
5225 }
5226 
5227 const int *
5228 CudaNdarray_DEV_DIMS(const CudaNdarray * self)
5229 {
5230     if (!self->dev_structure_fresh)
5231     {
5232         if (cnda_copy_structure_to_device(self))
5233             return NULL;
5234     }
5235     return self->dev_structure;
5236 }
5237 const int *
5238 CudaNdarray_DEV_STRIDES(const CudaNdarray * self)
5239 {
5240     if (!self->dev_structure_fresh)
5241     {
5242         if (cnda_copy_structure_to_device(self))
5243             return NULL;
5244     }
5245     return self->dev_structure + self->nd;
5246 }
5247 const int *
5248 CudaNdarray_DEV_LOG2DIMS(const CudaNdarray * self)
5249 {
5250     if (!self->dev_structure_fresh)
5251     {
5252         if (cnda_copy_structure_to_device(self))
5253             return NULL;
5254     }
5255     return self->dev_structure + 2*self->nd;
5256 }
5257 float *
5258 CudaNdarray_DEV_DATA(const CudaNdarray * self)
5259 {
5260     return self->devdata;
5261 }
5262 
5263 /**
5264  * Return the number of elements in the ndarray (product of the dimensions)
5265  */
5266 size_t
5267 CudaNdarray_SIZE(const CudaNdarray *self)
5268 {
5269     if (self->nd == -1) return 0;
5270     size_t size = 1;
5271     for (int i = 0; i < self->nd; ++i)
5272     {
5273         size *= CudaNdarray_HOST_DIMS(self)[i];
5274     }
5275     return size;
5276 }
5277 
5278 PyObject *
5279 CudaNdarray_SIZE_Object(const CudaNdarray *self, void *closure)
5280 {
5281     return PyInt_FromLong(CudaNdarray_SIZE(self));
5282 }
5283 
5284 int CudaNdarray_set_device_data(CudaNdarray * self, float * data, const CudaNdarray * base)
5285 {
5286     return CudaNdarray_set_device_data(self, data, (PyObject *) base);
5287 }
5288 
5289 PyObject * CudaNdarray_IS_C_Contiguous(CudaNdarray * self)
5290 {
5291     return PyBool_FromLong(CudaNdarray_is_c_contiguous(self));
5292 }
5293 
5294 int fprint_CudaNdarray(FILE * fd, const CudaNdarray *self)
5295 {
5296     cudaError_t err = cudaGetLastError();
5297     if( cudaSuccess != err)
5298     {
5299         PyErr_Format(PyExc_RuntimeError,
5300                      "Cuda error: %s: %s.",
5301                      "fprint_CudaNdarray was called with an uncleared error",
5302                      cudaGetErrorString(err));
5303         return -1;
5304     }
5305     fprintf(fd, "CudaNdarray <%p, %p> nd=%i dev_structure_fresh=%d data_allocated=%d\n",
5306             self, self->devdata, self->nd, self->dev_structure_fresh, self->data_allocated);
5307     fprintf(fd, "\tHOST_DIMS:      ");
5308     for (int i = 0; i < self->nd; ++i)
5309     {
5310         fprintf(fd, "%i\t", CudaNdarray_HOST_DIMS(self)[i]);
5311     }
5312     fprintf(fd, "\n\tHOST_STRIDES: ");
5313     for (int i = 0; i < self->nd; ++i)
5314     {
5315         fprintf(fd, "%i\t", CudaNdarray_HOST_STRIDES(self)[i]);
5316     }
5317 
5318     if (self->dev_structure)
5319     {
5320         int data=0;
5321         fprintf(fd, "\n\tDEV_DIMS:      ");
5322         for (int i = 0; i < self->nd; ++i)
5323         {
5324             cublasGetVector(1, sizeof(int),
5325                             self->dev_structure+i, 1,
5326                             &data, 1);
5327             fprintf(fd, "%i\t", data);
5328         }
5329         fprintf(fd, "\n\tDEV_STRIDES: ");
5330         for (int i = 0; i < self->nd; ++i)
5331         {
5332             cublasGetVector(1, sizeof(int),
5333                             self->dev_structure + self->nd+i, 1,
5334                             &data, 1);
5335             fprintf(fd, "%i \t", data);
5336         }
5337         fprintf(fd, "\n");
5338     }
5339     else
5340     {
5341         fprintf(fd, "\n\tdev_structure not allocated\n");
5342     }
5343 
5344     err = cudaGetLastError();
5345     if( cudaSuccess != err)
5346     {
5347         PyErr_Format(PyExc_RuntimeError,
5348                      "Cuda error: %s: %s.",
5349                      "fprint_CudaNdarray",
5350                      cudaGetErrorString(err));
5351         return -1;
5352     }
5353     return 0;
5354 }
5355 
5356 
5357 int CudaNdarray_prep_output(CudaNdarray ** arr, int nd,
5358                             const int * dims, int fortran)
5359 {
5360     bool allocated = false;
5361     if (*arr == NULL)
5362     {
5363         // This allocates the metadata but not the data
5364         *arr = (CudaNdarray *) CudaNdarray_new_nd(nd);
5365         if (*arr == NULL)
5366             return -1;
5367         allocated = true;
5368     }
5369 
5370     if (CudaNdarray_alloc_contiguous(*arr, nd, dims, fortran))
5371     {
5372         if (allocated)
5373         {
5374             Py_DECREF(*arr);
5375             *arr = NULL;
5376         }
5377         return -1;
5378     }
5379     return 0;
5380 }
5381 
5382 
5383 /*
5384   Local Variables:
5385   mode:c++
5386   c-basic-offset:4
5387   c-file-style:"stroustrup"
5388   indent-tabs-mode:nil
5389   fill-column:79
5390   End:
5391 */
5392 // vim: filetype=cpp:expandtab:shiftwidth=4:tabstop=8:softtabstop=4:textwidth=79 :
5393 
===============================
In file included from mod.cu:10:0:
/home/vlaand/.local/lib/python3.6/site-packages/theano/sandbox/cuda/cuda_ndarray.cuh:17:0: warning: "PyString_Check" redefined
 #define PyString_Check PyUnicode_Check
 ^
In file included from /home/vlaand/.local/lib/python3.6/site-packages/theano/sandbox/cuda/cuda_ndarray.cuh:11:0,
                 from mod.cu:10:
/home/vlaand/anaconda3/lib/python3.6/site-packages/numpy/core/include/numpy/npy_3kcompat.h:71:0: note: this is the location of the previous definition
 #define PyString_Check PyBytes_Check
 ^
In file included from mod.cu:10:0:
/home/vlaand/.local/lib/python3.6/site-packages/theano/sandbox/cuda/cuda_ndarray.cuh:18:0: warning: "PyString_FromString" redefined
 #define PyString_FromString PyUnicode_FromString
 ^
In file included from /home/vlaand/.local/lib/python3.6/site-packages/theano/sandbox/cuda/cuda_ndarray.cuh:11:0,
                 from mod.cu:10:
/home/vlaand/anaconda3/lib/python3.6/site-packages/numpy/core/include/numpy/npy_3kcompat.h:73:0: note: this is the location of the previous definition
 #define PyString_FromString PyBytes_FromString
 ^
In file included from mod.cu:10:0:
/home/vlaand/.local/lib/python3.6/site-packages/theano/sandbox/cuda/cuda_ndarray.cuh:19:0: warning: "PyString_AsString" redefined
 #define PyString_AsString PyUnicode_AsUTF8
 ^
In file included from /home/vlaand/.local/lib/python3.6/site-packages/theano/sandbox/cuda/cuda_ndarray.cuh:11:0,
                 from mod.cu:10:
/home/vlaand/anaconda3/lib/python3.6/site-packages/numpy/core/include/numpy/npy_3kcompat.h:80:0: note: this is the location of the previous definition
 #define PyString_AsString PyBytes_AsString
 ^
In file included from mod.cu:10:0:
/home/vlaand/.local/lib/python3.6/site-packages/theano/sandbox/cuda/cuda_ndarray.cuh:20:0: warning: "PyString_FromStringAndSize" redefined
 #define PyString_FromStringAndSize PyUnicode_FromStringAndSize
 ^
In file included from /home/vlaand/.local/lib/python3.6/site-packages/theano/sandbox/cuda/cuda_ndarray.cuh:11:0,
                 from mod.cu:10:
/home/vlaand/anaconda3/lib/python3.6/site-packages/numpy/core/include/numpy/npy_3kcompat.h:74:0: note: this is the location of the previous definition
 #define PyString_FromStringAndSize PyBytes_FromStringAndSize
 ^
In file included from mod.cu:10:0:
/home/vlaand/.local/lib/python3.6/site-packages/theano/sandbox/cuda/cuda_ndarray.cuh:21:0: warning: "PyString_Size" redefined
 #define PyString_Size PyUnicode_GET_SIZE
 ^
In file included from /home/vlaand/.local/lib/python3.6/site-packages/theano/sandbox/cuda/cuda_ndarray.cuh:11:0,
                 from mod.cu:10:
/home/vlaand/anaconda3/lib/python3.6/site-packages/numpy/core/include/numpy/npy_3kcompat.h:82:0: note: this is the location of the previous definition
 #define PyString_Size PyBytes_Size
 ^
mod.cu(940): warning: pointless comparison of unsigned integer with zero
mod.cu(3000): warning: conversion from a string literal to "char *" is deprecated
mod.cu(3003): warning: conversion from a string literal to "char *" is deprecated
mod.cu(3005): warning: conversion from a string literal to "char *" is deprecated
mod.cu(3008): warning: conversion from a string literal to "char *" is deprecated
mod.cu(3010): warning: conversion from a string literal to "char *" is deprecated
mod.cu(3013): warning: conversion from a string literal to "char *" is deprecated
mod.cu(3016): warning: conversion from a string literal to "char *" is deprecated
mod.cu(3019): warning: conversion from a string literal to "char *" is deprecated
mod.cu(3021): warning: conversion from a string literal to "char *" is deprecated
mod.cu(3024): warning: conversion from a string literal to "char *" is deprecated
mod.cu(3026): warning: conversion from a string literal to "char *" is deprecated
mod.cu(3029): warning: conversion from a string literal to "char *" is deprecated
mod.cu(3031): warning: conversion from a string literal to "char *" is deprecated
mod.cu(3034): warning: conversion from a string literal to "char *" is deprecated
mod.cu(3037): warning: conversion from a string literal to "char *" is deprecated
mod.cu(3040): warning: conversion from a string literal to "char *" is deprecated
mod.cu(3042): warning: conversion from a string literal to "char *" is deprecated
mod.cu(3045): warning: conversion from a string literal to "char *" is deprecated
mod.cu(3047): warning: conversion from a string literal to "char *" is deprecated
mod.cu(3050): warning: conversion from a string literal to "char *" is deprecated
mod.cu(3074): warning: statement is unreachable
In file included from mod.cu:10:0:
/home/vlaand/.local/lib/python3.6/site-packages/theano/sandbox/cuda/cuda_ndarray.cuh:17:0: warning: "PyString_Check" redefined
 #define PyString_Check PyUnicode_Check
 ^
In file included from /home/vlaand/.local/lib/python3.6/site-packages/theano/sandbox/cuda/cuda_ndarray.cuh:11:0,
                 from mod.cu:10:
/home/vlaand/anaconda3/lib/python3.6/site-packages/numpy/core/include/numpy/npy_3kcompat.h:71:0: note: this is the location of the previous definition
 #define PyString_Check PyBytes_Check
 ^
In file included from mod.cu:10:0:
/home/vlaand/.local/lib/python3.6/site-packages/theano/sandbox/cuda/cuda_ndarray.cuh:18:0: warning: "PyString_FromString" redefined
 #define PyString_FromString PyUnicode_FromString
 ^
In file included from /home/vlaand/.local/lib/python3.6/site-packages/theano/sandbox/cuda/cuda_ndarray.cuh:11:0,
                 from mod.cu:10:
/home/vlaand/anaconda3/lib/python3.6/site-packages/numpy/core/include/numpy/npy_3kcompat.h:73:0: note: this is the location of the previous definition
 #define PyString_FromString PyBytes_FromString
 ^
In file included from mod.cu:10:0:
/home/vlaand/.local/lib/python3.6/site-packages/theano/sandbox/cuda/cuda_ndarray.cuh:19:0: warning: "PyString_AsString" redefined
 #define PyString_AsString PyUnicode_AsUTF8
 ^
In file included from /home/vlaand/.local/lib/python3.6/site-packages/theano/sandbox/cuda/cuda_ndarray.cuh:11:0,
                 from mod.cu:10:
/home/vlaand/anaconda3/lib/python3.6/site-packages/numpy/core/include/numpy/npy_3kcompat.h:80:0: note: this is the location of the previous definition
 #define PyString_AsString PyBytes_AsString
 ^
In file included from mod.cu:10:0:
/home/vlaand/.local/lib/python3.6/site-packages/theano/sandbox/cuda/cuda_ndarray.cuh:20:0: warning: "PyString_FromStringAndSize" redefined
 #define PyString_FromStringAndSize PyUnicode_FromStringAndSize
 ^
In file included from /home/vlaand/.local/lib/python3.6/site-packages/theano/sandbox/cuda/cuda_ndarray.cuh:11:0,
                 from mod.cu:10:
/home/vlaand/anaconda3/lib/python3.6/site-packages/numpy/core/include/numpy/npy_3kcompat.h:74:0: note: this is the location of the previous definition
 #define PyString_FromStringAndSize PyBytes_FromStringAndSize
 ^
In file included from mod.cu:10:0:
/home/vlaand/.local/lib/python3.6/site-packages/theano/sandbox/cuda/cuda_ndarray.cuh:21:0: warning: "PyString_Size" redefined
 #define PyString_Size PyUnicode_GET_SIZE
 ^
In file included from /home/vlaand/.local/lib/python3.6/site-packages/theano/sandbox/cuda/cuda_ndarray.cuh:11:0,
                 from mod.cu:10:
/home/vlaand/anaconda3/lib/python3.6/site-packages/numpy/core/include/numpy/npy_3kcompat.h:82:0: note: this is the location of the previous definition
 #define PyString_Size PyBytes_Size
 ^
mod.cu(940): warning: pointless comparison of unsigned integer with zero
mod.cu(3000): warning: conversion from a string literal to "char *" is deprecated
mod.cu(3003): warning: conversion from a string literal to "char *" is deprecated
mod.cu(3005): warning: conversion from a string literal to "char *" is deprecated
mod.cu(3008): warning: conversion from a string literal to "char *" is deprecated
mod.cu(3010): warning: conversion from a string literal to "char *" is deprecated
mod.cu(3013): warning: conversion from a string literal to "char *" is deprecated
mod.cu(3016): warning: conversion from a string literal to "char *" is deprecated
mod.cu(3019): warning: conversion from a string literal to "char *" is deprecated
mod.cu(3021): warning: conversion from a string literal to "char *" is deprecated
mod.cu(3024): warning: conversion from a string literal to "char *" is deprecated
mod.cu(3026): warning: conversion from a string literal to "char *" is deprecated
mod.cu(3029): warning: conversion from a string literal to "char *" is deprecated
mod.cu(3031): warning: conversion from a string literal to "char *" is deprecated
mod.cu(3034): warning: conversion from a string literal to "char *" is deprecated
mod.cu(3037): warning: conversion from a string literal to "char *" is deprecated
mod.cu(3040): warning: conversion from a string literal to "char *" is deprecated
mod.cu(3042): warning: conversion from a string literal to "char *" is deprecated
mod.cu(3045): warning: conversion from a string literal to "char *" is deprecated
mod.cu(3047): warning: conversion from a string literal to "char *" is deprecated
mod.cu(3050): warning: conversion from a string literal to "char *" is deprecated
mod.cu(3074): warning: statement is unreachable
/usr/include/string.h: In function ‘void* __mempcpy_inline(void*, const void*, size_t)’:
/usr/include/string.h:652:42: error: ‘memcpy’ was not declared in this scope
   return (char *) memcpy (__dest, __src, __n) + __n;
                                          ^
ERROR (theano.sandbox.cuda): Failed to compile cuda_ndarray.cu: ('nvcc return status', 1, 'for cmd', 'nvcc -shared -O3 -m64 -Xcompiler -DCUDA_NDARRAY_CUH=mc72d035fdf91890f3b36710688069b2e,-DNPY_NO_DEPRECATED_API=NPY_1_7_API_VERSION,-fPIC,-fvisibility=hidden -Xlinker -rpath,/home/vlaand/.theano/compiledir_Linux-4.4--generic-x86_64-with-debian-stretch-sid-x86_64-3.6.1-64/cuda_ndarray -I/home/vlaand/.local/lib/python3.6/site-packages/theano/sandbox/cuda -I/home/vlaand/anaconda3/lib/python3.6/site-packages/numpy/core/include -I/home/vlaand/anaconda3/include/python3.6m -I/home/vlaand/.local/lib/python3.6/site-packages/theano/gof -L/home/vlaand/anaconda3/lib -o /home/vlaand/.theano/compiledir_Linux-4.4--generic-x86_64-with-debian-stretch-sid-x86_64-3.6.1-64/cuda_ndarray/cuda_ndarray.so mod.cu -lcublas -lpython3.6m -lcudart')
['nvcc', '-shared', '-O3', '-m64', '-Xcompiler', '-DCUDA_NDARRAY_CUH=mc72d035fdf91890f3b36710688069b2e,-DNPY_NO_DEPRECATED_API=NPY_1_7_API_VERSION,-fPIC,-fvisibility=hidden', '-Xlinker', '-rpath,/home/vlaand/.theano/compiledir_Linux-4.4--generic-x86_64-with-debian-stretch-sid-x86_64-3.6.1-64/cuda_ndarray', '-I/home/vlaand/.local/lib/python3.6/site-packages/theano/sandbox/cuda', '-I/home/vlaand/anaconda3/lib/python3.6/site-packages/numpy/core/include', '-I/home/vlaand/anaconda3/include/python3.6m', '-I/home/vlaand/.local/lib/python3.6/site-packages/theano/gof', '-L/home/vlaand/anaconda3/lib', '-o', '/home/vlaand/.theano/compiledir_Linux-4.4--generic-x86_64-with-debian-stretch-sid-x86_64-3.6.1-64/cuda_ndarray/cuda_ndarray.so', 'mod.cu', '-lcublas', '-lpython3.6m', '-lcudart']
/home/vlaand/.local/lib/python3.6/site-packages/theano/gpuarray/dnn.py:135: UserWarning: Your cuDNN version is more recent than Theano. If you encounter problems, try updating Theano or downgrading cuDNN to version 5.1.
  warnings.warn("Your cuDNN version is more recent than "
Using cuDNN version 7001 on context None
Mapped name None to device cuda3: GeForce GTX 1080 Ti (0000:83:00.0)

In [2]:
moduleName = "fivePointRegression"
repositoryPath = '/home/vlaand/IpythonNotebooks/05_emotion_fivepoint_nuig'

directory = '/home/vlaand/IpythonNotebooks/05_emotion_fivepoint_nuig'

emoNames = ['confident','excited','happy', 'surprised']
# emoNames = ['sadness', 'disgust', 'surprise', 'anger', 'fear', 'joy']

In [3]:
from nltk.tokenize import TweetTokenizer
import nltk.tokenize.casual as casual
from drevicko.twitter_regexes import cleanString, setupRegexes, tweetPreprocessor
import preprocess_twitter

def preprocess_tweet(text):    
    text = casual.reduce_lengthening(text)
    text = cleanString(setupRegexes('twitterProAna'),text)   
    text = ' '.join([span for notentity,span in tweetPreprocessor(text, ("urls", "users", "lists")) if notentity])
    text = text.replace('\t','')
    text = text.replace('< ','<').replace(' >','>')
    text = text.replace('):', '<sadface>').replace('(:', '<smile>')
    text = text.replace(" 't", "t")#.replace('#','')
    return text

def tokenise_tweet(text):
    text = preprocess_twitter.tokenize(text)
    text = preprocess_tweet(text)     
    return ' '.join(text.split())

tokenise_tweet.regexes = setupRegexes('twitterProAna')


imported regex as re

In [4]:
from collections import defaultdict
import numpy as np
import pandas as pd
import csv

def _load_tweets(filename = "tweet.txt"):

    tweets_text = []

    for line in open(filename, 'rb'):

        line = line.decode("utf-8")
#         tw_text = ' '.join(tokenise_tweet(line))
        tw_text = tokenise_tweet(line.split())
        tweets_text.append(tw_text)
    
    return(tweets_text)

def _load_labels(filename = "labels.txt"):
    
    labels = []
    for line in open(filename, 'rb'):
        labels.append(line.decode('utf-8').replace('\n','').split(','))
    header, labels = labels[0], labels[1:]
    return(labels)

def _read_csv(filename = "data.csv", header=True):
    
    df = pd.read_csv(filepath_or_buffer=filename)    
    
    tweets_list = []
    labels_list = []
    conf_list = []
    
    for row in df.iterrows():
        tweets_list.append(tokenise_tweet(row[1]['tweet']))
        labels_list.append([row[1][emo]for emo in emoNames])         
        #conf_list.append([row[1][emo+'_conf']for emo in emoNames])
    return tweets_list,labels_list


# tweets = _load_tweets(filename = "/home/vlaand/IpythonNotebooks/cf-5point-data/tweet")
# labels = _load_labels(filename = "/home/vlaand/IpythonNotebooks/cf-5point-data/labels-lowConfidence.csv")

tweets,labels = _read_csv(filename = "/home/vlaand/IpythonNotebooks/cf-5point-data/data-full5.csv",header=True)
# print(len(tweets), 'tweets,',len(labels),'labels')

WORD FREQUENCIES


In [7]:
from collections import Counter
from stop_words import get_stop_words
WORD_FREQUENCY_TRESHOLD = 2

import os
from sklearn.externals import joblib


def ifExists(filename):
    dir = os.path.dirname(filename)
    try:
        os.stat(dir)
    except:
        os.mkdir(dir) 
        
def checkFolder(filename):
    dir = os.path.dirname(filename)
    try:
        os.stat(dir)
    except:
        os.mkdir(dir) 

def _get_unique_tokens(tweets):    
    return(Counter(token for tweet in tweets for token in tweet.split()))

def _save_unique_tokens(tokens, filename='wordFrequencies.dump'):
    
    checkFolder(filename)
    _ = joblib.dump(tokens, filename=filename, compress=9)
    

def _plot_word_frequencies(wordFrequencies, WORD_FREQUENCY_TRESHOLD = 3):
    
    freqs = []
    for t,c in wordFrequencies.items():
        freqs.append(c)
        
    q = 0
    for t,c in wordFrequencies.items():
        if(c >= WORD_FREQUENCY_TRESHOLD):
            q+=1
    print(q, len(wordFrequencies))
    %pylab inline
    semilogy(arange(len(freqs)),sorted(freqs))
    show()
    

def _reduce_text(text, LANGUAGE='en', WORD_FREQUENCY_TRESHOLD = 3):    

    stop_words = get_stop_words(LANGUAGE)

    tweets_reduced = []

    for tw in tweets:
        tweet_r = []
        for token in tw.split():
            if(wordFrequencies[token] >= WORD_FREQUENCY_TRESHOLD):
                if(not token in stop_words):
                    tweet_r.append(token)

        tweets_reduced.append( ' '.join(tweet_r)  )
        
    return(tweets_reduced)

wordFrequencies = _get_unique_tokens(tweets)
_save_unique_tokens(tokens=wordFrequencies,filename = '/home/vlaand/IpythonNotebooks/cf-5point-data/wordFrequencies.dump')
_plot_word_frequencies(wordFrequencies, WORD_FREQUENCY_TRESHOLD = WORD_FREQUENCY_TRESHOLD)
tweets_reduced = _reduce_text(tweets, WORD_FREQUENCY_TRESHOLD = WORD_FREQUENCY_TRESHOLD)


1854 5157
Populating the interactive namespace from numpy and matplotlib

In [8]:
wordFrequencies2 = _get_unique_tokens(tweets_reduced)
_plot_word_frequencies(wordFrequencies2, WORD_FREQUENCY_TRESHOLD = WORD_FREQUENCY_TRESHOLD)


1744 1744
Populating the interactive namespace from numpy and matplotlib

N-GRAMS


In [9]:
from sklearn.feature_extraction.text import CountVectorizer
NGRAM_VALUE = 4

print('NGRAM_VALUE =',NGRAM_VALUE)


NGRAM_VALUE = 4

Save ngramizer


In [10]:
def _save_ngramizer(filename = 'ngramizer.dump'):
    checkFolder(filename)
    _ = joblib.dump(ngramizer, filename=filename, compress=9)
    print(filename+' saved')
    
vectorizer = CountVectorizer(ngram_range = (1,NGRAM_VALUE),token_pattern=r'\b\w+\b', min_df=WORD_FREQUENCY_TRESHOLD)
ngramizer = vectorizer.fit(tweets_reduced)
vec = ngramizer.transform(tweets_reduced).toarray()
print(len(vec), len(vec[0]))
    
_save_ngramizer(filename = '/home/vlaand/IpythonNotebooks/cf-5point-data/5gramizer.dump')


2019 2429
/home/vlaand/IpythonNotebooks/cf-5point-data/5gramizer.dump saved

Load ngramizer


In [11]:
def _load_ngramizer(filename = 'ngramizer.dump'):
    checkFolder(filename)
    ngramizer = joblib.load(filename = filename)
    print(filename+' loaded')
    return ngramizer

# ngramizer = _load_ngramizer('/home/vlaand/IpythonNotebooks/senpy-plugins-NUIG/hashTagClassification/ngramizers/'+str(NGRAM_VALUE)+'gramizer.dump')
# ngramizer = _load_ngramizer('/home/vlaand/IpythonNotebooks/senpy-plugins-NUIG/hashTagClassification/ngramizers/ngramizer.dump')

vec = ngramizer.transform(tweets_reduced).toarray()
print(len(vec), len(vec[0]))


2019 2429

NGRAM FREQUENCY


In [12]:
from natsort import natsorted
    
train_data_features = vec#X_train_counts.toarray()
vocab = ngramizer.get_feature_names()
dist = np.sum(train_data_features, axis=0)
ngram_freq = {}

# For each, print the vocabulary word and the frequency
for tag, count in zip(vocab, dist):
    #print(tag, count)
    ngram_freq[tag]=count

semilogy(natsorted(list(ngram_freq.values()),reverse=True))
show()


WORD EMBEDDINGS


In [13]:
def _read_csv_we(filename = "data.csv"):
    
    embedding_index = {}

    for row in pd.read_csv(filepath_or_buffer=filename, sep = ' ', header=None).iterrows():
        word, coefs = row[1][0], np.asarray(row[1][1:])
        embedding_index[word] = coefs
        
    print('we vectors loaded from <'+filename+'>')
    print('\t'+str(len(embedding_index))+'\tentries')    
        
    return embedding_index


def _load_original_vectors(filename = 'wordvectors-glove.twitter.27B.100d.txt', sep = ' ', wordFrequencies = None):

    Dictionary, Indices  = {},{}
    i=1
    
    for line in open(filename, 'rb'): 
        values = line.decode('utf-8').split(sep)
        
        token = values[0]
        token_vector = np.array(values[1:], dtype = 'float32')   
        if(wordFrequencies):
            if(token in wordFrequencies):                
                Dictionary[token] = token_vector
                Indices.update({token:i})
                i+=1
        else:
            Dictionary[token] = token_vector
            Indices.update({token:i})
            i+=1
            
    print('we vectors loaded from <'+filename+'>')
    print('\t'+str(len(Dictionary))+' entries') 
    return(Dictionary, Indices)

def pretrainedEmbeddings(EmbeddingPath):
        embedding_index = {}
        
        with open(EmbeddingPath) as f:
            next(iter(f))
            embedding_wordsList = []
            for line in f:
                values = line.split(" ")
                word = values[0]
                coefs = np.asarray(values[1:])
                embedding_index[word] = coefs
                embedding_wordsList.append(word)
        
        return (embedding_index, embedding_wordsList)

def _texts_to_sequences(train_tweets):
    
    train_sequences = []
    for i,tweet in enumerate(train_tweets): 
        tw = []
        for token in tweet.split():
            try:
                tw.append(Indices[token])
            except:
                continue
        tw.extend([0]*( maxlen-len(tw)) )
        train_sequences.append(np.asarray(tw))
    return train_sequences

def _data_to_lists(dataTrain):    
    
    train_tweets, train_labels = [], []
    print('stacking data to lists')
    for i in dataTrain:
        scores = []
        for score in i[1:]:
            if np.isnan(score):
                scores.append( 0 )
                print('\tWarning: Nan value present in dataset')
            else:
                scores.append(score-2)
        train_labels.append(scores)
        train_tweets.append(i[0])
    print('data stacked to lists\n\t'+str(len(train_tweets))+' tweets\n\t'+str(len(train_labels))+' labels')
    return train_tweets, train_labelsprint(len(Dictionary),'tokens in we')


def dataframe_to_lists(df):

    train_tweets, train_labels = [], []

    for row in df.iterrows():
        train_tweets.append(row[1][1])
        train_labels.append(row[1][3])
        
    return train_tweets, train_labels

def lists_to_vectors(train_tweets, train_labels, embedding_matrix=None):

    train_sequences = _texts_to_sequences(train_tweets)
    
    if embedding_matrix==None:
        embedding_matrix = np.zeros((len(Indices)+1, EMBEDDINGS_DIM))

    print('matrix created\n\t',embedding_matrix.shape)
    for (word, i) in Indices.items():
        embedding_vector = Dictionary.get(word)
        if embedding_vector != None:
            embedding_matrix[i] = embedding_vector.astype(np.float)

    _X = sequence.pad_sequences(train_sequences, maxlen=maxlen)
    _y = np.array(train_labels)

    print(len(_X), 'train sequences loaded')
    print('\t',_X.shape,'\n\t', _y.shape)
    
    return _X, _y, embedding_matrix

def _get_maxlen(tweets):
    max = 0
    for tw in tweets:
        if len(tw.split()) > max:
            max = len(tw.split())
    return max

In [14]:
EMBEDDINGS_DIM = 100
WORD_FREQUENCY_TRESHOLD = 2

_path_wordembeddings = '/home/vlaand/data/Glove/glove.twitter.27B/glove.twitter.27B.'+str(EMBEDDINGS_DIM)+'d.txt'
# _path_wordembeddings = '/home/vlaand/data/Glove/glove.6B/glove.6B.100d.txt'

Dictionary, Indices = _load_original_vectors(
        filename = _path_wordembeddings, 
        sep = ' ',
        wordFrequencies = None)#wordFrequencies) # leave wordFrequencies=None for loading the entire WE file

Indices_reversed = {}
for key in Indices.keys():
    Indices_reversed.update({Indices[key]:key})


we vectors loaded from </home/vlaand/data/Glove/glove.twitter.27B/glove.twitter.27B.100d.txt>
	1193514 entries

In [15]:
meltTweets = []
meltTweets.extend(tweets) 

print('all tweets melted into list, ',len(meltTweets))
print("max tweet length: %d tokens" %(_get_maxlen(meltTweets)) )

def _get_unique_tokens(text):    
    return(Counter(token for sentence in text for token in sentence.split()) )

wordFrequencies = _get_unique_tokens(tweets) 
_plot_word_frequencies(wordFrequencies, WORD_FREQUENCY_TRESHOLD = WORD_FREQUENCY_TRESHOLD)


all tweets melted into list,  2019
max tweet length: 56 tokens
1854 5157
Populating the interactive namespace from numpy and matplotlib
/home/vlaand/anaconda3/lib/python3.6/site-packages/IPython/core/magics/pylab.py:161: UserWarning: pylab import has clobbered these variables: ['dist']
`%matplotlib` prevents importing * from pylab and numpy
  "\n`%matplotlib` prevents importing * from pylab and numpy"

In [16]:
def plotSentenceLength(sentences):
    values = sorted([len(x.split()) for x in sentences],reverse=True)
    print(min(values), max(values), mean(values), median(values))
    line_0, = plt.plot(values,  label='Sentence length curve')
    plt.legend(handles=[line_0])
    plt.xlabel('sentences')
    plt.ylabel('sentence length')
    plt.title("Sentence length distribution")
    plt.grid(True)
    plt.show()    
    
plotSentenceLength(tweets)


1 56 14.4928182268 13.0

LSTM


In [17]:
def matthews_correlation(y_true, y_pred):
    y_pred_pos = K.round(K.clip(y_pred, 0, 1))
    y_pred_neg = 1 - y_pred_pos

    y_pos = K.round(K.clip(y_true, 0, 1))
    y_neg = 1 - y_pos

    tp = K.sum(y_pos * y_pred_pos)
    tn = K.sum(y_neg * y_pred_neg)

    fp = K.sum(y_neg * y_pred_pos)
    fn = K.sum(y_pos * y_pred_neg)

    numerator = (tp * tn - fp * fn)
    denominator = K.sqrt((tp + fp) * (tp + fn) * (tn + fp) * (tn + fn))

    return numerator / (denominator + K.epsilon())

def pearson_score(y, y_pred):
    return pearsonr(y, y_pred)[0]

def spearman_score(y, y_pred):
    return spearmanr(y, y_pred)[0]


def regressionReport_lstm(trainedModel, X_test, y_test, print_architecture = False):  
    
    scores = []
    
    for emo in range(len(emoNames)):
        y_pred1 = trainedModel.predict(X_test) #np.asarray([y_[0] for y_ in trainedModel.predict(X_test)], dtype=float32)
        y_pred1 = np.asarray([y_[emo] for y_ in y_pred1])
        y_true1 = np.asarray([y_[emo] for y_ in y_test])
        r2,prs,spr = r2_score(y_true1, y_pred1), pearson_score(y_true1, y_pred1), spearman_score(y_true1, y_pred1)    

#         print("%s, %.4f, %.4f, %.4f" %  (emoNames[emo],r2,prs,spr))  
        scores.append([emoNames[emo],r2,prs,spr])
        
    if print_architecture:
        print(trainedModel.to_json())  
    
    return scores

def printScores(scores):
    print("emotion, R2, pearson, spearman")
    for row in scores:
        print("%s, %.4f, %.4f, %.4f" %  (row[0],row[1],row[2],row[3],))

In [18]:
EMOTION = 0
maxlen = 30

lstm_X, lstm_y, embedding_matrix = lists_to_vectors(tweets, labels)
lstm_y = np.asarray(labels)


matrix created
	 (1193515, 100)
/home/vlaand/anaconda3/lib/python3.6/site-packages/ipykernel_launcher.py:106: FutureWarning: comparison to `None` will result in an elementwise object comparison in the future.
2019 train sequences loaded
	 (2019, 30) 
	 (2019, 4)

In [19]:
train, test, train, test = train_test_split(list(range(len(lstm_y))), list(range(len(lstm_y))), test_size=0.2, random_state=1337)

lstm_X_train, lstm_y_train = lstm_X[train], lstm_y[train]
lstm_X_test,  lstm_y_test  = lstm_X[test], lstm_y[test]

print('train data and label shape:', lstm_X_train.shape, lstm_y_train.shape)
print('test data and label shape:', lstm_X_test.shape, lstm_y_test.shape)


train data and label shape: (1615, 30) (1615, 4)
test data and label shape: (404, 30) (404, 4)

In [20]:
def oversampling(X, y):    
    ros = RandomOverSampler(ratio='auto')    
    y_fake = list(range(len(y)))
    X_resampled, y_resampled = ros.fit_sample(X, y_fake)
    y_resampled = np.asarray([y[_y] for _y in y_resampled])
    return X_resampled, y_resampled

# lstm_X_train_oversampled, lstm_y_train_oversampled = oversampling(X = lstm_X_train, y = lstm_y_train)#lstm_y_train)

In [ ]:
hidden_dims1 = 50
hidden_dims2 = 25
hidden_dims3 = 4

model = Sequential()
model.add(Embedding(len(Indices)+1,  EMBEDDINGS_DIM, weights=[embedding_matrix],
                            input_length=maxlen, trainable=False))
model.add(Bidirectional(LSTM(EMBEDDINGS_DIM))) #dropout is same as regularisation

model.add(Dropout(0.2))
model.add(Dense(hidden_dims1, b_regularizer=l2(0.01)),)
model.add(Dense(hidden_dims2, b_regularizer=l2(0.01)), ) 
model.add(Dense(hidden_dims3, activation='softsign'))

model.compile(loss='mean_absolute_error', optimizer='adam',metrics=['accuracy',matthews_correlation])

In [34]:
np.random.seed(1337)

batch_size = 32

EMBEDDINGS_DIM = 100
hidden_dims1 = 50
hidden_dims2 = 25
hidden_dims3 = 4
nb_epoch = 8

scores = []

lstmTrained = Sequential()
lstmTrained.add(Embedding(len(Indices)+1,  EMBEDDINGS_DIM, weights=[embedding_matrix],
                            input_length=maxlen, trainable=False))
lstmTrained.add(Bidirectional(LSTM(EMBEDDINGS_DIM, kernel_regularizer=l2(0.05))))
# lstmTrained.add(LSTM(EMBEDDINGS_DIM, kernel_regularizer=l2(0.05)))#, return_sequences=True, W_regularizer=l2(0.02))) 
lstmTrained.add(Dropout(0.2))
lstmTrained.add(Dense(hidden_dims1, bias_regularizer=l2(0.01)), )
lstmTrained.add(Dense(hidden_dims2, bias_regularizer=l2(0.01)), ) 
lstmTrained.add(Dense(hidden_dims3, activation='elu'))
lstmTrained.compile(loss='mae', optimizer='adam', metrics=['accuracy'])#, matthews_correlation])

lstmTrained.fit(lstm_X_train, lstm_y_train, batch_size=batch_size, epochs = nb_epoch, validation_split=None)


Epoch 1/8
1615/1615 [==============================] - 2s - loss: 4.8652 - acc: 0.4012     
Epoch 2/8
1615/1615 [==============================] - 2s - loss: 1.1876 - acc: 0.4204     
Epoch 3/8
1615/1615 [==============================] - 2s - loss: 0.5224 - acc: 0.4613     
Epoch 4/8
1615/1615 [==============================] - 2s - loss: 0.4198 - acc: 0.4873     
Epoch 5/8
1615/1615 [==============================] - 2s - loss: 0.4058 - acc: 0.4910     
Epoch 6/8
1615/1615 [==============================] - 2s - loss: 0.3897 - acc: 0.4935     
Epoch 7/8
1615/1615 [==============================] - 2s - loss: 0.3802 - acc: 0.5263     
Epoch 8/8
1615/1615 [==============================] - 2s - loss: 0.3773 - acc: 0.5115     
Out[34]:
<keras.callbacks.History at 0x7f69422f4400>

In [35]:
scores = regressionReport_lstm(trainedModel = lstmTrained,
                                        X_test=lstm_X_test, 
                                        y_test=lstm_y_test, 
                                        print_architecture = False)

printScores(scores)


emotion, R2, pearson, spearman
confident, 0.1748, 0.4283, 0.4163
excited, 0.3183, 0.5691, 0.5578
happy, 0.4485, 0.6698, 0.6327
surprised, 0.0549, 0.3024, 0.2975

In [36]:
# lstmTrained.to_json()


Out[36]:
'{"class_name": "Sequential", "config": [{"class_name": "Embedding", "config": {"name": "embedding_6", "trainable": false, "batch_input_shape": [null, 30], "dtype": "float32", "input_dim": 1193515, "output_dim": 100, "embeddings_initializer": {"class_name": "RandomUniform", "config": {"minval": -0.05, "maxval": 0.05, "seed": null}}, "embeddings_regularizer": null, "activity_regularizer": null, "embeddings_constraint": null, "mask_zero": false, "input_length": 30}}, {"class_name": "Bidirectional", "config": {"name": "bidirectional_6", "trainable": true, "layer": {"class_name": "LSTM", "config": {"name": "lstm_6", "trainable": true, "return_sequences": false, "return_state": false, "go_backwards": false, "stateful": false, "unroll": false, "implementation": 0, "units": 100, "activation": "tanh", "recurrent_activation": "hard_sigmoid", "use_bias": true, "kernel_initializer": {"class_name": "VarianceScaling", "config": {"scale": 1.0, "mode": "fan_avg", "distribution": "uniform", "seed": null}}, "recurrent_initializer": {"class_name": "Orthogonal", "config": {"gain": 1.0, "seed": null}}, "bias_initializer": {"class_name": "Zeros", "config": {}}, "unit_forget_bias": true, "kernel_regularizer": {"class_name": "L1L2", "config": {"l1": 0.0, "l2": 0.05000000074505806}}, "recurrent_regularizer": null, "bias_regularizer": null, "activity_regularizer": null, "kernel_constraint": null, "recurrent_constraint": null, "bias_constraint": null, "dropout": 0.0, "recurrent_dropout": 0.0}}, "merge_mode": "concat"}}, {"class_name": "Dropout", "config": {"name": "dropout_6", "trainable": true, "rate": 0.2}}, {"class_name": "Dense", "config": {"name": "dense_16", "trainable": true, "units": 50, "activation": "linear", "use_bias": true, "kernel_initializer": {"class_name": "VarianceScaling", "config": {"scale": 1.0, "mode": "fan_avg", "distribution": "uniform", "seed": null}}, "bias_initializer": {"class_name": "Zeros", "config": {}}, "kernel_regularizer": null, "bias_regularizer": {"class_name": "L1L2", "config": {"l1": 0.0, "l2": 0.009999999776482582}}, "activity_regularizer": null, "kernel_constraint": null, "bias_constraint": null}}, {"class_name": "Dense", "config": {"name": "dense_17", "trainable": true, "units": 25, "activation": "linear", "use_bias": true, "kernel_initializer": {"class_name": "VarianceScaling", "config": {"scale": 1.0, "mode": "fan_avg", "distribution": "uniform", "seed": null}}, "bias_initializer": {"class_name": "Zeros", "config": {}}, "kernel_regularizer": null, "bias_regularizer": {"class_name": "L1L2", "config": {"l1": 0.0, "l2": 0.009999999776482582}}, "activity_regularizer": null, "kernel_constraint": null, "bias_constraint": null}}, {"class_name": "Dense", "config": {"name": "dense_18", "trainable": true, "units": 4, "activation": "elu", "use_bias": true, "kernel_initializer": {"class_name": "VarianceScaling", "config": {"scale": 1.0, "mode": "fan_avg", "distribution": "uniform", "seed": null}}, "bias_initializer": {"class_name": "Zeros", "config": {}}, "kernel_regularizer": null, "bias_regularizer": null, "activity_regularizer": null, "kernel_constraint": null, "bias_constraint": null}}], "keras_version": "2.0.8", "backend": "theano"}'

In [301]:
np.random.seed(1337)

batch_size = 32

EMBEDDINGS_DIM = 100
hidden_dims1 = 50
hidden_dims2 = 25
hidden_dims3 = 1
nb_epoch = 8

def foo():
    # print(emoNames[EMOTION])
    scores = []

    for fold in StratifiedKFold(n_folds = 5, shuffle=True, random_state=1337, y=lstm_y):
        train, test = fold

        lstmTrained = Sequential()
        lstmTrained.add(Embedding(len(Indices)+1,  EMBEDDINGS_DIM, weights=[embedding_matrix],
                                    input_length=maxlen, trainable=False))
        # lstmTrained.add(Bidirectional(LSTM(EMBEDDINGS_DIM)))
        lstmTrained.add(LSTM(EMBEDDINGS_DIM, activation='tanh', kernel_regularizer=l2(0.05)))#, return_sequences=True, W_regularizer=l2(0.02))) 
        # lstmTrained.add(Dropout(0.2))
        lstmTrained.add(Dense(hidden_dims1, bias_regularizer=l2(0.02)), )
        lstmTrained.add(Dense(hidden_dims2, bias_regularizer=l2(0.01)), ) 
        lstmTrained.add(Dense(hidden_dims3, activation='softsign'))
        # lstmTrained.add(Dense(hidden_dims3, activation='softmax'))
        lstmTrained.compile(loss='mean_absolute_error', optimizer='adam', metrics=['accuracy', matthews_correlation])

        lstmTrained.fit(lstm_X_train_oversampled[train], lstm_y_train_oversampled[train], batch_size=batch_size, epochs = nb_epoch, validation_split=None, verbose=False)

        scores.append(classificationReport_lstm(trainedModel = lstmTrained,
                                                X_test=lstm_X_train_oversampled[test], 
                                                y_test=lstm_y_train_oversampled[test], 
                                                print_architecture = False))

    print("r2,pearson,spearman\n%.5f,%.5f,%.5f" % (np.mean([s[0] for s in scores]),np.mean([s[1] for s in scores]),np.mean([s[2] for s in scores]) ))


happy
0.7638, 0.8880, 0.8705
0.7916, 0.8929, 0.8770
0.7765, 0.8864, 0.8670
0.7736, 0.8828, 0.8660
0.7814, 0.8850, 0.8661
r2,pearson,spearman
0.77738,0.88701,0.86932

SVM

Vector transformations


In [83]:
def _vectors_similarity(v1 , v2):
    return( 1 - spatial.distance.cosine(v1,v2) )
def similarityVector(vector_, vectors_):
    resVector = np.asarray([_vectors_similarity(vector_ , v_) for v_ in vectors_])
    return np.asarray([np.max(resVector), np.mean(resVector), np.std(resVector), np.min(resVector)])
def compareTokenToSentence(leftToken, sentence):
    sentence_vectors = []
    for token in sentence:
        if token in Dictionary:
            sentence_vectors.append(Dictionary[token])
        else:
            token = token.replace('#','')
            if token in Dictionary:
                sentence_vectors.append(Dictionary[token])
    return similarityVector( Dictionary[leftToken], sentence_vectors)  
def capitalRatio(tweet):
        firstCap, allCap = 0, 0
        length = len(tweet)
        if length==0:
            return np.array([0,0])
        for i,token in enumerate(tweet.split()):
            if( token.istitle() ):
                firstCap += 1
            elif( token.isupper() ):
                allCap += 1
        return np.asarray([firstCap/length,allCap/length]) 
def tweetToWordVectors(dictionary, tweet, fixedLength=False):
    output = []    
    if(fixedLength):
        for i in range(MAX_SEQUENCE_LENGTH):
            output.append(blankVector)
        for i,token in enumerate(tweet.split()):
            if token in Dictionary:
                output[i] = Dictionary[token]                
    else:
         for i,token in enumerate(tweet.lower().split()):
            if token in Dictionary:
                output.append(Dictionary[token])
            elif token.replace('#','') in Dictionary:
                output.append(Dictionary[token.replace('#','')])
    return output
def ModWordVectors(x, mod=True):
    if(len(x) == 0):       
        if(mod):
            return(np.zeros(EMBEDDINGS_DIM*3, dtype='float32'))
        else:
            return(np.zeros(EMBEDDINGS_DIM, dtype='float32'))        
    m =  np.matrix(x)
    if(mod):
        xMean = np.array(m.mean(0))[0]
        xMin = np.array(m.min(0))[0]
        xMax = np.array(m.max(0))[0]
        xX = np.concatenate((xMean,xMin,xMax))
        return xX
    else:
        return np.array(m.mean(0))[0]
def bindTwoVectors(x0,x1):
    return np.array(list(itertools.chain(x0,x1)),dtype='float32') 
def _bind_vectors(x):
    return np.concatenate(x)   
def myLog10(vector):
    for i,v in enumerate(vector):
        if v > 0:
            vector[i] = np.log(v)
    return vector            
def _convert_text_to_vector(tweets,  Dictionary, labels, ngramizer, lstmLayer=None, emotion = None):
    _X = []
    _y = []
    vec = ngramizer.transform(tweets).toarray()
    for i, t in enumerate(tweets):
        if lstmLayer==None:
            embeddingsVector = ModWordVectors(tweetToWordVectors(Dictionary,tweets[i]))
        else:
            embeddingsVector_lstm = lstmLayer[i]
            embeddingsVector = ModWordVectors(tweetToWordVectors(Dictionary,tweets[i]))
                
#         capitalRatioVector = capitalRatio(dfs[st][emoNames[EMOTION]][i])
#         simVector = compareTokenToSentence(leftToken = emoNames[EMOTION], sentence = t)

        ngramVector = vec[i]
#         _X.append( embeddingsVector_lstm )
        _X.append(( _bind_vectors((ngramVector, embeddingsVector_lstm))  ))
#         _X.append( _bind_vectors((embeddingsVector,embeddingsVector_lstm))  )
        
        if emotion == None:
            _y.append( labels[i] )
        else:
            _y.append( labels[i][emotion] )
    return(np.asarray(_X), np.asarray(_y))


### Exctract activations
# for j,i in enumerate(lstmTrained.layers):
#     print(j,i)

get_activations = function([lstmTrained.layers[0].input], lstmTrained.layers[1].output, allow_input_downcast=True)
# activations = get_activations(lstm_X_train) # same result as above

In [84]:
normalize_labels = True
svc_X, svc_y = [[],[],[],[]], [[],[],[],[]]

for j, emo in enumerate(emoNames):
    emonum = j
    svc_X[j], svc_y[j] = _convert_text_to_vector(
        tweets = tweets_reduced,
        labels = labels, Dictionary = Dictionary, ngramizer = ngramizer,  
        emotion = emonum,
        lstmLayer = get_activations(lstm_X)        
    )
    if normalize_labels:
        svc_y[j] /= 4
    
    print('emotion:', emoNames[emonum])
    print('\t', svc_X[j].shape, svc_y[j].shape)  
    
print("labels range: [%.1f : %.1f]" % (min(np.concatenate(svc_y)), max(np.concatenate(svc_y))))


emotion: confident
	 (2019, 2629) (2019,)
emotion: excited
	 (2019, 2629) (2019,)
emotion: happy
	 (2019, 2629) (2019,)
emotion: surprised
	 (2019, 2629) (2019,)
labels range: [0.0 : 1.0]

In [85]:
svc_X_train, svc_y_train = [emo[train] for emo in svc_X], [emo[train] for emo in svc_y]
svc_X_test,  svc_y_test  = [emo[test] for emo in svc_X], [emo[test] for emo in svc_y]

print('train data and label shape:', svc_X_train[0].shape, svc_y_train[0].shape)
print('test data and label shape:', svc_X_test[0].shape, svc_y_test[0].shape)


train data and label shape: (1615, 2629) (1615,)
test data and label shape: (404, 2629) (404,)

GridSearch


In [96]:
from sklearn.metrics import make_scorer

def pearson_score(y, y_pred):
    return pearsonr(y, y_pred)[0]

def spearman_score(y, y_pred):
    return spearmanr(y, y_pred)[0]

pearson_scorer = make_scorer(pearson_score)
spearman_scorer = make_scorer(spearman_score)

def getScores(estimator, x, y):
    y_pred = estimator.predict(x)
    return (r2_score(y, y_pred), 
            pearson_score(y, y_pred), 
            spearman_score(y, y_pred))

def my_scorer(estimator, x, y):
    r2, p, s = getScores(estimator, x, y)
#     print("%.4f, %.4f, %.4f" % (r2, p, s))
    return s

In [100]:
warnings.simplefilter('ignore')

ESTIMATOR = 'LinearSVR'
cv_folds = 5

def _greed_search():     

    list_acc = []    
    list_val = []
                
    epsilon = 0.001    
    if(ESTIMATOR == 'LinearSVR'): 
        gamma_array = [1.0]
    elif(ESTIMATOR == 'SVR'):
        gamma_array = [0.001,0.01,0.1]
        
    c_array = [0.001,0.01,0.1]
    print("estimator, emotion, C, gamma, tol, score")

    for C in c_array:
        for gamma in gamma_array:              
            for tol in [1e-4]:
                cvs = cross_val_score(estimator = LinearSVR(C=C, tol=tol), X=svc_X[EMOTION], y=svc_y[EMOTION], cv=cv_folds, n_jobs=cv_folds, scoring=my_scorer) 
                meanScore = np.mean(np.asarray(cvs))
                
#                 if(ESTIMATOR == 'LinearSVR'): 
#                     svcTrained = LinearSVR(C=C, tol=tol) 
#                 elif(ESTIMATOR == 'SVR'): 
#                     svcTrained = SVR(C=C, tol=tol,gamma=gamma) 
                    
#                 svcTrained.fit(svc_X[EMOTION], svc_y[EMOTION])

#                 meanScore = r2_score(svc_y[EMOTION] , svcTrained.predict(svc_X[EMOTION]))
#                 prs = pearsonr(svc_y_test , svc_y_test_predict)[0]
#                 spr = spearmanr(svc_y_test , svc_y_test_predict)[0]

                list_val.append([moduleName,meanScore,ESTIMATOR, C, gamma,epsilon,tol,NGRAM_VALUE,EMBEDDINGS_DIM])
                list_acc.append(meanScore)
        
                print('%s, %s, %s, %s, %s, %.4f' %(ESTIMATOR, emoNames[EMOTION], str(C), str(gamma), str(tol), meanScore))     
        
    best = np.argmax(list_acc)    
#     print(list_val[best])
    out0 = {
        'C':list_val[best][3], 
        'gamma': list_val[best][4],
        'epsilon': list_val[best][5],
        'tol': list_val[best][6],
        'ngrams': list_val[best][7],
        'EMBEDDINGS_DIM': list_val[best][8],
        'score': list_val[best][1]
    }
    return {ESTIMATOR:{emoNames[EMOTION]:out0}}

    
def _combine_best_results(pool_output, ESTIMATOR):
    new_p = {}       
    for i in pool_output:
        new_p.update(i)   
        
    return new_p  

EMOTION = 0
pool_output = [_greed_search()]

print()
print(pool_output[0])


estimator, emotion, C, gamma, tol, score
LinearSVR, surprised, 0.001, 1.0, 0.0001, 0.3760
LinearSVR, surprised, 0.01, 1.0, 0.0001, 0.3231
LinearSVR, surprised, 0.1, 1.0, 0.0001, 0.1872

{'LinearSVR': {'surprised': {'C': 0.001, 'gamma': 1.0, 'epsilon': 0.001, 'tol': 0.0001, 'ngrams': 4, 'EMBEDDINGS_DIM': 100, 'score': 0.37597569469851272}}}

In [384]:
temp_params = _combine_best_results(pool_output, ESTIMATOR)

try:
    train_params[ESTIMATOR].update(temp_params[ESTIMATOR])
except:
    train_params = {}
    train_params.update(temp_params)
    
# train_params
# train_params = {'LinearSVR': {'confident': {'C': 0.01,   'EMBEDDINGS_DIM': 100,   'epsilon': 0.001,   'gamma': 1.0,   'ngrams': 4,   'score': 0.080144904381108911,   'tol': 0.0001},  'excited': {'C': 0.01,   'EMBEDDINGS_DIM': 100,   'epsilon': 0.001,   'gamma': 1.0,   'ngrams': 4,   'score': 0.20181175980742649,   'tol': 0.0001},  'happy': {'C': 0.01,   'EMBEDDINGS_DIM': 100,   'epsilon': 0.001,   'gamma': 1.0,   'ngrams': 4,   'score': 0.31076511419699682,   'tol': 0.0001},  'surprised': {'C': 0.001,   'EMBEDDINGS_DIM': 100,   'epsilon': 0.001,   'gamma': 1.0,   'ngrams': 4,   'score': -0.021849261405481914,   'tol': 0.0001}}}

SAVE CLASSIFIERS


In [177]:
# ESTIMATOR = 'SVR'
ESTIMATOR = 'LinearSVR'

if ESTIMATOR == 'SVR':
    svcTrained = SVR(C=train_params[ESTIMATOR][emoNames[EMOTION]]['C'], 
                 tol=train_params[ESTIMATOR][emoNames[EMOTION]]['tol'], 
                 gamma=train_params[ESTIMATOR][emoNames[EMOTION]]['gamma'],
                 verbose=True)
    
else:
    svcTrained = LinearSVR(C=train_params[ESTIMATOR][emoNames[EMOTION]]['C'], 
                 tol=train_params[ESTIMATOR][emoNames[EMOTION]]['tol'], 
                 verbose=True)
    
svcTrained.fit(svc_X[EMOTION], svc_y[EMOTION])
print(svcTrained)

       
# def saveModelFor(model, ESTIMATOR, path='/home/vlaand/IpythonNotebooks/wassa2017/'):
#     path = os.path.join(path,ESTIMATOR)
#     checkFolder(path)
#     filename = path+'.dump'
#     checkFolder(filename)
#     _ = joblib.dump(model, filename, compress=9)
#     print("model saved to <%s>" % filename)
    
# saveModelFor(svcTrained, ESTIMATOR=ESTIMATOR, path = os.path.join(repositoryPath, moduleName, 'classifiers'))


[LibLinear]LinearSVR(C=0.01, dual=True, epsilon=0.0, fit_intercept=True,
     intercept_scaling=1.0, loss='epsilon_insensitive', max_iter=1000,
     random_state=None, tol=0.0001, verbose=True)

In [178]:
from sklearn.svm import SVR, LinearSVR
from sklearn.externals import joblib
from multiprocessing import Pool
import os, sys

SEP = '/'
EXTENSION = '.dump'
ESTIMATOR = 'LinearSVR'
SAVE_DIR = '/home/vlaand/IpythonNotebooks/senpy-plugins-NUIG/fivePointRegression/classifiers/'

def ifExists(filename):
    dir = os.path.dirname(filename)
    try:
        os.stat(dir)
    except:
        os.mkdir(dir) 

def trainModelFor(EMOTION):
    
    y2 = []
    for ly in y:  
        if not np.isnan(ly[EMOTION]):        
            y2.append(float(ly[EMOTION])/4)
        else:
            y2.append(2.0)
        
    y2 = np.asarray(y2)    
    C = train_params[ESTIMATOR][ emoNames[EMOTION] ]['C'] 
    tol = train_params[ESTIMATOR][ emoNames[EMOTION] ]['tol'] 
                   
    if(ESTIMATOR == 'SVR'):
        gamma = train_params[ESTIMATOR][ emoNames[EMOTION] ]['gamma'] 
        svcTrained = SVR(C=C, gamma=gamma, tol=tol) 
        svcTrained.fit(X2, y2)
    elif(ESTIMATOR == 'LinearSVR'):
        svcTrained = LinearSVR(C=C, tol=tol)
        svcTrained.fit(X2, y2)
    else:
        print('Error: Classifier is not chosen')
        sys.exit(0)
        
    return(svcTrained)

def checkFolder(filename):
    dir = os.path.dirname(filename)
    try:
        os.stat(dir)
    except:
        os.mkdir(dir) 

def saveModelFor(model, EMOTION):
    path = SAVE_DIR +ESTIMATOR
    checkFolder(path)
    filename = path + SEP +str(emoNames[EMOTION]) + EXTENSION
    checkFolder(filename)
    _ = joblib.dump(model, filename, compress=9)
    print('model ' + filename + ' saved')
    #return {emoNames[EMOTION]:model}
    
def trainAndSave(emotion):
    model = trainModelFor(emotion)
    saveModelFor(model=model, EMOTION=emotion)
    return {emoNames[emotion]:model}

def _combine_train_results(pool_output, ESTIMATOR):
    new_p = {ESTIMATOR:{}}   
    for i in pool_output:
        new_p[ESTIMATOR].update(i)
        
    return new_p    

# X2 = np.asarray(X) 
# with Pool(processes = len(emoNames)) as p:
#     pool_output = p.map(trainAndSave, [i for i in  range(len(emoNames))])


# temp_models = _combine_train_results(pool_output, ESTIMATOR)

# try:
#     train_models.update(temp_models)
# except:
#     train_models = temp_models

In [227]:
l0,l1,l2 = [],[],[]
y0,y1,y2 = [],[],[]

for i,y_ in enumerate(y):
#     print(y_/4, y_pred[0][i], y_pred[1][i], y_pred[2][i])
    y_0,y_1,y_2 = y_pred[0][i],y_pred[1][i],y_pred[2][i]
    
    l0.append(y_0)    
    l1.append(y_1)
    l2.append(y_2)
        
    y0.append(y_[0]/4)
    y1.append(y_[1]/4)
    if not np.isnan(y_[2]/4):
        y2.append(y_[2]/4)
    else:
        print('!')
        y2.append(0.5)
#     print(i/4, j[0], j[1], j[2])
              
r2_score(y0,l0),r2_score(y1,l1),r2_score(y2,l2)


---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-227-c433b161cc70> in <module>()
      4 for i,y_ in enumerate(y):
      5 #     print(y_/4, y_pred[0][i], y_pred[1][i], y_pred[2][i])
----> 6     y_0,y_1,y_2 = y_pred[0][i],y_pred[1][i],y_pred[2][i]
      7 
      8     l0.append(y_0)

NameError: name 'y_pred' is not defined

Comparison HTC-VAD & FPR


In [40]:
models_path_fpr = '/home/vlaand/IpythonNotebooks/senpy-plugins-development/fivePointRegression/classifiers/SVR/'

models_fpr = {}
for emo in emoNames:
    models_fpr.update({emo:joblib.load(models_path_fpr+emo+'.dump')})
    
# models_fpr

In [9]:
models_path_htc = '/home/vlaand/IpythonNotebooks/senpy-plugins-NUIG/hashTagClassification/classifiers/LinearSVR/'

models_htc = {}
for emo in ['sadness', 'disgust', 'surprise', 'anger', 'fear', 'joy']:
    models_htc.update({emo:joblib.load(models_path_htc+emo+'.dump')})
    
# models_htc

In [215]:
models_path_htc_rbf = '/home/vlaand/IpythonNotebooks/senpy-plugins-NUIG/hashTagClassification/classifiers/SVC/'

models_htc_rbf = {}
for emo in ['sadness', 'disgust', 'surprise', 'anger', 'fear', 'joy']:
    models_htc_rbf.update({emo:joblib.load(models_path_htc_rbf+emo+'.dump')})
    
# models_htc_rbf

In [18]:
Dictionary
# wordFrequencies
# ngramizer

In [10]:
def _load_unique_tokens(filename = 'wordFrequencies.dump'):

#     filename = os.path.join(os.path.dirname(__file__),filename)
    return(joblib.load(filename))

wordFrequencies = _load_unique_tokens('/home/vlaand/IpythonNotebooks/senpy-plugins-NUIG/hashTagClassification/wordFrequencies.dump')

In [11]:
def foo():
    ngramizers = []
    for n_grams in [2,3,4]:        
        filename = os.path.join('/home/vlaand/IpythonNotebooks/senpy-plugins-NUIG/hashTagClassification/ngramizers/', str(n_grams)+'gramizer.dump')
        ngramizers.append( joblib.load(filename) )
        
    return ngramizers

ngramizers = foo()

In [121]:
# ngramizer = joblib.load('/home/vlaand/IpythonNotebooks/senpy-plugins-NUIG/hashTagClassification/ngramizers/4gramizer.dump')

In [14]:
import pandas as pd

dataTrain = _read_csv_data(filename = _path_dataset, header=True)    
print('training data loaded from <'+_path_dataset+'>')

train_tweets = []
train_labels = []

for i in dataTrain:
    scores = []
    for score in i[1:]:
        if(np.isnan(score)):
            scores.append(2.0)
        else:
            scores.append(score)
    train_labels.append(scores)
    train_tweets.append(i[0])


training data loaded from </home/vlaand/IpythonNotebooks/cf-5point-data/data-full.csv>

In [15]:
_ngramVectors = []

for i in ngramizers:
    _ngramVectors.append(i.transform(train_tweets).toarray())

In [16]:
X_eval_2, y_eval = _convert_text_to_vector(tweets = train_tweets, tweet_original = train_tweets, Dictionary = Dictionary, labels = train_labels, ngramvec =_ngramVectors[0])
X_eval_3, y_eval = _convert_text_to_vector(tweets = train_tweets, tweet_original = train_tweets, Dictionary = Dictionary, labels = train_labels, ngramvec =_ngramVectors[1])
X_eval_4, y_eval = _convert_text_to_vector(tweets = train_tweets, tweet_original = train_tweets, Dictionary = Dictionary, labels = train_labels, ngramvec =_ngramVectors[2])

print(X_eval_2.shape,X_eval_3.shape,X_eval_4.shape)
print(y_eval.shape)


(2019, 16169) (2019, 18241) (2019, 19192)
(2019, 3)

In [ ]:
X = {'sadness':X4, 'disgust':X4, 'surprise':X4, 'anger':X2, 'fear':X4, 'joy':X3}

Predicts to VAD


In [24]:
y_pred_htc = {}

for emo in ['sadness', 'disgust', 'surprise', 'anger', 'fear', 'joy']:
    if emo == 'anger':
        y_pred_htc.update({emo:models_htc[emo].predict(X_eval_2)})
    elif emo == 'joy':
        y_pred_htc.update({emo:models_htc[emo].predict(X_eval_3)})
    else:
        y_pred_htc.update({emo:models_htc[emo].predict(X_eval_4)})
        
features_htc = {}
for dimension in ['V','A','D']:
    values = []
    for row in range(len(y_pred_htc['surprise'])):
        weights=[y_pred_htc[i][row] for i in y_pred_htc if (i != 'surprise')]
        if False in all(v == 0 for v in weights):
            values.append(np.average([centroids[i][dimension] for i in y_pred_htc if (i != 'surprise')], weights=weights ))
        else:
            values.append(5.0)            
    features_htc.update({dimension:values})   
    
features_htc

In [ ]:
y_pred_htc_rbf = {}
for emo in ['sadness', 'disgust', 'surprise', 'anger', 'fear', 'joy']:
    print(emo)
    if emo == 'anger':
        y_pred_htc_rbf.update({emo:models_htc_rbf[emo].predict_proba(X_eval_2)})
    elif emo == 'joy':
        y_pred_htc_rbf.update({emo:models_htc_rbf[emo].predict_proba(X_eval_3)})
    else:
        y_pred_htc_rbf.update({emo:models_htc_rbf[emo].predict_proba(X_eval_4)})


1
sadness

In [232]:
features_htc_rbf = {}
for dimension in ['V','A','D']:
    print(dimension)
    values = []
    for row in range(len(y_pred_htc_rbf['surprise'])):
        weights=[y_pred_htc_rbf[i][row][1] for i in y_pred_htc_rbf if (i != 'surprise')]
#         if False in all(v == 0 for v in weights):
        values.append(np.average([centroids[i][dimension] for i in y_pred_htc_rbf if (i != 'surprise')], weights=weights ))
#         else:
#             values.append(5.0)            
    features_htc_rbf.update({dimension:values})   
    
# features_htc_rbf


V
A
D

In [119]:
y_pred_fpr = {}

for emo in emoNames:
    y_pred_fpr.update({emo:10*models_fpr[emo].predict(X_eval_4)})
    
y_pred_fpr


Out[119]:
{'confident': array([ 5.99784253,  6.03535934,  6.03450945, ...,  6.03278388,
         6.01940154,  6.04865325]),
 'excited': array([ 6.04721026,  6.13505352,  6.21766691, ...,  6.17532646,
         6.14233364,  6.31161089]),
 'happy': array([ 4.67336817,  4.32912562,  4.62225976, ...,  5.25173427,
         4.73891756,  5.04414936])}

In [155]:
y_annotated = {'V':y_eval.T[0]*10/4, 'A':y_eval.T[1]*10/4 ,'D':y_eval.T[2]*10/4}

In [ ]:
vad_mappings = {
     'confident': 'D',
     'excited': 'A',
     'happy': 'V',
     'D':'confident',
     'A':'excited',
     'V':'happy'
}

In [233]:
for i,dimension in enumerate(['V','A','D']):
    print( 
        r2_score(   features_htc[dimension],               y_annotated[dimension]               ),
        r2_score(   y_pred_fpr[vad_mappings[dimension]],   y_annotated[dimension]               ),
        r2_score(   features_htc_rbf[dimension],           y_annotated[dimension]               )
        )


-0.620680748396 -19.4727733044 -1.75307874052
-2.27833304462 -29.496527739 -14.8417019805
-1.56540913825 -22311.4709413 -9.50414297565

PLOT


In [280]:
%pylab inline

import numpy as np
import matplotlib.pyplot as plt
import natsort

tmp_range = 2018
dimension = 'A'

s = y_annotated[dimension]
order = sorted(range(len(s)), key=lambda k: s[k])

g0 = [i for i in range(2019)]
g1 = features_htc[dimension]
g2 = y_pred_fpr[vad_mappings[dimension]] 
g3 = features_htc_rbf[dimension]
g4 = y_annotated[dimension]
    
plt.grid(True)
# plt.plot(g1[0:tmp_range], 'r', g2[0:tmp_range], 'b', g3[0:tmp_range], 'g')
# plt.legend()
# plt.show()


line_0, = plt.plot(np.array(g1)[], 'b.',  label='htc SVM')
line_1, = plt.plot(np.array(g2)[order], 'r.', label='fpr SVM')
line_2, = plt.plot(np.array(g3)[order], 'm.', label='htc SVM rbf')
line_3, = plt.plot(np.array(g4)[order], 'g.', label='annotated')
# plt.plot([0,l],[.3,.3])
# plt.plot([0,l],[-.3,-.3])
plt.legend(handles=[line_0, line_2, line_1, line_3])
plt.legend(bbox_to_anchor=(1.02, .4, .65, .0), loc=3,
           ncol=2, mode="expand", borderaxespad=0.)
# plt.legend(bbox_to_anchor=(0.6, 0.05, 0.35, .102), loc=4,
#            ncol=1, mode="expand", borderaxespad=0.)

plt.ylabel('dimension: '+dimension)
plt.xlabel('tweets')
plt.title("Models Evaluation on 5point corpus")
plt.show()


Populating the interactive namespace from numpy and matplotlib

In [ ]:
feature_set_ftr = {emo: clf.predict_proba(X[emo]) for emo,clf in zip(emoNames, classifiers[estimator])}

# feature_set_htc = {emo: clf.predict(X[emo]) for emo,clf in zip(['sadness', 'disgust', 'surprise', 'anger', 'fear', 'joy'], classifiers[estimator])}

In [22]:
centroids= {
                            "anger": {
                                "A": 6.95, 
                                "D": 5.1, 
                                "V": 2.7
                            }, 
                            "disgust": {
                                "A": 5.3, 
                                "D": 8.05, 
                                "V": 2.7
                            }, 
                            "fear": {
                                "A": 6.5, 
                                "D": 3.6, 
                                "V": 3.2
                            }, 
                            "joy": {
                                "A": 7.22, 
                                "D": 6.28, 
                                "V": 8.6
                            }, 
                            "sadness": {
                                "A": 5.21, 
                                "D": 2.82, 
                                "V": 2.21
                            },
                            "neutral": {
                                "A": 5.0, 
                                "D": 5.0, 
                                "V": 5.0
                            }
                        }

LSTM


In [250]:
import numpy as np
import pandas as pd
import csv

from keras.preprocessing import sequence
from keras.utils.np_utils import to_categorical
from keras.models import Sequential
from keras.layers import Dense, Activation, Embedding, Bidirectional, Dropout, LSTM
from keras.regularizers import l2

from imblearn.over_sampling import RandomOverSampler
from sklearn.cross_validation import StratifiedKFold

In [241]:
seed = 1337
np.random.seed(seed)

maxlen = 65
batch_size = 32
EMBEDDINGS_DIM = 100
epochs = 50

hidden_dims1 = 50
hidden_dims2 = 25
hidden_dims3 = 3

path = '/home/vlaand/IpythonNotebooks/cf-5point-data/'

_path_wordembeddings = '/home/vlaand/data/Glove/glove.twitter.27B/glove.twitter.27B.'+str(EMBEDDINGS_DIM)+'d.txt'
_path_dataset = path + "data-full5.csv"

Load training data and word embeddinbgs


In [242]:
dataTrain = _read_csv_data(filename = _path_dataset, header=True)  

train_tweets, train_labels = _data_to_lists(dataTrain)


data loaded from </home/vlaand/IpythonNotebooks/cf-5point-data/data-full5.csv>
	2019 entries
stacking data to lists
data stacked to lists
	2019 tweets
	2019 labels

In [243]:
wordFrequencies = _get_unique_tokens(train_tweets)
# _plot_word_frequencies(wordFrequencies, WORD_FREQUENCY_TRESHOLD = WORD_FREQUENCY_TRESHOLD)

Dictionary, Indices = _load_original_vectors(
        filename = '/home/vlaand/data/Glove/glove.twitter.27B/glove.twitter.27B.'+str(EMBEDDINGS_DIM)+'d.txt', 
        sep = ' ', 
        wordFrequencies=None,
        zipped=False)

In [244]:
train_sequences = _texts_to_sequences(train_tweets)

embedding_matrix = np.zeros((len(Indices)+1, EMBEDDINGS_DIM))

print('matrix created\n\t',embedding_matrix.shape)
for (word, i) in Indices.items():
    embedding_vector = Dictionary.get(word)
    if (embedding_vector != None):
        embedding_matrix[i] = embedding_vector.astype(np.float)

X = sequence.pad_sequences(train_sequences, maxlen=maxlen)
y = np.array(train_labels)

print(len(X), 'train sequences loaded')
print('\t',X.shape,'\n\t', y.shape)


matrix created
	 (1193515, 100)
/home/vlaand/anaconda3/lib/python3.5/site-packages/ipykernel/__main__.py:8: FutureWarning: comparison to `None` will result in an elementwise object comparison in the future.
2019 train sequences loaded
	 (2019, 65) 
	 (2019, 4)

Evaluation / Cross validation

Kfold cross-validation


In [66]:
from multiprocessing import Pool
from sklearn.metrics import r2_score
import os

epochs = 20
n_splits = 5

def _cross_validation_parallel(_input):
    train, test = _input
    
    model = Sequential()
    model.add(Embedding(len(Indices)+1,  EMBEDDINGS_DIM, weights=[embedding_matrix],
                                input_length=maxlen, trainable=False))
    model.add(Bidirectional(LSTM(EMBEDDINGS_DIM)))
    #model.add(Dropout(0.2))
    model.add(Dense(hidden_dims1, kernel_regularizer=l2(0.01)), )
    model.add(Dense(hidden_dims2, kernel_regularizer=l2(0.01)), ) 
    model.add(Dense(hidden_dims3, activation='elu'))

    model.compile(loss='mae', optimizer='adam',metrics=['accuracy'])

    model.fit(X[train], y[train], batch_size=batch_size, epochs=epochs, validation_split=None, verbose=0)

    scores = []
    for i in [0,1,2]:
        score = r2_score( y[test][:, i] , model.predict(X[test])[:, i] )
        #score = r2_score([y_[i] for y_ in y[test]], [y_[i] for y_ in model.predict(X[test])])
        print("R2["+emoNames[i]+"]\t"+str(score))
        scores.append(score)        
    return scores

seed = 1337
np.random.seed(seed)

kfold = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=seed)

with Pool(processes = n_splits) as p:
    cvscores = p.map(_cross_validation_parallel, [(train, test) for (train, test) in kfold.split(X, [yy[0] for yy in y])])


/home/vlaand/anaconda3/lib/python3.5/site-packages/sklearn/model_selection/_split.py:581: Warning: The least populated class in y has only 1 members, which is too few. The minimum number of groups for any class cannot be less than n_splits=5.
  % (min_groups, self.n_splits)), Warning)
R2[confident]	0.12651473344
R2[excited]	0.183913593432
R2[happy]	0.325849621671
R2[confident]	0.114282643682
R2[excited]	0.280324333738
R2[happy]	0.33475915174
R2[confident]	0.0799940323227
R2[excited]	0.297428340739
R2[happy]	0.427131191301
R2[confident]	0.115380225651
R2[excited]	0.318680321749
R2[happy]	0.245129420541
R2[confident]	0.12899566559
R2[excited]	0.276562234678
R2[happy]	0.34378839584

In [68]:
print("%d-fold cross validation\n" % (n_splits))
for i in [0,1,2]:
    print("R2[%s] \t%.4f (+/- %.2f)" % (emoNames[i], np.mean(cvscores,axis=0)[i], np.std(cvscores,axis=0)[i]))


5-fold cross validation

R2[confident] 	0.1130 (+/- 0.02)
R2[excited] 	0.2714 (+/- 0.05)
R2[happy] 	0.3353 (+/- 0.06)
5-fold cross validation R2[confident] 0.0758 (+/- 0.02) R2[excited] 0.2215 (+/- 0.03) R2[happy] 0.2808 (+/- 0.02) 3-fold cross validation R2[confident] 0.09 (+/- 0.14) R2[excited] 0.20 (+/- 0.11) R2[happy] 0.18 (+/- 0.08)

In [ ]:
from keras.models import Sequential
from keras.layers import Dense
from sklearn.cross_validation import StratifiedKFold

from sklearn.metrics import r2_score, f1_score, classification_report
# from skll.metrics import pearsonr, spearman
from scipy.stats import pearsonr, spearmanr

from multiprocessing import Pool, Manager
import os
import numpy as np

def _cross_validation_parallel(_input):
    train, test = _input
    
    model = Sequential()
    model.add(Embedding(len(Indices)+1,  EMBEDDINGS_DIM, weights=[embedding_matrix],
                                input_length=maxlen, trainable=False))
    model.add(Bidirectional(LSTM(EMBEDDINGS_DIM))) #dropout is same as regularisation
    model.add(Dropout(0.2))
    model.add(Dense(hidden_dims1, W_regularizer=l2(0.01)), )
    model.add(Dense(hidden_dims2, W_regularizer=l2(0.01)), ) #!!!
    model.add(Dense(hidden_dims3, activation='softsign'))
    model.compile(loss='mean_absolute_error', optimizer='adam', metrics=['accuracy', matthews_correlation])
        
    model.fit(X[train], y[train], batch_size=batch_size, nb_epoch=nb_epoch, validation_split=None)
    
#     scores = model.evaluate(X[test], y[test], verbose=0,)
    y_test_predict = model.predict(X[test])
#     y_test_predict = np.reshape(y_test_predict, newshape=(len(y_test_predict),))
    
    scores =  [
                [r2_score(y_test_predict[:,emo], y[test][:,emo]) for emo in [0,1,2,3]], 
                [pearsonr(y_test_predict[:,emo], y[test][:,emo]) for emo in [0,1,2,3]], 
                [spearmanr(y_test_predict[:,emo], y[test][:,emo]) for emo in [0,1,2,3]]
              ]
#     try:
#         print("%s: %.2f" % (model.metrics_names[2], scores[1]))
#     except:
#         print('Error')
    return scores

nb_epoch = 80
n_splits = 5
hidden_dims1, hidden_dims2, hidden_dims3 = 50, 25, 4
np.random.seed(1337)

# with open('senpy-plugins-development/fivePointRegression/classifiers/LSTM/log.out', "w") as log_file:
#     log_file.write(str(cvscores)+'\n')
#     log_file.write("%.2f (+/- %.2f%%)" % (np.mean(cvscores), np.std(cvscores)))

In [ ]:
kfold = StratifiedKFold(n_folds=n_splits, shuffle=True, random_state=1337, y=y[:,0])

with Pool(processes = n_splits) as p:
    cvscores = p.map(_cross_validation_parallel, ((train, test) for (train, test) in kfold))


/home/vlaand/anaconda3/lib/python3.5/site-packages/sklearn/cross_validation.py:516: Warning: The least populated class in y has only 1 members, which is too few. The minimum number of labels for any class cannot be less than n_folds=5.
  % (min_labels, self.n_folds)), Warning)
Epoch 1/80
 448/1618 [=======>......................] - ETA: 52s - loss: 1.5245 - acc: 0.4576 - matthews_correlation: 0.2409

In [196]:
print('\t\tR2\tsd\tpearson\tsd\tspearman')
for emo in [0,1,2,3]:

    r2 = [n[emo] for n in [fold[0] for fold in cvscores]]
    prs = [n[emo][0] for n in [fold[1] for fold in cvscores]]
    spr = [n[emo][0] for n in [fold[2] for fold in cvscores]]

    print("[%8s]\t%.2f\t%.2f\t%.2f\t%.2f\t%.2f\t%.2f" % (emoNames[emo],
                                 np.mean(r2), np.std(r2)/np.sqrt(n_splits),
                                 np.mean(prs), np.std(prs)/np.sqrt(n_splits),
                                 np.mean(spr), np.std(spr)/np.sqrt(n_splits)
                                          ))


		R2	sd	pearson	sd	spearman
[confident]	-0.62	0.09	0.44	0.02	0.43	0.01
[ excited]	-0.65	0.04	0.52	0.01	0.52	0.00
[   happy]	-0.14	0.05	0.62	0.02	0.62	0.02
[surprised]	-1.68	0.09	0.29	0.01	0.28	0.02
R2 sd pearson sd spearman [confident] -0.60 0.10 0.46 0.01 0.44 0.01 [ excited] -0.47 0.03 0.56 0.01 0.55 0.01 [ happy] -0.10 0.02 0.63 0.01 0.63 0.01 [surprised] -1.60 0.09 0.24 0.01 0.24 0.01 R2 sd pearson sd spearman [confident] -0.62 0.09 0.44 0.02 0.43 0.01 [ excited] -0.65 0.04 0.52 0.01 0.52 0.00 [ happy] -0.14 0.05 0.62 0.02 0.62 0.02 [surprised] -1.68 0.09 0.29 0.01 0.28 0.02

In [176]:
p._pool[0].is_alive()


Out[176]:
False

Training

Train / test split


In [336]:
from sklearn.cross_validation import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

print('shapes of training data and label tensors:', X_train.shape, y_train.shape)
print('shapes of testing data and label tensors:', X_test.shape, y_test.shape)


shapes of training data and label tensors: (1615, 65) (1615, 4)
shapes of testing data and label tensors: (404, 65) (404, 4)

In [21]:
# set(y for y in y_train.flat)

In [26]:
# kfold = StratifiedKFold(n_folds=5, shuffle=True, random_state=seed,y=y)

Custom Metrics


In [337]:
import keras.backend as K

def matthews_correlation(y_true, y_pred):
    y_pred_pos = K.round(K.clip(y_pred, 0, 1))
    y_pred_neg = 1 - y_pred_pos

    y_pos = K.round(K.clip(y_true, 0, 1))
    y_neg = 1 - y_pos

    tp = K.sum(y_pos * y_pred_pos)
    tn = K.sum(y_neg * y_pred_neg)

    fp = K.sum(y_neg * y_pred_pos)
    fn = K.sum(y_pos * y_pred_neg)

    numerator = (tp * tn - fp * fn)
    denominator = K.sqrt((tp + fp) * (tp + fn) * (tn + fp) * (tn + fn))

    return numerator / (denominator + K.epsilon())
activations = ['elu','softplus','softsign','relu','tanh','sigmoid','hard_sigmoid','linear','softmax'] losses = ['mse','mae'] optimizers = ['sgd', 'rmsprop', 'adagrad', 'adadelta', 'adam', 'nadam']

In [338]:
hidden_dims1 = 50
hidden_dims2 = 25
hidden_dims3 = 4

model = Sequential()
model.add(Embedding(len(Indices)+1,  EMBEDDINGS_DIM, weights=[embedding_matrix],
                            input_length=maxlen, trainable=False))
model.add(Bidirectional(LSTM(EMBEDDINGS_DIM))) #dropout is same as regularisation

model.add(Dropout(0.2))
model.add(Dense(hidden_dims1, b_regularizer=l2(0.01)),)
model.add(Dense(hidden_dims2, b_regularizer=l2(0.01)), ) 
model.add(Dense(hidden_dims3, activation='softsign'))

model.compile(loss='mean_absolute_error', optimizer='adam',metrics=['accuracy',matthews_correlation])

In [339]:
nb_epoch = 20
model.fit(X, y, batch_size=batch_size, nb_epoch=nb_epoch, validation_split=None)


Epoch 1/20
2019/2019 [==============================] - 21s - loss: 0.1428 - acc: 0.4210 - matthews_correlation: 0.2000    
Epoch 2/20
2019/2019 [==============================] - 20s - loss: 0.1125 - acc: 0.4631 - matthews_correlation: 0.3472    
Epoch 3/20
2019/2019 [==============================] - 20s - loss: 0.1072 - acc: 0.4715 - matthews_correlation: 0.3852    
Epoch 4/20
2019/2019 [==============================] - 20s - loss: 0.1011 - acc: 0.5057 - matthews_correlation: 0.4427    
Epoch 5/20
2019/2019 [==============================] - 20s - loss: 0.0968 - acc: 0.4943 - matthews_correlation: 0.4517    
Epoch 6/20
2019/2019 [==============================] - 20s - loss: 0.0961 - acc: 0.5097 - matthews_correlation: 0.4445    
Epoch 7/20
2019/2019 [==============================] - 20s - loss: 0.0930 - acc: 0.5324 - matthews_correlation: 0.4795    
Epoch 8/20
2019/2019 [==============================] - 20s - loss: 0.0897 - acc: 0.5379 - matthews_correlation: 0.4953    
Epoch 9/20
2019/2019 [==============================] - 20s - loss: 0.0898 - acc: 0.5364 - matthews_correlation: 0.4915    
Epoch 10/20
2019/2019 [==============================] - 20s - loss: 0.0863 - acc: 0.5632 - matthews_correlation: 0.5215    
Epoch 11/20
2019/2019 [==============================] - 19s - loss: 0.0851 - acc: 0.5681 - matthews_correlation: 0.5196    
Epoch 12/20
2019/2019 [==============================] - 19s - loss: 0.0824 - acc: 0.5750 - matthews_correlation: 0.5518    
Epoch 13/20
2019/2019 [==============================] - 19s - loss: 0.0812 - acc: 0.5765 - matthews_correlation: 0.5500    
Epoch 14/20
2019/2019 [==============================] - 20s - loss: 0.0794 - acc: 0.5721 - matthews_correlation: 0.5466    
Epoch 15/20
2019/2019 [==============================] - 19s - loss: 0.0789 - acc: 0.5849 - matthews_correlation: 0.5683    
Epoch 16/20
2019/2019 [==============================] - 20s - loss: 0.0776 - acc: 0.5785 - matthews_correlation: 0.5768    
Epoch 17/20
2019/2019 [==============================] - 20s - loss: 0.0763 - acc: 0.5889 - matthews_correlation: 0.5701    
Epoch 18/20
2019/2019 [==============================] - 20s - loss: 0.0752 - acc: 0.5884 - matthews_correlation: 0.5849    
Epoch 19/20
2019/2019 [==============================] - 20s - loss: 0.0725 - acc: 0.5884 - matthews_correlation: 0.6061    
Epoch 20/20
2019/2019 [==============================] - 20s - loss: 0.0732 - acc: 0.5874 - matthews_correlation: 0.6050    
Out[339]:
<keras.callbacks.History at 0x7fa71c40a978>

In [325]:
from sklearn.metrics import r2_score, f1_score, classification_report
# from skll.metrics import pearson, spearman
from scipy.stats import pearsonr, spearmanr


print('\t\tR2\tpearson\tspearman')
y_test_predicts = []
for i in range(20):
    if i>0: 
        model.fit(X_train, y_train, batch_size=batch_size, nb_epoch=1,validation_split=None,)
#         model.fit(X, y, batch_size=batch_size, nb_epoch=1, validation_split=None)
    y_test_pred = np.array( model.predict(X_test))
    print("%8s\t%.2f\t%.2f\t%.2f" % (i,
                                 np.mean([r2_score(y1,y2)for y1,y2 in zip(y_test, y_test_pred)]) ,                                 
                                 np.mean([pearsonr(y1,y2)[0]  for y1,y2 in zip(y_test, y_test_pred)]) ,
                                 np.mean([spearmanr(y1,y2)[0]  for y1,y2 in zip(y_test, y_test_pred)])
                                    ))    
    y_test_predicts.append(y_test_pred)


# model.fit(X_train, y_train, batch_size=batch_size, nb_epoch=nb_epoch,validation_split=None,)


		R2	pearson	spearman
       0	-3.19	0.02	0.01
Epoch 1/1
1413/1413 [==============================] - 14s - loss: 0.4674 - acc: 0.4529 - matthews_correlation: 0.2539    
       1	-1.39	0.56	0.50
Epoch 1/1
1413/1413 [==============================] - 14s - loss: 0.4290 - acc: 0.4883 - matthews_correlation: 0.3599    
       2	-1.05	0.57	0.50
Epoch 1/1
1413/1413 [==============================] - 13s - loss: 0.4050 - acc: 0.4890 - matthews_correlation: 0.4193    
       3	-1.05	0.59	0.52
Epoch 1/1
1413/1413 [==============================] - 14s - loss: 0.3923 - acc: 0.4862 - matthews_correlation: 0.4674    
       4	-0.92	0.59	0.52
Epoch 1/1
1413/1413 [==============================] - 14s - loss: 0.3811 - acc: 0.5053 - matthews_correlation: 0.4823    
       5	-0.71	0.59	0.53
Epoch 1/1
1413/1413 [==============================] - 14s - loss: 0.3751 - acc: 0.5159 - matthews_correlation: 0.5019    
       6	-0.66	0.60	0.54
Epoch 1/1
1413/1413 [==============================] - 14s - loss: 0.3594 - acc: 0.5074 - matthews_correlation: 0.5338    
       7	-0.91	0.62	0.53
Epoch 1/1
1413/1413 [==============================] - 13s - loss: 0.3549 - acc: 0.5308 - matthews_correlation: 0.5395    
       8	-0.87	0.60	0.53
Epoch 1/1
1413/1413 [==============================] - 14s - loss: 0.3462 - acc: 0.5506 - matthews_correlation: 0.5621    
       9	-0.86	0.62	0.55
Epoch 1/1
1413/1413 [==============================] - 14s - loss: 0.3393 - acc: 0.5322 - matthews_correlation: 0.5777    
      10	-0.97	0.61	0.53
Epoch 1/1
1413/1413 [==============================] - 13s - loss: 0.3302 - acc: 0.5350 - matthews_correlation: 0.5844    
      11	-0.95	0.59	0.52
Epoch 1/1
1413/1413 [==============================] - 14s - loss: 0.3240 - acc: 0.5499 - matthews_correlation: 0.6001    
      12	-0.85	0.60	0.54
Epoch 1/1
1413/1413 [==============================] - 13s - loss: 0.3213 - acc: 0.5506 - matthews_correlation: 0.5933    
      13	-0.80	0.59	0.53
Epoch 1/1
1413/1413 [==============================] - 13s - loss: 0.3170 - acc: 0.5471 - matthews_correlation: 0.5945    
      14	-0.97	0.61	0.54
Epoch 1/1
1413/1413 [==============================] - 13s - loss: 0.3055 - acc: 0.5704 - matthews_correlation: 0.6265    
      15	-0.81	0.60	0.55
Epoch 1/1
1413/1413 [==============================] - 13s - loss: 0.3011 - acc: 0.5648 - matthews_correlation: 0.6319    
      16	-0.87	0.58	0.52
Epoch 1/1
1413/1413 [==============================] - 14s - loss: 0.2980 - acc: 0.5740 - matthews_correlation: 0.6389    
      17	-0.88	0.59	0.53
Epoch 1/1
1413/1413 [==============================] - 14s - loss: 0.2899 - acc: 0.5563 - matthews_correlation: 0.6577    
      18	-0.93	0.60	0.55
Epoch 1/1
1413/1413 [==============================] - 13s - loss: 0.2842 - acc: 0.5718 - matthews_correlation: 0.6652    
      19	-0.89	0.61	0.54

In [2]:
print('[%8s]\tR2\tpearson\tspearman' % emoNames[EMOTION])
for i,y__ in enumerate(y_test_predicts): 
#         print("%8s\t%.2f\t%.2f\t%.2f" % (i,
#                                  r2_score(y_dev , y_test_predict),                                 
#                                  pearsonr(y_dev , y_test_predict)[0],
#                                  spearmanr(y_dev , y_test_predict)[0]))
        print("%8s\t%.2f\t%.2f\t%.2f" % (i,
                                 np.mean([r2_score(y1,y2)for y1,y2 in zip(y_test, y__)]) ,                                 
                                 np.mean([pearsonr(y1,y2)[0]  for y1,y2 in zip(y_test, y__)]) ,
                                 np.mean([spearmanr(y1,y2)[0]  for y1,y2 in zip(y_test, y__)])))

In [350]:
from sklearn.metrics import r2_score, f1_score, classification_report

y_test_predict = model.predict(X_test)
   
for i in [0,1,2,3]:
    print("[%9s]\t%.2f" % (emoNames[i],
                           r2_score(
                                y_test[: , i], 
                                y_test_predict[: , i]
                                )))


[confident]	0.96
[  excited]	0.94
[    happy]	0.96
[surprised]	0.93

In [349]:
%pylab inline
import numpy as np
import matplotlib.pyplot as plt
import natsort

dimension = 0

s = y_test[:, dimension] #y_annotated[dimension]
order = sorted(range(len(s)), key=lambda k: s[k])

g1 = y_test[:, dimension]
g2 = y_test_predict[:, dimension]   

#--------------          ----SCALING----
line_1, = plt.plot(np.array(g2)[order], 'g.', label='Prediction')
line_0, = plt.plot(np.array(g1)[order], 'r.',  label='Actual')

plt.grid(True)
plt.legend(handles=[line_0, line_1])
plt.legend(bbox_to_anchor=(1.02, .4, .65, .0), loc=3,ncol=1, mode="expand", borderaxespad=1.0)
plt.ylabel('dimension: '+emoNames[dimension])
plt.xlabel('tweets')
plt.title("Models Evaluation on 5point corpus")
plt.show()


Populating the interactive namespace from numpy and matplotlib

Save Model and Weights


In [351]:
def _save_model_wassa(model, savePath, emo, modelName):
    if emo == None:
        savePath = os.path.join(savePath,modelName)
    else:
        savePath = savePath+"."+emo
        
    model_json = model.to_json()
    with open(savePath + ".json", "w") as json_file:
        json_file.write(model_json)
        print("<%s.json> " % (savePath))
        
    model.save_weights(savePath +".h5", overwrite=True)
    print("<%s.h5> " % (savePath))
    
#     model.save(savePath + "_.h5")
#     print("<"+savePath + "_.h5>")
    
savePath = "/home/vlaand/IpythonNotebooks/senpy-plugins-development/fivePointRegression/classifiers/LSTM/"


_save_model_wassa(model=model, savePath=savePath, emo=None, modelName="fivePointRegression")
_save_model_wassa(model=model, savePath='/home/vlaand/IpythonNotebooks/cf-5point-data/classifiers/LSTM', emo=None, modelName="fivePointRegression")


</home/vlaand/IpythonNotebooks/senpy-plugins-development/fivePointRegression/classifiers/LSTM/fivePointRegression.json> 
</home/vlaand/IpythonNotebooks/senpy-plugins-development/fivePointRegression/classifiers/LSTM/fivePointRegression.h5> 
</home/vlaand/IpythonNotebooks/cf-5point-data/classifiers/LSTM/fivePointRegression.json> 
</home/vlaand/IpythonNotebooks/cf-5point-data/classifiers/LSTM/fivePointRegression.h5> 

Load


In [ ]:
with open('model_fivePointRegression.json', 'r') as json_file:
    loaded_model_json = json_file.read()
    
loaded_model = model_from_json(loaded_model_json)

# load weights into new model
loaded_model.load_weights("model_fivePointRegression.h5")
# print("Loaded model from disk")