1 #include <Python.h>
2 #include <iostream>
3 #include "theano_mod_helper.h"
4 #include "cuda_ndarray.cuh"
5 //////////////////////
6 //// Support Code
7 //////////////////////
8
9
10 namespace {
11 struct __struct_compiled_op_3a084fd032db206a67ca8121a24aa32e {
12 PyObject* __ERROR;
13
14 PyObject* storage_V3;
15 PyObject* storage_V1;
16
17
18 __struct_compiled_op_3a084fd032db206a67ca8121a24aa32e() {
19 // This is only somewhat safe because we:
20 // 1) Are not a virtual class
21 // 2) Do not use any virtual classes in the members
22 // 3) Deal with mostly POD and pointers
23
24 // If this changes, we would have to revise this, but for
25 // now I am tired of chasing segfaults because
26 // initialization code had an error and some pointer has
27 // a junk value.
28 memset(this, 0, sizeof(*this));
29 }
30 ~__struct_compiled_op_3a084fd032db206a67ca8121a24aa32e(void) {
31 cleanup();
32 }
33
34 int init(PyObject* __ERROR, PyObject* storage_V3, PyObject* storage_V1) {
35 Py_XINCREF(storage_V3);
36 Py_XINCREF(storage_V1);
37 this->storage_V3 = storage_V3;
38 this->storage_V1 = storage_V1;
39
40
41
42
43 this->__ERROR = __ERROR;
44 return 0;
45 }
46 void cleanup(void) {
47 __label_1:
48
49 double __DUMMY_1;
50 __label_3:
51
52 double __DUMMY_3;
53 __label_6:
54
55 double __DUMMY_6;
56
57 Py_XDECREF(this->storage_V3);
58 Py_XDECREF(this->storage_V1);
59 }
60 int run(void) {
61 int __failure = 0;
62
63 PyObject* py_V1;
64 CudaNdarray * V1;
65 PyObject* py_V3;
66 CudaNdarray * V3;
67 {
68
69 py_V1 = PyList_GET_ITEM(storage_V1, 0);
70 {Py_XINCREF(py_V1);}
71
72 if (py_V1 == Py_None)
73 {
74 V1 = NULL;
75 }
76 else
77 {
78
79 assert(py_V1->ob_refcnt >= 2); // There should be at least one ref from the container object,
80 // and one ref from the local scope.
81
82 if (CudaNdarray_Check(py_V1))
83 {
84 //fprintf(stderr, "c_extract CNDA object w refcnt %p %i\n", py_V1, (py_V1->ob_refcnt));
85 V1 = (CudaNdarray*)py_V1;
86 //std::cerr << "c_extract " << V1 << '\n';
87
88
89 if (V1->nd != 2)
90 {
91 PyErr_Format(PyExc_RuntimeError,
92 "c_extract: Some CudaNdarray has rank %i, it was supposed to have rank 2",
93 V1->nd);
94 V1 = NULL;
95 {
96 __failure = 2;
97 if (!PyErr_Occurred()) {
98 PyErr_SetString(PyExc_RuntimeError,
99 "Unexpected error in an Op's C code. "
100 "No Python exception was set.");
101 }
102 goto __label_2;};
103 }
104 //std::cerr << "c_extract " << V1 << " nd check passed\n";
105
106
107 assert(V1);
108 Py_INCREF(py_V1);
109 }
110 else if (py_V1 == Py_None)
111 {
112 PyErr_SetString(PyExc_TypeError,
113 "expected a CudaNdarray, not None");
114 V1 = NULL;
115 {
116 __failure = 2;
117 if (!PyErr_Occurred()) {
118 PyErr_SetString(PyExc_RuntimeError,
119 "Unexpected error in an Op's C code. "
120 "No Python exception was set.");
121 }
122 goto __label_2;};
123 }
124 else
125 {
126 //fprintf(stderr, "FAILING c_extract CNDA object w refcnt %p %i\n", py_V1, (py_V1->ob_refcnt));
127 PyErr_SetString(PyExc_TypeError, "Argument not a CudaNdarray");
128 V1 = NULL;
129 {
130 __failure = 2;
131 if (!PyErr_Occurred()) {
132 PyErr_SetString(PyExc_RuntimeError,
133 "Unexpected error in an Op's C code. "
134 "No Python exception was set.");
135 }
136 goto __label_2;};
137 }
138 //std::cerr << "c_extract done " << V1 << '\n';
139
140
141 }
142
143 {
144
145 py_V3 = PyList_GET_ITEM(storage_V3, 0);
146 {Py_XINCREF(py_V3);}
147
148 assert(py_V3->ob_refcnt >= 2); // There should be at least one ref from the container object,
149 // and one ref from the local scope.
150
151 if (CudaNdarray_Check(py_V3))
152 {
153 //fprintf(stderr, "c_extract CNDA object w refcnt %p %i\n", py_V3, (py_V3->ob_refcnt));
154 V3 = (CudaNdarray*)py_V3;
155 //std::cerr << "c_extract " << V3 << '\n';
156
157
158 if (V3->nd != 2)
159 {
160 PyErr_Format(PyExc_RuntimeError,
161 "c_extract: Some CudaNdarray has rank %i, it was supposed to have rank 2",
162 V3->nd);
163 V3 = NULL;
164 {
165 __failure = 4;
166 if (!PyErr_Occurred()) {
167 PyErr_SetString(PyExc_RuntimeError,
168 "Unexpected error in an Op's C code. "
169 "No Python exception was set.");
170 }
171 goto __label_4;};
172 }
173 //std::cerr << "c_extract " << V3 << " nd check passed\n";
174
175
176 assert(V3);
177 Py_INCREF(py_V3);
178 }
179 else if (py_V3 == Py_None)
180 {
181 PyErr_SetString(PyExc_TypeError,
182 "expected a CudaNdarray, not None");
183 V3 = NULL;
184 {
185 __failure = 4;
186 if (!PyErr_Occurred()) {
187 PyErr_SetString(PyExc_RuntimeError,
188 "Unexpected error in an Op's C code. "
189 "No Python exception was set.");
190 }
191 goto __label_4;};
192 }
193 else
194 {
195 //fprintf(stderr, "FAILING c_extract CNDA object w refcnt %p %i\n", py_V3, (py_V3->ob_refcnt));
196 PyErr_SetString(PyExc_TypeError, "Argument not a CudaNdarray");
197 V3 = NULL;
198 {
199 __failure = 4;
200 if (!PyErr_Occurred()) {
201 PyErr_SetString(PyExc_RuntimeError,
202 "Unexpected error in an Op's C code. "
203 "No Python exception was set.");
204 }
205 goto __label_4;};
206 }
207 //std::cerr << "c_extract done " << V3 << '\n';
208
209
210 {
211 // Op class GpuElemwise
212
213 //std::cerr << "C_CODE RoundHalfToEven START\n";
214 //standard elemwise size checks
215
216
217 int dims[2] = {1,1};
218
219
220 int broadcasts_V3[2] = {0, 0};
221
222
223 //std::cerr << "C_CODE RoundHalfToEven checking input V3\n";
224 if (2 != V3->nd)
225 {
226 PyErr_Format(PyExc_TypeError,
227 "need 2 dims, not %i", V3->nd);
228 {
229 __failure = 5;
230 if (!PyErr_Occurred()) {
231 PyErr_SetString(PyExc_RuntimeError,
232 "Unexpected error in an Op's C code. "
233 "No Python exception was set.");
234 }
235 goto __label_5;};
236 }
237 for (int i = 0; i< 2; ++i)
238 {
239 dims[i] = (dims[i] == 1) ? CudaNdarray_HOST_DIMS(V3)[i] : dims[i];
240 if ((!(broadcasts_V3[i] &&
241 CudaNdarray_HOST_DIMS(V3)[i] == 1)) &&
242 (dims[i] != CudaNdarray_HOST_DIMS(V3)[i]))
243 {
244 //std::cerr << "C_CODE RoundHalfToEven checking input V3 failed\n";
245 PyErr_Format(PyExc_ValueError,
246 "GpuElemwise. Input dimension mis-match. Input"
247 " 0 (indices start at 0) has shape[%i] == %i"
248 ", but the output's size on that axis is %i.",
249 i,
250 CudaNdarray_HOST_DIMS(V3)[i],
251 dims[i]
252 );
253 {
254 __failure = 5;
255 if (!PyErr_Occurred()) {
256 PyErr_SetString(PyExc_RuntimeError,
257 "Unexpected error in an Op's C code. "
258 "No Python exception was set.");
259 }
260 goto __label_5;};
261 }
262 }
263
264
265 Py_XDECREF(V1);
266 V1 = V3;
267 Py_INCREF(V1);
268 for (int i = 0; (i< 2) && (V1); ++i) {
269 if (dims[i] != CudaNdarray_HOST_DIMS(V1)[i])
270 {
271 PyErr_Format(PyExc_ValueError,
272 "GpuElemwise. Output dimension mis-match. Output"
273 " 0 (indices start at 0), working inplace"
274 " on input 0, has shape[%i] == %i"
275 ", but the output's size on that axis is %i.",
276 i,
277 CudaNdarray_HOST_DIMS(V1)[i],
278 dims[i]
279 );
280 Py_DECREF(V1);
281 V1 = NULL;
282 {
283 __failure = 5;
284 if (!PyErr_Occurred()) {
285 PyErr_SetString(PyExc_RuntimeError,
286 "Unexpected error in an Op's C code. "
287 "No Python exception was set.");
288 }
289 goto __label_5;};
290 }
291 }
292 //std::cerr << "ELEMWISE NEW V1 nd" << V1->nd << "\n";
293 //std::cerr << "ELEMWISE NEW V1 data" << V1->devdata << "\n";
294
295
296 {
297 //new block so that failure gotos don't skip over variable initialization
298 //std::cerr << "calling callkernel\n";
299 if (callkernel_node_3a084fd032db206a67ca8121a24aa32e_0(1, 0, dims
300
301
302 , CudaNdarray_DEV_DATA(V3), CudaNdarray_HOST_STRIDES(V3)
303
304
305 , CudaNdarray_DEV_DATA(V1), CudaNdarray_HOST_STRIDES(V1)
306
307
308 ))
309 {
310 // error
311
312
313 Py_DECREF(V1);
314 V1 = NULL;
315
316
317 {
318 __failure = 5;
319 if (!PyErr_Occurred()) {
320 PyErr_SetString(PyExc_RuntimeError,
321 "Unexpected error in an Op's C code. "
322 "No Python exception was set.");
323 }
324 goto __label_5;};
325 }
326 else // no error
327 {
328 }
329 }
330 //std::cerr << "C_CODE RoundHalfToEven END\n";
331
332 __label_5:
333
334 double __DUMMY_5;
335
336 }
337 __label_4:
338
339 //std::cerr << "cleanup " << py_V3 << " " << V3 << "\n";
340 //fprintf(stderr, "c_cleanup CNDA py_object w refcnt %p %i\n", py_V3, (py_V3->ob_refcnt));
341 if (V3)
342 {
343 //fprintf(stderr, "c_cleanup CNDA cn_object w refcnt %p %i\n", V3, (V3->ob_refcnt));
344 Py_XDECREF(V3);
345 }
346 //std::cerr << "cleanup done" << py_V3 << "\n";
347
348 {Py_XDECREF(py_V3);}
349
350 double __DUMMY_4;
351
352 }
353 __label_2:
354
355 if (!__failure) {
356
357 //std::cerr << "sync\n";
358 if (NULL == V1) {
359 // failure: sync None to storage
360 Py_XDECREF(py_V1);
361 py_V1 = Py_None;
362 Py_INCREF(py_V1);
363 }
364 else
365 {
366 if (py_V1 != (PyObject*)V1)
367 {
368 Py_XDECREF(py_V1);
369 py_V1 = (PyObject*)V1;
370 Py_INCREF(py_V1);
371 }
372 assert(py_V1->ob_refcnt);
373 }
374
375 PyObject* old = PyList_GET_ITEM(storage_V1, 0);
376 {Py_XINCREF(py_V1);}
377 PyList_SET_ITEM(storage_V1, 0, py_V1);
378 {Py_XDECREF(old);}
379 }
380
381 //std::cerr << "cleanup " << py_V1 << " " << V1 << "\n";
382 //fprintf(stderr, "c_cleanup CNDA py_object w refcnt %p %i\n", py_V1, (py_V1->ob_refcnt));
383 if (V1)
384 {
385 //fprintf(stderr, "c_cleanup CNDA cn_object w refcnt %p %i\n", V1, (V1->ob_refcnt));
386 Py_XDECREF(V1);
387 }
388 //std::cerr << "cleanup done" << py_V1 << "\n";
389
390 {Py_XDECREF(py_V1);}
391
392 double __DUMMY_2;
393
394 }
395
396
397 if (__failure) {
398 // When there is a failure, this code puts the exception
399 // in __ERROR.
400 PyObject* err_type = NULL;
401 PyObject* err_msg = NULL;
402 PyObject* err_traceback = NULL;
403 PyErr_Fetch(&err_type, &err_msg, &err_traceback);
404 if (!err_type) {err_type = Py_None;Py_INCREF(Py_None);}
405 if (!err_msg) {err_msg = Py_None; Py_INCREF(Py_None);}
406 if (!err_traceback) {err_traceback = Py_None; Py_INCREF(Py_None);}
407 PyObject* old_err_type = PyList_GET_ITEM(__ERROR, 0);
408 PyObject* old_err_msg = PyList_GET_ITEM(__ERROR, 1);
409 PyObject* old_err_traceback = PyList_GET_ITEM(__ERROR, 2);
410 PyList_SET_ITEM(__ERROR, 0, err_type);
411 PyList_SET_ITEM(__ERROR, 1, err_msg);
412 PyList_SET_ITEM(__ERROR, 2, err_traceback);
413 {Py_XDECREF(old_err_type);}
414 {Py_XDECREF(old_err_msg);}
415 {Py_XDECREF(old_err_traceback);}
416 }
417 // The failure code is returned to index what code block failed.
418 return __failure;
419
420 }
421 };
422 }
423
424
425 static int __struct_compiled_op_3a084fd032db206a67ca8121a24aa32e_executor(__struct_compiled_op_3a084fd032db206a67ca8121a24aa32e* self) {
426 return self->run();
427 }
428
429 static void __struct_compiled_op_3a084fd032db206a67ca8121a24aa32e_destructor(void* executor, void* self) {
430 delete ((__struct_compiled_op_3a084fd032db206a67ca8121a24aa32e*)self);
431 }
432
433 //////////////////////
434 //// Functions
435 //////////////////////
436 static PyObject * instantiate(PyObject * self, PyObject *argtuple) {
437 assert(PyTuple_Check(argtuple));
438 if (3 != PyTuple_Size(argtuple)){
439 PyErr_Format(PyExc_TypeError, "Wrong number of arguments, expected 3, got %i", (int)PyTuple_Size(argtuple));
440 return NULL;
441 }
442 __struct_compiled_op_3a084fd032db206a67ca8121a24aa32e* struct_ptr = new __struct_compiled_op_3a084fd032db206a67ca8121a24aa32e();
443 if (struct_ptr->init( PyTuple_GET_ITEM(argtuple, 0),PyTuple_GET_ITEM(argtuple, 1),PyTuple_GET_ITEM(argtuple, 2) ) != 0) {
444 delete struct_ptr;
445 return NULL;
446 }
447 PyObject* thunk = PyCObject_FromVoidPtrAndDesc((void*)(&__struct_compiled_op_3a084fd032db206a67ca8121a24aa32e_executor), struct_ptr, __struct_compiled_op_3a084fd032db206a67ca8121a24aa32e_destructor);
448 return thunk; }
449
450 //////////////////////
451 //// Module init
452 //////////////////////
453 static PyMethodDef MyMethods[] = {
454 {"instantiate", instantiate, METH_VARARGS, "undocumented"} ,
455 {NULL, NULL, 0, NULL}
456 };
457 PyMODINIT_FUNC init3a084fd032db206a67ca8121a24aa32e(void){
458 (void) Py_InitModule("3a084fd032db206a67ca8121a24aa32e", MyMethods);
459 }
460
===============================
In file included from /home/ubuntu/anaconda2/include/python2.7/Python.h:8:0,
from mod.cu:1:
/home/ubuntu/anaconda2/include/python2.7/pyconfig.h:1193:0: warning: "_POSIX_C_SOURCE" redefined
#define _POSIX_C_SOURCE 200112L
^
In file included from /usr/local/cuda/include/host_config.h:173:0,
from /usr/local/cuda/include/cuda_runtime.h:78,
from <command-line>:0:
/usr/include/features.h:228:0: note: this is the location of the previous definition
# define _POSIX_C_SOURCE 200809L
^
In file included from /home/ubuntu/anaconda2/include/python2.7/Python.h:8:0,
from mod.cu:1:
/home/ubuntu/anaconda2/include/python2.7/pyconfig.h:1215:0: warning: "_XOPEN_SOURCE" redefined
#define _XOPEN_SOURCE 600
^
In file included from /usr/local/cuda/include/host_config.h:173:0,
from /usr/local/cuda/include/cuda_runtime.h:78,
from <command-line>:0:
/usr/include/features.h:169:0: note: this is the location of the previous definition
# define _XOPEN_SOURCE 700
^
mod.cu(299): error: identifier "callkernel_node_3a084fd032db206a67ca8121a24aa32e_0" is undefined
1 error detected in the compilation of "/tmp/tmpxft_00000896_00000000-9_mod.cpp1.ii".
['nvcc', '-shared', '-O3', '--maxrregcount=32', '-arch=sm_37', '-m64', '-Xcompiler', '-fno-math-errno,-Wno-unused-label,-Wno-unused-variable,-Wno-write-strings,-DCUDA_NDARRAY_CUH=c72d035fdf91890f3b36710688069b2e,-DNPY_NO_DEPRECATED_API=NPY_1_7_API_VERSION,-fPIC,-fvisibility=hidden', '-Xlinker', '-rpath,/home/ubuntu/.theano/compiledir_Linux-4.4--generic-x86_64-with-debian-stretch-sid-x86_64-2.7.12-64/cuda_ndarray', '-I/home/ubuntu/.theano/compiledir_Linux-4.4--generic-x86_64-with-debian-stretch-sid-x86_64-2.7.12-64/cuda_ndarray', '-I/usr/local/cuda/include', '-I/home/ubuntu/anaconda2/lib/python2.7/site-packages/numpy/core/include', '-I/home/ubuntu/anaconda2/include/python2.7', '-I/home/ubuntu/anaconda2/lib/python2.7/site-packages/theano/gof', '-I/home/ubuntu/anaconda2/lib/python2.7/site-packages/theano/sandbox/cuda', '-o', '/home/ubuntu/.theano/compiledir_Linux-4.4--generic-x86_64-with-debian-stretch-sid-x86_64-2.7.12-64/tmpjyNShN/3a084fd032db206a67ca8121a24aa32e.so', 'mod.cu', '-L/home/ubuntu/.theano/compiledir_Linux-4.4--generic-x86_64-with-debian-stretch-sid-x86_64-2.7.12-64/cuda_ndarray', '-L/home/ubuntu/anaconda2/lib', '-lcudart', '-lcublas', '-lcuda_ndarray', '-lpython2.7']
---------------------------------------------------------------------------
Exception Traceback (most recent call last)
<ipython-input-19-a16097444c46> in <module>()
----> 1 model.fit(trn, labels_train, validation_data=(test, labels_test), nb_epoch=2, batch_size=64)
/home/ubuntu/anaconda2/lib/python2.7/site-packages/keras/models.pyc in fit(self, x, y, batch_size, nb_epoch, verbose, callbacks, validation_split, validation_data, shuffle, class_weight, sample_weight, initial_epoch, **kwargs)
670 class_weight=class_weight,
671 sample_weight=sample_weight,
--> 672 initial_epoch=initial_epoch)
673
674 def evaluate(self, x, y, batch_size=32, verbose=1,
/home/ubuntu/anaconda2/lib/python2.7/site-packages/keras/engine/training.pyc in fit(self, x, y, batch_size, nb_epoch, verbose, callbacks, validation_split, validation_data, shuffle, class_weight, sample_weight, initial_epoch)
1135 check_batch_axis=False,
1136 batch_size=batch_size)
-> 1137 self._make_test_function()
1138 val_f = self.test_function
1139 if self.uses_learning_phase and not isinstance(K.learning_phase(), int):
/home/ubuntu/anaconda2/lib/python2.7/site-packages/keras/engine/training.pyc in _make_test_function(self)
780 [self.total_loss] + self.metrics_tensors,
781 updates=self.state_updates,
--> 782 **self._function_kwargs)
783
784 def _make_predict_function(self):
/home/ubuntu/anaconda2/lib/python2.7/site-packages/keras/backend/theano_backend.pyc in function(inputs, outputs, updates, **kwargs)
967 msg = 'Invalid argument "%s" passed to K.function' % key
968 raise ValueError(msg)
--> 969 return Function(inputs, outputs, updates=updates, **kwargs)
970
971
/home/ubuntu/anaconda2/lib/python2.7/site-packages/keras/backend/theano_backend.pyc in __init__(self, inputs, outputs, updates, **kwargs)
953 allow_input_downcast=True,
954 on_unused_input='ignore',
--> 955 **kwargs)
956
957 def __call__(self, inputs):
/home/ubuntu/anaconda2/lib/python2.7/site-packages/theano/compile/function.pyc in function(inputs, outputs, mode, updates, givens, no_default_updates, accept_inplace, name, rebuild_strict, allow_input_downcast, profile, on_unused_input)
318 on_unused_input=on_unused_input,
319 profile=profile,
--> 320 output_keys=output_keys)
321 # We need to add the flag check_aliased inputs if we have any mutable or
322 # borrowed used defined inputs
/home/ubuntu/anaconda2/lib/python2.7/site-packages/theano/compile/pfunc.pyc in pfunc(params, outputs, mode, updates, givens, no_default_updates, accept_inplace, name, rebuild_strict, allow_input_downcast, profile, on_unused_input, output_keys)
477 accept_inplace=accept_inplace, name=name,
478 profile=profile, on_unused_input=on_unused_input,
--> 479 output_keys=output_keys)
480
481
/home/ubuntu/anaconda2/lib/python2.7/site-packages/theano/compile/function_module.pyc in orig_function(inputs, outputs, mode, accept_inplace, name, profile, on_unused_input, output_keys)
1775 on_unused_input=on_unused_input,
1776 output_keys=output_keys).create(
-> 1777 defaults)
1778
1779 t2 = time.time()
/home/ubuntu/anaconda2/lib/python2.7/site-packages/theano/compile/function_module.pyc in create(self, input_storage, trustme, storage_map)
1639 theano.config.traceback.limit = 0
1640 _fn, _i, _o = self.linker.make_thunk(
-> 1641 input_storage=input_storage_lists, storage_map=storage_map)
1642 finally:
1643 theano.config.traceback.limit = limit_orig
/home/ubuntu/anaconda2/lib/python2.7/site-packages/theano/gof/link.pyc in make_thunk(self, input_storage, output_storage, storage_map)
688 return self.make_all(input_storage=input_storage,
689 output_storage=output_storage,
--> 690 storage_map=storage_map)[:3]
691
692 def make_all(self, input_storage, output_storage):
/home/ubuntu/anaconda2/lib/python2.7/site-packages/theano/gof/vm.pyc in make_all(self, profiler, input_storage, output_storage, storage_map)
1001 storage_map,
1002 compute_map,
-> 1003 no_recycling))
1004 if not hasattr(thunks[-1], 'lazy'):
1005 # We don't want all ops maker to think about lazy Ops.
/home/ubuntu/anaconda2/lib/python2.7/site-packages/theano/sandbox/cuda/__init__.pyc in make_thunk(self, node, storage_map, compute_map, no_recycling)
254 enable_cuda=False)
255 return super(GpuOp, self).make_thunk(node, storage_map,
--> 256 compute_map, no_recycling)
257
258 theano.compile.debugmode.default_make_thunk.append(
/home/ubuntu/anaconda2/lib/python2.7/site-packages/theano/gof/op.pyc in make_thunk(self, node, storage_map, compute_map, no_recycling)
968 try:
969 return self.make_c_thunk(node, storage_map, compute_map,
--> 970 no_recycling)
971 except (NotImplementedError, utils.MethodNotDefined):
972 logger.debug('Falling back on perform')
/home/ubuntu/anaconda2/lib/python2.7/site-packages/theano/gof/op.pyc in make_c_thunk(self, node, storage_map, compute_map, no_recycling)
877 logger.debug('Trying CLinker.make_thunk')
878 outputs = cl.make_thunk(input_storage=node_input_storage,
--> 879 output_storage=node_output_storage)
880 fill_storage, node_input_filters, node_output_filters = outputs
881
/home/ubuntu/anaconda2/lib/python2.7/site-packages/theano/gof/cc.pyc in make_thunk(self, input_storage, output_storage, storage_map, keep_lock)
1198 cthunk, in_storage, out_storage, error_storage = self.__compile__(
1199 input_storage, output_storage, storage_map,
-> 1200 keep_lock=keep_lock)
1201
1202 res = _CThunk(cthunk, init_tasks, tasks, error_storage)
/home/ubuntu/anaconda2/lib/python2.7/site-packages/theano/gof/cc.pyc in __compile__(self, input_storage, output_storage, storage_map, keep_lock)
1141 output_storage,
1142 storage_map,
-> 1143 keep_lock=keep_lock)
1144 return (thunk,
1145 [link.Container(input, storage) for input, storage in
/home/ubuntu/anaconda2/lib/python2.7/site-packages/theano/gof/cc.pyc in cthunk_factory(self, error_storage, in_storage, out_storage, storage_map, keep_lock)
1593 else:
1594 module = get_module_cache().module_from_key(
-> 1595 key=key, lnk=self, keep_lock=keep_lock)
1596
1597 vars = self.inputs + self.outputs + self.orphans
/home/ubuntu/anaconda2/lib/python2.7/site-packages/theano/gof/cmodule.pyc in module_from_key(self, key, lnk, keep_lock)
1140 try:
1141 location = dlimport_workdir(self.dirname)
-> 1142 module = lnk.compile_cmodule(location)
1143 name = module.__file__
1144 assert name.startswith(location)
/home/ubuntu/anaconda2/lib/python2.7/site-packages/theano/gof/cc.pyc in compile_cmodule(self, location)
1504 lib_dirs=self.lib_dirs(),
1505 libs=libs,
-> 1506 preargs=preargs)
1507 except Exception as e:
1508 e.args += (str(self.fgraph),)
/home/ubuntu/anaconda2/lib/python2.7/site-packages/theano/sandbox/cuda/nvcc_compiler.pyc in compile_str(module_name, src_code, location, include_dirs, lib_dirs, libs, preargs, rpaths, py_module, hide_symbols)
397 print(cmd)
398 raise Exception('nvcc return status', p.returncode,
--> 399 'for cmd', ' '.join(cmd))
400 elif config.cmodule.compilation_warning and nvcc_stdout:
401 print(nvcc_stdout)
Exception: ('The following error happened while compiling the node', GpuElemwise{RoundHalfToEven}[(0, 0)](GpuElemwise{Composite{scalar_sigmoid((i0 + i1))}}[(0, 0)].0), '\n', 'nvcc return status', 2, 'for cmd', 'nvcc -shared -O3 --maxrregcount=32 -arch=sm_37 -m64 -Xcompiler -fno-math-errno,-Wno-unused-label,-Wno-unused-variable,-Wno-write-strings,-DCUDA_NDARRAY_CUH=c72d035fdf91890f3b36710688069b2e,-DNPY_NO_DEPRECATED_API=NPY_1_7_API_VERSION,-fPIC,-fvisibility=hidden -Xlinker -rpath,/home/ubuntu/.theano/compiledir_Linux-4.4--generic-x86_64-with-debian-stretch-sid-x86_64-2.7.12-64/cuda_ndarray -I/home/ubuntu/.theano/compiledir_Linux-4.4--generic-x86_64-with-debian-stretch-sid-x86_64-2.7.12-64/cuda_ndarray -I/usr/local/cuda/include -I/home/ubuntu/anaconda2/lib/python2.7/site-packages/numpy/core/include -I/home/ubuntu/anaconda2/include/python2.7 -I/home/ubuntu/anaconda2/lib/python2.7/site-packages/theano/gof -I/home/ubuntu/anaconda2/lib/python2.7/site-packages/theano/sandbox/cuda -o /home/ubuntu/.theano/compiledir_Linux-4.4--generic-x86_64-with-debian-stretch-sid-x86_64-2.7.12-64/tmpjyNShN/3a084fd032db206a67ca8121a24aa32e.so mod.cu -L/home/ubuntu/.theano/compiledir_Linux-4.4--generic-x86_64-with-debian-stretch-sid-x86_64-2.7.12-64/cuda_ndarray -L/home/ubuntu/anaconda2/lib -lcudart -lcublas -lcuda_ndarray -lpython2.7', '[GpuElemwise{RoundHalfToEven}[(0, 0)](<CudaNdarrayType(float32, matrix)>)]')