import numpy as np
import pandas as pd
import csv
import math, itertools
import os
import subprocess

import matplotlib.pyplot as plt

from multiprocessing import Pool, Manager

from collections import Counter
from stop_words import get_stop_words
import natsort
from natsort import natsorted

from scipy import spatial
from scipy.stats import pearsonr, spearmanr

from sklearn.svm import SVR, LinearSVR
from sklearn.externals import joblib
from sklearn.model_selection import cross_val_predict, cross_val_score
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.metrics import r2_score, f1_score 
from sklearn.metrics import classification_report, precision_recall_fscore_support
from sklearn.feature_extraction.text import CountVectorizer

from imblearn.over_sampling import RandomOverSampler

from keras.preprocessing import sequence
from keras.utils.np_utils import to_categorical
from keras.models import Sequential,load_model, model_from_json
from keras.layers import Dense, Activation, Embedding, Bidirectional, Dropout, LSTM
from keras.regularizers import l2
import keras.backend as K

from theano import function

import warnings

2598                 return -1;
2599         }
2600     }
2602     PyErr_Clear(); // clear PyNumber_Int error.
2604     if(!CudaNdarray_Check(o) || !CudaNdarray_Check(value))
2605     {
2606         PyErr_SetString(PyExc_TypeError,
2607           "CudaNdarray.__setitem__: left must be a CudaNdarrays and right"
2608           " must be a CudaNdarrays, an ndarray or a python scalar of value 0.");
2609         Py_XDECREF(new_value);
2610         return -1;
2611     }
2613     if (verbose)
2614         fprintf(stderr, "CudaNdarray_setitem dest and value are CudaNdarray\n");
2616     if (cnda_copy_structure_to_device(rval))
2617     {
2618         PyErr_SetString(PyExc_RuntimeError,
2619                 "CudaNdarray.__setitem__: syncing structure to device failed");
2620         Py_DECREF(rval);
2621         Py_XDECREF(new_value);
2623         if (verbose)
2624             fprintf(stderr, "CudaNdarray_setitem error end\n");
2625         return -1;
2626     }
2628     PyObject *baseSavedForComparison = rval->base;
2630     if (CudaNdarray_CopyFromCudaNdarray(rval, (CudaNdarray*)value, true))
2631     {
2632         Py_DECREF((PyObject*)rval);
2633         Py_XDECREF(new_value);
2635         if (verbose)
2636             fprintf(stderr, "CudaNdarray_setitem error end\n");
2637         return -1;
2638     }
2640     assert (rval->base == baseSavedForComparison);
2641     assert (rval->dev_structure_fresh);
2643     // Clean up locally-created references
2644     Py_DECREF(rval);
2645     Py_XDECREF(new_value);
2647     return 0;
2648 }
2651 PyMappingMethods CudaNdarrayMappingMethods = {
2652     CudaNdarray_len, //lenfunc mp_length;               __len__
2653     CudaNdarray_Subscript, //binaryfunc mp_subscript;   __getitem__
2654     CudaNdarray_setitem //objobjargproc mp_ass_subscript;                __setitem__
2655 };
2657 ////////////////////
2658 //
2659 ////////////////////
2661 static PyObject *
2662 CudaNdarray_get_shape(CudaNdarray *self, void *closure)
2663 {
2664     if (self->nd < 0)
2665     {
2666         PyErr_SetString(PyExc_ValueError, "CudaNdarray not initialized");
2667         return NULL;
2668     }
2669     PyObject * rval = PyTuple_New(self->nd);
2670     for (int i = 0; i < self->nd; ++i)
2671     {
2672         if (!rval || PyTuple_SetItem(rval, i, PyInt_FromLong(CudaNdarray_HOST_DIMS(self)[i])))
2673         {
2674             Py_XDECREF(rval);
2675             return NULL;
2676         }
2678     }
2679     return rval;
2680 }
2682 static int
2683 CudaNdarray_set_shape(CudaNdarray *self, PyObject *value, void *closure)
2684 {
2685     PyErr_SetString(PyExc_NotImplementedError, "TODO: call reshape");
2686     return -1;
2687 }
2689 static PyObject *
2690 CudaNdarray_get_strides(CudaNdarray *self, void *closure)
2691 {
2692     if (self->nd < 0)
2693     {
2694         PyErr_SetString(PyExc_ValueError, "CudaNdarray not initialized");
2695         return NULL;
2696     }
2697     PyObject * rval = PyTuple_New(self->nd);
2698     for (int i = 0; i < self->nd; ++i)
2699     {
2700         if (!rval || PyTuple_SetItem(rval, i, PyInt_FromLong(CudaNdarray_HOST_STRIDES(self)[i])))
2701         {
2702             Py_XDECREF(rval);
2703             return NULL;
2704         }
2706     }
2707     return rval;
2708 }
2710 static int
2711 CudaNdarray_set_strides(CudaNdarray *self, PyObject *value, void *closure)
2712 {
2713     //npy_intp newstrides_bytes[PyTuple_Size(value)];
2714     if (PyTuple_Check(value)){
2715         if (PyTuple_Size(value) != CudaNdarray_NDIM(self)){
2716             PyErr_SetString(PyExc_ValueError,
2717                             "The new strides tuple must have the same length"
2718                             " as the number of dimensions");
2719             return -1;
2720         }
2721     }else if (PyList_Check(value)){
2722         if (PyList_Size(value) != CudaNdarray_NDIM(self)){
2723             PyErr_SetString(PyExc_ValueError,
2724                             "The new strides list must have the same length"
2725                             " as the number of dimensions");
2726             return -1;
2727         }
2728     }else{
2729         PyErr_SetString(PyExc_ValueError,
2730                         "The new strides need to be encoded in a tuple or list");
2731         return -1;
2732     }
2733     npy_intp* newstrides = (npy_intp*) alloca(CudaNdarray_NDIM(self) * sizeof(npy_intp));
2734     if (PyTuple_Check(value)){
2735         for(int i=0; i < CudaNdarray_NDIM(self); i++){
2736             newstrides[i] = PyInt_AsLong(PyTuple_GetItem(value, Py_ssize_t(i)));
2737             //newstrides_bytes[i] = newstrides[i] * 4;
2738         }
2739     }else if (PyList_Check(value)){
2740         for(int i=0; i < CudaNdarray_NDIM(self); i++){
2741             newstrides[i] = PyInt_AsLong(PyList_GetItem(value, Py_ssize_t(i)));
2742             //newstrides_bytes[i] = newstrides[i] * 4;
2743         }
2744     }
2745     /*
2746     // Do not do this check, as ExtractDiag needs that, and NumPy does not seem
2747     // to do it.
2748     npy_intp dims[PyTuple_Size(value)];
2749     for(int i=0; i < CudaNdarray_NDIM(self); i++){
2750         dims[i] = CudaNdarray_HOST_DIMS(self)[i];
2751     }
2752     if (!PyArray_CheckStrides(4,
2753                               CudaNdarray_NDIM(self),
2754                               0, 0,
2755                               dims,
2756                               newstrides_bytes)){
2757         PyErr_SetString(PyExc_ValueError, "bad new strides");
2758         return -1;
2759         }
2760     */
2761     for(int i=0; i < CudaNdarray_NDIM(self); i++){
2762         CudaNdarray_set_stride(self, i, newstrides[i]);
2763     }
2764     return 0;
2765 }
2767 static PyObject *
2768 CudaNdarray_get_dev_data(CudaNdarray *self, void *closure)
2769 {
2770     float * p =  CudaNdarray_DEV_DATA(self);
2771     //printf("get_dev_data %p %li \n", p, (long int)p );
2772     return PyInt_FromSize_t((size_t) CudaNdarray_DEV_DATA(self));
2773 }
2775 static int
2776 CudaNdarray_set_dev_data(CudaNdarray *self, PyObject *value, void *closure)
2777 {
2778     Py_ssize_t newdevdata = PyInt_AsSsize_t(value);
2779     //printf("set_dev_data %p %li \n",(float*)newdevdata ,newdevdata);
2780     if (PyErr_Occurred())
2781     {
2782         return -1;
2783     }
2784     return  CudaNdarray_set_device_data(self, (float*)newdevdata, (CudaNdarray*)self->base);
2785 }
2787 static PyObject *
2788 CudaNdarray_get_dtype(CudaNdarray *self, void *closure)
2789 {
2790     return PyString_FromString("float32");
2791 }
2793 static PyObject *
2794 CudaNdarray_get_ndim(CudaNdarray *self, void *closure)
2795 {
2796     return PyInt_FromLong(self->nd);
2797 }
2799 static PyObject *
2800 CudaNdarray_get_base(CudaNdarray *self, void *closure)
2801 {
2802     PyObject * base = self->base;
2803     if (!base)
2804     {
2805         // We cannot return a NULL pointer, use None instead
2806         base = Py_None;
2807     }
2808     Py_INCREF(base);
2809     return base;
2810 }
2812 void put_in_dict(PyObject * dict, const char * key, int val)
2813 {
2814   PyObject * k = PyString_FromString(key);
2815   PyObject * v = PyInt_FromLong(val);
2816   PyDict_SetItem(dict, k, v);
2817   Py_DECREF(k);
2818   Py_DECREF(v);
2819 }
2821 PyObject *
2822 GetDeviceProperties(PyObject* _unused, PyObject* args)
2823 {
2824   int dev_id = -1;
2825   if (! PyArg_ParseTuple(args, "i", &dev_id))
2826     return NULL;
2827   cudaDeviceProp deviceProp;
2828   cudaGetDeviceProperties(&deviceProp, dev_id);
2830   PyObject * dict = PyDict_New();
2831   PyObject * str= PyString_FromString("name");
2832   PyObject * i = PyString_FromString(;
2833   PyDict_SetItem(dict, str, i);
2834   Py_DECREF(str);
2835   Py_DECREF(i);
2837   put_in_dict(dict, "major", deviceProp.major);
2838   put_in_dict(dict, "minor", deviceProp.minor);
2839 #if CUDART_VERSION >= 2020
2840   int driverVersion = 0, runtimeVersion = 0;
2841   cudaDriverGetVersion(&driverVersion);
2842   cudaRuntimeGetVersion(&runtimeVersion);
2843   put_in_dict(dict, "driverVersion", driverVersion);
2844   put_in_dict(dict, "runtimeVersion", runtimeVersion);
2845 #endif
2846 #if CUDART_VERSION >= 2000
2848   put_in_dict(dict, "multiProcessorCount", deviceProp.multiProcessorCount);
2849   //if ConvertSMVer2Cores is not defined in cuda_runtime_api.h, the run time is too old.
2850   int sm_cores = -1;
2851   if(deviceProp.major==1)
2852     sm_cores = 32;
2853   else if(deviceProp.major==2 && deviceProp.minor==0)
2854     sm_cores = 32;
2855   else if(deviceProp.major==2 && deviceProp.minor==1)
2856     sm_cores = 48;
2857   put_in_dict(dict, "coresCount", sm_cores * deviceProp.multiProcessorCount);
2858 #endif
2859   put_in_dict(dict, "totalConstMem", deviceProp.totalConstMem);
2860   put_in_dict(dict, "sharedMemPerBlock", deviceProp.sharedMemPerBlock);
2861   put_in_dict(dict, "regsPerBlock", deviceProp.regsPerBlock);
2862   put_in_dict(dict, "warpSize", deviceProp.warpSize);
2863   put_in_dict(dict, "maxThreadsPerBlock", deviceProp.maxThreadsPerBlock);
2864   put_in_dict(dict, "maxThreadsDim0", deviceProp.maxThreadsDim[0]);
2865   put_in_dict(dict, "maxThreadsDim1", deviceProp.maxThreadsDim[1]);
2866   put_in_dict(dict, "maxThreadsDim2", deviceProp.maxThreadsDim[2]);
2867   put_in_dict(dict, "maxGridSize0", deviceProp.maxGridSize[0]);
2868   put_in_dict(dict, "maxGridSize1", deviceProp.maxGridSize[1]);
2869   put_in_dict(dict, "maxGridSize2", deviceProp.maxGridSize[2]);
2870   put_in_dict(dict, "memPitch", deviceProp.memPitch);
2871   put_in_dict(dict, "textureAlignment", deviceProp.textureAlignment);
2872   put_in_dict(dict, "clockRate", deviceProp.clockRate);
2873 #if CUDART_VERSION >= 2000
2874   put_in_dict(dict, "deviceOverlap", deviceProp.deviceOverlap);
2875 #endif
2876 #if CUDART_VERSION >= 2020
2877   put_in_dict(dict, "kernelExecTimeoutEnabled", deviceProp.kernelExecTimeoutEnabled);
2878   put_in_dict(dict, "integrated", deviceProp.integrated);
2879   put_in_dict(dict, "canMapHostMemory", deviceProp.canMapHostMemory);
2880   put_in_dict(dict, "computeMode", deviceProp.computeMode);
2881   //in the doc of this fct tell that 0 - Normal mode, 1 - only 1 context, 2 - no context
2882 #endif
2883 #if CUDART_VERSION >= 3000
2884   put_in_dict(dict, "concurrentKernels", deviceProp.concurrentKernels);
2885 #endif
2886 #if CUDART_VERSION >= 3010
2887   put_in_dict(dict, "ECCEnabled", deviceProp.ECCEnabled);
2888 #endif
2889 #if CUDART_VERSION >= 3020
2890   put_in_dict(dict, "tccDriver", deviceProp.tccDriver);
2891 #endif
2893   return dict;
2894 }
2896 /*
2897  * Returns in *free and *total respectively, the free and total amount of memory available for allocation by the device in bytes.
2898  */
2899 PyObject *
2900 GetDeviceMemInfo(PyObject* _unused, PyObject* dummy)
2901 {
2902     size_t free = 0, total = 0;
2903     if(g_gpu_context_active == 0){
2904         PyErr_Format(PyExc_RuntimeError, "No gpu device selected yet. Please make sure the gpu device was initialized by Theano before.");
2905         return NULL;
2906     }
2908     cudaError_t err = cudaMemGetInfo(&free, &total);
2909     if (err != cudaSuccess){
2910         // Clear the error flag, cudaMemGetInfo doesn't do it.
2911         // Currently this returns the same thing as err, but if in future
2912         // it returns something else I still don't see why we should ignore
2913         // it.  All we want to do here is reset the flag.
2914         cudaGetLastError();
2915         PyErr_Format(PyExc_RuntimeError,
2916                      "Error while getting memory info about the gpu: %s",
2917                      cudaGetErrorString(err));
2918         return NULL;
2919     }
2920     return PyTuple_Pack(2, PyLong_FromSize_t(free), PyLong_FromSize_t(total));
2921 }
2923 /*
2924  * Synchronize with all the gpu device stream.
2925  */
2926 PyObject *
2927 CudaNdarray_synchronize(PyObject* _unused, PyObject* dummy)
2928 {
2930     cudaThreadSynchronize();
2932     Py_INCREF(Py_None);
2933     return Py_None;
2934 }
2936 /*
2937  * Exist and return true if we link with cublas v2.
2938  */
2939 PyObject *
2940 CudaNdarray_cublasv2(PyObject* _unused, PyObject* dummy)
2941 {
2942     Py_INCREF(Py_True);
2943     return Py_True;
2944 }
2946 PyObject *
2947 CudaNdarray_select_a_gpu(PyObject* _unused, PyObject* dummy)
2948 {
2949     void * rval = NULL;
2950     cudaError_t err;
2951     int num_gpus = 0;
2953     err = cudaGetDeviceCount(&num_gpus);
2954     if (cudaSuccess != err){
2955         printf("ERR!\\n");
2956             PyErr_Format(PyExc_RuntimeError,
2957                          "Not able to get number of GPUs (%s).",
2958                          cudaGetErrorString(err));
2959             return NULL;
2960     }
2962     for (int device = 0; device < num_gpus; device++) {
2963         cudaSetDevice(device);
2964         err = cudaDeviceSynchronize(); // << CUDA context gets created here.
2965         cudaGetLastError(); // reset the error state
2966         if (cudaSuccess == err)
2967             break;
2968     }
2970     if (cudaSuccess != err){
2971             printf("ERR!\\n");
2972                 PyErr_Format(PyExc_RuntimeError,
2973                              "Not able to select available GPU from %d cards (%s).",
2974                              num_gpus, cudaGetErrorString(err));
2975                 return NULL;
2976     }
2978     Py_INCREF(Py_None);
2979     return Py_None;
2980 }
2983 /*
2984  * Return the size in bytes that Theano currently have allocated on the gpu.
2985  */
2986 PyObject *
2987 GetTheanoAllocInfo(PyObject* _unused, PyObject* dummy)
2988 {
2989     PyObject* a = PyLong_FromSize_t(_allocated_size);
2990     PyObject* b = PyLong_FromSize_t(_max_allocated_size);
2992     PyObject* tuple = PyTuple_New(2);
2993     PyTuple_SetItem(tuple, 0, a);
2994     PyTuple_SetItem(tuple, 1, b);
2995     return tuple;
2996 }
2997 #endif
2999 static PyGetSetDef CudaNdarray_getset[] = {
3000     {"shape",
3001         (getter)CudaNdarray_get_shape,
3002         (setter)CudaNdarray_set_shape,
3003         "shape of this ndarray (tuple)",
3004         NULL},
3005     {"_strides",
3006         (getter)CudaNdarray_get_strides,
3007         (setter)CudaNdarray_set_strides,
3008         "data pointer strides (in elements)",
3009         NULL},
3010     {"strides",
3011         (getter)CudaNdarray_get_strides,
3012         (setter)CudaNdarray_set_strides,
3013         "data pointer strides (in elements)",
3014         NULL},
3015     //gpudata is needed to allow calling pycuda fct with CudaNdarray input.
3016     {"gpudata",
3017         (getter)CudaNdarray_get_dev_data,
3018         NULL,
3019         "device data pointer",
3020         NULL},
3021     {"_dev_data",
3022         (getter)CudaNdarray_get_dev_data,
3023         (setter)CudaNdarray_set_dev_data,
3024         "device data pointer",
3025         NULL},
3026     {"dtype",
3027         (getter)CudaNdarray_get_dtype,
3028         NULL,
3029         "The dtype of the element. Now always float32",
3030         NULL},
3031     {"size",
3032         (getter)CudaNdarray_SIZE_Object,
3033         NULL,
3034         "The number of elements in this object.",
3035         NULL},
3036     //mem_size is neede for pycuda.elementwise.ElementwiseKernel Why do they use size and mem_size of the same value?
3037     {"mem_size",
3038         (getter)CudaNdarray_SIZE_Object,
3039         NULL,
3040         "The number of elements in this object.",
3041         NULL},
3042     {"ndim",
3043         (getter)CudaNdarray_get_ndim,
3044         NULL,
3045         "The number of dimensions in this object.",
3046         NULL},
3047     {"base",
3048         (getter)CudaNdarray_get_base,
3049         NULL,
3050         "If this ndarray is a view, base is the original ndarray.",
3051         NULL},
3053     {NULL, NULL, NULL, NULL}  /* Sentinel */
3054 };
3056 PyObject *CudaNdarray_repr(PyObject *self)
3057 {
3058     CudaNdarray *object = (CudaNdarray *)self;
3059     PyObject * np_object = CudaNdarray_CreateArrayObj(object);
3060     PyObject * str = PyObject_Str((PyObject *) np_object);
3061     char * cstr = PyString_AsString(str);
3062     PyObject * out = PyString_FromFormat("%s%s%s",
3063                         "CudaNdarray(",
3064                         cstr,
3065                         ")");
3066     Py_DECREF(str);
3067     Py_DECREF(np_object);
3068     #if PY_MAJOR_VERSION >= 3
3069     // In Python 3 PyString_FromFormat return a Bytes object
3070     PyObject* out2 = PyObject_Str(out);
3071     Py_DECREF(out);
3072     return out2;
3073     #endif
3074     return out;
3075 }
3077 static PyTypeObject CudaNdarrayType =
3078 {
3079 #if PY_MAJOR_VERSION >= 3
3080     PyVarObject_HEAD_INIT(NULL, 0)
3081 #else
3082     PyObject_HEAD_INIT(NULL)
3083     0,                         /*ob_size*/
3084 #endif
3085     "CudaNdarray",             /*tp_name*/
3086     sizeof(CudaNdarray),       /*tp_basicsize*/
3087     0,                         /*tp_itemsize*/
3088     (destructor)CudaNdarray_dealloc, /*tp_dealloc*/
3089     0,                         /*tp_print*/
3090     0,                         /*tp_getattr*/
3091     0,                         /*tp_setattr*/
3092     0,                         /*tp_compare*/
3093     CudaNdarray_repr,          /*tp_repr*/
3094     &CudaNdarrayNumberMethods, /*tp_as_number*/
3095     0,                         /*tp_as_sequence*/
3096     &CudaNdarrayMappingMethods,/*tp_as_mapping*/
3097     0,                         /*tp_hash */
3098     0,                         /*tp_call*/
3099     0,                         /*tp_str*/
3100     0,                         /*tp_getattro*/
3101     0,                         /*tp_setattro*/
3102     0,                         /*tp_as_buffer*/
3103 #if PY_MAJOR_VERSION >= 3
3104     // Py_TPFLAGS_CHECKTYPES is always true and was removed in Python 3.
3105     Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE, /*tp_flags*/
3106 #else
3108 #endif
3109     "CudaNdarray objects",     /* tp_doc */
3110     0,                         /* tp_traverse */
3111     0,                         /* tp_clear */
3112     0,                         /* tp_richcompare */
3113     0,                         /* tp_weaklistoffset */
3114     0,                         /* tp_iter */
3115     0,                         /* tp_iternext */
3116     CudaNdarray_methods,       /* tp_methods */
3117     CudaNdarray_members,       /* tp_members */
3118     CudaNdarray_getset,        /* tp_getset */
3119     0,                         /* tp_base */
3120     0,                         /* tp_dict */
3121     0,                         /* tp_descr_get */
3122     0,                         /* tp_descr_set */
3123     0,                         /* tp_dictoffset */
3124     (initproc)CudaNdarray_init,/* tp_init */
3125     0,                         /* tp_alloc */
3126     CudaNdarray_new,           /* tp_new */
3127 };
3129 static __global__ void get_gpu_ptr_size(int* dst)
3130 {
3131     dst[0] = sizeof(float*);
3132     dst[1] = sizeof(int);
3133 }
3135 PyObject *
3136 CudaNdarray_ptr_int_size(PyObject* _unused, PyObject* args)
3137 {
3138     int *gpu_data = (int*)device_malloc(sizeof(int)*2);
3139     if(gpu_data == NULL){
3140         return NULL;
3141     }
3142     get_gpu_ptr_size<<<1,1>>>(gpu_data);
3144     cudaError_t cudaErr = cudaGetLastError();
3145     if (cudaSuccess != cudaErr){
3147         device_free(gpu_data);
3148         return PyErr_Format(PyExc_RuntimeError,
3149                             "CudaNdarray_ptr_int_size: error when calling the gpu code. (%s)",
3150                             cudaGetErrorString(cudaErr));
3151     }
3153     // Transfer the result to cpu
3154     int gpu_sizes[] = {-1,-1};
3155     cublasStatus_t err;
3156     err = cublasGetVector(2, sizeof(int), gpu_data, 1, gpu_sizes, 1);
3157     device_free(gpu_data);
3159     if (CUBLAS_STATUS_SUCCESS != err){
3160         PyErr_SetString(PyExc_RuntimeError, "error copying data to from memory");
3161         return NULL;
3162     }
3163     return Py_BuildValue("iiii", (int) gpu_sizes[0], (int)sizeof(float*),
3164                          (int)sizeof(int), (int) gpu_sizes[1]);
3165 }
3167 static int cublas_init();
3168 static void cublas_shutdown();
3169 // Initialize the gpu.
3170 // Takes two optional parameters, the device number and if we should use cnmem.
3171 // If the device number is provided, it sets that device to be the active device.
3172 // If not provided (usually just to test whether the gpu is available at all),
3173 // it does not set an active device.
3174 // Raises EnvironmentError or ValueError (as appropriate) if the initialization failed.
3175 // cnmem is threaded like a bool. If converted to 0, don't use cnmem. Otherwise, use it.
3176 PyObject *
3177 CudaNdarray_gpu_init(PyObject* _unused, PyObject* args)
3178 {
3179     int card_nb = 0;
3180     int card_number_provided = 1;
3181     float cnmem = 0; // Theano flag lib.cnmem
3182     // if we're given something wildly invalid, this will throw a TypeError
3183     if(!PyArg_ParseTuple(args, "|if", &card_nb, &cnmem))
3184         return NULL;
3185     if(cnmem)
3186         g_use_cnmem = true;
3188     if(PyTuple_Size(args) == 0) {
3189         card_number_provided = 0;
3190         card_nb = 0;
3191     }
3193     int deviceCount;
3194     cudaError err = cudaGetDeviceCount(&deviceCount);
3195     if(cudaSuccess != err) {
3196         return PyErr_Format(PyExc_EnvironmentError,
3197                             "Unable to get the number of gpus available: %s",
3198                             cudaGetErrorString(cudaGetLastError()));
3199     }
3201     // as soon as the first successful call to a cuda* function is made, a
3202     // gpu context has been created
3203     g_gpu_context_active = 1;
3205     if(deviceCount <= 0) {
3206         return PyErr_Format(PyExc_EnvironmentError,
3207                             "Can't use the GPU, no devices support CUDA");
3208     }
3209     if(card_number_provided && (card_nb < 0 || card_nb > (deviceCount - 1))) {
3210         return PyErr_Format(PyExc_ValueError,
3211                             "Bad device number %d. Only %d devices available.",
3212                             card_nb,
3213                             deviceCount);
3214     }
3216     cudaDeviceProp deviceProp;
3217     err = cudaGetDeviceProperties(&deviceProp, card_nb);
3218     if(cudaSuccess != err) {
3219         return PyErr_Format(PyExc_EnvironmentError,
3220                             "Unable to get properties of gpu %i: %s",
3221                             card_nb,
3222                             cudaGetErrorString(cudaGetLastError()));
3223     }
3225     if(deviceProp.major == 9999 && deviceProp.minor == 9999 ){
3226         return PyErr_Format(PyExc_EnvironmentError,
3227                             "There is no device that supports CUDA");
3228     }
3230     if(card_number_provided) {
3231         err = cudaSetDevice(card_nb);
3232         if(cudaSuccess != err) {
3233             return PyErr_Format(PyExc_EnvironmentError,
3234                                 "Unable to set device %i: %s",
3235                                 card_nb,
3236                                 cudaGetErrorString(cudaGetLastError()));
3237         }
3238         if (cublas_init() == -1)
3239             return NULL;
3240     }
3241     if(card_number_provided && g_use_cnmem) {
3242         size_t mem = 0;
3243         if (cnmem > 1)
3244             mem = cnmem * 1024 * 1024;
3245         else{
3246             // Clip to 95% to let memory for the driver.
3247             // 98% didn't worked in some cases.
3248             if (cnmem > .95){
3249                 cnmem = .95;
3250             }
3251             size_t free = 0, total = 0;
3252             cudaError_t err = cudaMemGetInfo(&free, &total);
3253             if (err != cudaSuccess){
3254                 // Clear the error flag, cudaMemGetInfo doesn't do it.
3255                 // Currently this returns the same thing as err, but if in future
3256                 // it returns something else I still don't see why we should ignore
3257                 // it.  All we want to do here is reset the flag.
3258                 cudaGetLastError();
3259                 PyErr_Format(PyExc_RuntimeError,
3260                              "Error while getting memory info about the gpu: %s",
3261                              cudaGetErrorString(err));
3262                 return NULL;
3263             }
3264             mem = total * cnmem;
3265         }
3266         if(initCnmem(card_number_provided, card_nb, mem) == -1){
3267             return NULL;
3268         }
3269     }
3271     Py_INCREF(Py_None);
3272     return Py_None;
3273 }
3275 PyObject *
3276 CudaNdarray_active_device_number(PyObject* _unused, PyObject* _unused_args) {
3277     // NB: No cuda error checking here; keeps things simple, and it's not
3278     // really necessary.
3279     int currentDevice;
3280     cudaGetDevice(&currentDevice);
3281     return PyInt_FromLong(currentDevice);
3282 }
3284 PyObject *
3285 CudaNdarray_active_device_name(PyObject* _unused, PyObject* _unused_args) {
3286     // NB: No cuda error checking here; keeps things simple, and it's not
3287     // really necessary.
3288     int currentDevice;
3289     cudaGetDevice(&currentDevice);
3291     cudaDeviceProp deviceProp;
3292     cudaGetDeviceProperties(&deviceProp, currentDevice);
3293     return PyString_FromString(;
3294 }
3296 PyObject *
3297 CudaNdarray_gpu_shutdown(PyObject* _unused, PyObject* _unused_args) {
3298     // Don't handle errors here
3299     cublas_shutdown();
3300     g_gpu_context_active = 0; // context has now been closed down
3301     if(g_use_cnmem) {
3302         cnmemStatus_t status = cnmemFinalize();
3303         if(status != CNMEM_STATUS_SUCCESS) {
3304             fprintf(stderr, "CudaNdarray_gpu_shutdown: cnmemFinalize failed! Reason=%s\n",
3305                     cnmemGetErrorString(status));
3306             if(status == CNMEM_STATUS_CUDA_ERROR) {
3307                 fprintf(stderr, "  Cuda-Reason=%s\n",
3308                         cudaGetErrorString(cudaGetLastError()));
3309             }
3310         }
3311     }
3313     Py_INCREF(Py_None);
3314     return Py_None;
3315 }
3317 /*
3318  * This function is tested in theano/misc/
3319  */
3320 PyObject *
3321 CudaNdarray_from_gpu_pointer(PyObject* _unused, PyObject* args)
3322 {
3323     int verbose = 0;
3324     PyObject *gpu_ptr = NULL;
3325     PyObject *shapes = NULL;
3326     PyObject *strides = NULL;
3327     PyObject *base = NULL;
3328     PyObject *rval = NULL;
3330     //args should consist of 3 python objects
3331     //The first is the gpu ptr
3332     //The second if the shape
3333     //The third if the strides
3334     if (! PyArg_ParseTuple(args, "OOOO", &gpu_ptr, &shapes, &strides, &base))
3335         return NULL;
3337     if (verbose) printf("In CudaNdarray_from_gpu_pointer\n");
3338     if (!PyLong_Check(gpu_ptr))
3339     {
3340         PyErr_Format(PyExc_Exception, "CudaNdarray_from_gpu_pointer: The gpu pointor is not an long");
3341         return NULL;
3342     }
3344     Py_ssize_t nd =  PyObject_Length(shapes);
3345     if (nd < 0)
3346     {
3347         PyErr_SetString(PyExc_TypeError, "CudaNdarray_from_gpu_pointer: Couldn't get length of second argument");
3348         return NULL;
3349     }
3350     Py_ssize_t nd_stride =  PyObject_Length(strides);
3351     if (nd_stride < 0)
3352     {
3353         PyErr_SetString(PyExc_TypeError, "CudaNdarray_from_gpu_pointer: Couldn't get length of third argument");
3354         return NULL;
3355     }
3357     if (nd != nd_stride)
3358     {
3359         PyErr_SetString(PyExc_TypeError, "CudaNdarray_from_gpu_pointer: We need the same number of shapes and strides");
3360         return NULL;
3361     }
3363     rval = CudaNdarray_New();
3365     if (CudaNdarray_set_nd((CudaNdarray *)rval, nd))
3366     {
3367         //CudaNdarray_set_nd set the error msg
3368         return NULL;
3369     }
3370     // set gpu pointeur
3371     assert(((CudaNdarray *)rval)->data_allocated == 0);
3372     if (CudaNdarray_set_device_data((CudaNdarray *)rval, (float *)PyInt_AsLong(gpu_ptr), base))
3373     {
3374         PyErr_SetString(PyExc_TypeError, "CudaNdarray_from_gpu_pointer: Error while setting the gpu pointor");
3375         return NULL;
3377     }
3379     // Set dims and strides
3380     for (int i = nd-1; i >= 0; --i)
3381     {
3382         PyObject * idx = PyLong_FromLong(i);
3383         if (idx == NULL)
3384         {
3385             PyErr_SetString(PyExc_Exception, "CudaNdarray_from_gpu_pointer: Couldn't make long object to loop over list/tuple");
3386             return NULL;
3387         }
3388         PyObject* dim_ = PyObject_GetItem(shapes, idx);
3389         PyObject* strd_ = PyObject_GetItem(strides, idx);
3390         if (!PyInt_Check(dim_))
3391         {
3392             PyErr_Format(PyExc_Exception, "CudaNdarray_from_gpu_pointer: shapes[%d] is not an int", i);
3393             return NULL;
3394         }
3395         if (!PyInt_Check(strd_))
3396         {
3397             PyErr_Format(PyExc_Exception, "CudaNdarray_from_gpu_pointer: strides[%d] is not an int", i);
3398             return NULL;
3399         }
3400         int dim = PyInt_AsLong(dim_);
3401         int strd = PyInt_AsLong(strd_);
3402         CudaNdarray_set_stride((CudaNdarray *)rval, i, strd);
3403         CudaNdarray_set_dim((CudaNdarray *)rval, i, dim);
3404         Py_DECREF(idx);
3405         Py_DECREF(dim_);
3406         Py_DECREF(strd_);
3407     }
3408     if (verbose) printf("CudaNdarray_from_gpu_pointer normal return\n");
3409     return rval;
3410 }
3412 PyObject *
3413 CudaNdarray_Dot(PyObject* _unused, PyObject* args)
3414 {
3415     PyObject *l=NULL;
3416     PyObject *r=NULL;
3417     PyObject * rval = NULL;
3419     //args should consist of two python objects ("OO")
3420     if (! PyArg_ParseTuple(args, "OO", &l, &r))
3421         return NULL;
3423     if (!CudaNdarray_Check(l) || !CudaNdarray_Check(r))
3424     {
3425         PyErr_SetString(PyExc_TypeError, "CudaNdarray arguments required ");
3426         goto CudaNdarray_dot_fail;
3427     }
3428     if (((CudaNdarray*)l)->nd != 2)
3429     {
3430         PyErr_SetString(PyExc_TypeError, "need 2d CudaNdarray arg for now");
3431         goto CudaNdarray_dot_fail;
3432     }
3433     if (((CudaNdarray*)r)->nd != 2)
3434     {
3435         PyErr_SetString(PyExc_TypeError, "need 2d CudaNdarray arg for now");
3436         goto CudaNdarray_dot_fail;
3437     }
3438     rval = CudaNdarray_New();
3439     if (!rval)
3440     {
3441         goto CudaNdarray_dot_fail;
3442     }
3443     int dims[2];
3444     dims[0] = CudaNdarray_HOST_DIMS((CudaNdarray*)l)[0];
3445     dims[1] = CudaNdarray_HOST_DIMS((CudaNdarray*)r)[1];
3446     if (CudaNdarray_alloc_contiguous((CudaNdarray*)rval, 2, dims))
3447     {
3448         goto CudaNdarray_dot_fail;
3449     }
3450     if (CudaNdarray_gemm(1.0, (CudaNdarray*)l, (CudaNdarray*)r, 0.0, (CudaNdarray*)rval))
3451     {
3452         goto CudaNdarray_dot_fail;
3453     }
3455     return rval;
3457     CudaNdarray_dot_fail:
3458     Py_XDECREF(rval);
3459     return NULL;
3460 }
3462 static PyObject *
3463 filter(PyObject* __unsed_self, PyObject *args) // args = (data, broadcastable, strict, storage)
3464 {
3465     /*
3466      * TODO: DOC what this function should do in the various cases of
3467      * What is 'strict' supposed to mean in the context of this function?
3468      * What do we do with input that could be interpreted as matching the broadcastable pattern in strict vs. non-strict cases?
3469      *
3470      */
3471     PyObject *py_data=NULL;
3472     PyArrayObject * data = NULL;
3473     int strict = 0;
3474     PyObject * broadcastable=NULL;
3475     PyObject * storage=NULL;
3476     CudaNdarray * rval=NULL;
3478     //Python object references which are provided to the caller are borrowed references
3479     if (!PyArg_ParseTuple(args, "OOiO", &py_data, &broadcastable, &strict, &storage)) return NULL;
3481     if (!PyTuple_Check(broadcastable)){
3482         PyErr_SetString(PyExc_TypeError, "broadcastable arg should be a tuple of int.");
3483         return NULL;
3484     }
3485     Py_INCREF(py_data);
3486     Py_INCREF(broadcastable);
3488     CudaNdarray * cnda = (CudaNdarray*)py_data;
3490     if (strict || CudaNdarray_Check(py_data))
3491     {
3492         //TODO: support non-strict "casting" from a vt to the broadcastable/type/size that we need.
3493         if (!CudaNdarray_Check(py_data))
3494         {
3495             Py_DECREF(py_data);
3496             Py_DECREF(broadcastable);
3497             PyErr_SetString(PyExc_TypeError, "strict mode requires CudaNdarray");
3498             return NULL;
3499         }
3500         if (cnda->nd != PyTuple_Size(broadcastable))
3501         {
3502             Py_DECREF(py_data);
3503             Py_DECREF(broadcastable);
3504             PyErr_Format(PyExc_TypeError, "Wrong rank: %i vs %li", cnda->nd, (long)PyTuple_Size(broadcastable));
3505             return NULL;
3506         }
3507         for (int i = 0; i < cnda->nd; ++i)
3508         {
3509             if ((CudaNdarray_HOST_DIMS(cnda)[i] > 1) && PyInt_AsLong(PyTuple_GetItem(broadcastable, Py_ssize_t(i))))
3510             {
3511                 PyErr_Format(PyExc_TypeError, "Non-unit size in broadcastable vt dimension %i", i);
3512                 Py_DECREF(py_data);
3513                 Py_DECREF(broadcastable);
3514                 return NULL;
3515             }else if (CudaNdarray_HOST_DIMS(cnda)[i] == 1 && CudaNdarray_HOST_STRIDES(cnda)[i] != 0){
3516                 PyErr_Format(PyExc_TypeError, "Non-zeros strides(%d) on dimension %d of size 1",
3517                              CudaNdarray_HOST_STRIDES(cnda)[i], i);
3518                 Py_DECREF(py_data);
3519                 Py_DECREF(broadcastable);
3520                 return NULL;
3521             }
3522         }
3523         Py_DECREF(broadcastable);
3524         return py_data;
3525     }
3526     else
3527     {
3528         data = (PyArrayObject*)PyArray_FromObject(py_data, REAL_TYPENUM, PyTuple_Size(broadcastable), PyTuple_Size(broadcastable));
3529         if (!data)
3530         {
3531             //err message already defined
3532             Py_DECREF(py_data);
3533             Py_DECREF(broadcastable);
3534             return NULL;
3535         }
3536         for (int i = 0; i < PyArray_NDIM(data); ++i)
3537         {
3538             if ((PyArray_DIMS(data)[i] > 1) && PyInt_AsLong(PyTuple_GetItem(broadcastable, Py_ssize_t(i))))
3539             {
3540                 PyErr_Format(PyExc_TypeError, "Non-unit size in broadcastable dimension %i", i);
3541                 Py_DECREF(data);
3542                 Py_DECREF(py_data);
3543                 Py_DECREF(broadcastable);
3544                 return NULL;
3545             }
3546         }
3547         if (storage && CudaNdarray_Check(storage))
3548         {
3549             rval = (CudaNdarray*) storage;
3550             Py_INCREF(rval);
3551         }
3552         else
3553         {
3554             rval = (CudaNdarray*) CudaNdarray_New();
3555         }
3556         if (rval)
3557         {
3558             if (CudaNdarray_CopyFromArray(rval, data))
3559             {
3560                 Py_DECREF(rval);
3561                 rval = NULL;
3562             }
3563         }
3564         Py_DECREF(data);
3565         Py_DECREF(py_data);
3566         Py_DECREF(broadcastable);
3567         return (PyObject*)rval;
3568     }
3569 }
3571 //TODO-- CudaNdarray_Dot and CudaNdarray_active_device_name are following different capitalization conventions.
3572 //       Pick one and standardize it, this file is already annoying enough to grep through
3573 static PyMethodDef module_methods[] = {
3574     {"dimshuffle", CudaNdarray_Dimshuffle, METH_VARARGS, "Returns the dimshuffle of a CudaNdarray."},
3575     {"dot", CudaNdarray_Dot, METH_VARARGS, "Returns the matrix product of two CudaNdarray arguments."},
3576     {"gpu_init", CudaNdarray_gpu_init, METH_VARARGS, "Select the gpu card to use; also usable to test whether CUDA is available."},
3577     {"select_a_gpu", CudaNdarray_select_a_gpu, METH_NOARGS, "Call this method if you want to select a GPU before gpu_init call and let the driver choose the GPU."},
3578     {"active_device_name", CudaNdarray_active_device_name, METH_VARARGS, "Get the name of the active device."},
3579     {"active_device_number", CudaNdarray_active_device_number, METH_VARARGS, "Get the number of the active device."},
3580     {"gpu_shutdown", CudaNdarray_gpu_shutdown, METH_VARARGS, "Shut down the gpu."},
3581     {"device_properties", GetDeviceProperties, METH_VARARGS, "Return a dictionary with the device properties."},
3582     {"mem_info", GetDeviceMemInfo, METH_NOARGS, "Return a tuple with the free and total memory on the gpu in bytes."},
3584     {"theano_allocated", GetTheanoAllocInfo, METH_NOARGS, "Return the size in bytes of memory Theano currently have allocated on the gpu."},
3585 #endif
3586     {"ptr_int_size", CudaNdarray_ptr_int_size, METH_VARARGS, "Return a tuple with the size of gpu pointer, cpu pointer and int in bytes."},
3587     {"filter", filter, METH_VARARGS, "filter(obj, broadcastable, strict, storage) returns a CudaNdarray initialized to obj if it matches the constraints of broadcastable.  strict=True prevents any numeric casting. If storage is a CudaNdarray it may be overwritten and used as the return value."},
3588     {"outstanding_mallocs", outstanding_mallocs, METH_VARARGS, "how many more mallocs have been called than free's"},
3589     {"from_gpu_pointer", CudaNdarray_from_gpu_pointer, METH_VARARGS, "Used to create a CudaNdarray from already allocated memory on the gpu.(example by pycuda)"},
3590     {"synchronize", CudaNdarray_synchronize, METH_NOARGS, "Used to synchronize the device"},
3591     {"cublas_v2", CudaNdarray_cublasv2, METH_NOARGS,
3592      "Used to know if this version of cuda_ndarray is linked with cublas v2."},
3593     {NULL, NULL, NULL, NULL}  /* Sentinel */
3594 };
3596 #define CNDA_MOD_NAME "cuda_ndarray"
3597 #define CNDA_DOCSTRING "CUDA implementation of a numpy ndarray-like object."
3599 #if PY_MAJOR_VERSION == 3
3600 static struct PyModuleDef cuda_ndarray_moduledef =
3601 {
3602     PyModuleDef_HEAD_INIT,
3603     CNDA_MOD_NAME,
3605     -1,     /* size of per-interpreter state of the module,
3606                or -1 if the module keeps state in global variables. */
3607     module_methods
3608 };
3611 PyInit_cuda_ndarray(void)
3612 #else
3614 initcuda_ndarray(void)
3615 #endif
3616 {
3617     import_array();
3619     PyObject* m;
3621     if (PyType_Ready(&CudaNdarrayType) < 0) {
3622 #if PY_MAJOR_VERSION == 3
3623         return NULL;
3624 #else
3625         return;
3626 #endif
3627     }
3629 #if PY_MAJOR_VERSION == 3
3630     m = PyModule_Create(&cuda_ndarray_moduledef);
3631 #else
3632     m = Py_InitModule3(CNDA_MOD_NAME, module_methods, CNDA_DOCSTRING);
3633 #endif
3635     if (m == NULL) {
3636 #if PY_MAJOR_VERSION == 3
3637         return NULL;
3638 #else
3639         return;
3640 #endif
3641     }
3643     Py_INCREF(&CudaNdarrayType);
3644     PyModule_AddObject(m, "CudaNdarray", (PyObject *)&CudaNdarrayType);
3646     for(int i=0;i<TABLE_SIZE;i++){
3647         _alloc_size_table[i].ptr=NULL;
3648         _alloc_size_table[i].size=0;
3649     }
3650 #endif
3651     //    cublasInit();
3652     //if (0&&CUBLAS_STATUS_SUCCESS != cublasGetError())
3653     //{
3654         //std::cerr << "WARNING: initcuda_ndarray: error initializing device\n";
3655     //}
3656     if (0) //TODO: is this necessary?
3657     {
3658         int deviceId = 0; // TODO: what number goes here?
3659         cudaSetDevice(deviceId);
3660         cudaError_t err = cudaGetLastError();
3661         if( cudaSuccess != err)
3662         {
3663             std::cerr << "Error in SetDevice:" << cudaGetErrorString(err) << "\n";
3664         }
3665     }
3667 #if PY_MAJOR_VERSION == 3
3668     return m;
3669 #endif
3670 }
3673 //////////////////////////////////////
3674 //
3675 // C API FOR CudaNdarray
3676 //
3677 //////////////////////////////////////
3679 int
3680 CudaNdarray_Check(const PyObject * ob)
3681 {
3682     //TODO: doesn't work with inheritance
3683     return CudaNdarray_CheckExact(ob);
3684 }
3685 int
3686 CudaNdarray_CheckExact(const PyObject * ob)
3687 {
3688     return ((Py_TYPE(ob) == &CudaNdarrayType) ? 1 : 0);
3689 }
3691 PyObject *
3692 CudaNdarray_New(int nd)
3693 {
3694     CudaNdarray *self = (CudaNdarray *)CudaNdarrayType.tp_alloc(&CudaNdarrayType, 0);
3695     if (self == NULL)
3696     {
3697         PyErr_SetString(PyExc_RuntimeError, "CudaNdarray_New failed to allocate self");
3698         return NULL;
3699     }
3700     CudaNdarray_null_init(self);
3702     if (nd == 0)
3703     {
3704         self->nd = 0;
3705     }
3706     else if (nd > 0)
3707     {
3708         if (CudaNdarray_set_nd(self, nd))
3709         {
3710             Py_DECREF(self);
3711             return NULL;
3712         }
3713     }
3714     ++_outstanding_mallocs[1];
3715     return (PyObject *)self;
3716 }
3720 //////////////////////////////
3721 //
3722 // Published helper functions
3723 //
3724 //////////////////////////////
3726 static int
3727 cublas_init()
3728 {
3729     cublasStatus_t err;
3730     err = cublasCreate(&handle);
3731     if (CUBLAS_STATUS_SUCCESS != err)
3732     {
3733         if(CUBLAS_STATUS_NOT_INITIALIZED == err)
3734             PyErr_SetString(PyExc_RuntimeError,
3735                             "cublasCreate() returned this error "
3736                             "'the CUDA Runtime initialization failed'");
3737         else if(CUBLAS_STATUS_ALLOC_FAILED == err)
3738             PyErr_SetString(PyExc_RuntimeError,
3739                             "cublasCreate() returned this error "
3740                             "'the resources could not be allocated'");
3741         else
3742             PyErr_SetString(PyExc_RuntimeError,
3743                             "unknow error during returned by cublasCreate()");
3744         return -1;
3745     }
3746     // Set the default stream as the one to execute on (default)
3747     cublasSetStream(handle, NULL);
3748     // Pointer to scalars are on the host (also default)
3749     cublasSetPointerMode(handle, CUBLAS_POINTER_MODE_HOST);
3750 #if CUDA_VERSION >= 5000
3751     // atomics can be used in kernels to speed up operations (not default)
3752     // This may lead to a slight variance from run to run in some operations
3753     cublasSetAtomicsMode(handle, CUBLAS_ATOMICS_ALLOWED);
3754 #endif
3755     return 0;
3756 }
3758 static void
3759 cublas_shutdown()
3760 {
3761     if (handle != NULL)
3762         cublasDestroy(handle);
3763     // No point in handling any errors here
3764     handle = NULL;
3765 }
3767 int
3768 CudaNdarray_CopyFromArray(CudaNdarray * self, PyArrayObject*obj)
3769 {
3770     int err = CudaNdarray_alloc_contiguous(self, PyArray_NDIM(obj),
3771                                            PyArray_DIMS(obj));
3772     if (err) {
3773         return err;
3774     }
3776     int typenum = PyArray_TYPE(obj);
3777     if (typenum != REAL_TYPENUM)
3778     {
3779         PyErr_SetString(PyExc_TypeError, "can only copy from float arrays");
3780         return -1;
3781     }
3782     assert( 4 ==  PyArray_ITEMSIZE(obj));
3783     PyArrayObject * py_src = (PyArrayObject *)PyArray_ContiguousFromAny(
3784         (PyObject*)obj, typenum, self->nd, self->nd);
3785     if (!py_src) {
3786         return -1;
3787     }
3788     npy_intp py_src_size = PyArray_SIZE(py_src);
3789     void *py_src_data = PyArray_DATA(py_src);
3790     cudaError_t cerr;
3792     cerr = cudaMemcpy(self->devdata, py_src_data,
3793                       py_src_size * sizeof(real),
3794                       cudaMemcpyHostToDevice);
3795     //CNDA_THREAD_SYNC;  // unneeded because cudaMemcpy is blocking anyway
3797     if (cudaSuccess != cerr)
3798     {
3799         PyErr_Format(PyExc_RuntimeError,
3800                      "Cuda error '%s' while copying %lli data element"
3801                      " to device memory. str ptr=%p. dst ptr=%p",
3802                      cudaGetErrorString(cerr),
3803                      (long long)py_src_size,
3804                      py_src_data,
3805                      self->devdata);
3806         Py_DECREF(py_src);
3807         return -1;
3808     }
3809     Py_DECREF(py_src);
3810     return 0;
3811 }
3813 PyObject *
3814 CudaNdarray_new_nd(int nd)
3815 {
3816     CudaNdarray * rval = (CudaNdarray*) CudaNdarray_New();
3817     if (!rval || CudaNdarray_set_nd(rval, nd))
3818     {
3819         Py_XDECREF(rval);
3820         rval = NULL;
3821     }
3822     return (PyObject *) rval;
3823 }
3826 /**
3827  * Initialize 'self' as a view of 'base', with memory storage 'data'
3828  */
3830 int CudaNdarray_set_device_data(CudaNdarray * self, float * data, PyObject * base)
3831 {
3832     if (self->data_allocated)
3833     {
3834         assert(self->devdata);
3835         if (device_free(self->devdata))
3836         {
3837             self->devdata = NULL;
3838             self->data_allocated = 0;
3839             return -1;
3840         }
3841     }
3842     // Get the original base object (base.base.base...)
3843     PyObject * orig_base = base;
3844     // base is not always a CudaNdarray. It can be a GpuArray from pycuda, ...
3845     while (orig_base && CudaNdarray_Check(orig_base) && ((CudaNdarray*) orig_base)->base)
3846     {
3847         // base_base is itself a view
3848         orig_base = ((CudaNdarray*) orig_base)->base;
3849     }
3850     //N.B. XDECREF and XINCREF are no-ops for NULL pointers
3851     if (self->base != orig_base)
3852     {
3853         Py_XDECREF(self->base);
3854         self->base = orig_base;
3855         Py_XINCREF(self->base);
3856     }
3857     self->data_allocated = 0;
3858     self->devdata = data;
3859     return 0;
3860 }
3862 static __global__ void k_copy_1d(const int N, const float * x, const int sx, float * y, const int sy)
3863 {
3864     for (int i = threadIdx.x + blockIdx.x * blockDim.x; i < N; i += gridDim.x*blockDim.x)
3865     {
3866         y[i*sy] = x[i*sx];
3867     }
3868 }
3870 // N1 through N4 are the size of y
3871 static __global__ void k_copy_4d(const int N1,
3872         const int N2, const int N3, const int N4,
3873         const float * x, const int sx1, const int sx2, const int sx3,
3874         const int sx4,  float * y, const int sy1, const int sy2,
3875         const int sy3, const int sy4)
3876 {
3877     // These must be made int instead of unsigned int due to a bug in nvcc
3878     int bx = blockIdx.x;
3879     int by = blockIdx.y;
3881     for (int i = bx; i < N1; i += gridDim.x)
3882     {
3883         for (int j = by; j < N2; j += gridDim.y)
3884         {
3885             for (int k = threadIdx.x; k < N3; k += (int) blockDim.x)
3886             {
3887                 for (int l = threadIdx.y; l < N4; l += (int) blockDim.y)
3888                 {
3889                     y[i * sy1 + j * sy2 + k * sy3 + l * sy4] =
3890                         x[i * sx1 + j * sx2 + k * sx3 + l * sx4];
3891                 }
3892             }
3893         }
3894     }
3895 }
3897 //copy from other into self
3898 int CudaNdarray_CopyFromCudaNdarray(CudaNdarray * self,
3899                                     const CudaNdarray * other,
3900                                     bool unbroadcast)
3901 {
3902     int verbose = 0;
3903     if (verbose>1) fprintf(stderr, "CudaNdarray_CopyFromCudaNdarray\n");
3905     //standard elemwise size checks
3906     if (self->nd == -1)
3907     {
3908         PyErr_SetString(PyExc_TypeError,
3909                         "can't copy into un-initialized CudaNdarray");
3910         return -1;
3911     }
3912     CudaNdarray * new_other = NULL;
3914     if (self->nd < other->nd)
3915     {
3916         PyErr_Format(PyExc_NotImplementedError,
3917             "CudaNdarray_CopyFromCudaNdarray: The number of dimensions of the "
3918             "destination needs to be >= the number of dimensions of the "
3919             "source. Got %d and %d.", self->nd, other->nd);
3920         return -1;
3921     }
3922     else if (self->nd != other->nd)
3923     {
3924         new_other = (CudaNdarray *) CudaNdarray_View(other);
3925         int added_dims = self->nd - other->nd;
3926         int* pattern = (int*) alloca(self->nd * sizeof(int));
3927         for(int i = 0; i < added_dims; i++)
3928             pattern[i] = -1;
3929         for(int i = 0; i < other->nd; i++)
3930             pattern[i + added_dims] = i;
3931         CudaNdarray_dimshuffle(new_other, self->nd, pattern);
3932         other = new_other;
3933     }
3934     assert(self->nd == other->nd);
3935     //standard elemwise dim checks (also compute total size)
3936     unsigned int size = 1;
3937     unsigned int size_source = 1;
3938     for (int i = 0; i< self->nd; ++i)
3939     {
3940         if ((CudaNdarray_HOST_DIMS(self)[i] != CudaNdarray_HOST_DIMS(other)[i])
3941             && (1!=CudaNdarray_HOST_DIMS(other)[i] || !unbroadcast) )
3942         {
3943           PyErr_Format(PyExc_ValueError,
3944                        "CudaNdarray_CopyFromCudaNdarray:"
3945                        " need same dimensions for dim %d,"
3946                        " destination=%d, source=%d",
3947                        i, CudaNdarray_HOST_DIMS(self)[i],
3948                        CudaNdarray_HOST_DIMS(other)[i]);
3949           Py_XDECREF(new_other);
3950           return -1;
3951         }
3952         size *= (unsigned int) CudaNdarray_HOST_DIMS(self)[i];
3953         size_source *= (unsigned int) CudaNdarray_HOST_DIMS(other)[i];
3954     }
3955     if (0 == size)
3956     {
3957         Py_XDECREF(new_other);
3958         return 0; //nothing to copy, we're done.
3959     }
3960     if (CudaNdarray_is_c_contiguous(self) &&
3961         CudaNdarray_is_c_contiguous(other) &&
3962         size == size_source)
3963     {
3964         if (verbose)
3965             fprintf(stderr, "Copying contiguous vector with cublasScopy\n");
3967         cublasStatus_t err;
3968         err = cublasScopy(handle, size, CudaNdarray_DEV_DATA(other), 1,
3969                           CudaNdarray_DEV_DATA(self), 1);
3970         CNDA_THREAD_SYNC;
3971         Py_XDECREF(new_other);
3972         if (CUBLAS_STATUS_SUCCESS != err)
3973         {
3974             PyErr_SetString(PyExc_RuntimeError, "Error copying memory");
3975             return -1;
3976         }
3977         return 0;
3978     }
3979     //TODO: rewrite these copy operations to be more efficient
3980     //      See, for example the transpose example in the cuda_sdk.
3981     switch (self->nd)
3982     {
3983         case 0: // scalar
3984             {
3986                 assert(0);
3987             }; break;
3988         case 1: // vector
3989             {
3990                 if (verbose) fprintf(stderr, "Copying non-contiguous vector\n");
3991                 if (verbose) fprint_CudaNdarray(stderr, other);
3992                 unsigned int n_blocks = std::min(size,
3993                                                  (unsigned int)NUM_VECTOR_OP_BLOCKS);
3994                 unsigned int n_threads = std::min(ceil_intdiv(size, n_blocks),
3995                                                   (unsigned int)NUM_VECTOR_OP_THREADS_PER_BLOCK);
3996                 k_copy_1d<<<n_blocks, n_threads>>>(size,
3997                                             CudaNdarray_DEV_DATA(other),
3998                                             CudaNdarray_HOST_STRIDES(other)[0],
3999                                             CudaNdarray_DEV_DATA(self),
4000                                             CudaNdarray_HOST_STRIDES(self)[0]);
4001                 CNDA_THREAD_SYNC;
4002                 cudaError_t err = cudaGetLastError();
4003                 if( cudaSuccess != err)
4004                 {
4005                     PyErr_Format(PyExc_RuntimeError,
4006                                  "Cuda error: %s: %s. (n_blocks=%i,"
4007                                  " n_threads_per_block=%i)\n", "k_copy_1d",
4008                                  cudaGetErrorString(err), n_blocks, n_threads);
4009                     Py_XDECREF(new_other);
4010                     return -1;
4011                 }
4012             }; break;
4013         case 4: // 4-tensor
4014             {
4015                 if (verbose)
4016                 {
4017                     if (0 != fprint_CudaNdarray(stderr, other))
4018                     {
4019                         Py_XDECREF(new_other);
4020                         return -1;
4021                     }
4022                 }
4024                 // The blocks implement the looping over the first two axes so
4025                 // this needs to be (N1, N2)
4026                 dim3 n_blocks( std::min(CudaNdarray_HOST_DIMS(self)[0],
4027                                         NUM_VECTOR_OP_BLOCKS),
4028                                std::min(CudaNdarray_HOST_DIMS(self)[1],
4029                                         NUM_VECTOR_OP_BLOCKS));
4030                 // For the threads, just make as many as possible
4031                 dim3 n_threads( std::min( (unsigned int) CudaNdarray_HOST_DIMS(self)[2],
4032                                  (unsigned int) NUM_VECTOR_OP_THREADS_PER_BLOCK),
4033                                 std::min( (unsigned int) CudaNdarray_HOST_DIMS(self)[3],
4034                                     (unsigned int) NUM_VECTOR_OP_THREADS_PER_BLOCK));
4036                 n_threads.x = std::min( (unsigned int) 32, (unsigned int) n_threads.x);
4037                 n_threads.y = std::min( n_threads.y, NUM_VECTOR_OP_THREADS_PER_BLOCK / n_threads.x);
4039                 k_copy_4d<<<n_blocks, n_threads>>>(
4040                                             // size of y
4041                                             (unsigned int) CudaNdarray_HOST_DIMS(self)[0], // N1
4042                                             (unsigned int) CudaNdarray_HOST_DIMS(self)[1], // N2
4043                                             (unsigned int) CudaNdarray_HOST_DIMS(self)[2], // N3
4044                                             (unsigned int) CudaNdarray_HOST_DIMS(self)[3], // N4
4045                                             CudaNdarray_DEV_DATA(other), // x
4046                                             // x strides
4047                                             CudaNdarray_HOST_STRIDES(other)[0],
4048                                             CudaNdarray_HOST_STRIDES(other)[1],
4049                                             CudaNdarray_HOST_STRIDES(other)[2],
4050                                             CudaNdarray_HOST_STRIDES(other)[3],
4051                                             CudaNdarray_DEV_DATA(self), // y
4052                                             // y strides
4053                                             CudaNdarray_HOST_STRIDES(self)[0],
4054                                             CudaNdarray_HOST_STRIDES(self)[1],
4055                                             CudaNdarray_HOST_STRIDES(self)[2],
4056                                             CudaNdarray_HOST_STRIDES(self)[3]
4057                                             );
4058                 CNDA_THREAD_SYNC;
4059                 cudaError_t err = cudaGetLastError();
4060                 if( cudaSuccess != err)
4061                 {
4062                     PyErr_Format(PyExc_RuntimeError,
4063                                  "Cuda error: %s: %s.",
4064                                  "k_copy_4d",
4065                                  cudaGetErrorString(err));
4066                     Py_XDECREF(new_other);
4067                     return -1;
4068                 }
4069             }; break;
4070         default:
4071             {
4072                 cudaError_t err = cudaGetLastError();
4073                 if(cudaSuccess != err){
4074                     PyErr_Format(PyExc_RuntimeError,
4075                                  "Unexpected Cuda error: %s: %s\n",
4076                                  "CudaNdarray_CopyFromCudaNdarray",
4077                                  cudaGetErrorString(err));
4078                     Py_XDECREF(new_other);
4079                     return -1;
4080                 }
4082                 if (verbose)
4083                     fprintf(stderr,
4084                             "Copying with default version unbroadcast=%d\n",
4085                             unbroadcast);
4086                 // call worker routine
4087                 unsigned int threads_per_block = std::min(size,
4088                                                           (unsigned int)NUM_VECTOR_OP_THREADS_PER_BLOCK);
4089                 unsigned int n_blocks = std::min(ceil_intdiv(size, threads_per_block),
4090                                                  (unsigned int)NUM_VECTOR_OP_BLOCKS);
4091                 const CudaNdarray * cuda_dims = other;
4092                 if(unbroadcast)
4093                     cuda_dims = self;
4094                 //copy from other into self
4095                 k_elemwise_unary_rowmajor_copy<<<n_blocks, threads_per_block>>>(
4096                         size,
4097                         (unsigned int)other->nd,
In [2]:
moduleName = "fivePointRegression"
repositoryPath = '/home/vlaand/IpythonNotebooks/05_emotion_fivepoint_nuig'

directory = '/home/vlaand/IpythonNotebooks/05_emotion_fivepoint_nuig'

emoNames = ['confident','excited','happy', 'surprised']
# emoNames = ['sadness', 'disgust', 'surprise', 'anger', 'fear', 'joy']

In [3]:
from nltk.tokenize import TweetTokenizer
import nltk.tokenize.casual as casual
from drevicko.twitter_regexes import cleanString, setupRegexes, tweetPreprocessor
import preprocess_twitter

def preprocess_tweet(text):    
    text = casual.reduce_lengthening(text)
    text = cleanString(setupRegexes('twitterProAna'),text)   
    text = ' '.join([span for notentity,span in tweetPreprocessor(text, ("urls", "users", "lists")) if notentity])
    text = text.replace('\t','')
    text = text.replace('< ','<').replace(' >','>')
    text = text.replace('):', '<sadface>').replace('(:', '<smile>')
    text = text.replace(" 't", "t")#.replace('#','')
    return text

def tokenise_tweet(text):
    text = preprocess_twitter.tokenize(text)
    text = preprocess_tweet(text)     
    return ' '.join(text.split())

tokenise_tweet.regexes = setupRegexes('twitterProAna')

imported regex as re

In [4]:
from collections import defaultdict
import numpy as np
import pandas as pd
import csv

def _load_tweets(filename = "tweet.txt"):

    tweets_text = []

    for line in open(filename, 'rb'):

        line = line.decode("utf-8")
#         tw_text = ' '.join(tokenise_tweet(line))
        tw_text = tokenise_tweet(line.split())

def _load_labels(filename = "labels.txt"):
    labels = []
    for line in open(filename, 'rb'):
    header, labels = labels[0], labels[1:]

def _read_csv(filename = "data.csv", header=True):
    df = pd.read_csv(filepath_or_buffer=filename)    
    tweets_list = []
    labels_list = []
    conf_list = []
    for row in df.iterrows():
        labels_list.append([row[1][emo]for emo in emoNames])         
        #conf_list.append([row[1][emo+'_conf']for emo in emoNames])
    return tweets_list,labels_list

# tweets = _load_tweets(filename = "/home/vlaand/IpythonNotebooks/cf-5point-data/tweet")
# labels = _load_labels(filename = "/home/vlaand/IpythonNotebooks/cf-5point-data/labels-lowConfidence.csv")

tweets,labels = _read_csv(filename = "/home/vlaand/IpythonNotebooks/cf-5point-data/data-full5.csv",header=True)
# print(len(tweets), 'tweets,',len(labels),'labels')


In [7]:
from collections import Counter
from stop_words import get_stop_words

import os
from sklearn.externals import joblib

def ifExists(filename):
    dir = os.path.dirname(filename)
def checkFolder(filename):
    dir = os.path.dirname(filename)

def _get_unique_tokens(tweets):    
    return(Counter(token for tweet in tweets for token in tweet.split()))

def _save_unique_tokens(tokens, filename='wordFrequencies.dump'):
    _ = joblib.dump(tokens, filename=filename, compress=9)

def _plot_word_frequencies(wordFrequencies, WORD_FREQUENCY_TRESHOLD = 3):
    freqs = []
    for t,c in wordFrequencies.items():
    q = 0
    for t,c in wordFrequencies.items():
    print(q, len(wordFrequencies))
    %pylab inline

def _reduce_text(text, LANGUAGE='en', WORD_FREQUENCY_TRESHOLD = 3):    

    stop_words = get_stop_words(LANGUAGE)

    tweets_reduced = []

    for tw in tweets:
        tweet_r = []
        for token in tw.split():
            if(wordFrequencies[token] >= WORD_FREQUENCY_TRESHOLD):
                if(not token in stop_words):

        tweets_reduced.append( ' '.join(tweet_r)  )

wordFrequencies = _get_unique_tokens(tweets)
_save_unique_tokens(tokens=wordFrequencies,filename = '/home/vlaand/IpythonNotebooks/cf-5point-data/wordFrequencies.dump')
_plot_word_frequencies(wordFrequencies, WORD_FREQUENCY_TRESHOLD = WORD_FREQUENCY_TRESHOLD)
tweets_reduced = _reduce_text(tweets, WORD_FREQUENCY_TRESHOLD = WORD_FREQUENCY_TRESHOLD)

1854 5157
Populating the interactive namespace from numpy and matplotlib

In [8]:
wordFrequencies2 = _get_unique_tokens(tweets_reduced)
_plot_word_frequencies(wordFrequencies2, WORD_FREQUENCY_TRESHOLD = WORD_FREQUENCY_TRESHOLD)

1744 1744
Populating the interactive namespace from numpy and matplotlib


In [9]:
from sklearn.feature_extraction.text import CountVectorizer



Save ngramizer

In [10]:
def _save_ngramizer(filename = 'ngramizer.dump'):
    _ = joblib.dump(ngramizer, filename=filename, compress=9)
    print(filename+' saved')
vectorizer = CountVectorizer(ngram_range = (1,NGRAM_VALUE),token_pattern=r'\b\w+\b', min_df=WORD_FREQUENCY_TRESHOLD)
ngramizer =
vec = ngramizer.transform(tweets_reduced).toarray()
print(len(vec), len(vec[0]))
_save_ngramizer(filename = '/home/vlaand/IpythonNotebooks/cf-5point-data/5gramizer.dump')

2019 2429
/home/vlaand/IpythonNotebooks/cf-5point-data/5gramizer.dump saved

Load ngramizer

In [11]:
def _load_ngramizer(filename = 'ngramizer.dump'):
    ngramizer = joblib.load(filename = filename)
    print(filename+' loaded')
    return ngramizer

# ngramizer = _load_ngramizer('/home/vlaand/IpythonNotebooks/senpy-plugins-NUIG/hashTagClassification/ngramizers/'+str(NGRAM_VALUE)+'gramizer.dump')
# ngramizer = _load_ngramizer('/home/vlaand/IpythonNotebooks/senpy-plugins-NUIG/hashTagClassification/ngramizers/ngramizer.dump')

vec = ngramizer.transform(tweets_reduced).toarray()
print(len(vec), len(vec[0]))

2019 2429


In [12]:
from natsort import natsorted
train_data_features = vec#X_train_counts.toarray()
vocab = ngramizer.get_feature_names()
dist = np.sum(train_data_features, axis=0)
ngram_freq = {}

# For each, print the vocabulary word and the frequency
for tag, count in zip(vocab, dist):
    #print(tag, count)



In [13]:
def _read_csv_we(filename = "data.csv"):
    embedding_index = {}

    for row in pd.read_csv(filepath_or_buffer=filename, sep = ' ', header=None).iterrows():
        word, coefs = row[1][0], np.asarray(row[1][1:])
        embedding_index[word] = coefs
    print('we vectors loaded from <'+filename+'>')
    return embedding_index

def _load_original_vectors(filename = 'wordvectors-glove.twitter.27B.100d.txt', sep = ' ', wordFrequencies = None):

    Dictionary, Indices  = {},{}
    for line in open(filename, 'rb'): 
        values = line.decode('utf-8').split(sep)
        token = values[0]
        token_vector = np.array(values[1:], dtype = 'float32')   
            if(token in wordFrequencies):                
                Dictionary[token] = token_vector
            Dictionary[token] = token_vector
    print('we vectors loaded from <'+filename+'>')
    print('\t'+str(len(Dictionary))+' entries') 
    return(Dictionary, Indices)

def pretrainedEmbeddings(EmbeddingPath):
        embedding_index = {}
        with open(EmbeddingPath) as f:
            embedding_wordsList = []
            for line in f:
                values = line.split(" ")
                word = values[0]
                coefs = np.asarray(values[1:])
                embedding_index[word] = coefs
        return (embedding_index, embedding_wordsList)

def _texts_to_sequences(train_tweets):
    train_sequences = []
    for i,tweet in enumerate(train_tweets): 
        tw = []
        for token in tweet.split():
        tw.extend([0]*( maxlen-len(tw)) )
    return train_sequences

def _data_to_lists(dataTrain):    
    train_tweets, train_labels = [], []
    print('stacking data to lists')
    for i in dataTrain:
        scores = []
        for score in i[1:]:
            if np.isnan(score):
                scores.append( 0 )
                print('\tWarning: Nan value present in dataset')
    print('data stacked to lists\n\t'+str(len(train_tweets))+' tweets\n\t'+str(len(train_labels))+' labels')
    return train_tweets, train_labelsprint(len(Dictionary),'tokens in we')

def dataframe_to_lists(df):

    train_tweets, train_labels = [], []

    for row in df.iterrows():
    return train_tweets, train_labels

def lists_to_vectors(train_tweets, train_labels, embedding_matrix=None):

    train_sequences = _texts_to_sequences(train_tweets)
    if embedding_matrix==None:
        embedding_matrix = np.zeros((len(Indices)+1, EMBEDDINGS_DIM))

    print('matrix created\n\t',embedding_matrix.shape)
    for (word, i) in Indices.items():
        embedding_vector = Dictionary.get(word)
        if embedding_vector != None:
            embedding_matrix[i] = embedding_vector.astype(np.float)

    _X = sequence.pad_sequences(train_sequences, maxlen=maxlen)
    _y = np.array(train_labels)

    print(len(_X), 'train sequences loaded')
    print('\t',_X.shape,'\n\t', _y.shape)
    return _X, _y, embedding_matrix

def _get_maxlen(tweets):
    max = 0
    for tw in tweets:
        if len(tw.split()) > max:
            max = len(tw.split())
    return max

In [14]:

_path_wordembeddings = '/home/vlaand/data/Glove/glove.twitter.27B/glove.twitter.27B.'+str(EMBEDDINGS_DIM)+'d.txt'
# _path_wordembeddings = '/home/vlaand/data/Glove/glove.6B/glove.6B.100d.txt'

Dictionary, Indices = _load_original_vectors(
        filename = _path_wordembeddings, 
        sep = ' ',
        wordFrequencies = None)#wordFrequencies) # leave wordFrequencies=None for loading the entire WE file

Indices_reversed = {}
for key in Indices.keys():

we vectors loaded from </home/vlaand/data/Glove/glove.twitter.27B/glove.twitter.27B.100d.txt>
	1193514 entries

In [15]:
meltTweets = []

print('all tweets melted into list, ',len(meltTweets))
print("max tweet length: %d tokens" %(_get_maxlen(meltTweets)) )

def _get_unique_tokens(text):    
    return(Counter(token for sentence in text for token in sentence.split()) )

wordFrequencies = _get_unique_tokens(tweets) 
_plot_word_frequencies(wordFrequencies, WORD_FREQUENCY_TRESHOLD = WORD_FREQUENCY_TRESHOLD)

all tweets melted into list,  2019
max tweet length: 56 tokens
1854 5157
Populating the interactive namespace from numpy and matplotlib
/home/vlaand/anaconda3/lib/python3.6/site-packages/IPython/core/magics/ UserWarning: pylab import has clobbered these variables: ['dist']
`%matplotlib` prevents importing * from pylab and numpy
  "\n`%matplotlib` prevents importing * from pylab and numpy"

In [16]:
def plotSentenceLength(sentences):
    values = sorted([len(x.split()) for x in sentences],reverse=True)
    print(min(values), max(values), mean(values), median(values))
    line_0, = plt.plot(values,  label='Sentence length curve')
    plt.ylabel('sentence length')
    plt.title("Sentence length distribution")

1 56 14.4928182268 13.0


In [17]:
def matthews_correlation(y_true, y_pred):
    y_pred_pos = K.round(K.clip(y_pred, 0, 1))
    y_pred_neg = 1 - y_pred_pos

    y_pos = K.round(K.clip(y_true, 0, 1))
    y_neg = 1 - y_pos

    tp = K.sum(y_pos * y_pred_pos)
    tn = K.sum(y_neg * y_pred_neg)

    fp = K.sum(y_neg * y_pred_pos)
    fn = K.sum(y_pos * y_pred_neg)

    numerator = (tp * tn - fp * fn)
    denominator = K.sqrt((tp + fp) * (tp + fn) * (tn + fp) * (tn + fn))

    return numerator / (denominator + K.epsilon())

def pearson_score(y, y_pred):
    return pearsonr(y, y_pred)[0]

def spearman_score(y, y_pred):
    return spearmanr(y, y_pred)[0]

def regressionReport_lstm(trainedModel, X_test, y_test, print_architecture = False):  
    scores = []
    for emo in range(len(emoNames)):
        y_pred1 = trainedModel.predict(X_test) #np.asarray([y_[0] for y_ in trainedModel.predict(X_test)], dtype=float32)
        y_pred1 = np.asarray([y_[emo] for y_ in y_pred1])
        y_true1 = np.asarray([y_[emo] for y_ in y_test])
        r2,prs,spr = r2_score(y_true1, y_pred1), pearson_score(y_true1, y_pred1), spearman_score(y_true1, y_pred1)    

#         print("%s, %.4f, %.4f, %.4f" %  (emoNames[emo],r2,prs,spr))  
    if print_architecture:
    return scores

def printScores(scores):
    print("emotion, R2, pearson, spearman")
    for row in scores:
        print("%s, %.4f, %.4f, %.4f" %  (row[0],row[1],row[2],row[3],))

In [18]:
maxlen = 30

lstm_X, lstm_y, embedding_matrix = lists_to_vectors(tweets, labels)
lstm_y = np.asarray(labels)

matrix created
	 (1193515, 100)
/home/vlaand/anaconda3/lib/python3.6/site-packages/ FutureWarning: comparison to `None` will result in an elementwise object comparison in the future.
2019 train sequences loaded
	 (2019, 30) 
	 (2019, 4)

In [19]:
train, test, train, test = train_test_split(list(range(len(lstm_y))), list(range(len(lstm_y))), test_size=0.2, random_state=1337)

lstm_X_train, lstm_y_train = lstm_X[train], lstm_y[train]
lstm_X_test,  lstm_y_test  = lstm_X[test], lstm_y[test]

print('train data and label shape:', lstm_X_train.shape, lstm_y_train.shape)
print('test data and label shape:', lstm_X_test.shape, lstm_y_test.shape)

train data and label shape: (1615, 30) (1615, 4)
test data and label shape: (404, 30) (404, 4)

In [20]:
def oversampling(X, y):    
    ros = RandomOverSampler(ratio='auto')    
    y_fake = list(range(len(y)))
    X_resampled, y_resampled = ros.fit_sample(X, y_fake)
    y_resampled = np.asarray([y[_y] for _y in y_resampled])
    return X_resampled, y_resampled

# lstm_X_train_oversampled, lstm_y_train_oversampled = oversampling(X = lstm_X_train, y = lstm_y_train)#lstm_y_train)

In [ ]:
hidden_dims1 = 50
hidden_dims2 = 25
hidden_dims3 = 4

model = Sequential()
model.add(Embedding(len(Indices)+1,  EMBEDDINGS_DIM, weights=[embedding_matrix],
                            input_length=maxlen, trainable=False))
model.add(Bidirectional(LSTM(EMBEDDINGS_DIM))) #dropout is same as regularisation

model.add(Dense(hidden_dims1, b_regularizer=l2(0.01)),)
model.add(Dense(hidden_dims2, b_regularizer=l2(0.01)), ) 
model.add(Dense(hidden_dims3, activation='softsign'))

model.compile(loss='mean_absolute_error', optimizer='adam',metrics=['accuracy',matthews_correlation])

In [34]:

batch_size = 32

hidden_dims1 = 50
hidden_dims2 = 25
hidden_dims3 = 4
nb_epoch = 8

scores = []

lstmTrained = Sequential()
lstmTrained.add(Embedding(len(Indices)+1,  EMBEDDINGS_DIM, weights=[embedding_matrix],
                            input_length=maxlen, trainable=False))
lstmTrained.add(Bidirectional(LSTM(EMBEDDINGS_DIM, kernel_regularizer=l2(0.05))))
# lstmTrained.add(LSTM(EMBEDDINGS_DIM, kernel_regularizer=l2(0.05)))#, return_sequences=True, W_regularizer=l2(0.02))) 
lstmTrained.add(Dense(hidden_dims1, bias_regularizer=l2(0.01)), )
lstmTrained.add(Dense(hidden_dims2, bias_regularizer=l2(0.01)), ) 
lstmTrained.add(Dense(hidden_dims3, activation='elu'))
lstmTrained.compile(loss='mae', optimizer='adam', metrics=['accuracy'])#, matthews_correlation]), lstm_y_train, batch_size=batch_size, epochs = nb_epoch, validation_split=None)

Epoch 1/8
1615/1615 [==============================] - 2s - loss: 4.8652 - acc: 0.4012     
Epoch 2/8
1615/1615 [==============================] - 2s - loss: 1.1876 - acc: 0.4204     
Epoch 3/8
1615/1615 [==============================] - 2s - loss: 0.5224 - acc: 0.4613     
Epoch 4/8
1615/1615 [==============================] - 2s - loss: 0.4198 - acc: 0.4873     
Epoch 5/8
1615/1615 [==============================] - 2s - loss: 0.4058 - acc: 0.4910     
Epoch 6/8
1615/1615 [==============================] - 2s - loss: 0.3897 - acc: 0.4935     
Epoch 7/8
1615/1615 [==============================] - 2s - loss: 0.3802 - acc: 0.5263     
Epoch 8/8
1615/1615 [==============================] - 2s - loss: 0.3773 - acc: 0.5115     
<keras.callbacks.History at 0x7f69422f4400>

In [35]:
scores = regressionReport_lstm(trainedModel = lstmTrained,
                                        print_architecture = False)


emotion, R2, pearson, spearman
confident, 0.1748, 0.4283, 0.4163
excited, 0.3183, 0.5691, 0.5578
happy, 0.4485, 0.6698, 0.6327
surprised, 0.0549, 0.3024, 0.2975

In [36]:
# lstmTrained.to_json()

'{"class_name": "Sequential", "config": [{"class_name": "Embedding", "config": {"name": "embedding_6", "trainable": false, "batch_input_shape": [null, 30], "dtype": "float32", "input_dim": 1193515, "output_dim": 100, "embeddings_initializer": {"class_name": "RandomUniform", "config": {"minval": -0.05, "maxval": 0.05, "seed": null}}, "embeddings_regularizer": null, "activity_regularizer": null, "embeddings_constraint": null, "mask_zero": false, "input_length": 30}}, {"class_name": "Bidirectional", "config": {"name": "bidirectional_6", "trainable": true, "layer": {"class_name": "LSTM", "config": {"name": "lstm_6", "trainable": true, "return_sequences": false, "return_state": false, "go_backwards": false, "stateful": false, "unroll": false, "implementation": 0, "units": 100, "activation": "tanh", "recurrent_activation": "hard_sigmoid", "use_bias": true, "kernel_initializer": {"class_name": "VarianceScaling", "config": {"scale": 1.0, "mode": "fan_avg", "distribution": "uniform", "seed": null}}, "recurrent_initializer": {"class_name": "Orthogonal", "config": {"gain": 1.0, "seed": null}}, "bias_initializer": {"class_name": "Zeros", "config": {}}, "unit_forget_bias": true, "kernel_regularizer": {"class_name": "L1L2", "config": {"l1": 0.0, "l2": 0.05000000074505806}}, "recurrent_regularizer": null, "bias_regularizer": null, "activity_regularizer": null, "kernel_constraint": null, "recurrent_constraint": null, "bias_constraint": null, "dropout": 0.0, "recurrent_dropout": 0.0}}, "merge_mode": "concat"}}, {"class_name": "Dropout", "config": {"name": "dropout_6", "trainable": true, "rate": 0.2}}, {"class_name": "Dense", "config": {"name": "dense_16", "trainable": true, "units": 50, "activation": "linear", "use_bias": true, "kernel_initializer": {"class_name": "VarianceScaling", "config": {"scale": 1.0, "mode": "fan_avg", "distribution": "uniform", "seed": null}}, "bias_initializer": {"class_name": "Zeros", "config": {}}, "kernel_regularizer": null, "bias_regularizer": {"class_name": "L1L2", "config": {"l1": 0.0, "l2": 0.009999999776482582}}, "activity_regularizer": null, "kernel_constraint": null, "bias_constraint": null}}, {"class_name": "Dense", "config": {"name": "dense_17", "trainable": true, "units": 25, "activation": "linear", "use_bias": true, "kernel_initializer": {"class_name": "VarianceScaling", "config": {"scale": 1.0, "mode": "fan_avg", "distribution": "uniform", "seed": null}}, "bias_initializer": {"class_name": "Zeros", "config": {}}, "kernel_regularizer": null, "bias_regularizer": {"class_name": "L1L2", "config": {"l1": 0.0, "l2": 0.009999999776482582}}, "activity_regularizer": null, "kernel_constraint": null, "bias_constraint": null}}, {"class_name": "Dense", "config": {"name": "dense_18", "trainable": true, "units": 4, "activation": "elu", "use_bias": true, "kernel_initializer": {"class_name": "VarianceScaling", "config": {"scale": 1.0, "mode": "fan_avg", "distribution": "uniform", "seed": null}}, "bias_initializer": {"class_name": "Zeros", "config": {}}, "kernel_regularizer": null, "bias_regularizer": null, "activity_regularizer": null, "kernel_constraint": null, "bias_constraint": null}}], "keras_version": "2.0.8", "backend": "theano"}'

In [301]:

batch_size = 32

hidden_dims1 = 50
hidden_dims2 = 25
hidden_dims3 = 1
nb_epoch = 8

def foo():
    # print(emoNames[EMOTION])
    scores = []

    for fold in StratifiedKFold(n_folds = 5, shuffle=True, random_state=1337, y=lstm_y):
        train, test = fold

        lstmTrained = Sequential()
        lstmTrained.add(Embedding(len(Indices)+1,  EMBEDDINGS_DIM, weights=[embedding_matrix],
                                    input_length=maxlen, trainable=False))
        # lstmTrained.add(Bidirectional(LSTM(EMBEDDINGS_DIM)))
        lstmTrained.add(LSTM(EMBEDDINGS_DIM, activation='tanh', kernel_regularizer=l2(0.05)))#, return_sequences=True, W_regularizer=l2(0.02))) 
        # lstmTrained.add(Dropout(0.2))
        lstmTrained.add(Dense(hidden_dims1, bias_regularizer=l2(0.02)), )
        lstmTrained.add(Dense(hidden_dims2, bias_regularizer=l2(0.01)), ) 
        lstmTrained.add(Dense(hidden_dims3, activation='softsign'))
        # lstmTrained.add(Dense(hidden_dims3, activation='softmax'))
        lstmTrained.compile(loss='mean_absolute_error', optimizer='adam', metrics=['accuracy', matthews_correlation])[train], lstm_y_train_oversampled[train], batch_size=batch_size, epochs = nb_epoch, validation_split=None, verbose=False)

        scores.append(classificationReport_lstm(trainedModel = lstmTrained,
                                                print_architecture = False))

    print("r2,pearson,spearman\n%.5f,%.5f,%.5f" % (np.mean([s[0] for s in scores]),np.mean([s[1] for s in scores]),np.mean([s[2] for s in scores]) ))

0.7638, 0.8880, 0.8705
0.7916, 0.8929, 0.8770
0.7765, 0.8864, 0.8670
0.7736, 0.8828, 0.8660
0.7814, 0.8850, 0.8661


Vector transformations

In [83]:
def _vectors_similarity(v1 , v2):
    return( 1 - spatial.distance.cosine(v1,v2) )
def similarityVector(vector_, vectors_):
    resVector = np.asarray([_vectors_similarity(vector_ , v_) for v_ in vectors_])
    return np.asarray([np.max(resVector), np.mean(resVector), np.std(resVector), np.min(resVector)])
def compareTokenToSentence(leftToken, sentence):
    sentence_vectors = []
    for token in sentence:
        if token in Dictionary:
            token = token.replace('#','')
            if token in Dictionary:
    return similarityVector( Dictionary[leftToken], sentence_vectors)  
def capitalRatio(tweet):
        firstCap, allCap = 0, 0
        length = len(tweet)
        if length==0:
            return np.array([0,0])
        for i,token in enumerate(tweet.split()):
            if( token.istitle() ):
                firstCap += 1
            elif( token.isupper() ):
                allCap += 1
        return np.asarray([firstCap/length,allCap/length]) 
def tweetToWordVectors(dictionary, tweet, fixedLength=False):
    output = []    
        for i in range(MAX_SEQUENCE_LENGTH):
        for i,token in enumerate(tweet.split()):
            if token in Dictionary:
                output[i] = Dictionary[token]                
         for i,token in enumerate(tweet.lower().split()):
            if token in Dictionary:
            elif token.replace('#','') in Dictionary:
    return output
def ModWordVectors(x, mod=True):
    if(len(x) == 0):       
            return(np.zeros(EMBEDDINGS_DIM*3, dtype='float32'))
            return(np.zeros(EMBEDDINGS_DIM, dtype='float32'))        
    m =  np.matrix(x)
        xMean = np.array(m.mean(0))[0]
        xMin = np.array(m.min(0))[0]
        xMax = np.array(m.max(0))[0]
        xX = np.concatenate((xMean,xMin,xMax))
        return xX
        return np.array(m.mean(0))[0]
def bindTwoVectors(x0,x1):
    return np.array(list(itertools.chain(x0,x1)),dtype='float32') 
def _bind_vectors(x):
    return np.concatenate(x)   
def myLog10(vector):
    for i,v in enumerate(vector):
        if v > 0:
            vector[i] = np.log(v)
    return vector            
def _convert_text_to_vector(tweets,  Dictionary, labels, ngramizer, lstmLayer=None, emotion = None):
    _X = []
    _y = []
    vec = ngramizer.transform(tweets).toarray()
    for i, t in enumerate(tweets):
        if lstmLayer==None:
            embeddingsVector = ModWordVectors(tweetToWordVectors(Dictionary,tweets[i]))
            embeddingsVector_lstm = lstmLayer[i]
            embeddingsVector = ModWordVectors(tweetToWordVectors(Dictionary,tweets[i]))
#         capitalRatioVector = capitalRatio(dfs[st][emoNames[EMOTION]][i])
#         simVector = compareTokenToSentence(leftToken = emoNames[EMOTION], sentence = t)

        ngramVector = vec[i]
#         _X.append( embeddingsVector_lstm )
        _X.append(( _bind_vectors((ngramVector, embeddingsVector_lstm))  ))
#         _X.append( _bind_vectors((embeddingsVector,embeddingsVector_lstm))  )
        if emotion == None:
            _y.append( labels[i] )
            _y.append( labels[i][emotion] )
    return(np.asarray(_X), np.asarray(_y))

### Exctract activations
# for j,i in enumerate(lstmTrained.layers):
#     print(j,i)

get_activations = function([lstmTrained.layers[0].input], lstmTrained.layers[1].output, allow_input_downcast=True)
# activations = get_activations(lstm_X_train) # same result as above

In [84]:
normalize_labels = True
svc_X, svc_y = [[],[],[],[]], [[],[],[],[]]

for j, emo in enumerate(emoNames):
    emonum = j
    svc_X[j], svc_y[j] = _convert_text_to_vector(
        tweets = tweets_reduced,
        labels = labels, Dictionary = Dictionary, ngramizer = ngramizer,  
        emotion = emonum,
        lstmLayer = get_activations(lstm_X)        
    if normalize_labels:
        svc_y[j] /= 4
    print('emotion:', emoNames[emonum])
    print('\t', svc_X[j].shape, svc_y[j].shape)  
print("labels range: [%.1f : %.1f]" % (min(np.concatenate(svc_y)), max(np.concatenate(svc_y))))

emotion: confident
	 (2019, 2629) (2019,)
emotion: excited
	 (2019, 2629) (2019,)
emotion: happy
	 (2019, 2629) (2019,)
emotion: surprised
	 (2019, 2629) (2019,)
labels range: [0.0 : 1.0]

In [85]:
svc_X_train, svc_y_train = [emo[train] for emo in svc_X], [emo[train] for emo in svc_y]
svc_X_test,  svc_y_test  = [emo[test] for emo in svc_X], [emo[test] for emo in svc_y]

print('train data and label shape:', svc_X_train[0].shape, svc_y_train[0].shape)
print('test data and label shape:', svc_X_test[0].shape, svc_y_test[0].shape)

train data and label shape: (1615, 2629) (1615,)
test data and label shape: (404, 2629) (404,)


In [96]:
from sklearn.metrics import make_scorer

def pearson_score(y, y_pred):
    return pearsonr(y, y_pred)[0]

def spearman_score(y, y_pred):
    return spearmanr(y, y_pred)[0]

pearson_scorer = make_scorer(pearson_score)
spearman_scorer = make_scorer(spearman_score)

def getScores(estimator, x, y):
    y_pred = estimator.predict(x)
    return (r2_score(y, y_pred), 
            pearson_score(y, y_pred), 
            spearman_score(y, y_pred))

def my_scorer(estimator, x, y):
    r2, p, s = getScores(estimator, x, y)
#     print("%.4f, %.4f, %.4f" % (r2, p, s))
    return s

In [100]:

cv_folds = 5

def _greed_search():     

    list_acc = []    
    list_val = []
    epsilon = 0.001    
    if(ESTIMATOR == 'LinearSVR'): 
        gamma_array = [1.0]
    elif(ESTIMATOR == 'SVR'):
        gamma_array = [0.001,0.01,0.1]
    c_array = [0.001,0.01,0.1]
    print("estimator, emotion, C, gamma, tol, score")

    for C in c_array:
        for gamma in gamma_array:              
            for tol in [1e-4]:
                cvs = cross_val_score(estimator = LinearSVR(C=C, tol=tol), X=svc_X[EMOTION], y=svc_y[EMOTION], cv=cv_folds, n_jobs=cv_folds, scoring=my_scorer) 
                meanScore = np.mean(np.asarray(cvs))
#                 if(ESTIMATOR == 'LinearSVR'): 
#                     svcTrained = LinearSVR(C=C, tol=tol) 
#                 elif(ESTIMATOR == 'SVR'): 
#                     svcTrained = SVR(C=C, tol=tol,gamma=gamma) 
#       [EMOTION], svc_y[EMOTION])

#                 meanScore = r2_score(svc_y[EMOTION] , svcTrained.predict(svc_X[EMOTION]))
#                 prs = pearsonr(svc_y_test , svc_y_test_predict)[0]
#                 spr = spearmanr(svc_y_test , svc_y_test_predict)[0]

                list_val.append([moduleName,meanScore,ESTIMATOR, C, gamma,epsilon,tol,NGRAM_VALUE,EMBEDDINGS_DIM])
                print('%s, %s, %s, %s, %s, %.4f' %(ESTIMATOR, emoNames[EMOTION], str(C), str(gamma), str(tol), meanScore))     
    best = np.argmax(list_acc)    
#     print(list_val[best])
    out0 = {
        'gamma': list_val[best][4],
        'epsilon': list_val[best][5],
        'tol': list_val[best][6],
        'ngrams': list_val[best][7],
        'EMBEDDINGS_DIM': list_val[best][8],
        'score': list_val[best][1]
    return {ESTIMATOR:{emoNames[EMOTION]:out0}}

def _combine_best_results(pool_output, ESTIMATOR):
    new_p = {}       
    for i in pool_output:
    return new_p  

pool_output = [_greed_search()]


estimator, emotion, C, gamma, tol, score
LinearSVR, surprised, 0.001, 1.0, 0.0001, 0.3760
LinearSVR, surprised, 0.01, 1.0, 0.0001, 0.3231
LinearSVR, surprised, 0.1, 1.0, 0.0001, 0.1872

{'LinearSVR': {'surprised': {'C': 0.001, 'gamma': 1.0, 'epsilon': 0.001, 'tol': 0.0001, 'ngrams': 4, 'EMBEDDINGS_DIM': 100, 'score': 0.37597569469851272}}}

In [384]:
temp_params = _combine_best_results(pool_output, ESTIMATOR)

    train_params = {}
# train_params
# train_params = {'LinearSVR': {'confident': {'C': 0.01,   'EMBEDDINGS_DIM': 100,   'epsilon': 0.001,   'gamma': 1.0,   'ngrams': 4,   'score': 0.080144904381108911,   'tol': 0.0001},  'excited': {'C': 0.01,   'EMBEDDINGS_DIM': 100,   'epsilon': 0.001,   'gamma': 1.0,   'ngrams': 4,   'score': 0.20181175980742649,   'tol': 0.0001},  'happy': {'C': 0.01,   'EMBEDDINGS_DIM': 100,   'epsilon': 0.001,   'gamma': 1.0,   'ngrams': 4,   'score': 0.31076511419699682,   'tol': 0.0001},  'surprised': {'C': 0.001,   'EMBEDDINGS_DIM': 100,   'epsilon': 0.001,   'gamma': 1.0,   'ngrams': 4,   'score': -0.021849261405481914,   'tol': 0.0001}}}


In [177]:

    svcTrained = SVR(C=train_params[ESTIMATOR][emoNames[EMOTION]]['C'], 
    svcTrained = LinearSVR(C=train_params[ESTIMATOR][emoNames[EMOTION]]['C'], 
                 verbose=True)[EMOTION], svc_y[EMOTION])

# def saveModelFor(model, ESTIMATOR, path='/home/vlaand/IpythonNotebooks/wassa2017/'):
#     path = os.path.join(path,ESTIMATOR)
#     checkFolder(path)
#     filename = path+'.dump'
#     checkFolder(filename)
#     _ = joblib.dump(model, filename, compress=9)
#     print("model saved to <%s>" % filename)
# saveModelFor(svcTrained, ESTIMATOR=ESTIMATOR, path = os.path.join(repositoryPath, moduleName, 'classifiers'))

[LibLinear]LinearSVR(C=0.01, dual=True, epsilon=0.0, fit_intercept=True,
     intercept_scaling=1.0, loss='epsilon_insensitive', max_iter=1000,
     random_state=None, tol=0.0001, verbose=True)

In [178]:
from sklearn.svm import SVR, LinearSVR
from sklearn.externals import joblib
from multiprocessing import Pool
import os, sys

SEP = '/'
EXTENSION = '.dump'
SAVE_DIR = '/home/vlaand/IpythonNotebooks/senpy-plugins-NUIG/fivePointRegression/classifiers/'

def ifExists(filename):
    dir = os.path.dirname(filename)

def trainModelFor(EMOTION):
    y2 = []
    for ly in y:  
        if not np.isnan(ly[EMOTION]):        
    y2 = np.asarray(y2)    
    C = train_params[ESTIMATOR][ emoNames[EMOTION] ]['C'] 
    tol = train_params[ESTIMATOR][ emoNames[EMOTION] ]['tol'] 
    if(ESTIMATOR == 'SVR'):
        gamma = train_params[ESTIMATOR][ emoNames[EMOTION] ]['gamma'] 
        svcTrained = SVR(C=C, gamma=gamma, tol=tol), y2)
    elif(ESTIMATOR == 'LinearSVR'):
        svcTrained = LinearSVR(C=C, tol=tol), y2)
        print('Error: Classifier is not chosen')

def checkFolder(filename):
    dir = os.path.dirname(filename)

def saveModelFor(model, EMOTION):
    filename = path + SEP +str(emoNames[EMOTION]) + EXTENSION
    _ = joblib.dump(model, filename, compress=9)
    print('model ' + filename + ' saved')
    #return {emoNames[EMOTION]:model}
def trainAndSave(emotion):
    model = trainModelFor(emotion)
    saveModelFor(model=model, EMOTION=emotion)
    return {emoNames[emotion]:model}

def _combine_train_results(pool_output, ESTIMATOR):
    new_p = {ESTIMATOR:{}}   
    for i in pool_output:
    return new_p    

# X2 = np.asarray(X) 
# with Pool(processes = len(emoNames)) as p:
#     pool_output =, [i for i in  range(len(emoNames))])

# temp_models = _combine_train_results(pool_output, ESTIMATOR)

# try:
#     train_models.update(temp_models)
# except:
#     train_models = temp_models

In [227]:
l0,l1,l2 = [],[],[]
y0,y1,y2 = [],[],[]

for i,y_ in enumerate(y):
#     print(y_/4, y_pred[0][i], y_pred[1][i], y_pred[2][i])
    y_0,y_1,y_2 = y_pred[0][i],y_pred[1][i],y_pred[2][i]
    if not np.isnan(y_[2]/4):
#     print(i/4, j[0], j[1], j[2])

NameError                                 Traceback (most recent call last)
<ipython-input-227-c433b161cc70> in <module>()
      4 for i,y_ in enumerate(y):
      5 #     print(y_/4, y_pred[0][i], y_pred[1][i], y_pred[2][i])
----> 6     y_0,y_1,y_2 = y_pred[0][i],y_pred[1][i],y_pred[2][i]
      8     l0.append(y_0)

NameError: name 'y_pred' is not defined

Comparison HTC-VAD & FPR

In [40]:
models_path_fpr = '/home/vlaand/IpythonNotebooks/senpy-plugins-development/fivePointRegression/classifiers/SVR/'

models_fpr = {}
for emo in emoNames:
# models_fpr

In [9]:
models_path_htc = '/home/vlaand/IpythonNotebooks/senpy-plugins-NUIG/hashTagClassification/classifiers/LinearSVR/'

models_htc = {}
for emo in ['sadness', 'disgust', 'surprise', 'anger', 'fear', 'joy']:
# models_htc

In [215]:
models_path_htc_rbf = '/home/vlaand/IpythonNotebooks/senpy-plugins-NUIG/hashTagClassification/classifiers/SVC/'

models_htc_rbf = {}
for emo in ['sadness', 'disgust', 'surprise', 'anger', 'fear', 'joy']:
# models_htc_rbf

In [18]:
# wordFrequencies
# ngramizer

In [10]:
def _load_unique_tokens(filename = 'wordFrequencies.dump'):

#     filename = os.path.join(os.path.dirname(__file__),filename)

wordFrequencies = _load_unique_tokens('/home/vlaand/IpythonNotebooks/senpy-plugins-NUIG/hashTagClassification/wordFrequencies.dump')

In [11]:
def foo():
    ngramizers = []
    for n_grams in [2,3,4]:        
        filename = os.path.join('/home/vlaand/IpythonNotebooks/senpy-plugins-NUIG/hashTagClassification/ngramizers/', str(n_grams)+'gramizer.dump')
        ngramizers.append( joblib.load(filename) )
    return ngramizers

ngramizers = foo()

In [121]:
# ngramizer = joblib.load('/home/vlaand/IpythonNotebooks/senpy-plugins-NUIG/hashTagClassification/ngramizers/4gramizer.dump')

In [14]:
import pandas as pd

dataTrain = _read_csv_data(filename = _path_dataset, header=True)    
print('training data loaded from <'+_path_dataset+'>')

train_tweets = []
train_labels = []

for i in dataTrain:
    scores = []
    for score in i[1:]:

training data loaded from </home/vlaand/IpythonNotebooks/cf-5point-data/data-full.csv>

In [15]:
_ngramVectors = []

for i in ngramizers:

In [16]:
X_eval_2, y_eval = _convert_text_to_vector(tweets = train_tweets, tweet_original = train_tweets, Dictionary = Dictionary, labels = train_labels, ngramvec =_ngramVectors[0])
X_eval_3, y_eval = _convert_text_to_vector(tweets = train_tweets, tweet_original = train_tweets, Dictionary = Dictionary, labels = train_labels, ngramvec =_ngramVectors[1])
X_eval_4, y_eval = _convert_text_to_vector(tweets = train_tweets, tweet_original = train_tweets, Dictionary = Dictionary, labels = train_labels, ngramvec =_ngramVectors[2])


(2019, 16169) (2019, 18241) (2019, 19192)
(2019, 3)

In [ ]:
X = {'sadness':X4, 'disgust':X4, 'surprise':X4, 'anger':X2, 'fear':X4, 'joy':X3}

Predicts to VAD

In [24]:
y_pred_htc = {}

for emo in ['sadness', 'disgust', 'surprise', 'anger', 'fear', 'joy']:
    if emo == 'anger':
    elif emo == 'joy':
features_htc = {}
for dimension in ['V','A','D']:
    values = []
    for row in range(len(y_pred_htc['surprise'])):
        weights=[y_pred_htc[i][row] for i in y_pred_htc if (i != 'surprise')]
        if False in all(v == 0 for v in weights):
            values.append(np.average([centroids[i][dimension] for i in y_pred_htc if (i != 'surprise')], weights=weights ))

In [ ]:
y_pred_htc_rbf = {}
for emo in ['sadness', 'disgust', 'surprise', 'anger', 'fear', 'joy']:
    if emo == 'anger':
    elif emo == 'joy':


In [232]:
features_htc_rbf = {}
for dimension in ['V','A','D']:
    values = []
    for row in range(len(y_pred_htc_rbf['surprise'])):
        weights=[y_pred_htc_rbf[i][row][1] for i in y_pred_htc_rbf if (i != 'surprise')]
#         if False in all(v == 0 for v in weights):
        values.append(np.average([centroids[i][dimension] for i in y_pred_htc_rbf if (i != 'surprise')], weights=weights ))
#         else:
#             values.append(5.0)            
# features_htc_rbf


In [119]:
y_pred_fpr = {}

for emo in emoNames:

{'confident': array([ 5.99784253,  6.03535934,  6.03450945, ...,  6.03278388,
         6.01940154,  6.04865325]),
 'excited': array([ 6.04721026,  6.13505352,  6.21766691, ...,  6.17532646,
         6.14233364,  6.31161089]),
 'happy': array([ 4.67336817,  4.32912562,  4.62225976, ...,  5.25173427,
         4.73891756,  5.04414936])}

In [155]:
y_annotated = {'V':y_eval.T[0]*10/4, 'A':y_eval.T[1]*10/4 ,'D':y_eval.T[2]*10/4}

In [ ]:
vad_mappings = {
     'confident': 'D',
     'excited': 'A',
     'happy': 'V',

In [233]:
for i,dimension in enumerate(['V','A','D']):
        r2_score(   features_htc[dimension],               y_annotated[dimension]               ),
        r2_score(   y_pred_fpr[vad_mappings[dimension]],   y_annotated[dimension]               ),
        r2_score(   features_htc_rbf[dimension],           y_annotated[dimension]               )

-0.620680748396 -19.4727733044 -1.75307874052
-2.27833304462 -29.496527739 -14.8417019805
-1.56540913825 -22311.4709413 -9.50414297565


In [280]:
%pylab inline

import numpy as np
import matplotlib.pyplot as plt
import natsort

tmp_range = 2018
dimension = 'A'

s = y_annotated[dimension]
order = sorted(range(len(s)), key=lambda k: s[k])

g0 = [i for i in range(2019)]
g1 = features_htc[dimension]
g2 = y_pred_fpr[vad_mappings[dimension]] 
g3 = features_htc_rbf[dimension]
g4 = y_annotated[dimension]
# plt.plot(g1[0:tmp_range], 'r', g2[0:tmp_range], 'b', g3[0:tmp_range], 'g')
# plt.legend()

line_0, = plt.plot(np.array(g1)[], 'b.',  label='htc SVM')
line_1, = plt.plot(np.array(g2)[order], 'r.', label='fpr SVM')
line_2, = plt.plot(np.array(g3)[order], 'm.', label='htc SVM rbf')
line_3, = plt.plot(np.array(g4)[order], 'g.', label='annotated')
# plt.plot([0,l],[.3,.3])
# plt.plot([0,l],[-.3,-.3])
plt.legend(handles=[line_0, line_2, line_1, line_3])
plt.legend(bbox_to_anchor=(1.02, .4, .65, .0), loc=3,
           ncol=2, mode="expand", borderaxespad=0.)
# plt.legend(bbox_to_anchor=(0.6, 0.05, 0.35, .102), loc=4,
#            ncol=1, mode="expand", borderaxespad=0.)

plt.ylabel('dimension: '+dimension)
plt.title("Models Evaluation on 5point corpus")

Populating the interactive namespace from numpy and matplotlib

In [ ]:
feature_set_ftr = {emo: clf.predict_proba(X[emo]) for emo,clf in zip(emoNames, classifiers[estimator])}

# feature_set_htc = {emo: clf.predict(X[emo]) for emo,clf in zip(['sadness', 'disgust', 'surprise', 'anger', 'fear', 'joy'], classifiers[estimator])}

In [22]:
centroids= {
                            "anger": {
                                "A": 6.95, 
                                "D": 5.1, 
                                "V": 2.7
                            "disgust": {
                                "A": 5.3, 
                                "D": 8.05, 
                                "V": 2.7
                            "fear": {
                                "A": 6.5, 
                                "D": 3.6, 
                                "V": 3.2
                            "joy": {
                                "A": 7.22, 
                                "D": 6.28, 
                                "V": 8.6
                            "sadness": {
                                "A": 5.21, 
                                "D": 2.82, 
                                "V": 2.21
                            "neutral": {
                                "A": 5.0, 
                                "D": 5.0, 
                                "V": 5.0


In [250]:
import numpy as np
import pandas as pd
import csv

from keras.preprocessing import sequence
from keras.utils.np_utils import to_categorical
from keras.models import Sequential
from keras.layers import Dense, Activation, Embedding, Bidirectional, Dropout, LSTM
from keras.regularizers import l2

from imblearn.over_sampling import RandomOverSampler
from sklearn.cross_validation import StratifiedKFold

In [241]:
seed = 1337

maxlen = 65
batch_size = 32
epochs = 50

hidden_dims1 = 50
hidden_dims2 = 25
hidden_dims3 = 3

path = '/home/vlaand/IpythonNotebooks/cf-5point-data/'

_path_wordembeddings = '/home/vlaand/data/Glove/glove.twitter.27B/glove.twitter.27B.'+str(EMBEDDINGS_DIM)+'d.txt'
_path_dataset = path + "data-full5.csv"

Load training data and word embeddinbgs

In [242]:
dataTrain = _read_csv_data(filename = _path_dataset, header=True)  

train_tweets, train_labels = _data_to_lists(dataTrain)

data loaded from </home/vlaand/IpythonNotebooks/cf-5point-data/data-full5.csv>
	2019 entries
stacking data to lists
data stacked to lists
	2019 tweets
	2019 labels

In [243]:
wordFrequencies = _get_unique_tokens(train_tweets)
# _plot_word_frequencies(wordFrequencies, WORD_FREQUENCY_TRESHOLD = WORD_FREQUENCY_TRESHOLD)

Dictionary, Indices = _load_original_vectors(
        filename = '/home/vlaand/data/Glove/glove.twitter.27B/glove.twitter.27B.'+str(EMBEDDINGS_DIM)+'d.txt', 
        sep = ' ', 

In [244]:
train_sequences = _texts_to_sequences(train_tweets)

embedding_matrix = np.zeros((len(Indices)+1, EMBEDDINGS_DIM))

print('matrix created\n\t',embedding_matrix.shape)
for (word, i) in Indices.items():
    embedding_vector = Dictionary.get(word)
    if (embedding_vector != None):
        embedding_matrix[i] = embedding_vector.astype(np.float)

X = sequence.pad_sequences(train_sequences, maxlen=maxlen)
y = np.array(train_labels)

print(len(X), 'train sequences loaded')
print('\t',X.shape,'\n\t', y.shape)

matrix created
	 (1193515, 100)
/home/vlaand/anaconda3/lib/python3.5/site-packages/ipykernel/ FutureWarning: comparison to `None` will result in an elementwise object comparison in the future.
2019 train sequences loaded
	 (2019, 65) 
	 (2019, 4)

Evaluation / Cross validation

Kfold cross-validation

In [66]:
from multiprocessing import Pool
from sklearn.metrics import r2_score
import os

epochs = 20
n_splits = 5

def _cross_validation_parallel(_input):
    train, test = _input
    model = Sequential()
    model.add(Embedding(len(Indices)+1,  EMBEDDINGS_DIM, weights=[embedding_matrix],
                                input_length=maxlen, trainable=False))
    model.add(Dense(hidden_dims1, kernel_regularizer=l2(0.01)), )
    model.add(Dense(hidden_dims2, kernel_regularizer=l2(0.01)), ) 
    model.add(Dense(hidden_dims3, activation='elu'))

    model.compile(loss='mae', optimizer='adam',metrics=['accuracy'])[train], y[train], batch_size=batch_size, epochs=epochs, validation_split=None, verbose=0)

    scores = []
    for i in [0,1,2]:
        score = r2_score( y[test][:, i] , model.predict(X[test])[:, i] )
        #score = r2_score([y_[i] for y_ in y[test]], [y_[i] for y_ in model.predict(X[test])])
    return scores

seed = 1337

kfold = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=seed)

with Pool(processes = n_splits) as p:
    cvscores =, [(train, test) for (train, test) in kfold.split(X, [yy[0] for yy in y])])

/home/vlaand/anaconda3/lib/python3.5/site-packages/sklearn/model_selection/ Warning: The least populated class in y has only 1 members, which is too few. The minimum number of groups for any class cannot be less than n_splits=5.
  % (min_groups, self.n_splits)), Warning)
R2[confident]	0.12651473344
R2[excited]	0.183913593432
R2[happy]	0.325849621671
R2[confident]	0.114282643682
R2[excited]	0.280324333738
R2[happy]	0.33475915174
R2[confident]	0.0799940323227
R2[excited]	0.297428340739
R2[happy]	0.427131191301
R2[confident]	0.115380225651
R2[excited]	0.318680321749
R2[happy]	0.245129420541
R2[confident]	0.12899566559
R2[excited]	0.276562234678
R2[happy]	0.34378839584

In [68]:
print("%d-fold cross validation\n" % (n_splits))
for i in [0,1,2]:
    print("R2[%s] \t%.4f (+/- %.2f)" % (emoNames[i], np.mean(cvscores,axis=0)[i], np.std(cvscores,axis=0)[i]))

5-fold cross validation

R2[confident] 	0.1130 (+/- 0.02)
R2[excited] 	0.2714 (+/- 0.05)
R2[happy] 	0.3353 (+/- 0.06)
5-fold cross validation R2[confident] 0.0758 (+/- 0.02) R2[excited] 0.2215 (+/- 0.03) R2[happy] 0.2808 (+/- 0.02) 3-fold cross validation R2[confident] 0.09 (+/- 0.14) R2[excited] 0.20 (+/- 0.11) R2[happy] 0.18 (+/- 0.08)

In [ ]:
from keras.models import Sequential
from keras.layers import Dense
from sklearn.cross_validation import StratifiedKFold

from sklearn.metrics import r2_score, f1_score, classification_report
# from skll.metrics import pearsonr, spearman
from scipy.stats import pearsonr, spearmanr

from multiprocessing import Pool, Manager
import os
import numpy as np

def _cross_validation_parallel(_input):
    train, test = _input
    model = Sequential()
    model.add(Embedding(len(Indices)+1,  EMBEDDINGS_DIM, weights=[embedding_matrix],
                                input_length=maxlen, trainable=False))
    model.add(Bidirectional(LSTM(EMBEDDINGS_DIM))) #dropout is same as regularisation
    model.add(Dense(hidden_dims1, W_regularizer=l2(0.01)), )
    model.add(Dense(hidden_dims2, W_regularizer=l2(0.01)), ) #!!!
    model.add(Dense(hidden_dims3, activation='softsign'))
    model.compile(loss='mean_absolute_error', optimizer='adam', metrics=['accuracy', matthews_correlation])
   [train], y[train], batch_size=batch_size, nb_epoch=nb_epoch, validation_split=None)
#     scores = model.evaluate(X[test], y[test], verbose=0,)
    y_test_predict = model.predict(X[test])
#     y_test_predict = np.reshape(y_test_predict, newshape=(len(y_test_predict),))
    scores =  [
                [r2_score(y_test_predict[:,emo], y[test][:,emo]) for emo in [0,1,2,3]], 
                [pearsonr(y_test_predict[:,emo], y[test][:,emo]) for emo in [0,1,2,3]], 
                [spearmanr(y_test_predict[:,emo], y[test][:,emo]) for emo in [0,1,2,3]]
#     try:
#         print("%s: %.2f" % (model.metrics_names[2], scores[1]))
#     except:
#         print('Error')
    return scores

nb_epoch = 80
n_splits = 5
hidden_dims1, hidden_dims2, hidden_dims3 = 50, 25, 4

# with open('senpy-plugins-development/fivePointRegression/classifiers/LSTM/log.out', "w") as log_file:
#     log_file.write(str(cvscores)+'\n')
#     log_file.write("%.2f (+/- %.2f%%)" % (np.mean(cvscores), np.std(cvscores)))

In [ ]:
kfold = StratifiedKFold(n_folds=n_splits, shuffle=True, random_state=1337, y=y[:,0])

with Pool(processes = n_splits) as p:
    cvscores =, ((train, test) for (train, test) in kfold))

/home/vlaand/anaconda3/lib/python3.5/site-packages/sklearn/ Warning: The least populated class in y has only 1 members, which is too few. The minimum number of labels for any class cannot be less than n_folds=5.
  % (min_labels, self.n_folds)), Warning)
Epoch 1/80
 448/1618 [=======>......................] - ETA: 52s - loss: 1.5245 - acc: 0.4576 - matthews_correlation: 0.2409

In [196]:
for emo in [0,1,2,3]:

    r2 = [n[emo] for n in [fold[0] for fold in cvscores]]
    prs = [n[emo][0] for n in [fold[1] for fold in cvscores]]
    spr = [n[emo][0] for n in [fold[2] for fold in cvscores]]

    print("[%8s]\t%.2f\t%.2f\t%.2f\t%.2f\t%.2f\t%.2f" % (emoNames[emo],
                                 np.mean(r2), np.std(r2)/np.sqrt(n_splits),
                                 np.mean(prs), np.std(prs)/np.sqrt(n_splits),
                                 np.mean(spr), np.std(spr)/np.sqrt(n_splits)

		R2	sd	pearson	sd	spearman
[confident]	-0.62	0.09	0.44	0.02	0.43	0.01
[ excited]	-0.65	0.04	0.52	0.01	0.52	0.00
[   happy]	-0.14	0.05	0.62	0.02	0.62	0.02
[surprised]	-1.68	0.09	0.29	0.01	0.28	0.02
R2 sd pearson sd spearman [confident] -0.60 0.10 0.46 0.01 0.44 0.01 [ excited] -0.47 0.03 0.56 0.01 0.55 0.01 [ happy] -0.10 0.02 0.63 0.01 0.63 0.01 [surprised] -1.60 0.09 0.24 0.01 0.24 0.01 R2 sd pearson sd spearman [confident] -0.62 0.09 0.44 0.02 0.43 0.01 [ excited] -0.65 0.04 0.52 0.01 0.52 0.00 [ happy] -0.14 0.05 0.62 0.02 0.62 0.02 [surprised] -1.68 0.09 0.29 0.01 0.28 0.02

In [176]:



Train / test split

In [336]:
from sklearn.cross_validation import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

print('shapes of training data and label tensors:', X_train.shape, y_train.shape)
print('shapes of testing data and label tensors:', X_test.shape, y_test.shape)

shapes of training data and label tensors: (1615, 65) (1615, 4)
shapes of testing data and label tensors: (404, 65) (404, 4)

In [21]:
# set(y for y in y_train.flat)

In [26]:
# kfold = StratifiedKFold(n_folds=5, shuffle=True, random_state=seed,y=y)

Custom Metrics

In [337]:
import keras.backend as K

def matthews_correlation(y_true, y_pred):
    y_pred_pos = K.round(K.clip(y_pred, 0, 1))
    y_pred_neg = 1 - y_pred_pos

    y_pos = K.round(K.clip(y_true, 0, 1))
    y_neg = 1 - y_pos

    tp = K.sum(y_pos * y_pred_pos)
    tn = K.sum(y_neg * y_pred_neg)

    fp = K.sum(y_neg * y_pred_pos)
    fn = K.sum(y_pos * y_pred_neg)

    numerator = (tp * tn - fp * fn)
    denominator = K.sqrt((tp + fp) * (tp + fn) * (tn + fp) * (tn + fn))

    return numerator / (denominator + K.epsilon())
activations = ['elu','softplus','softsign','relu','tanh','sigmoid','hard_sigmoid','linear','softmax'] losses = ['mse','mae'] optimizers = ['sgd', 'rmsprop', 'adagrad', 'adadelta', 'adam', 'nadam']

In [338]:
hidden_dims1 = 50
hidden_dims2 = 25
hidden_dims3 = 4

model = Sequential()
model.add(Embedding(len(Indices)+1,  EMBEDDINGS_DIM, weights=[embedding_matrix],
                            input_length=maxlen, trainable=False))
model.add(Bidirectional(LSTM(EMBEDDINGS_DIM))) #dropout is same as regularisation

model.add(Dense(hidden_dims1, b_regularizer=l2(0.01)),)
model.add(Dense(hidden_dims2, b_regularizer=l2(0.01)), ) 
model.add(Dense(hidden_dims3, activation='softsign'))

model.compile(loss='mean_absolute_error', optimizer='adam',metrics=['accuracy',matthews_correlation])

In [339]:
nb_epoch = 20, y, batch_size=batch_size, nb_epoch=nb_epoch, validation_split=None)

Epoch 1/20
2019/2019 [==============================] - 21s - loss: 0.1428 - acc: 0.4210 - matthews_correlation: 0.2000    
Epoch 2/20
2019/2019 [==============================] - 20s - loss: 0.1125 - acc: 0.4631 - matthews_correlation: 0.3472    
Epoch 3/20
2019/2019 [==============================] - 20s - loss: 0.1072 - acc: 0.4715 - matthews_correlation: 0.3852    
Epoch 4/20
2019/2019 [==============================] - 20s - loss: 0.1011 - acc: 0.5057 - matthews_correlation: 0.4427    
Epoch 5/20
2019/2019 [==============================] - 20s - loss: 0.0968 - acc: 0.4943 - matthews_correlation: 0.4517    
Epoch 6/20
2019/2019 [==============================] - 20s - loss: 0.0961 - acc: 0.5097 - matthews_correlation: 0.4445    
Epoch 7/20
2019/2019 [==============================] - 20s - loss: 0.0930 - acc: 0.5324 - matthews_correlation: 0.4795    
Epoch 8/20
2019/2019 [==============================] - 20s - loss: 0.0897 - acc: 0.5379 - matthews_correlation: 0.4953    
Epoch 9/20
2019/2019 [==============================] - 20s - loss: 0.0898 - acc: 0.5364 - matthews_correlation: 0.4915    
Epoch 10/20
2019/2019 [==============================] - 20s - loss: 0.0863 - acc: 0.5632 - matthews_correlation: 0.5215    
Epoch 11/20
2019/2019 [==============================] - 19s - loss: 0.0851 - acc: 0.5681 - matthews_correlation: 0.5196    
Epoch 12/20
2019/2019 [==============================] - 19s - loss: 0.0824 - acc: 0.5750 - matthews_correlation: 0.5518    
Epoch 13/20
2019/2019 [==============================] - 19s - loss: 0.0812 - acc: 0.5765 - matthews_correlation: 0.5500    
Epoch 14/20
2019/2019 [==============================] - 20s - loss: 0.0794 - acc: 0.5721 - matthews_correlation: 0.5466    
Epoch 15/20
2019/2019 [==============================] - 19s - loss: 0.0789 - acc: 0.5849 - matthews_correlation: 0.5683    
Epoch 16/20
2019/2019 [==============================] - 20s - loss: 0.0776 - acc: 0.5785 - matthews_correlation: 0.5768    
Epoch 17/20
2019/2019 [==============================] - 20s - loss: 0.0763 - acc: 0.5889 - matthews_correlation: 0.5701    
Epoch 18/20
2019/2019 [==============================] - 20s - loss: 0.0752 - acc: 0.5884 - matthews_correlation: 0.5849    
Epoch 19/20
2019/2019 [==============================] - 20s - loss: 0.0725 - acc: 0.5884 - matthews_correlation: 0.6061    
Epoch 20/20
2019/2019 [==============================] - 20s - loss: 0.0732 - acc: 0.5874 - matthews_correlation: 0.6050    
<keras.callbacks.History at 0x7fa71c40a978>

In [325]:
from sklearn.metrics import r2_score, f1_score, classification_report
# from skll.metrics import pearson, spearman
from scipy.stats import pearsonr, spearmanr

y_test_predicts = []
for i in range(20):
    if i>0:, y_train, batch_size=batch_size, nb_epoch=1,validation_split=None,)
#, y, batch_size=batch_size, nb_epoch=1, validation_split=None)
    y_test_pred = np.array( model.predict(X_test))
    print("%8s\t%.2f\t%.2f\t%.2f" % (i,
                                 np.mean([r2_score(y1,y2)for y1,y2 in zip(y_test, y_test_pred)]) ,                                 
                                 np.mean([pearsonr(y1,y2)[0]  for y1,y2 in zip(y_test, y_test_pred)]) ,
                                 np.mean([spearmanr(y1,y2)[0]  for y1,y2 in zip(y_test, y_test_pred)])

#, y_train, batch_size=batch_size, nb_epoch=nb_epoch,validation_split=None,)

		R2	pearson	spearman
       0	-3.19	0.02	0.01
Epoch 1/1
1413/1413 [==============================] - 14s - loss: 0.4674 - acc: 0.4529 - matthews_correlation: 0.2539    
       1	-1.39	0.56	0.50
Epoch 1/1
1413/1413 [==============================] - 14s - loss: 0.4290 - acc: 0.4883 - matthews_correlation: 0.3599    
       2	-1.05	0.57	0.50
Epoch 1/1
1413/1413 [==============================] - 13s - loss: 0.4050 - acc: 0.4890 - matthews_correlation: 0.4193    
       3	-1.05	0.59	0.52
Epoch 1/1
1413/1413 [==============================] - 14s - loss: 0.3923 - acc: 0.4862 - matthews_correlation: 0.4674    
       4	-0.92	0.59	0.52
Epoch 1/1
1413/1413 [==============================] - 14s - loss: 0.3811 - acc: 0.5053 - matthews_correlation: 0.4823    
       5	-0.71	0.59	0.53
Epoch 1/1
1413/1413 [==============================] - 14s - loss: 0.3751 - acc: 0.5159 - matthews_correlation: 0.5019    
       6	-0.66	0.60	0.54
Epoch 1/1
1413/1413 [==============================] - 14s - loss: 0.3594 - acc: 0.5074 - matthews_correlation: 0.5338    
       7	-0.91	0.62	0.53
Epoch 1/1
1413/1413 [==============================] - 13s - loss: 0.3549 - acc: 0.5308 - matthews_correlation: 0.5395    
       8	-0.87	0.60	0.53
Epoch 1/1
1413/1413 [==============================] - 14s - loss: 0.3462 - acc: 0.5506 - matthews_correlation: 0.5621    
       9	-0.86	0.62	0.55
Epoch 1/1
1413/1413 [==============================] - 14s - loss: 0.3393 - acc: 0.5322 - matthews_correlation: 0.5777    
      10	-0.97	0.61	0.53
Epoch 1/1
1413/1413 [==============================] - 13s - loss: 0.3302 - acc: 0.5350 - matthews_correlation: 0.5844    
      11	-0.95	0.59	0.52
Epoch 1/1
1413/1413 [==============================] - 14s - loss: 0.3240 - acc: 0.5499 - matthews_correlation: 0.6001    
      12	-0.85	0.60	0.54
Epoch 1/1
1413/1413 [==============================] - 13s - loss: 0.3213 - acc: 0.5506 - matthews_correlation: 0.5933    
      13	-0.80	0.59	0.53
Epoch 1/1
1413/1413 [==============================] - 13s - loss: 0.3170 - acc: 0.5471 - matthews_correlation: 0.5945    
      14	-0.97	0.61	0.54
Epoch 1/1
1413/1413 [==============================] - 13s - loss: 0.3055 - acc: 0.5704 - matthews_correlation: 0.6265    
      15	-0.81	0.60	0.55
Epoch 1/1
1413/1413 [==============================] - 13s - loss: 0.3011 - acc: 0.5648 - matthews_correlation: 0.6319    
      16	-0.87	0.58	0.52
Epoch 1/1
1413/1413 [==============================] - 14s - loss: 0.2980 - acc: 0.5740 - matthews_correlation: 0.6389    
      17	-0.88	0.59	0.53
Epoch 1/1
1413/1413 [==============================] - 14s - loss: 0.2899 - acc: 0.5563 - matthews_correlation: 0.6577    
      18	-0.93	0.60	0.55
Epoch 1/1
1413/1413 [==============================] - 13s - loss: 0.2842 - acc: 0.5718 - matthews_correlation: 0.6652    
      19	-0.89	0.61	0.54

In [2]:
print('[%8s]\tR2\tpearson\tspearman' % emoNames[EMOTION])
for i,y__ in enumerate(y_test_predicts): 
#         print("%8s\t%.2f\t%.2f\t%.2f" % (i,
#                                  r2_score(y_dev , y_test_predict),                                 
#                                  pearsonr(y_dev , y_test_predict)[0],
#                                  spearmanr(y_dev , y_test_predict)[0]))
        print("%8s\t%.2f\t%.2f\t%.2f" % (i,
                                 np.mean([r2_score(y1,y2)for y1,y2 in zip(y_test, y__)]) ,                                 
                                 np.mean([pearsonr(y1,y2)[0]  for y1,y2 in zip(y_test, y__)]) ,
                                 np.mean([spearmanr(y1,y2)[0]  for y1,y2 in zip(y_test, y__)])))

In [350]:
from sklearn.metrics import r2_score, f1_score, classification_report

y_test_predict = model.predict(X_test)
for i in [0,1,2,3]:
    print("[%9s]\t%.2f" % (emoNames[i],
                                y_test[: , i], 
                                y_test_predict[: , i]

[confident]	0.96
[  excited]	0.94
[    happy]	0.96
[surprised]	0.93

In [349]:
%pylab inline
import numpy as np
import matplotlib.pyplot as plt
import natsort

dimension = 0

s = y_test[:, dimension] #y_annotated[dimension]
order = sorted(range(len(s)), key=lambda k: s[k])

g1 = y_test[:, dimension]
g2 = y_test_predict[:, dimension]   

#--------------          ----SCALING----
line_1, = plt.plot(np.array(g2)[order], 'g.', label='Prediction')
line_0, = plt.plot(np.array(g1)[order], 'r.',  label='Actual')

plt.legend(handles=[line_0, line_1])
plt.legend(bbox_to_anchor=(1.02, .4, .65, .0), loc=3,ncol=1, mode="expand", borderaxespad=1.0)
plt.ylabel('dimension: '+emoNames[dimension])
plt.title("Models Evaluation on 5point corpus")

Populating the interactive namespace from numpy and matplotlib

Save Model and Weights

In [351]:
def _save_model_wassa(model, savePath, emo, modelName):
    if emo == None:
        savePath = os.path.join(savePath,modelName)
        savePath = savePath+"."+emo
    model_json = model.to_json()
    with open(savePath + ".json", "w") as json_file:
        print("<%s.json> " % (savePath))
    model.save_weights(savePath +".h5", overwrite=True)
    print("<%s.h5> " % (savePath))
# + "_.h5")
#     print("<"+savePath + "_.h5>")
savePath = "/home/vlaand/IpythonNotebooks/senpy-plugins-development/fivePointRegression/classifiers/LSTM/"

_save_model_wassa(model=model, savePath=savePath, emo=None, modelName="fivePointRegression")
_save_model_wassa(model=model, savePath='/home/vlaand/IpythonNotebooks/cf-5point-data/classifiers/LSTM', emo=None, modelName="fivePointRegression")



In [ ]:
with open('model_fivePointRegression.json', 'r') as json_file:
    loaded_model_json =
loaded_model = model_from_json(loaded_model_json)

# load weights into new model
# print("Loaded model from disk")