In [ ]:

    
class RNNlayer:
    
    """ 
    RNN nodes for decoder
    
    hidden state at time step t of decoder is conditioned on hidden state at time step t-1,
    output at time step t-1 and context C from the encoder
    """
    
    def __init__(self, inputSize, outputSize, bptt_truncate = 5, hiddenDim = 10):
        """
        inputSize = dimensions of the context from encoder
        outputSize = vocabulary size
        hiddenDim = size of the hidden unit in RNN
        bptt_truncate = truncate the number of time steps we calculate the gradient during backpropagation
        """
        self.inputSize = inputSize
        self.outputSize = outputSize
        self.hiddenDim = hiddenDim
        self. bptt_truncate = bptt_truncate
        
        self.w_in = np.random.uniform(-np.sqrt(1./inputSize), np.sqrt(1./inputSize),(hiddenDim, inputSize))
        self.w_hh = np.random.uniform(-np.sqrt(1./hiddenDim), np.sqrt(1./hiddenDim),(hiddenDim, hiddenDim))
        self.w_outH = np.random.uniform(-np.sqrt(1./hiddenDim), np.sqrt(1./hiddenDim),(outputSize, hiddenDim))
        self.w_out = np.random.uniform(-np.sqrt(1./hiddenDim), np.sqrt(1./hiddenDim),(outputSize, hiddenDim))
        
    def forwardProp(self, context, expSent):
        """
        context: calculated from encoder
        expSent: word indices in target language vocabulary
        """
        
        #Total number of time steps equal to number of words in the sentence
        T = len(expSent)
        
        #Saving all hidden states and outputs during forward propagation
        _h = np.zeros((T,self.hiddenDim))
        _o = np.zeros((T,self.outputSize))
        
        #Initializing initial output as the start token
        _o[-1] = 
        
        #For each time step calculating hidden state and output
        for t in np.arange(T):
            outIdx = predict(_o[t-1])
            _h[t] = hyperbolic(self.w_in.dot(context) + self.w_hh.dot(_h[t-1]) + self.w_outH[:,outIdx:outIdx+1])
            _o[t] = softmax(self.w_out.dot(_h[t]))
            
        return _o, _h
    
    def calculateLoss(self, context, expSentence):
        
        #For each sentence
        o, h = self.forwardProp(context, expSentence)
        #TODO recheck this part
        correctPred = o[np.arange(len(expSentence)), expSentence]
        #Loss for each sentence
        l = -1 * np.sum(np.log(correctPred))
        return l
    
    def calculateTotalLoss(self, contexts, expSentences):
        
        L = 0.0
        for i in len(contexts):
            L += self.calculateLoss(context[i], expSentences[i])
            
        return L
    
    def backPropTT(self, context, expSentence):
        
        # Total number of time steps equal to number of words in the sentence
        T = len(expSentence)
        
        # Performing forward propagation
        o, h = self.forwardProp(context, expSentence)
        
        # Defining gradient variables
        dLdin = np.zeros(self.w_in.shape)
        dLdhh = np.zeros(self.w_hh.shape)
        dLdoutH = np.zeros(self.w_outH.shape)
        dLdout = np.zeros(self.w_out.shape)
        
        # Calculating the difference between output and actual output
        delta_o = o
        delta_o[np.arange(T), expSentence] -= 1
        
        # Calculating gradients backwards through time
        for t in np.arange(T)[::-1]:
            #Output gradient is only dependent on time step t
            dLdout += np.outer(delta_o[t], h[t])
            # Initial delta calculation propagating gradients from output
            delta_t = self.w_out.T.dot(delta_o[t]) * (1 - (h[t] ** 2))
            # Backpropagation through time (for at most self.bptt_truncate steps)
            for bptt_step in np.arange(max(0, t-self.bptt_truncate), t+1)[::-1]:
                # print "Backpropagation step t=%d bptt step=%d " % (t, bptt_step)
                # Add to gradients at each previous step
                dLdhh += np.outer(delta_t, h[bptt_step-1])              
                dLdin += np.outer(delta_t, context)
                dLdoutH += np.outer(delta_t, o[bptt_step-1])
                # Update delta for next step dL/dz at t-1
                delta_t = self.W.T.dot(delta_t) * (1 - s[bptt_step-1] ** 2)
            """TODO review backprop implementation"""
            
        return dLdin, dLdhh, dLdoutH, dLdout
    
    def sgd_step(self, context, expSentence, learningRate):
        
        """ Performs a single stochastic gradient step"""
        
        # Calculating gradients
        dLdin, dLdhh, dLdoutH, dLdout = self.backPropTT(context, expSentence)
        
        # Updating parameters
        self.w_in -= learningRate * dLdin
        self.w_hh -= learningRate * dLdhh
        self.w_outH -= learningRate * dLdoutH
        self.w_out -= learningRate * dLdout
        
    def train_Decoder_With_SGD(self, X_train, Y_train, learningRate = 0.05, nepochs = 10):
        """TODO evaluate losses and update learning rate if required"""
        for epoch in range(nepochs):
            for i in range(len(Y_train)):
                self.sgd_step(X_train[i], Y_train[i], learningRate)