In [2]:
""" Implementing decoder"""
"""Imports"""
import numpy as np
from nltk import sent_tokenize, word_tokenize
"""Global definitons"""
_start = 'S_START'
_end = 'S_END'
_unk = 'UNK'
In [ ]:
""" util definitions"""
def hyperbolic(net):
return np.tanh(net)
def relu(net):
return np.maximum(0,net)
def softmax(net):
_exp = np.exp(net)
return _exp/np.sum(_exp)
def predict(scores):
return np.argmax(scores)
In [ ]:
class RNNlayer:
"""
RNN nodes for decoder
hidden state at time step t of decoder is conditioned on hidden state at time step t-1,
output at time step t-1 and context C from the encoder
"""
def __init__(self, inputSize, outputSize, bptt_truncate = 5, hiddenDim = 10):
"""
inputSize = dimensions of the context from encoder
outputSize = vocabulary size
hiddenDim = size of the hidden unit in RNN
bptt_truncate = truncate the number of time steps we calculate the gradient during backpropagation
"""
self.inputSize = inputSize
self.outputSize = outputSize
self.hiddenDim = hiddenDim
self. bptt_truncate = bptt_truncate
self.w_in = np.random.uniform(-np.sqrt(1./inputSize), np.sqrt(1./inputSize),(hiddenDim, inputSize))
self.w_hh = np.random.uniform(-np.sqrt(1./hiddenDim), np.sqrt(1./hiddenDim),(hiddenDim, hiddenDim))
self.w_outH = np.random.uniform(-np.sqrt(1./hiddenDim), np.sqrt(1./hiddenDim),(outputSize, hiddenDim))
self.w_out = np.random.uniform(-np.sqrt(1./hiddenDim), np.sqrt(1./hiddenDim),(outputSize, hiddenDim))
def forwardProp(self, context, expSent):
"""
context: calculated from encoder
expSent: word indices in target language vocabulary
"""
#Total number of time steps equal to number of words in the sentence
T = len(expSent)
#Saving all hidden states and outputs during forward propagation
_h = np.zeros((T,self.hiddenDim))
_o = np.zeros((T,self.outputSize))
#Initializing initial output as the start token
_o[-1] =
#For each time step calculating hidden state and output
for t in np.arange(T):
outIdx = predict(_o[t-1])
_h[t] = hyperbolic(self.w_in.dot(context) + self.w_hh.dot(_h[t-1]) + self.w_outH[:,outIdx:outIdx+1])
_o[t] = softmax(self.w_out.dot(_h[t]))
return _o, _h
def calculateLoss(self, context, expSentence):
#For each sentence
o, h = self.forwardProp(context, expSentence)
#TODO recheck this part
correctPred = o[np.arange(len(expSentence)), expSentence]
#Loss for each sentence
l = -1 * np.sum(np.log(correctPred))
return l
def calculateTotalLoss(self, contexts, expSentences):
L = 0.0
for i in len(contexts):
L += self.calculateLoss(context[i], expSentences[i])
return L
def backPropTT(self, context, expSentence):
# Total number of time steps equal to number of words in the sentence
T = len(expSentence)
# Performing forward propagation
o, h = self.forwardProp(context, expSentence)
# Defining gradient variables
dLdin = np.zeros(self.w_in.shape)
dLdhh = np.zeros(self.w_hh.shape)
dLdoutH = np.zeros(self.w_outH.shape)
dLdout = np.zeros(self.w_out.shape)
# Calculating the difference between output and actual output
delta_o = o
delta_o[np.arange(T), expSentence] -= 1
# Calculating gradients backwards through time
for t in np.arange(T)[::-1]:
#Output gradient is only dependent on time step t
dLdout += np.outer(delta_o[t], h[t])
# Initial delta calculation propagating gradients from output
delta_t = self.w_out.T.dot(delta_o[t]) * (1 - (h[t] ** 2))
# Backpropagation through time (for at most self.bptt_truncate steps)
for bptt_step in np.arange(max(0, t-self.bptt_truncate), t+1)[::-1]:
# print "Backpropagation step t=%d bptt step=%d " % (t, bptt_step)
# Add to gradients at each previous step
dLdhh += np.outer(delta_t, h[bptt_step-1])
dLdin += np.outer(delta_t, context)
dLdoutH += np.outer(delta_t, o[bptt_step-1])
# Update delta for next step dL/dz at t-1
delta_t = self.W.T.dot(delta_t) * (1 - s[bptt_step-1] ** 2)
"""TODO review backprop implementation"""
return dLdin, dLdhh, dLdoutH, dLdout
def sgd_step(self, context, expSentence, learningRate):
""" Performs a single stochastic gradient step"""
# Calculating gradients
dLdin, dLdhh, dLdoutH, dLdout = self.backPropTT(context, expSentence)
# Updating parameters
self.w_in -= learningRate * dLdin
self.w_hh -= learningRate * dLdhh
self.w_outH -= learningRate * dLdoutH
self.w_out -= learningRate * dLdout
def train_Decoder_With_SGD(self, X_train, Y_train, learningRate = 0.05, nepochs = 10):
"""TODO evaluate losses and update learning rate if required"""
for epoch in range(nepochs):
for i in range(len(Y_train)):
self.sgd_step(X_train[i], Y_train[i], learningRate)
In [ ]:
vocabSize = None
embSize = None
W_out = np.random.randn(vocabSize, embSize)
W_hh = np.random.randn(embSize, embSize)
W_in = np.random.randn(embSize,)
In [ ]:
# a = np.array([[1,2,3],[4,5,6]])
a[np.arange(2),[1,2]] -= 1
print a[1].T
print np.outer(a[0],a[1].T)