In [ ]:
""" Imports """
import re
from nltk.tokenize import word_tokenize, sent_tokenize
import collections
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

"""Global definitons"""
_start = 'S_START'
_end = 'S_END'

In [ ]:
""" util definitions"""

def hyperbolic(net):
    return np.tanh(net)

def relu(net):
    return np.maximum(0,net)

def softmax(net):
    _exp = np.exp(net)
    return _exp/np.sum(_exp)

def predict(scores):
    return np.argmax(scores)

In [ ]:
class WordItem:
    def __init__(self,word,count=0):
        self.word = word
        self.count = count

In [ ]:
""" Word preprocessing """
def dataset(_fi='/home/jazzycrazzy/PythonScripts/dataset.csv', _fo = 'testfile.txt'):
    file_in = open(_fi)

    words = [] #stores unique words encountered in the document as WordItem objects
    _dict = {} #temporary dictionary to maintain count of each word
    
    _dict['UNK'] = 0

    for l in file_in:
        #file_out.write(l+'\n')
        l = _start+' '+l+' '+_end
        split = word_tokenize(l.decode('utf-8'))
        for w in split:
            if len(w)==0:
                continue
            elif len(w) > 15: #if word's length is greater than 15 counting it as unknown
                _dict['UNK'] += 1
                continue
            if w not in _dict:
                _dict[w] = 1
            _dict[w] += 1
            
    _vocab = {} #dictionary with words as keys and values as indices of them in 'word' list
    _vocab['UNK'] = len(words)
    words.append(WordItem('UNK',_dict['UNK']))
    for k,v in _dict.iteritems():
        #if v > 9 and k != 'UNK':
        if k != 'UNK':
            _vocab[k] = len(words)
            words.append(WordItem(k,v))
        else:
            words[0].count += 1
    
    #cleaning up unnecessary memory
    del _dict
    file_in.close()
    #file_out.close()
    
    return _vocab, words

def UnigramTable(_vocab, words):
    """ Calculates probabilities based on count of each word present"""
    pow = 0.75
    totalFreqPow = 0.0
    unigramTable = {}
    
    l = [words[i].count**pow for i in range(len(_vocab))]
    totalFreqPow = np.sum(l)
    
    for i in range(len(_vocab)):
        unigramTable[i] = (words[i].count**pow)/totalFreqPow
    
    del l
    return unigramTable

def hotVector(wordIndex,vocabSize):
    """ Returns hot vector representation of a word """
    hVector = np.zeros(vocabSize)
    hVector[wordIndex-1] = 1
    return hVector

def softmax(net):
    """ calculates softmax score - target score normalized with noise scores and calculated as probability"""
    _exp = np.exp(net)
    return _exp/np.sum(_exp)

def sigmoid(net):
    """ Applies sigmoid logistic function on net """
    return 1.0/(1+np.exp(-net))

def randomIdx(k, vocabSize, current):
    """ Returns k indices from with unigram table randomly with respect to each word's probablity """
    global _unigramTable
    idxs = list(np.random.choice(vocabSize, k+1, False, p = _unigramTable.values()))
    if current in idxs:
        idxs.remove(current)
    else:
        del idxs[-1]
    return idxs
    
def softmaxCostGradient(net, target):
    prob = softmax(net)
    print(prob)
    
    
def negSamplingCostGradient(out, context, emb, vocabSize, learningRate, W_Output, k = 10):
    
    errorHidden = np.zeros(shape=(emb.size,1))
    
    actOut = sigmoid(out[context])
    negSamples = randomIdx(k, vocabSize, context)
    _negSamples = [-out[sample] for sample in negSamples]
    
    # error for context word
    e = -np.log(actOut) - np.sum(np.log(sigmoid(np.array(_negSamples))))
    
    """ calculating gradients for output vectors for both target and negative samples
    calculating hidden layer error for each context word """
    # Updating output weight vector for context word
    delta = actOut - 1
    errorHidden += delta * W_Output[:,context:context+1]
    W_Output[:,context:context+1] -= learningRate * np.reshape(delta * emb,(emb.size,1))
    
    # Updating output weight vectors for negative sampling
    for sample in negSamples:
        delta = sigmoid(out[sample])
        errorHidden += delta * W_Output[:,sample:sample+1]
        W_Output[:,sample:sample+1] -= learningRate * np.reshape(delta * emb,(emb.size,1))
    
    return errorHidden,e    
    
def skipgram(target,contextWords, vocabSize, learningRate, W_Embedding, W_Output):
    
    """
    will be called on each window with
    target: Target word index
    contextWords: Arrray of integers representing context words
    """
    loss = 0
    k = 10 #Number of negative samples
    emb = W_Embedding[target]
    out = np.matmul(emb,W_Output) # [1 x EmbSize].[EmbSize x VocabSize]
    #print out.shape
    _predicted = []
    EH = np.zeros(shape=(emb.size,1))
    for context in contextWords:
        #predicted = hotVector(context, vocabSize)
        #softmaxCostGradient(out,context)
        _EH,_e = negSamplingCostGradient(out, context, emb, vocabSize, learningRate, W_Output, k)
        EH += _EH
        loss += _e
        #EH += sof
        
    #updating hidden layer input vector embedding
    W_Embedding[target] -= learningRate * EH.T[0]
    return loss

In [ ]:
class RNNEncoder:
    
    """ 
    RNN nodes for encoder
    
    hidden state at time step t of encoder is conditioned on hidden state at time step t-1,
    and input at time step t or t word of a sentence
    """
    
    def __init__(self, inputSize, outputSize, bptt_truncate = 5, hiddenDim = 10):
        """
        inputSize = dimensions of the context from encoder
        outputSize = vocabulary size
        hiddenDim = size of the hidden unit in RNN
        bptt_truncate = truncate the number of time steps we calculate the gradient during backpropagation
        """
        self.inputSize = inputSize
        self.outputSize = outputSize
        self.hiddenDim = hiddenDim
        self. bptt_truncate = bptt_truncate
        
        self.w_in = np.random.uniform(-np.sqrt(1./inputSize), np.sqrt(1./inputSize),(hiddenDim, inputSize))
        self.w_hh = np.random.uniform(-np.sqrt(1./hiddenDim), np.sqrt(1./hiddenDim),(hiddenDim, hiddenDim))
        self.w_out = np.random.uniform(-np.sqrt(1./hiddenDim), np.sqrt(1./hiddenDim),(outputSize, hiddenDim))
        
    def forwardProp(self, inpSent, expSent):
        """
        inpSent: input word indices
        expSent: word indices in target language vocabulary
        """
        
        #Total number of time steps equal to number of words in the sentence
        T = len(inpSent)
        
        #Saving all hidden states and outputs during forward propagation
        _h = np.zeros((T,self.hiddenDim))
        _o = np.zeros((T,self.outputSize))
        
        #For each time step calculating hidden state and output
        for t in np.arange(T):
            outIdx = predict(_o[t-1])
            _h[t] = hyperbolic(self.w_in.dot(inpSent) + self.w_hh.dot(_h[t-1]))
            _o[t] = softmax(self.w_out.dot(_h[t]))
            
        return _o, _h
    
    def calculateLoss(self, inpSentence, expSentence):
        
        #For each sentence
        o, h = self.forwardProp(inpSentence, expSentence)
        #Loss for each sentence
        l = -1 * np.sum(np.log(correctPred))
        return l
    
    def calculateTotalLoss(self, inpSentences, expSentences):
        
        L = 0.0
        for i in len(inpSentences):
            L += self.calculateLoss(inpSentences[i], expSentences[i])
            
        return L
    
    def backPropTT(self, inpSentence, expSentence):
        
        # Total number of time steps equal to number of words in the sentence
        T = len(expSentence)
        
        # Performing forward propagation
        o, h = self.forwardProp(inpSentence, expSentence)
        
        # Defining gradient variables
        dLdin = np.zeros(self.w_in.shape)
        dLdhh = np.zeros(self.w_hh.shape)
        dLdout = np.zeros(self.w_out.shape)
        
        # Calculating the difference between output and actual output
        delta_o = o
        delta_o[np.arange(T), expSentence] -= 1
        
        # Calculating gradients backwards through time
        for t in np.arange(T)[::-1]:
            #Output gradient is only dependent on time step t
            dLdout += np.outer(delta_o[t], h[t])
            # Initial delta calculation propagating gradients from output
            delta_t = self.w_out.T.dot(delta_o[t]) * (1 - (h[t] ** 2))
            # Backpropagation through time (for at most self.bptt_truncate steps)
            for bptt_step in np.arange(max(0, t-self.bptt_truncate), t+1)[::-1]:
                # print "Backpropagation step t=%d bptt step=%d " % (t, bptt_step)
                # Add to gradients at each previous step
                dLdhh += np.outer(delta_t, h[bptt_step-1])              
                dLdin += np.outer(delta_t, context)
                # Update delta for next step dL/dz at t-1
                delta_t = self.W.T.dot(delta_t) * (1 - s[bptt_step-1] ** 2)
            """TODO review backprop implementation"""
            
        return dLdin, dLdhh, dLdout
    
    def sgd_step(self, inpSentence, expSentence, learningRate):
        
        """ Performs a single stochastic gradient step"""
        
        # Calculating gradients
        dLdin, dLdhh, dLdout = self.backPropTT(inpSentence, expSentence)
        
        # Updating parameters
        self.w_in -= learningRate * dLdin
        self.w_hh -= learningRate * dLdhh
        self.w_out -= learningRate * dLdout
        
    def train_Decoder_With_SGD(self, X_train, Y_train, learningRate = 0.05, nepochs = 10):
        """TODO evaluate losses and update learning rate if required"""
        for epoch in range(nepochs):
            for i in range(len(Y_train)):
                self.sgd_step(X_train[i], Y_train[i], learningRate)

In [ ]:
class RNNDecoder:
    
    """ 
    RNN nodes for decoder
    
    hidden state at time step t of decoder is conditioned on hidden state at time step t-1,
    output at time step t-1 and context C from the encoder
    """
    
    def __init__(self, inputSize, outputSize, embFr, startFrIdx, bptt_truncate = 5, hiddenDim = 10):
        """
        inputSize = dimensions of the context from encoder
        outputSize = vocabulary size
        embFr = French embeddings
        hiddenDim = size of the hidden unit in RNN
        bptt_truncate = truncate the number of time steps we calculate the gradient during backpropagation
        """
        self.inputSize = inputSize
        self.outputSize = outputSize
        self.hiddenDim = hiddenDim
        self.embFr = embFr
        self.startFrIdx = startFrIdx
        self. bptt_truncate = bptt_truncate
        
        self.w_in = np.random.uniform(-np.sqrt(1./inputSize), np.sqrt(1./inputSize),(hiddenDim, inputSize))
        self.w_hh = np.random.uniform(-np.sqrt(1./hiddenDim), np.sqrt(1./hiddenDim),(hiddenDim, hiddenDim))
        self.w_outH = np.random.uniform(-np.sqrt(1./hiddenDim), np.sqrt(1./hiddenDim),(inputSize, hiddenDim))
        self.w_out = np.random.uniform(-np.sqrt(1./hiddenDim), np.sqrt(1./hiddenDim),(outputSize, hiddenDim))
        
    def forwardProp(self, context, expSent):
        """
        context: calculated from encoder
        expSent: word indices in target language vocabulary
        """
        
        #Total number of time steps equal to number of words in the sentence
        T = len(expSent)
        
        #Saving all hidden states and outputs during forward propagation
        _h = np.zeros((T,self.hiddenDim))
        _o = np.zeros((T,self.outputSize))
        
        #Initializing initial output as the start token
        _o[-1,startFrIdx] = 1
        
        #For each time step calculating hidden state and output
        for t in np.arange(T):
            outIdx = predict(_o[t-1])
            _h[t] = hyperbolic(self.w_in.dot(context) + self.w_hh.dot(_h[t-1]) + self.w_outH.dot(embFr[outIdx]))
            _o[t] = softmax(self.w_out.dot(_h[t]))
            
        return _o, _h
    
    def calculateLoss(self, context, expSentence):
        
        #For each sentence
        o, h = self.forwardProp(context, expSentence)
        #TODO recheck this part
        correctPred = o[np.arange(len(expSentence)), expSentence]
        #Loss for each sentence
        l = -1 * np.sum(np.log(correctPred))
        return l
    
    def calculateTotalLoss(self, contexts, expSentences):
        
        L = 0.0
        for i in len(contexts):
            L += self.calculateLoss(context[i], expSentences[i])
            
        return L
    
    def backPropTT(self, context, expSentence):
        
        # Total number of time steps equal to number of words in the sentence
        T = len(expSentence)
        
        # Performing forward propagation
        o, h = self.forwardProp(context, expSentence)
        
        # Defining gradient variables
        dLdin = np.zeros(self.w_in.shape)
        dLdhh = np.zeros(self.w_hh.shape)
        dLdoutH = np.zeros(self.w_outH.shape)
        dLdout = np.zeros(self.w_out.shape)
        
        # Calculating the difference between output and actual output
        delta_o = o
        delta_o[np.arange(T), expSentence] -= 1
        
        # Calculating gradients backwards through time
        for t in np.arange(T)[::-1]:
            #Output gradient is only dependent on time step t
            dLdout += np.outer(delta_o[t], h[t])
            # Initial delta calculation propagating gradients from output
            delta_t = self.w_out.T.dot(delta_o[t]) * (1 - (h[t] ** 2))
            # Backpropagation through time (for at most self.bptt_truncate steps)
            for bptt_step in np.arange(max(0, t-self.bptt_truncate), t+1)[::-1]:
                # print "Backpropagation step t=%d bptt step=%d " % (t, bptt_step)
                # Add to gradients at each previous step
                dLdhh += np.outer(delta_t, h[bptt_step-1])              
                dLdin += np.outer(delta_t, context)
                dLdoutH += np.outer(delta_t, embFr[predict(_o[t-1])])
                # Update delta for next step dL/dz at t-1
                delta_t = self.W.T.dot(delta_t) * (1 - s[bptt_step-1] ** 2)
            """TODO review backprop implementation"""
            
        return dLdin, dLdhh, dLdoutH, dLdout
    
    def sgd_step(self, context, expSentence, learningRate):
        
        """ Performs a single stochastic gradient step"""
        
        # Calculating gradients
        dLdin, dLdhh, dLdoutH, dLdout = self.backPropTT(context, expSentence)
        
        # Updating parameters
        self.w_in -= learningRate * dLdin
        self.w_hh -= learningRate * dLdhh
        self.w_outH -= learningRate * dLdoutH
        self.w_out -= learningRate * dLdout
        
    def train_Decoder_With_SGD(self, X_train, Y_train, learningRate = 0.05, nepochs = 10):
        """TODO evaluate losses and update learning rate if required"""
        for epoch in range(nepochs):
            for i in range(len(Y_train)):
                self.sgd_step(X_train[i], Y_train[i], learningRate)

In [ ]: