In [102]:
""" Imports """
import re
from nltk.tokenize import word_tokenize, sent_tokenize
import collections
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
"""Global definitons"""
_start = 'S_START'
_end = 'S_END'
In [103]:
""" util definitions"""
def hyperbolic(net):
return np.tanh(net)
def relu(net):
return np.maximum(0,net)
def softmax(net):
_exp = np.exp(net)
return _exp/np.sum(_exp)
def predict(scores):
return np.argmax(scores)
In [104]:
class WordItem:
def __init__(self,word,count=0):
self.word = word
self.count = count
In [105]:
class RNNlayer:
"""
RNN nodes for decoder
hidden state at time step t of decoder is conditioned on hidden state at time step t-1,
output at time step t-1 and input at time step t
"""
def __init__(self, inputSize, outputSize, bptt_truncate = 5, hiddenDim = 10):
"""
inputSize = dimensions of the input embedding
outputSize = vocabulary size
hiddenDim = size of the hidden unit in RNN
bptt_truncate = truncate the number of time steps we calculate the gradient during backpropagation
"""
self.inputSize = inputSize
self.outputSize = outputSize
self.hiddenDim = hiddenDim
self. bptt_truncate = bptt_truncate
self.w_in = np.random.uniform(-np.sqrt(1./inputSize), np.sqrt(1./inputSize),(hiddenDim, inputSize))
self.w_hh = np.random.uniform(-np.sqrt(1./hiddenDim), np.sqrt(1./hiddenDim),(hiddenDim, hiddenDim))
#self.w_outH = np.random.uniform(-np.sqrt(1./hiddenDim), np.sqrt(1./hiddenDim),(outputSize, hiddenDim))
self.w_out = np.random.uniform(-np.sqrt(1./hiddenDim), np.sqrt(1./hiddenDim),(outputSize, hiddenDim))
def forwardProp(self, inSentence, expSent):
"""
inSentence: word indices in input language vocabulary
expSent: word indices in target language vocabulary
"""
#Total number of time steps equal to number of words in the sentence
T = len(expSent)
#Saving all hidden states and outputs during forward propagation
_h = np.zeros((T,self.hiddenDim))
_o = np.zeros((T,self.outputSize))
#Initializing initial output as the start token
#_o[-1] =
#For each time step calculating hidden state and output
for t in np.arange(T):
#outIdx = predict(_o[t-1])
_h[t] = hyperbolic(self.w_in.dot(inSentence[t]) + self.w_hh.dot(_h[t-1])) #+ self.w_outH[:,outIdx:outIdx+1])
_o[t] = softmax(self.w_out.dot(_h[t]))
return _o, _h
def calculateLoss(self, inSentence, expSentence):
#For each sentence
o, h = self.forwardProp(inSentencecontext, expSentence)
#TODO recheck this part
correctPred = o[np.arange(len(expSentence)), expSentence]
#Loss for each sentence
l = -1 * np.sum(np.log(correctPred))
return l
def calculateTotalLoss(self, inSentence, expSentences):
L = 0.0
for i in len(inSentence):
L += self.calculateLoss(inSentencecontext[i], expSentences[i])
return L
def backPropTT(self, inSentence, expSentence):
# Total number of time steps equal to number of words in the sentence
T = len(expSentence)
# Performing forward propagation
o, h = self.forwardProp(inSentence, expSentence)
# Defining gradient variables
dLdin = np.zeros(self.w_in.shape)
dLdhh = np.zeros(self.w_hh.shape)
#dLdoutH = np.zeros(self.w_outH.shape)
dLdout = np.zeros(self.w_out.shape)
# Calculating the difference between output and actual output
delta_o = o
delta_o[np.arange(T), expSentence] -= 1
#print 'delta_o', delta_o
# Calculating gradients backwards through time
for t in np.arange(T)[::-1]:
#Output gradient is only dependent on time step t
dLdout += np.outer(delta_o[t], h[t])
# Initial delta calculation propagating gradients from output
delta_t = self.w_out.T.dot(delta_o[t]) * (1 - (h[t] ** 2))
# Backpropagation through time (for at most self.bptt_truncate steps)
for bptt_step in np.arange(max(0, t-self.bptt_truncate), t+1)[::-1]:
# print "Backpropagation step t=%d bptt step=%d " % (t, bptt_step)
# Add to gradients at each previous step
dLdhh += np.outer(delta_t, h[bptt_step-1])
dLdin += np.outer(delta_t, inSentence[bptt_step-1])
#dLdoutH += np.outer(delta_t, o[bptt_step-1])
# Update delta for next step dL/dz at t-1
delta_t = self.w_hh.T.dot(delta_t) * (1 - h[bptt_step-1] ** 2)
"""TODO review backprop implementation"""
#return dLdin, dLdhh, dLdoutH, dLdout
return dLdin, dLdhh, dLdout
def sgd_step(self, inSentence, expSentence, learningRate):
""" Performs a single stochastic gradient step"""
# Calculating gradients
#dLdin, dLdhh, dLdoutH, dLdout = self.backPropTT(inSentence, expSentence)
dLdin, dLdhh, dLdout = self.backPropTT(inSentence, expSentence)
# Updating parameters
self.w_in -= learningRate * dLdin
self.w_hh -= learningRate * dLdhh
#self.w_outH -= learningRate * dLdoutH
self.w_out -= learningRate * dLdout
def train_Decoder_With_SGD(self, X_train, Y_train, learningRate = 0.05, nepochs = 20):
"""TODO evaluate losses and update learning rate if required"""
for epoch in range(nepochs):
for i in range(len(Y_train)):
print i
self.sgd_step(X_train[i], Y_train[i], learningRate)
print 'W_in ', self.w_in
print 'W_hh ', self.w_hh
print 'W_out ', self.w_out
In [ ]:
""" Word preprocessing """
def dataset(_fi='/home/jazzycrazzy/PythonScripts/dataset.csv', _fo = 'testfile.txt'):
file_in = open(_fi)
#file_out = open(_fo,'wb')
words = [] #stores unique words encountered in the document as WordItem objects
_dict = {} #temporary dictionary to maintain count of each word
_dict['UNK'] = 0
for l in file_in:
#file_out.write(l+'\n')
l = _start+' '+l+' '+_end
split = word_tokenize(l.decode('utf-8'))
for w in split:
if len(w)==0:
continue
elif len(w) > 15: #if word's length is greater than 15 counting it as unknown
_dict['UNK'] += 1
continue
if w not in _dict:
_dict[w] = 1
_dict[w] += 1
_vocab = {} #dictionary with words as keys and values as indices of them in 'word' list
_vocab['UNK'] = len(words)
words.append(WordItem('UNK',_dict['UNK']))
for k,v in _dict.iteritems():
if k != 'UNK':
_vocab[k] = len(words)
words.append(WordItem(k,v))
else:
words[0].count += 1
#cleaning up unnecessary memory
del _dict
file_in.close()
#file_out.close()
return _vocab, words
def UnigramTable(_vocab, words):
""" Calculates probabilities based on count of each word present"""
pow = 0.75
totalFreqPow = 0.0
unigramTable = {}
l = [words[i].count**pow for i in range(len(_vocab))]
totalFreqPow = np.sum(l)
for i in range(len(_vocab)):
unigramTable[i] = (words[i].count**pow)/totalFreqPow
del l
return unigramTable
def hotVector(wordIndex,vocabSize):
""" Returns hot vector representation of a word """
hVector = np.zeros(vocabSize)
hVector[wordIndex-1] = 1
return hVector
def sigmoid(net):
""" Applies sigmoid logistic function on net """
return 1.0/(1+np.exp(-net))
def randomIdx(k, vocabSize, current):
""" Returns k indices from with unigram table randomly with respect to each word's probablity """
global _unigramTable
idxs = list(np.random.choice(vocabSize, k+1, False, p = _unigramTable.values()))
if current in idxs:
idxs.remove(current)
else:
del idxs[-1]
return idxs
def softmaxCostGradient(net, target):
prob = softmax(net)
def negSamplingCostGradient(out, context, emb, vocabSize, learningRate, W_Output, k = 10):
#cost = []
errorHidden = np.zeros(shape=(emb.size,1))
actOut = sigmoid(out[context])
negSamples = randomIdx(k, vocabSize, context)
_negSamples = [out[sample] for sample in negSamples]
e = -np.log(actOut) - np.sum(np.log(sigmoid(np.negative(_negSamples))))
#cost = np.concatenate(cost, e)
""" calculating gradients for output vectors for both target and negative samples
calculating hidden layer error for each context word """
delta = actOut - 1
errorHidden += delta * W_Output[:,context:context+1]
W_Output[:,context:context+1] -= learningRate * np.reshape(delta * emb,(emb.size,1))
for sample in negSamples:
delta = sigmoid(out[sample])
errorHidden += delta * W_Output[:,sample:sample+1]
W_Output[:,sample:sample+1] -= learningRate * np.reshape(delta * emb,(emb.size,1))
return errorHidden
def skipgram(target,contextWords, vocabSize, learningRate, W_Embedding, W_Output):
"""
will be called on each window with
target: Target word index
contextWords: Arrray of integers representing context words
"""
k = 10 #Number of negative samples
emb = W_Embedding[target]
out = np.matmul(emb,W_Output) # [1 x EmbSize].[EmbSize x VocabSize]
_predicted = []
EH = np.zeros(shape=(emb.size,1))
for context in contextWords:
#predicted = hotVector(context, vocabSize)
EH += negSamplingCostGradient(out, context, emb, vocabSize, learningRate, W_Output, k)
#updating hidden layer input vector embedding
W_Embedding[target] -= learningRate * EH.T[0]
In [ ]:
""" Creates word embeddings in vector space representation """
""" Feedforward Neural Net Language model """
#Input layer
#Projection layer
#Hidden layer
#Output layer
#Initialization
fin='/Users/preethikapachaiyappa/Documents/MachineLearning/Data/English-small.txt'#/home/jazzycrazzy/PythonScripts/dataset.csv'
fin1='/Users/preethikapachaiyappa/Documents/MachineLearning/Data/French-small.txt'
fout = 'testfile.txt'
fout1 = 'testfile1.txt'
_vocab, words = dataset(fin, fout)
_vocab_f, words_f = dataset(fin1, fout)
_unigramTable = UnigramTable(_vocab, words)
learningRate = 0.2
vocabSize = len(words)
vocabSize_f = len(words_f)
emb_size = 10
win_size = 2
target = None
contextWords = []
print _vocab
print _vocab_f
# No need of hidden layer since when the embedding matrix is multiplied with hot vector
#it essentially gives that embedding row
W_Embedding = np.random.randn(vocabSize,emb_size) #Embedding matrix
W_Output = np.random.randn(emb_size,vocabSize) #Outputlayer weight matrix Emb_size x Vocab
fileIn = open(fin)
for l in fileIn:
l = _start+' '+l+' '+_end
tokens = word_tokenize(l)
print 'tokens',tokens
for token in tokens:
if token in _vocab:
target = _vocab[token]
trgtIdx = tokens.index(token)
cntxtIdxs = range(trgtIdx-win_size, trgtIdx+win_size+1)
cntxtIdxs.remove(trgtIdx)
for idx in cntxtIdxs:
if idx >-1 and idx < len(tokens) and tokens[idx] in _vocab:
contextWords = np.append(contextWords, _vocab[tokens[idx]])
else:
contextWords = np.append(contextWords, _vocab['UNK'])
skipgram(target, contextWords, vocabSize, learningRate, W_Embedding, W_Output)
print W_Embedding
In [ ]:
inSentence = []
expSentence = []
fileIn0 = open(fin)
for l in fileIn0 :
l = _start+' '+l+' '+_end
tokens = word_tokenize(l)
inSent = []
for token in tokens :
target = _vocab[token]
vec = W_Embedding[target]
vec_list = vec.tolist()
inSent.append(vec_list)
inSentence.append(inSent)
fileIn1 = open(fin1)
for l in fileIn1 :
l = _start+' '+l+' '+_end
tokens = word_tokenize(l.decode('utf-8'))
expSent = []
for token in tokens :
target = _vocab_f[token]
expSent.append(target)
expSentence.append(expSent)
#print inSentence
#print expSentence
a = RNNlayer(10,vocabSize_f)
a.train_Decoder_With_SGD(inSentence, expSentence)