RNNs and LSTMs

http://pytorch.org/tutorials/beginner/nlp/sequence_models_tutorial.html#



In [1]:

    
import torch
import torch.autograd as autograd
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

torch.manual_seed(1)









    Out[1]:





<torch._C.Generator at 0x10c191d80>



In [40]:

    
lstm = nn.LSTM(3, 3)  # Input dim is 3, output dim is 3
inputs = [autograd.Variable(torch.randn((1, 3))) for _ in range(5)]  # make a sequence of length 5


# initialize the hidden state.
hidden = (autograd.Variable(torch.randn(1, 1, 3)),
          autograd.Variable(torch.randn((1, 1, 3))))
for i in inputs:
    # Step through the sequence one element at a time.
    # after each step, hidden contains the hidden state.
    out, hidden = lstm(i.view(1, 1, -1), hidden)

# alternatively, we can do the entire sequence all at once.
# the first value returned by LSTM is all of the hidden states throughout
# the sequence. the second is just the most recent hidden state
# (compare the last slice of "out" with "hidden" below, they are the same)
# The reason for this is that:
# "out" will give you access to all hidden states in the sequence
# "hidden" will allow you to continue the sequence and backpropagate,
# by passing it as an argument  to the lstm at a later time
# Add the extra 2nd dimension
inputs = torch.cat(inputs).view(len(inputs), 1, -1)

hidden = (autograd.Variable(torch.randn(1, 1, 3)), autograd.Variable(torch.randn((1, 1, 3))))  # clean out hidden state

out, hidden = lstm(inputs, hidden)
print(out)
print(hidden)









    



Variable containing:
(0 ,.,.) = 
  0.2410  0.0926  0.0145

(1 ,.,.) = 
  0.1168 -0.0040  0.2024

(2 ,.,.) = 
  0.0572 -0.0214  0.2348

(3 ,.,.) = 
  0.0182 -0.1506  0.2259

(4 ,.,.) = 
  0.0220  0.0112  0.3491
[torch.FloatTensor of size 5x1x3]

(Variable containing:
(0 ,.,.) = 
  0.0220  0.0112  0.3491
[torch.FloatTensor of size 1x1x3]
, Variable containing:
(0 ,.,.) = 
  0.0431  0.0247  0.7800
[torch.FloatTensor of size 1x1x3]
)



In [10]:

    
def prepare_sequence(seq, to_ix):
    idxs = [to_ix[w] for w in seq]
    tensor = torch.LongTensor(idxs)
    return autograd.Variable(tensor)


training_data = [
    ("The dog ate the apple".split(), ["DET", "NN", "V", "DET", "NN"]),
    ("Everybody read that book".split(), ["NN", "V", "DET", "NN"])
]
word_to_ix = {}
for sent, tags in training_data:
    for word in sent:
        if word not in word_to_ix:
            word_to_ix[word] = len(word_to_ix)
print(word_to_ix)
tag_to_ix = {"DET": 0, "NN": 1, "V": 2}

# These will usually be more like 32 or 64 dimensional.
# We will keep them small, so we can see how the weights change as we train.
EMBEDDING_DIM = 6
HIDDEN_DIM = 6









    



{'The': 0, 'dog': 1, 'ate': 2, 'the': 3, 'apple': 4, 'Everybody': 5, 'read': 6, 'that': 7, 'book': 8}



In [41]:

    
class LSTMTagger(nn.Module):

    def __init__(self, embedding_dim, hidden_dim, vocab_size, tagset_size):
        super(LSTMTagger, self).__init__()
        self.hidden_dim = hidden_dim

        self.word_embeddings = nn.Embedding(vocab_size, embedding_dim)

        # The LSTM takes word embeddings as inputs, and outputs hidden states
        # with dimensionality hidden_dim.
        self.lstm = nn.LSTM(embedding_dim, hidden_dim)

        # The linear layer that maps from hidden state space to tag space
        self.hidden2tag = nn.Linear(hidden_dim, tagset_size)
        self.hidden = self.init_hidden()
        
    def init_hidden(self):
        # Before we've done anything, we dont have any hidden state.
        # Refer to the Pytorch documentation to see exactly
        # why they have this dimensionality.
        # The axes semantics are (num_layers, minibatch_size, hidden_dim)
        return (autograd.Variable(torch.zeros(1, 1, self.hidden_dim)),
                autograd.Variable(torch.zeros(1, 1, self.hidden_dim)))

    def forward(self, sentence):
        embeds = self.word_embeddings(sentence)
        lstm_out, self.hidden = self.lstm(embeds.view(len(sentence), 1, -1), self.hidden)
        tag_space = self.hidden2tag(lstm_out.view(len(sentence), -1))
        tag_scores = F.log_softmax(tag_space)
        return tag_scores



In [42]:

    
model = LSTMTagger(EMBEDDING_DIM, HIDDEN_DIM, len(word_to_ix), len(tag_to_ix))
loss_func = nn.NLLLoss()
optimizer = optim.SGD(params=model.parameters(), lr=0.5)

print(model(prepare_sequence(training_data[0][0], word_to_ix)))
for e in range(300):
    for seq, tags in training_data:
        X = prepare_sequence(seq, word_to_ix)
        y_gold = prepare_sequence(tags, tag_to_ix)
        model.zero_grad()
        model.hidden = model.init_hidden()
        
        y_pred = model(X)
        loss = loss_function(y_pred, y_gold)
        
        loss.backward()
        optimizer.step()

print(model(prepare_sequence(training_data[0][0], word_to_ix)))









    



Variable containing:
-0.9910 -1.1752 -1.1393
-1.0081 -1.1000 -1.1965
-1.0978 -1.1130 -1.0853
-0.9537 -1.1596 -1.2003
-0.9241 -1.2080 -1.1898
[torch.FloatTensor of size 5x3]

Variable containing:
-0.0604 -4.6273 -3.0198
-8.1220 -0.0021 -6.2955
-5.8761 -7.2554 -0.0035
-0.0062 -6.4301 -5.3880
-5.4609 -0.0044 -8.7629
[torch.FloatTensor of size 5x3]

Character Level Embeddings for LSTM sequence tagger



In [130]:

    
class Tagger2(nn.Module):

    def __init__(self, embedding_dim, hidden_dim, word2ix, char2ix, tagset_size):
        super(Tagger2, self).__init__()
        self.hidden_dim = hidden_dim
        self.char2ix = char2ix
        self.word2ix = word2ix
        vocab_size, alphabet_size = len(word2ix), len(char2ix)
        self.word_embeddings = nn.Embedding(vocab_size, embedding_dim)

        self.char_embeddings = nn.Embedding(alphabet_size, embedding_dim)
        self.char_lstm = nn.LSTM(embedding_dim, hidden_dim)
        self.char_hidden = self.init_hidden() # Hidden state for char LSTM

        # The LSTM takes word embeddings as inputs, and outputs hidden states
        # with dimensionality hidden_dim. 
        # we concatenate character and word embeddings
        self.lstm = nn.LSTM(embedding_dim + hidden_dim, hidden_dim)

        # The linear layer that maps from hidden state space to tag space
        self.hidden2tag = nn.Linear(hidden_dim, tagset_size)
        self.hidden = self.init_hidden()

        
    def init_hidden(self):
        # Before we've done anything, we dont have any hidden state.
        # Refer to the Pytorch documentation to see exactly
        # why they have this dimensionality.
        # The axes semantics are (num_layers, minibatch_size, hidden_dim)
        return (autograd.Variable(torch.zeros(1, 1, self.hidden_dim)),
                autograd.Variable(torch.zeros(1, 1, self.hidden_dim)))

    def forward(self, sentence):
        word_reprs = []
        for word in sentence:
            word_embed = self.word_embeddings(autograd.Variable(torch.LongTensor([self.word2ix[word]])))
            self.char_hidden = self.init_hidden()
            ch_ixs = [self.char2ix[ch] for ch in word]
            ch_seq = autograd.Variable(torch.LongTensor(ch_ixs))
            ch_embeds = self.char_embeddings(ch_seq)
            ch_repr, self.char_hidden = self.char_lstm(ch_embeds.view(len(word), 1, -1), self.char_hidden)
            word_repr = torch.cat((word_embed, ch_repr[-1]), dim=1) # char LSTM output's last time stamp output
            word_reprs.append(word_repr)
        embeds = torch.cat(word_reprs)
        #print(len(sentence), embeds.size())
        lstm_out, self.hidden = self.lstm(embeds.view(len(sentence), 1, -1), self.hidden)
        tag_space = self.hidden2tag(lstm_out.view(len(sentence), -1))
        tag_scores = F.log_softmax(tag_space)
        return tag_scores



In [122]:

    
uniq_chars = set(ch for seq, _ in training_data for word in seq for ch in word) # read left to right
char_to_ix = {ch: ix for ix, ch in enumerate(sorted(uniq_chars))}
print(len(char_to_ix), char_to_ix)









    



16 {'E': 0, 'T': 1, 'a': 2, 'b': 3, 'd': 4, 'e': 5, 'g': 6, 'h': 7, 'k': 8, 'l': 9, 'o': 10, 'p': 11, 'r': 12, 't': 13, 'v': 14, 'y': 15}



In [131]:

    
tagger = Tagger2(EMBEDDING_DIM, HIDDEN_DIM, word_to_ix, char_to_ix, len(tag_to_ix))
loss_func = nn.NLLLoss()
optimizer = optim.SGD(params=tagger.parameters(), lr=0.5)

print(tagger(training_data[0][0]))
for e in range(300):
    for seq, tags in training_data:
        y_gold = prepare_sequence(tags, tag_to_ix)
        tagger.zero_grad()
        tagger.hidden = tagger.init_hidden()
        
        y_pred = tagger(seq)
        loss = loss_function(y_pred, y_gold)
        
        loss.backward()
        optimizer.step()

print(tagger(training_data[0][0]))









    



Variable containing:
-1.4640 -0.9649 -0.9476
-1.5087 -0.9130 -0.9742
-1.4906 -0.9598 -0.9370
-1.5099 -0.9626 -0.9234
-1.5170 -1.0029 -0.8823
[torch.FloatTensor of size 5x3]

Variable containing:
-0.0436 -3.2314 -5.7466
-7.7805 -0.0017 -6.6383
-7.6889 -6.2214 -0.0024
-0.0042 -5.8647 -6.6115
-6.2718 -0.0036 -6.3794
[torch.FloatTensor of size 5x3]

Indices [DET, NN, V]

Scrap Space



In [84]:

    
V = 10
E = 5
H = 6
e = nn.Embedding(V, E)
l = nn.LSTM(E, H)



In [88]:

    
e(autograd.Variable(torch.LongTensor([5])))









    Out[88]:





Variable containing:
 0.4068  1.5358 -0.1027 -2.9540  0.7994
[torch.FloatTensor of size 1x5]



In [85]:

    
hh = (autograd.Variable(torch.zeros(1, 1, 6)), autograd.Variable(torch.zeros(1, 1, 6)))
chw = autograd.Variable(torch.LongTensor([5, 1, 3, 4]))
print('1', chw.size())
seq = e(chw)
print('2', seq.size())
out, hh = l(seq.view(4, 1, -1), hh)
print('3', out.size())
# take the last time stamp output
out[-1]









    



1 torch.Size([4])
2 torch.Size([4, 5])
3 torch.Size([4, 1, 6])



In [86]:

    
out









    Out[86]:





Variable containing:
(0 ,.,.) = 
  0.1137  0.0481 -0.0472  0.1024 -0.0824  0.2085

(1 ,.,.) = 
  0.1944  0.1903 -0.0111  0.0966 -0.1379 -0.0205

(2 ,.,.) = 
  0.0041  0.0633  0.0476  0.0258 -0.0292  0.0463

(3 ,.,.) = 
  0.0173 -0.0504 -0.1395 -0.1017 -0.0999  0.1025
[torch.FloatTensor of size 4x1x6]



In [97]:

    
l = [out[-1], out[-2]]
torch.cat(l)









    Out[97]:





Variable containing:
 0.0173 -0.0504 -0.1395 -0.1017 -0.0999  0.1025
 0.0041  0.0633  0.0476  0.0258 -0.0292  0.0463
[torch.FloatTensor of size 2x6]



In [ ]: