003.1 - LSTM Tagger with Word+Char Embeddings

Thamme Gowda , April 26, 2018

Taken from Pytorch tutorials


In [26]:
import torch
import torch.autograd as autograd
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import os

torch.manual_seed(1)

print(f'Torch Version {torch.__version__}')
assert int(torch.__version__.split('.')[1]) >= 4 # should be greater than equal to 0.4
print(f'Cuda is available: {torch.cuda.is_available()}')
print(f'CUDA_VISIBLE_DEVICES : {os.environ["CUDA_VISIBLE_DEVICES"]}')
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(device)


Torch Version 0.4.0
Cuda is available: True
CUDA_VISIBLE_DEVICES : 0,2
cuda

In [3]:
def prepare_sequence(seq, to_ix):
    return torch.tensor([to_ix[w] for w in seq], dtype=torch.long, device=device)

training_data = [
    ("The dog ate the apple".split(), ["DET", "NN", "V", "DET", "NN"]),
    ("Everybody read that book".split(), ["NN", "V", "DET", "NN"])
]
word_to_ix = {}
for sent, tags in training_data:
    for word in sent:
        if word not in word_to_ix:
            word_to_ix[word] = len(word_to_ix)

tag_to_ix = {"DET": 0, "NN": 1, "V": 2}


uniq_chars = set(ch for seq, _ in training_data for word in seq for ch in word) # read left to right
char_to_ix = {ch: ix for ix, ch in enumerate(sorted(uniq_chars))}

print(word_to_ix)
print(len(char_to_ix), char_to_ix)


# These will usually be more like 32 or 64 dimensional.
# We will keep them small, so we can see how the weights change as we train.
EMBEDDING_DIM = 6
HIDDEN_DIM = 6


{'The': 0, 'dog': 1, 'ate': 2, 'the': 3, 'apple': 4, 'Everybody': 5, 'read': 6, 'that': 7, 'book': 8}
16 {'E': 0, 'T': 1, 'a': 2, 'b': 3, 'd': 4, 'e': 5, 'g': 6, 'h': 7, 'k': 8, 'l': 9, 'o': 10, 'p': 11, 'r': 12, 't': 13, 'v': 14, 'y': 15}

In [30]:
class CharWordLSTMTagger(nn.Module):
    '''
    Character + Word Embedding based LSTM Tagger
    '''

    def __init__(self, embedding_dim, hidden_dim, word2ix, char2ix, tag2ix):
        super(CharWordLSTMTagger, self).__init__()
        self.num_dirs = 1
        self.num_layers = 2
        self.hidden_dim = hidden_dim
        self.char2ix = char2ix
        self.word2ix = word2ix
        self.tag2ix = tag2ix
        self.ix2tag = {v: k for k,v in tag2ix.items()}
        vocab_size, alphabet_size, tagset_size = len(word2ix), len(char2ix), len(tag2ix)
        self.word_embeddings = nn.Embedding(vocab_size, embedding_dim)

        self.char_embeddings = nn.Embedding(alphabet_size, embedding_dim)
        self.char_lstm = nn.LSTM(embedding_dim, hidden_dim,
                                 num_layers=self.num_layers,
                                 bidirectional=self.num_dirs == 2)
        self.char_hidden = self.init_hidden() # Hidden state for char LSTM

        # The LSTM takes word embeddings as inputs, and outputs hidden states
        # with dimensionality hidden_dim. 
        # we concatenate character and word embeddings
        self.lstm = nn.LSTM(self.num_dirs * embedding_dim + hidden_dim, hidden_dim,
                            num_layers=self.num_layers,
                            bidirectional=self.num_dirs == 2)

        # The linear layer that maps from hidden state space to tag space
        self.hidden2tag = nn.Linear(self.num_dirs * hidden_dim, tagset_size)
        self.hidden = self.init_hidden()

        
    def init_hidden(self):
        # Before we've done anything, we dont have any hidden state.
        # Refer to the Pytorch documentation to see exactly
        # why they have this dimensionality.
        # The axes semantics are (num_layers*directions, minibatch_size, hidden_dim)
        return (torch.zeros(self.num_layers * self.num_dirs, 1, self.hidden_dim, device=device),
                torch.zeros(self.num_layers * self.num_dirs, 1, self.hidden_dim, device=device))

    def forward(self, sentence):
        word_reprs = []
        for word in sentence:
            word_idx = torch.tensor([self.word2ix[word]], dtype=torch.long, device=device)
            word_embed = self.word_embeddings(word_idx)
            self.char_hidden = self.init_hidden()
            ch_ixs = [self.char2ix[ch] for ch in word]
            ch_seq = torch.tensor(ch_ixs, dtype=torch.long, device=device)
            ch_embeds = self.char_embeddings(ch_seq)
            ch_repr, self.char_hidden = self.char_lstm(ch_embeds.view(len(word), 1, -1), self.char_hidden)
            word_repr = torch.cat((word_embed, ch_repr[-1]), dim=1) # char LSTM output's last time stamp output
            word_reprs.append(word_repr)
        embeds = torch.cat(word_reprs)
        #print(len(sentence), embeds.size())
        lstm_out, self.hidden = self.lstm(embeds.view(len(sentence), 1, -1), self.hidden)
        tag_space = self.hidden2tag(lstm_out.view(len(sentence), -1))
        tag_scores = F.log_softmax(tag_space, dim=0)
        return tag_scores
    
    def tag(self, sentence):
        vals, idx = self(sentence).max(dim=1)
        return [self.ix2tag[i.item()] for i in idx]

In [33]:
tagger = CharWordLSTMTagger(EMBEDDING_DIM, HIDDEN_DIM, word_to_ix, char_to_ix, tag_to_ix)
tagger.to(device)
loss_func = nn.NLLLoss()
optimizer = optim.SGD(params=tagger.parameters(), lr=0.5)

print(tagger(training_data[0][0]))
for e in range(200):
    for seq, tags in training_data:
        y_gold = prepare_sequence(tags, tag_to_ix)
        tagger.zero_grad()
        tagger.hidden = tagger.init_hidden()
        
        y_pred = tagger(seq)
        loss = loss_func(y_pred, y_gold)
        
        loss.backward()
        optimizer.step()

xseq = training_data[0][0]
print(xseq)
print(tagger.tag(xseq))
print(tagger(xseq))


tensor([[-1.5695, -1.6457, -1.6651],
        [-1.6009, -1.6211, -1.6356],
        [-1.6193, -1.5984, -1.6045],
        [-1.6261, -1.5889, -1.5791],
        [-1.6327, -1.5942, -1.5662]], device='cuda:0')
['The', 'dog', 'ate', 'the', 'apple']
['DET', 'NN', 'V', 'DET', 'NN']
tensor([[-1.1046, -5.7964, -5.0518],
        [-7.5461, -0.6958, -5.3074],
        [-4.9072, -5.7974, -0.0184],
        [-0.4229, -6.2712, -5.0101],
        [-5.1899, -0.7065, -8.2660]], device='cuda:0')

TODO

  1. Learn how to use CRF + viterbi decoding with these features

In [ ]: