In [26]:
import torch
import torch.autograd as autograd
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import os
torch.manual_seed(1)
print(f'Torch Version {torch.__version__}')
assert int(torch.__version__.split('.')[1]) >= 4 # should be greater than equal to 0.4
print(f'Cuda is available: {torch.cuda.is_available()}')
print(f'CUDA_VISIBLE_DEVICES : {os.environ["CUDA_VISIBLE_DEVICES"]}')
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(device)
In [3]:
def prepare_sequence(seq, to_ix):
return torch.tensor([to_ix[w] for w in seq], dtype=torch.long, device=device)
training_data = [
("The dog ate the apple".split(), ["DET", "NN", "V", "DET", "NN"]),
("Everybody read that book".split(), ["NN", "V", "DET", "NN"])
]
word_to_ix = {}
for sent, tags in training_data:
for word in sent:
if word not in word_to_ix:
word_to_ix[word] = len(word_to_ix)
tag_to_ix = {"DET": 0, "NN": 1, "V": 2}
uniq_chars = set(ch for seq, _ in training_data for word in seq for ch in word) # read left to right
char_to_ix = {ch: ix for ix, ch in enumerate(sorted(uniq_chars))}
print(word_to_ix)
print(len(char_to_ix), char_to_ix)
# These will usually be more like 32 or 64 dimensional.
# We will keep them small, so we can see how the weights change as we train.
EMBEDDING_DIM = 6
HIDDEN_DIM = 6
In [30]:
class CharWordLSTMTagger(nn.Module):
'''
Character + Word Embedding based LSTM Tagger
'''
def __init__(self, embedding_dim, hidden_dim, word2ix, char2ix, tag2ix):
super(CharWordLSTMTagger, self).__init__()
self.num_dirs = 1
self.num_layers = 2
self.hidden_dim = hidden_dim
self.char2ix = char2ix
self.word2ix = word2ix
self.tag2ix = tag2ix
self.ix2tag = {v: k for k,v in tag2ix.items()}
vocab_size, alphabet_size, tagset_size = len(word2ix), len(char2ix), len(tag2ix)
self.word_embeddings = nn.Embedding(vocab_size, embedding_dim)
self.char_embeddings = nn.Embedding(alphabet_size, embedding_dim)
self.char_lstm = nn.LSTM(embedding_dim, hidden_dim,
num_layers=self.num_layers,
bidirectional=self.num_dirs == 2)
self.char_hidden = self.init_hidden() # Hidden state for char LSTM
# The LSTM takes word embeddings as inputs, and outputs hidden states
# with dimensionality hidden_dim.
# we concatenate character and word embeddings
self.lstm = nn.LSTM(self.num_dirs * embedding_dim + hidden_dim, hidden_dim,
num_layers=self.num_layers,
bidirectional=self.num_dirs == 2)
# The linear layer that maps from hidden state space to tag space
self.hidden2tag = nn.Linear(self.num_dirs * hidden_dim, tagset_size)
self.hidden = self.init_hidden()
def init_hidden(self):
# Before we've done anything, we dont have any hidden state.
# Refer to the Pytorch documentation to see exactly
# why they have this dimensionality.
# The axes semantics are (num_layers*directions, minibatch_size, hidden_dim)
return (torch.zeros(self.num_layers * self.num_dirs, 1, self.hidden_dim, device=device),
torch.zeros(self.num_layers * self.num_dirs, 1, self.hidden_dim, device=device))
def forward(self, sentence):
word_reprs = []
for word in sentence:
word_idx = torch.tensor([self.word2ix[word]], dtype=torch.long, device=device)
word_embed = self.word_embeddings(word_idx)
self.char_hidden = self.init_hidden()
ch_ixs = [self.char2ix[ch] for ch in word]
ch_seq = torch.tensor(ch_ixs, dtype=torch.long, device=device)
ch_embeds = self.char_embeddings(ch_seq)
ch_repr, self.char_hidden = self.char_lstm(ch_embeds.view(len(word), 1, -1), self.char_hidden)
word_repr = torch.cat((word_embed, ch_repr[-1]), dim=1) # char LSTM output's last time stamp output
word_reprs.append(word_repr)
embeds = torch.cat(word_reprs)
#print(len(sentence), embeds.size())
lstm_out, self.hidden = self.lstm(embeds.view(len(sentence), 1, -1), self.hidden)
tag_space = self.hidden2tag(lstm_out.view(len(sentence), -1))
tag_scores = F.log_softmax(tag_space, dim=0)
return tag_scores
def tag(self, sentence):
vals, idx = self(sentence).max(dim=1)
return [self.ix2tag[i.item()] for i in idx]
In [33]:
tagger = CharWordLSTMTagger(EMBEDDING_DIM, HIDDEN_DIM, word_to_ix, char_to_ix, tag_to_ix)
tagger.to(device)
loss_func = nn.NLLLoss()
optimizer = optim.SGD(params=tagger.parameters(), lr=0.5)
print(tagger(training_data[0][0]))
for e in range(200):
for seq, tags in training_data:
y_gold = prepare_sequence(tags, tag_to_ix)
tagger.zero_grad()
tagger.hidden = tagger.init_hidden()
y_pred = tagger(seq)
loss = loss_func(y_pred, y_gold)
loss.backward()
optimizer.step()
xseq = training_data[0][0]
print(xseq)
print(tagger.tag(xseq))
print(tagger(xseq))
In [ ]: