In [1]:
import torch
import torch.autograd as autograd
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
torch.manual_seed(1)
Out[1]:
In [40]:
lstm = nn.LSTM(3, 3) # Input dim is 3, output dim is 3
inputs = [autograd.Variable(torch.randn((1, 3))) for _ in range(5)] # make a sequence of length 5
# initialize the hidden state.
hidden = (autograd.Variable(torch.randn(1, 1, 3)),
autograd.Variable(torch.randn((1, 1, 3))))
for i in inputs:
# Step through the sequence one element at a time.
# after each step, hidden contains the hidden state.
out, hidden = lstm(i.view(1, 1, -1), hidden)
# alternatively, we can do the entire sequence all at once.
# the first value returned by LSTM is all of the hidden states throughout
# the sequence. the second is just the most recent hidden state
# (compare the last slice of "out" with "hidden" below, they are the same)
# The reason for this is that:
# "out" will give you access to all hidden states in the sequence
# "hidden" will allow you to continue the sequence and backpropagate,
# by passing it as an argument to the lstm at a later time
# Add the extra 2nd dimension
inputs = torch.cat(inputs).view(len(inputs), 1, -1)
hidden = (autograd.Variable(torch.randn(1, 1, 3)), autograd.Variable(torch.randn((1, 1, 3)))) # clean out hidden state
out, hidden = lstm(inputs, hidden)
print(out)
print(hidden)
In [10]:
def prepare_sequence(seq, to_ix):
idxs = [to_ix[w] for w in seq]
tensor = torch.LongTensor(idxs)
return autograd.Variable(tensor)
training_data = [
("The dog ate the apple".split(), ["DET", "NN", "V", "DET", "NN"]),
("Everybody read that book".split(), ["NN", "V", "DET", "NN"])
]
word_to_ix = {}
for sent, tags in training_data:
for word in sent:
if word not in word_to_ix:
word_to_ix[word] = len(word_to_ix)
print(word_to_ix)
tag_to_ix = {"DET": 0, "NN": 1, "V": 2}
# These will usually be more like 32 or 64 dimensional.
# We will keep them small, so we can see how the weights change as we train.
EMBEDDING_DIM = 6
HIDDEN_DIM = 6
In [41]:
class LSTMTagger(nn.Module):
def __init__(self, embedding_dim, hidden_dim, vocab_size, tagset_size):
super(LSTMTagger, self).__init__()
self.hidden_dim = hidden_dim
self.word_embeddings = nn.Embedding(vocab_size, embedding_dim)
# The LSTM takes word embeddings as inputs, and outputs hidden states
# with dimensionality hidden_dim.
self.lstm = nn.LSTM(embedding_dim, hidden_dim)
# The linear layer that maps from hidden state space to tag space
self.hidden2tag = nn.Linear(hidden_dim, tagset_size)
self.hidden = self.init_hidden()
def init_hidden(self):
# Before we've done anything, we dont have any hidden state.
# Refer to the Pytorch documentation to see exactly
# why they have this dimensionality.
# The axes semantics are (num_layers, minibatch_size, hidden_dim)
return (autograd.Variable(torch.zeros(1, 1, self.hidden_dim)),
autograd.Variable(torch.zeros(1, 1, self.hidden_dim)))
def forward(self, sentence):
embeds = self.word_embeddings(sentence)
lstm_out, self.hidden = self.lstm(embeds.view(len(sentence), 1, -1), self.hidden)
tag_space = self.hidden2tag(lstm_out.view(len(sentence), -1))
tag_scores = F.log_softmax(tag_space)
return tag_scores
In [42]:
model = LSTMTagger(EMBEDDING_DIM, HIDDEN_DIM, len(word_to_ix), len(tag_to_ix))
loss_func = nn.NLLLoss()
optimizer = optim.SGD(params=model.parameters(), lr=0.5)
print(model(prepare_sequence(training_data[0][0], word_to_ix)))
for e in range(300):
for seq, tags in training_data:
X = prepare_sequence(seq, word_to_ix)
y_gold = prepare_sequence(tags, tag_to_ix)
model.zero_grad()
model.hidden = model.init_hidden()
y_pred = model(X)
loss = loss_function(y_pred, y_gold)
loss.backward()
optimizer.step()
print(model(prepare_sequence(training_data[0][0], word_to_ix)))
In [130]:
class Tagger2(nn.Module):
def __init__(self, embedding_dim, hidden_dim, word2ix, char2ix, tagset_size):
super(Tagger2, self).__init__()
self.hidden_dim = hidden_dim
self.char2ix = char2ix
self.word2ix = word2ix
vocab_size, alphabet_size = len(word2ix), len(char2ix)
self.word_embeddings = nn.Embedding(vocab_size, embedding_dim)
self.char_embeddings = nn.Embedding(alphabet_size, embedding_dim)
self.char_lstm = nn.LSTM(embedding_dim, hidden_dim)
self.char_hidden = self.init_hidden() # Hidden state for char LSTM
# The LSTM takes word embeddings as inputs, and outputs hidden states
# with dimensionality hidden_dim.
# we concatenate character and word embeddings
self.lstm = nn.LSTM(embedding_dim + hidden_dim, hidden_dim)
# The linear layer that maps from hidden state space to tag space
self.hidden2tag = nn.Linear(hidden_dim, tagset_size)
self.hidden = self.init_hidden()
def init_hidden(self):
# Before we've done anything, we dont have any hidden state.
# Refer to the Pytorch documentation to see exactly
# why they have this dimensionality.
# The axes semantics are (num_layers, minibatch_size, hidden_dim)
return (autograd.Variable(torch.zeros(1, 1, self.hidden_dim)),
autograd.Variable(torch.zeros(1, 1, self.hidden_dim)))
def forward(self, sentence):
word_reprs = []
for word in sentence:
word_embed = self.word_embeddings(autograd.Variable(torch.LongTensor([self.word2ix[word]])))
self.char_hidden = self.init_hidden()
ch_ixs = [self.char2ix[ch] for ch in word]
ch_seq = autograd.Variable(torch.LongTensor(ch_ixs))
ch_embeds = self.char_embeddings(ch_seq)
ch_repr, self.char_hidden = self.char_lstm(ch_embeds.view(len(word), 1, -1), self.char_hidden)
word_repr = torch.cat((word_embed, ch_repr[-1]), dim=1) # char LSTM output's last time stamp output
word_reprs.append(word_repr)
embeds = torch.cat(word_reprs)
#print(len(sentence), embeds.size())
lstm_out, self.hidden = self.lstm(embeds.view(len(sentence), 1, -1), self.hidden)
tag_space = self.hidden2tag(lstm_out.view(len(sentence), -1))
tag_scores = F.log_softmax(tag_space)
return tag_scores
In [122]:
uniq_chars = set(ch for seq, _ in training_data for word in seq for ch in word) # read left to right
char_to_ix = {ch: ix for ix, ch in enumerate(sorted(uniq_chars))}
print(len(char_to_ix), char_to_ix)
In [131]:
tagger = Tagger2(EMBEDDING_DIM, HIDDEN_DIM, word_to_ix, char_to_ix, len(tag_to_ix))
loss_func = nn.NLLLoss()
optimizer = optim.SGD(params=tagger.parameters(), lr=0.5)
print(tagger(training_data[0][0]))
for e in range(300):
for seq, tags in training_data:
y_gold = prepare_sequence(tags, tag_to_ix)
tagger.zero_grad()
tagger.hidden = tagger.init_hidden()
y_pred = tagger(seq)
loss = loss_function(y_pred, y_gold)
loss.backward()
optimizer.step()
print(tagger(training_data[0][0]))
Indices [DET, NN, V]
In [84]:
V = 10
E = 5
H = 6
e = nn.Embedding(V, E)
l = nn.LSTM(E, H)
In [88]:
e(autograd.Variable(torch.LongTensor([5])))
Out[88]:
In [85]:
hh = (autograd.Variable(torch.zeros(1, 1, 6)), autograd.Variable(torch.zeros(1, 1, 6)))
chw = autograd.Variable(torch.LongTensor([5, 1, 3, 4]))
print('1', chw.size())
seq = e(chw)
print('2', seq.size())
out, hh = l(seq.view(4, 1, -1), hh)
print('3', out.size())
# take the last time stamp output
out[-1]
In [86]:
out
Out[86]:
In [97]:
l = [out[-1], out[-2]]
torch.cat(l)
Out[97]:
In [ ]: