In [1]:
%matplotlib inline
import re, pickle, collections, bcolz, numpy as np, keras, sklearn, math, operator, random, time, os
import matplotlib.pyplot as plt
In [2]:
from gensim.models import KeyedVectors
import torch, torch.nn as nn
from torch.autograd import Variable
from torch import optim
import torch.nn.functional as F
In [3]:
headlines = pickle.load( open('Dissertation/headlines.pkl', 'rb') )
articles = pickle.load( open('Dissertation/articles.pkl', 'rb') )
In [4]:
pairs = [(' '.join(hdln), ' '.join(art)) for hdln, art in zip(headlines, articles)]
Because it takes a while to load the data, we save the results to make it easier to load in later.
In [9]:
pickle.dump(pairs, open('art-hdln-pairs.pkl', 'wb'))
In [11]:
pairs = pickle.load(open('art-hdln-pairs.pkl', 'rb'))
Special tokens used to pad the end of sentences, and to mark the start of a sentence.
In [5]:
PAD = 0; SOS = 1; UNK = 2
Enumerate the unique words (vocab) in the corpus, and also create the reverse map (word->index). Then use this mapping to encode every sentence as a list of int indices.
In [6]:
def toks2ids(sents, voc_size = 200000):
voc_cnt = collections.Counter(t for sent in sents for t in sent)
vocab = sorted(voc_cnt, key=voc_cnt.get, reverse=True)
vocab = vocab[:voc_size]
vocab.insert(PAD, "<PAD>")
vocab.insert(SOS, "<SOS>")
vocab.insert(UNK, "<UNK>")
w2id = {w:i for i,w in enumerate(vocab)}
id2w = {i:w for i,w in enumerate(vocab)}
ids = [[w2id[t] if t in vocab else UNK for t in sent] for sent in sents]
return ids, vocab, w2id, id2w, voc_cnt
In [7]:
art_ids, art_vocab, art_w2id, art_id2w, art_counts = toks2ids(articles)
hdln_ids, hdln_vocab, hdln_w2id, hdln_id2w, hdln_counts = toks2ids(headlines)
Stanford's GloVe word vectors can be downloaded from https://nlp.stanford.edu/projects/glove/ (in the code below we have preprocessed them into a bcolz array). We use these because each individual word has a single word vector, which is what we need for translation. Word2vec, on the other hand, often uses multi-word phrases.
In [8]:
def load_glove(loc):
return (bcolz.open(loc+'.dat')[:],
pickle.load(open(loc+'_words.pkl','rb'), encoding='latin1'),
pickle.load(open(loc+'_idx.pkl','rb'), encoding='latin1'))
In [9]:
hdln_vecs, hdln_wv_word, hdln_wv_idx = load_glove('/mnt/cvl-store-0/home/psxca1/data/glove/6B.300d')
hdln_w2v = {w: hdln_vecs[hdln_wv_idx[w]] for w in hdln_wv_word}
n_hdln_vec, dim_hdln_vec = hdln_vecs.shape
In [10]:
hdln_w2v['king']
Out[10]:
For French word vectors, we're using those from http://fauconnier.github.io/index.html
In [11]:
art_vecs, art_wv_word, art_wv_idx = load_glove('/mnt/cvl-store-0/home/psxca1/data/glove/6B.300d')
art_w2v = {w: art_vecs[art_wv_idx[w]] for w in art_wv_word}
n_art_vec, dim_art_vec = art_vecs.shape
We need to map each word index in our vocabs to their word vector. Not every word in our vocabs will be in our word vectors, since our tokenization approach won't be identical to the word vector creators - in these cases we simply create a random vector.
In [12]:
def create_emb(w2v, targ_vocab, dim_vec):
vocab_size = len(targ_vocab)
emb = np.zeros((vocab_size, dim_vec))
found=0
for i, word in enumerate(targ_vocab):
try: emb[i] = w2v[word]; found+=1
except KeyError: emb[i] = np.random.normal(scale=0.6, size=(dim_vec,))
return emb, found
In [13]:
hdln_embs, found = create_emb(hdln_w2v, hdln_vocab, dim_hdln_vec); hdln_embs.shape, found
Out[13]:
In [14]:
art_embs, found = create_emb(art_w2v, art_vocab, dim_art_vec); art_embs.shape, found
Out[14]:
Each sentence has to be of equal length. Keras has a convenient function pad_sequences
to truncate and/or pad each sentence as required - even although we're not using keras for the neural net, we can still use any functions from it we need!
In [15]:
from keras.preprocessing.sequence import pad_sequences
hdln_len = 30
art_len = 50
hdln_padded = pad_sequences(hdln_ids, hdln_len, 'int64', "post", "post")
art_padded = pad_sequences(art_ids, art_len, 'int64', "post", "post")
hdln_padded.shape, art_padded.shape, hdln_embs.shape
Out[15]:
And of course we need to separate our training and test sets...
In [29]:
from sklearn import model_selection
art_train, art_test, hdln_train, hdln_test = model_selection.train_test_split(
art_padded, hdln_padded, test_size=0.1)
[o.shape for o in (art_train, art_test, hdln_train, hdln_test)]
Out[29]:
In [16]:
art_train = pickle.load( open('1art_train.pkl', 'rb') )
art_test = pickle.load( open('1art_test.pkl', 'rb') )
hdln_train = pickle.load( open('1hdln_train.pkl', 'rb') )
hdln_test = pickle.load( open('1hdln_test.pkl', 'rb') )
Here's an example of a French and English sentence, after encoding and padding.
In [17]:
art_train[0], hdln_train[0]
Out[17]:
In [18]:
def long_t(arr): return Variable(torch.LongTensor(arr)).cuda()
In [19]:
art_emb_t = torch.FloatTensor(art_embs).cuda()
hdln_emb_t = torch.FloatTensor(hdln_embs).cuda()
In [20]:
def create_emb(emb_mat, non_trainable=False):
output_size, emb_size = emb_mat.size()
emb = nn.Embedding(output_size, emb_size)
emb.load_state_dict({'weight': emb_mat})
if non_trainable:
for param in emb.parameters():
param.requires_grad = False
return emb, emb_size, output_size
Turning a sequence into a representation can be done using an RNN (called the 'encoder'. This approach is useful because RNN's are able to keep track of state and memory, which is obviously important in forming a complete understanding of a sentence.
bidirectional=True
passes the original sequence through an RNN, and the reversed sequence through a different RNN and concatenates the results. This allows us to look forward and backwards.
In [21]:
class EncoderRNN(nn.Module):
def __init__(self, embs, hidden_size, n_layers=4, dropout=0.2):
super(EncoderRNN, self).__init__()
self.emb, emb_size, output_size = create_emb(embs)
self.n_layers = n_layers
self.dropout = dropout
self.hidden_size = hidden_size
self.gru = nn.GRU(emb_size, hidden_size, batch_first=True, num_layers=self.n_layers, bidirectional=True, dropout=self.dropout)
def forward(self, inp, hidden):
outputs, hidden = self.gru(self.emb(inp), hidden)
return outputs, hidden
def initHidden(self, batch_size):
return Variable(torch.zeros(self.n_layers * 2, batch_size, self.hidden_size)) # * 2 for bidirectional
In [22]:
def encode(inp, encoder):
batch_size, input_length = inp.size()
hidden = encoder.initHidden(batch_size).cuda()
enc_outputs, hidden = encoder(inp, hidden)
return long_t([SOS]*batch_size), enc_outputs, hidden
Finally, we arrive at a vector representation of the sequence which captures everything we need to translate it. We feed this vector into more RNN's, which are trying to generate the labels. After this, we make a classification for what each word is in the output sequence.
In [36]:
class DecoderRNN(nn.Module):
def __init__(self, embs, hidden_size, n_layers=2):
super(DecoderRNN, self).__init__()
self.emb, emb_size, output_size = create_emb(embs)
self.gru = nn.GRU(emb_size, hidden_size, batch_first=True, num_layers=n_layers)
self.out = nn.Linear(hidden_size, output_size)
def forward(self, inp, hidden):
emb = self.emb(inp).unsqueeze(1)
res, hidden = self.gru(emb, hidden)
res = F.log_softmax(self.out(res[:,0]))
return res, hidden
This graph demonstrates the accuracy decay for a neural translation task. With an encoding/decoding technique, larger input sequences result in less accuracy.
This can be mitigated using an attentional model.
In [23]:
def unit_prefix(x, n=1):
for i in range(n): x = x.unsqueeze(0)
return x
def align(x, y, start_dim=2):
xd, yd = x.dim(), y.dim()
if xd > yd: y = unit_prefix(y, xd - yd)
elif yd > xd: x = unit_prefix(x, yd - xd)
xs, ys = list(x.size()), list(y.size())
nd = len(ys)
for i in range(start_dim, nd):
td = nd-i-1
if ys[td]==1: ys[td] = xs[td]
elif xs[td]==1: xs[td] = ys[td]
return x.expand(*xs), y.expand(*ys)
In [24]:
def aligned_op(x,y,f): return f(*align(x,y,0))
def add(x, y): return aligned_op(x, y, operator.add)
def sub(x, y): return aligned_op(x, y, operator.sub)
def mul(x, y): return aligned_op(x, y, operator.mul)
def div(x, y): return aligned_op(x, y, operator.truediv)
In [25]:
def dot(x, y):
assert(1<y.dim()<5)
x, y = align(x, y)
if y.dim() == 2: return x.mm(y)
elif y.dim() == 3: return x.bmm(y)
else:
xs,ys = x.size(), y.size()
res = torch.zeros(*(xs[:-1] + (ys[-1],)))
for i in range(xs[0]): res[i].baddbmm_(x[i], (y[i]))
return res
In [26]:
def Arr(*sz): return torch.randn(sz)/math.sqrt(sz[0])
def Var(*sz): return nn.Parameter(Arr(*sz)).cuda()
In [27]:
class AttnDecoderRNN(nn.Module):
def __init__(self, embs, hidden_size, n_layers=2, p=0.2):
super(AttnDecoderRNN, self).__init__()
self.hidden_size = hidden_size
self.n_layers=n_layers
self.p=p
self.emb_dropout = nn.Dropout(p)
self.emb, emb_size, output_size = create_emb(embs)
self.W1 = Var(hidden_size*2, hidden_size) # * 2 for bidirectional
self.W2 = Var(hidden_size, hidden_size)
self.W3 = Var(emb_size+hidden_size*2, hidden_size) # * 2 for bidirectional
self.b2 = Var(1,hidden_size)
self.b3 = Var(1,hidden_size)
self.V = Var(1,1,hidden_size)
self.gru = nn.GRU(hidden_size, hidden_size, num_layers=n_layers, bidirectional=True, dropout=self.p)
self.out = nn.Linear(hidden_size*2, output_size) # * 2 for bidirectional
def forward(self, inp, hidden, enc_outputs):
emb_inp = self.emb_dropout(self.emb(inp))
w1e = dot(enc_outputs, self.W1)
w2h = dot(hidden[-1], self.W2)
w2h = (w2h+self.b2.expand_as(w2h)).unsqueeze(1)
u = F.tanh(w1e + w2h.expand_as(w1e))
a = (self.V.expand_as(u)*u).sum(2).squeeze(2)
a = F.softmax(a).unsqueeze(2)
Xa = (a.expand_as(enc_outputs) * enc_outputs).sum(1)
res = dot(torch.cat([emb_inp, Xa.squeeze(1)], 1), self.W3)
res = (res+self.b3.expand_as(res)).unsqueeze(0)
res, hidden = self.gru(res, hidden)
res = F.log_softmax(self.out(res.squeeze(0)))
ret = res, hidden
return ret
In [28]:
def get_batch(x, y, batch_size=16):
idxs = np.random.permutation(len(x))[:batch_size]
return x[idxs], y[idxs]
Pytorch has limited functionality for training models automatically - you will generally have to write your own training loops. However, Pytorch makes it far easier to customize how this training is done, such as using teacher forcing.
In [29]:
def train(inp, targ, encoder, decoder, enc_opt, dec_opt, crit, teacher_forcing_ratio):
decoder_input, encoder_outputs, hidden = encode(inp, encoder)
target_length = targ.size()[1]
enc_opt.zero_grad(); dec_opt.zero_grad()
loss = 0
if random.random() < teacher_forcing_ratio:
for di in range(target_length):
decoder_output, hidden = decoder(decoder_input, hidden, encoder_outputs)
loss += crit(decoder_output, targ[:, di])
decoder_input = targ[:, di]
else: # feed output for next input
for di in range(target_length):
decoder_output, hidden = decoder(decoder_input, hidden, encoder_outputs)
loss += crit(decoder_output, targ[:, di])
topv, topi = decoder_output.data.topk(1);
decoder_input = Variable(topi.squeeze()).cuda()
loss.backward()
enc_opt.step(); dec_opt.step()
return loss.data[0] / target_length
In [30]:
def calc_minutes(since):
now = time.time()
s = now - since
return s/60
def as_minutes(s):
m = math.floor(s / 60)
s -= m * 60
return '%dm %ds' % (m, s)
def time_since(since, percent):
now = time.time()
s = now - since
es = s / (percent)
rs = es - s
return '%-7s (- %s)' % (as_minutes(s), as_minutes(rs))
In [31]:
def req_grad_params(o):
return (p for p in o.parameters() if p.requires_grad)
In [32]:
def trainEpochs(encoder, decoder, n_epochs, start_time, times_list, avg_loss_list, epochs_list,\
print_every=200, lr=0.01, plot_loss_every=20, teacher_forcing = 'graduated',):
print('LEARNING RATE: %f' % (lr))
print_loss = 0 # Reset every print_every
plot_loss = 0
enc_opt = optim.Adam(req_grad_params(encoder), lr=lr)
dec_opt = optim.Adam(decoder.parameters(), lr=lr)
crit = nn.NLLLoss().cuda()
for epoch in range(n_epochs):
art, hdln = get_batch(art_train, hdln_train, 128)
inp = long_t(art)
targ = long_t(hdln)
try:
isinstance(teacher_forcing, (str, float, int))
except:
raise TypeError
if teacher_forcing == 'graduated':
teacher_forcing_ratio = 1 - epoch/n_epochs
elif teacher_forcing == 'full':
teacher_forcing_ratio = 1
elif teacher_forcing == 'none':
teacher_forcing_ratio = 0
elif teacher_forcing <= 1 and teacher_forcing >= 0:
teacher_forcing_ratio = teacher_forcing
else:
raise ValueError
loss = train(inp, targ, encoder, decoder, enc_opt, dec_opt, crit, teacher_forcing_ratio)
print_loss += loss
plot_loss += loss
if epoch % print_every == 0 and epoch is not 0:
print('%s\t%d\t%d%%\t%.4f' % (time_since(start_time, epoch / n_epochs), \
epoch, epoch / n_epochs * 100, print_loss / print_every))
print_loss = 0
if epoch % plot_loss_every == 0 and epoch is not 0:
times_list.append(calc_minutes(start_time))
avg_loss_list.append(plot_loss / plot_loss_every)
epochs_list.append(epoch)
plot_loss = 0
In [33]:
def multi_train(encoder, decoder, times_list, avg_loss_list, epochs_list, teacher_forcing_type):
start_time = time.time()
trainEpochs(encoder, decoder, 5000, start_time, times_list, avg_loss_list, epochs_list, lr=0.003, teacher_forcing=teacher_forcing_type)
trainEpochs(encoder, decoder, 5000, start_time, times_list, avg_loss_list, epochs_list, lr=0.003, teacher_forcing=teacher_forcing_type)
trainEpochs(encoder, decoder, 5000, start_time, times_list, avg_loss_list, epochs_list, lr=0.001, teacher_forcing=teacher_forcing_type)
trainEpochs(encoder, decoder, 5000, start_time, times_list, avg_loss_list, epochs_list, lr=0.001, teacher_forcing=teacher_forcing_type)
trainEpochs(encoder, decoder, 5000, start_time, times_list, avg_loss_list, epochs_list, lr=0.0003, teacher_forcing=teacher_forcing_type)
trainEpochs(encoder, decoder, 5000, start_time, times_list, avg_loss_list, epochs_list, lr=0.0003, teacher_forcing=teacher_forcing_type)
trainEpochs(encoder, decoder, 5000, start_time, times_list, avg_loss_list, epochs_list, lr=0.0001, teacher_forcing=teacher_forcing_type)
trainEpochs(encoder, decoder, 5000, start_time, times_list, avg_loss_list, epochs_list, lr=0.00003, teacher_forcing=teacher_forcing_type)
In [ ]:
def evaluate(inp):
decoder_input, encoder_outputs, hidden = encode(inp, encoder)
target_length = hdln_len
decoded_words = []
for di in range(target_length):
decoder_output, hidden = decoder(decoder_input, hidden, encoder_outputs)
topv, topi = decoder_output.data.topk(1)
ni = topi[0][0];
if ni==PAD: break
decoded_words.append(hdln_vocab[ni])
decoder_input = long_t([ni])
return decoded_words
In [ ]:
# create blank files/erase previous content
#with open('./Dissertation/data/system/newsdecoded.txt','w') as dec_file, open('./rouge_eval/real_sents.txt', 'w') as source_file:
def test(f_path):
ref_dir = f_path+'/reference/'
system_dir = f_path+'/system/'
if not os.path.exists(ref_dir): os.makedirs(os.path.dirname(ref_dir), exist_ok=True)
if not os.path.exists(system_dir): os.makedirs(os.path.dirname(system_dir), exist_ok=True)
for idx in range(len(art_test)):
real_sent = [hdln_id2w[t] for t in hdln_test[idx] if t != 0]
if real_sent:
with open(ref_dir + 'news%d_reference%d' % (idx, idx), 'w') as f:
f.write(' '.join(real_sent))
else:
continue
ids = long_t(art_test[idx]); ids = ids.unsqueeze(0)
translation = evaluate(ids)
with open(system_dir + 'news%d_system%d' % (idx, idx), 'w') as f:
f.write(' '.join(translation))
In [34]:
test_num = 1
list_path = './Dissertation/lists/test{}/'.format(test_num)
enc_dec_path = './Dissertation/encoders_and_decoders/test{}/'.format(test_num)
eval_path = './Dissertation/rouge_eval/test{}/'.format(test_num)
plot_path = './Dissertation/plots/test{}/'.format(test_num)
list_path, enc_dec_path, eval_path, plot_path
Out[34]:
In [83]:
n_layers = 1
hidden_size = 128
graduated_reset_epochs_list = []
graduated_reset_times_list = []
graduated_reset_avg_loss_list = []
encoder = EncoderRNN(art_emb_t, hidden_size, n_layers).cuda()
decoder = AttnDecoderRNN(hdln_emb_t, hidden_size, n_layers).cuda()
multi_train(encoder, decoder, graduated_reset_times_list, graduated_reset_avg_loss_list, graduated_reset_epochs_list, 'graduated')
In [84]:
with open(list_path+'graduated_reset_epochs_list.pkl', 'wb') as f:
pickle.dump(graduated_reset_epochs_list, f)
with open(list_path+'graduated_reset_times_list.pkl', 'wb') as f:
pickle.dump(graduated_reset_times_list, f)
with open(list_path+'graduated_reset_avg_loss_list.pkl', 'wb') as f:
pickle.dump(graduated_reset_avg_loss_list, f)
In [85]:
torch.save(encoder, enc_dec_path+'graduated_reset_encoder.pth')
torch.save(decoder, enc_dec_path+'graduated_reset_decoder.pth')
In [86]:
plt.plot(graduated_reset_times_list, graduated_reset_avg_loss_list)
plt.xlabel('Training Time (Minutes)')
plt.ylabel('Average Loss')
plt.title('Graduated (Reset) Teacher Forcing Loss over Time')
plt.savefig(plot_path+'grad_reset.png')
plt.show()
In [87]:
test(eval_path+'graduated_reset')
In [94]:
def fullgraduated_trainEpochs(encoder, decoder, total_epochs, start_time, times_list, avg_loss_list, epochs_list,\
print_every=200, lr=0.003, plot_loss_every=20):
print("LEARNING RATE: %f" % (lr))
print_loss = 0 # Reset every print_every
plot_loss = 0
enc_opt = optim.Adam(req_grad_params(encoder), lr=lr)
dec_opt = optim.Adam(decoder.parameters(), lr=lr)
crit = nn.NLLLoss().cuda()
for epoch in range(total_epochs):
art, hdln = get_batch(art_train, hdln_train, 128)
inp = long_t(art)
targ = long_t(hdln)
teacher_forcing_ratio = 1 - epoch/total_epochs
loss = train(inp, targ, encoder, decoder, enc_opt, dec_opt, crit, teacher_forcing_ratio)
print_loss += loss
plot_loss += loss
if epoch % print_every == 0 and epoch is not 0:
print('%s\t%d\t%d%%\t%.4f' % (time_since(start_time, epoch / total_epochs), \
epoch, epoch / total_epochs * 100, print_loss / print_every))
print_loss = 0
if epoch % plot_loss_every == 0 and epoch is not 0:
times_list.append(calc_minutes(start_time))
avg_loss_list.append(plot_loss / plot_loss_every)
epochs_list.append(epoch)
plot_loss = 0
if epoch == 10000:
lr = .001
print("LEARNING RATE: %f" % (lr))
enc_opt = optim.Adam(req_grad_params(encoder), lr=lr)
dec_opt = optim.Adam(decoder.parameters(), lr=lr)
elif epoch == 20000:
lr = .0003
print("LEARNING RATE: %f" % (lr))
enc_opt = optim.Adam(req_grad_params(encoder), lr=lr)
dec_opt = optim.Adam(decoder.parameters(), lr=lr)
elif epoch == 30000:
lr = .0001
print("LEARNING RATE: %f" % (lr))
enc_opt = optim.Adam(req_grad_params(encoder), lr=lr)
dec_opt = optim.Adam(decoder.parameters(), lr=lr)
elif epoch == 35000:
lr = .00003
print("LEARNING RATE: %f" % (lr))
enc_opt = optim.Adam(req_grad_params(encoder), lr=lr)
dec_opt = optim.Adam(decoder.parameters(), lr=lr)
In [95]:
n_layers = 1
hidden_size = 128
graduated_epochs_list = []
graduated_times_list = []
graduated_avg_loss_list = []
encoder = EncoderRNN(art_emb_t, hidden_size, n_layers).cuda()
decoder = AttnDecoderRNN(hdln_emb_t, hidden_size, n_layers).cuda()
fullgraduated_trainEpochs(encoder, decoder, 40000, time.time(), graduated_times_list, graduated_avg_loss_list, \
graduated_epochs_list)
In [96]:
torch.save(encoder, enc_dec_path+'graduated_encoder.pth')
torch.save(decoder, enc_dec_path+'graduated_decoder.pth')
In [97]:
with open(list_path+'graduated_epochs_list.pkl', 'wb') as f:
pickle.dump(graduated_epochs_list, f)
with open(list_path+'graduated_times_list.pkl', 'wb') as f:
pickle.dump(graduated_times_list, f)
with open(list_path+'graduated_avg_loss_list.pkl', 'wb') as f:
pickle.dump(graduated_avg_loss_list, f)
In [98]:
plt.plot(graduated_times_list, graduated_avg_loss_list)
plt.xlabel('Training Time (Minutes)')
plt.ylabel('Average Loss')
plt.title('Graduated Teacher Forcing Loss over Time')
plt.savefig(plot_path+'grad.png')
plt.show()
In [99]:
test(eval_path+'graduated')
In [101]:
n_layers = 1
hidden_size = 128
full_epochs_list = []
full_times_list = []
full_avg_loss_list = []
encoder = EncoderRNN(art_emb_t, hidden_size, n_layers).cuda()
decoder = AttnDecoderRNN(hdln_emb_t, hidden_size, n_layers).cuda()
multi_train(encoder, decoder, full_times_list, full_avg_loss_list, full_epochs_list, 'full')
In [102]:
torch.save(encoder, enc_dec_path+'full_tf_encoder.pth')
torch.save(decoder, enc_dec_path+'full_tf_decoder.pth')
In [103]:
with open(list_path+'full_epochs_list.pkl', 'wb') as f:
pickle.dump(full_epochs_list, f)
with open(list_path+'full_times_list.pkl', 'wb') as f:
pickle.dump(full_times_list, f)
with open(list_path+'full_avg_loss_list.pkl', 'wb') as f:
pickle.dump(full_avg_loss_list, f)
In [104]:
plt.plot(full_times_list, full_avg_loss_list)
plt.xlabel('Training Time (Minutes)')
plt.ylabel('Average Loss')
plt.title('Full Teacher Forcing Loss over Time')
plt.savefig(plot_path+'full_tf.png')
plt.show()
In [105]:
test(eval_path+'full_tf')
In [35]:
n_layers = 1
hidden_size = 128
none_epochs_list = []
none_times_list = []
none_avg_loss_list = []
encoder = EncoderRNN(art_emb_t, hidden_size, n_layers).cuda()
decoder = AttnDecoderRNN(hdln_emb_t, hidden_size, n_layers).cuda()
multi_train(encoder, decoder, none_times_list, none_avg_loss_list, none_epochs_list, 'none')
In [36]:
torch.save(encoder, enc_dec_path+'no_tf_encoder.pth')
torch.save(decoder, enc_dec_path+'no_tf_decoder.pth')
In [37]:
with open(list_path+'none_epochs_list.pkl', 'wb') as f:
pickle.dump(none_epochs_list, f)
with open(list_path+'none_times_list.pkl', 'wb') as f:
pickle.dump(none_times_list, f)
with open(list_path+'none_avg_loss_list.pkl', 'wb') as f:
pickle.dump(none_avg_loss_list, f)
In [ ]:
plt.plot(none_times_list, none_avg_loss_list)
plt.xlabel('Training Time (Minutes)')
plt.ylabel('Average Loss')
plt.title('No Teacher Forcing Loss over Time')
plt.savefig(plot_path+'no_tf.png')
plt.show()
In [41]:
test(eval_path+'no_tf')
In [55]:
n_layers = 1
hidden_size = 128
p25_epochs_list = []
p25_times_list = []
p25_avg_loss_list = []
encoder = EncoderRNN(art_emb_t, hidden_size, n_layers).cuda()
decoder = AttnDecoderRNN(hdln_emb_t, hidden_size, n_layers).cuda()
multi_train(encoder, decoder, p25_times_list, p25_avg_loss_list, p25_epochs_list, teacher_forcing_type=.25)
In [56]:
torch.save(encoder, enc_dec_path+'p25_tf_encoder.pth')
torch.save(decoder, enc_dec_path+'/p25_tf_decoder.pth')
In [57]:
with open(list_path+'p25_epochs_list.pkl', 'wb') as f:
pickle.dump(p25_epochs_list, f)
with open(list_path+'p25_times_list.pkl', 'wb') as f:
pickle.dump(p25_times_list, f)
with open(list_path+'p25_avg_loss_list.pkl', 'wb') as f:
pickle.dump(p25_avg_loss_list, f)
In [58]:
plt.plot(p25_times_list, p25_avg_loss_list)
plt.xlabel('Training Time (Minutes)')
plt.ylabel('Average Loss')
plt.title('25% Teacher Forcing Loss over Time')
plt.savefig(plot_path+'p25_tf.png')
plt.show()
In [71]:
test(eval_path+'p25_tf')
In [73]:
n_layers = 1
hidden_size = 128
p50_epochs_list = []
p50_times_list = []
p50_avg_loss_list = []
encoder = EncoderRNN(art_emb_t, hidden_size, n_layers).cuda()
decoder = AttnDecoderRNN(hdln_emb_t, hidden_size, n_layers).cuda()
multi_train(encoder, decoder, p50_times_list, p50_avg_loss_list, p50_epochs_list, .5)
In [74]:
torch.save(encoder, enc_dec_path+'p50_tf_encoder.pth')
torch.save(decoder, enc_dec_path+'p50_tf_decoder.pth')
In [75]:
with open(list_path+'p50_epochs_list.pkl', 'wb') as f:
pickle.dump(p50_epochs_list, f)
with open(list_path+'p50_times_list.pkl', 'wb') as f:
pickle.dump(p50_times_list, f)
with open(list_path+'p50_avg_loss_list.pkl', 'wb') as f:
pickle.dump(p50_avg_loss_list, f)
In [76]:
plt.plot(p50_times_list, p50_avg_loss_list)
plt.xlabel('Training Time (Minutes)')
plt.ylabel('Average Loss')
plt.title('50% Teacher Forcing Loss over Time')
plt.savefig(plot_path+'p50_tf.png')
plt.show()
In [77]:
test(eval_path+'p50_tf')
In [78]:
n_layers = 1
hidden_size = 128
p75_epochs_list = []
p75_times_list = []
p75_avg_loss_list = []
encoder = EncoderRNN(art_emb_t, hidden_size, n_layers).cuda()
decoder = AttnDecoderRNN(hdln_emb_t, hidden_size, n_layers).cuda()
multi_train(encoder, decoder, p75_times_list, p75_avg_loss_list, p75_epochs_list, .75)
In [79]:
torch.save(encoder, enc_dec_path+'p75_tf_encoder.pth')
torch.save(decoder, enc_dec_path+'p75_tf_decoder.pth')
In [80]:
with open(list_path+'p75_epochs_list.pkl', 'wb') as f:
pickle.dump(p75_epochs_list, f)
with open(list_path+'p75_times_list.pkl', 'wb') as f:
pickle.dump(p75_times_list, f)
with open(list_path+'p75_avg_loss_list.pkl', 'wb') as f:
pickle.dump(p75_avg_loss_list, f)
In [81]:
plt.plot(p75_times_list, p75_avg_loss_list)
plt.xlabel('Training Time (Minutes)')
plt.ylabel('Average Loss')
plt.title('75% Teacher Forcing Loss over Time')
plt.savefig(plot_path+'p75_tf.png')
plt.show()
In [82]:
test(eval_path+'p75_tf')