STATIC GRAPH


In [7]:
import torch
from torch import nn
import torch.nn.functional as F
from torch import optim
from torch.autograd import Variable

DATA


In [8]:
import data_utils
metadata, idx_q, idx_a = data_utils.load_data('../data/')

In [9]:
# add special symbol
i2w = metadata['idx2w'] + ['GO']
w2i = metadata['w2idx']
w2i['GO'] = len(i2w)-1

Parameters


In [10]:
batch_size = 128
L = len(idx_q[0])
vocab_size = len(i2w)
hidden_size = 256
dataset_size = len(idx_q)

In [11]:
class Config:
    pass

config = Config()
config.printsize = True

In [12]:
len(idx_q)


Out[12]:
267518

In [38]:
import numpy as np
x = np.arange(batch_size)
np.random.shuffle(x)
x = x * dataset_size//batch_size
print(x)
print(idx_q[x].shape)


[179738 229898 156748 238258 219448 108679  41799 165108 181828 100319
 127489 121219 169288  58519 125399 117039 250798   2089  52249  81509
  43889 154658  91959 148388 265428 114949  25079  16719  66879  77329
 112859 257068 240348 150478  83599 244528 242438  94049 171378 225718
      0 194368  75239 183918  39709  37619  12539 146298  31349 234078
 192278 110769 177648 129579 223628  71059 106589 211088 208998 221538
 167198 102409 248708  85689  48069 186008  54339  50159  60609 173468
 142118 204818 104499 254978  27169 190188 196458  73149 227808 163018
 131669 213178  62699 198548 246618 188098  56429 160928 231988  64789
 252888 137938 259158  45979 119129   8359  10449 123309  68969 236168
 152568 202728  18809  14629 135848 206908   6269 217358 158838  35529
 261248 263338  22989  79419 215268  87779 140028 144208 133759  20899
 200638  98229   4179  89869  29259  33439 175558  96139]
(128, 21)

Graph


In [23]:
def initial_state(batch_size, hidden_size):
    state = torch.zeros([batch_size, hidden_size])
    return Variable(state.cuda())

def psize(name, variable):
    if config.printsize:
        print(name, variable.size(), type(variable.data))
        
class Encoder(nn.Module):
    def __init__(self, vocab_size, hidden_size):
        super(Encoder, self).__init__()
        
        self.vocab_size = vocab_size
        self.hidden_size = hidden_size
                
        self.embed = nn.Embedding(vocab_size, hidden_size)
        self.encode = nn.LSTMCell(hidden_size, hidden_size)
            
    def forward(self, enc_inputs, hidden, batch_size):
        input_length = enc_inputs.size()[0]
        psize('enc_inputs', enc_inputs)
        enc_embeddings = self.embed(enc_inputs)
        psize('enc_embeddings', enc_embeddings)
        enc_embeddings = enc_embeddings.view(input_length, 
                                            batch_size, 
                                            hidden_size)            #LxBxH       
                
        psize('enc_embeddings', enc_embeddings)        
        hidden, cell_state = hidden
        for i in range(enc_embeddings.size()[0]):
            hidden, cell_state = self.encode(enc_embeddings[i], (hidden, cell_state))
            
        return hidden, cell_state
    
class Decoder(nn.Module):
    def __init__(self, vocab_size, hidden_size):
        super(Decoder, self).__init__()
        
        self.vocab_size = vocab_size
        self.hidden_size = hidden_size
        
        self.decode = nn.LSTMCell(hidden_size, hidden_size)
        self.embed = nn.Embedding(vocab_size, hidden_size)
        self.project = nn.Linear(hidden_size, vocab_size)
        
    def forward(self, outputs, hidden, batch_size):
        length = outputs.size()[0]
        psize('hidden', hidden[0]), psize('hidden', hidden[1])
        predicted_outputs = []
    
        dec_embeddings = self.embed(outputs).view(length,
                                                 batch_size,
                                                 hidden_size)           #LxBxH
    
        GO = torch.LongTensor([w2i['GO']] * batch_size).cuda()            
        GO = Variable(GO)
        psize('GO', GO)
        dec_input = GO
        hidden, cell_state = hidden
        for i in range(length):
            psize('\tdec_input', dec_input)
            dec_input_emb = self.embed(dec_input)
            psize('\tdec_input_emb', dec_input_emb)

            hidden, cell_state = self.decode(dec_input_emb, (hidden, cell_state))
            predicted_outputs.append(hidden)

            topv, topi = self.project(F.log_softmax(hidden)).topk(1)
            psize('\ttopi', topi)
            dec_input = topi.squeeze(1)
            
        predicted_outputs = torch.stack(predicted_outputs).squeeze(1)
        psize('predicted_outputs', predicted_outputs)
              
        predicted_outputs = self.project(predicted_outputs.view(length*batch_size, hidden_size))
        psize('predicted_outputs', predicted_outputs)
        predicted_outputs = predicted_outputs.view(length, batch_size, vocab_size)
        psize('predicted_outputs', predicted_outputs)

        return predicted_outputs

TRAINING


In [14]:
from pprint import pprint
from tqdm import tqdm
def train_epochs(epochs, encoder, decoder, eoptim, doptim, criterion, print_every=1):
    encoder.train()
    decoder.train()
    losses = []
    config.printsize = True

    for epoch in tqdm(range(epochs+1)):
        loss = train(encoder, decoder, eoptim, doptim, criterion, idx_q, idx_a,
                    print_every=print_every*100)    
        if epoch % print_every == 0:
            losses.append(loss)
            print('{} - loss: {}'.format(epoch, loss))

        
def train(encoder, decoder, eoptim, doptim, criterion, question_ids, answer_ids, print_every=100):
    input_length = len(question_ids[0])
    dataset_size = len(idx_q)
    batch_count  = dataset_size//batch_size
    for batch_index in range(batch_count):
        #l,r = batch_index * batch_size, (batch_index + 1) * batch_size
        import numpy as np
        indices = np.arange(batch_size)
        np.random.shuffle(indices)
        indices = indices * batch_count
        
        question_id, answer_id = question_ids[indices], answer_ids[indices]
        _batch_size = len(question_id)
        if _batch_size != batch_size:
            print('breaking because batch sizes do not match')
            break

        data = Variable(torch.from_numpy(question_id).long().cuda().t())
        target = Variable(torch.from_numpy(answer_id).long().cuda().t())

        eoptim.zero_grad(), doptim.zero_grad()    
        initial_hidden = initial_state(batch_size, hidden_size).cuda(), initial_state(batch_size, hidden_size).cuda()
        
        encoder_output = encoder(data, initial_hidden, _batch_size)
        decoder_output = decoder(target, encoder_output, _batch_size)
        loss = 0
        for i in range(input_length):
            logits = F.log_softmax(decoder_output[i])
            loss += criterion(logits, target[i])    
            
        loss.backward()
        eoptim.step(), doptim.step()
        config.printsize = False
        
        if batch_index % print_every == 0:
            print('\t{} - loss: {}'.format(batch_index, loss.data[0]))
        
    return loss.data[0]

In [24]:
encoder = Encoder(vocab_size, hidden_size)
decoder = Decoder(vocab_size, hidden_size)

encoder.cuda()
decoder.cuda()

criterion = nn.NLLLoss()

eoptim = optim.SGD(encoder.parameters(), lr=0.1, momentum=0.1)
doptim = optim.SGD(decoder.parameters(), lr=0.1, momentum=0.1)

In [25]:
train_epochs(10, encoder, decoder, eoptim, doptim, criterion,)


  0%|          | 0/11 [00:00<?, ?it/s]
enc_inputs torch.Size([21, 128]) <class 'torch.cuda.LongTensor'>
enc_embeddings torch.Size([21, 128, 256]) <class 'torch.cuda.FloatTensor'>
enc_embeddings torch.Size([21, 128, 256]) <class 'torch.cuda.FloatTensor'>
hidden torch.Size([128, 256]) <class 'torch.cuda.FloatTensor'>
hidden torch.Size([128, 256]) <class 'torch.cuda.FloatTensor'>
GO torch.Size([128]) <class 'torch.cuda.LongTensor'>
	dec_input torch.Size([128]) <class 'torch.cuda.LongTensor'>
	dec_input_emb torch.Size([128, 256]) <class 'torch.cuda.FloatTensor'>
	topi torch.Size([128, 1]) <class 'torch.cuda.LongTensor'>
	dec_input torch.Size([128]) <class 'torch.cuda.LongTensor'>
	dec_input_emb torch.Size([128, 256]) <class 'torch.cuda.FloatTensor'>
	topi torch.Size([128, 1]) <class 'torch.cuda.LongTensor'>
	dec_input torch.Size([128]) <class 'torch.cuda.LongTensor'>
	dec_input_emb torch.Size([128, 256]) <class 'torch.cuda.FloatTensor'>
	topi torch.Size([128, 1]) <class 'torch.cuda.LongTensor'>
	dec_input torch.Size([128]) <class 'torch.cuda.LongTensor'>
	dec_input_emb torch.Size([128, 256]) <class 'torch.cuda.FloatTensor'>
	topi torch.Size([128, 1]) <class 'torch.cuda.LongTensor'>
	dec_input torch.Size([128]) <class 'torch.cuda.LongTensor'>
	dec_input_emb torch.Size([128, 256]) <class 'torch.cuda.FloatTensor'>
	topi torch.Size([128, 1]) <class 'torch.cuda.LongTensor'>
	dec_input torch.Size([128]) <class 'torch.cuda.LongTensor'>
	dec_input_emb torch.Size([128, 256]) <class 'torch.cuda.FloatTensor'>
	topi torch.Size([128, 1]) <class 'torch.cuda.LongTensor'>
	dec_input torch.Size([128]) <class 'torch.cuda.LongTensor'>
	dec_input_emb torch.Size([128, 256]) <class 'torch.cuda.FloatTensor'>
	topi torch.Size([128, 1]) <class 'torch.cuda.LongTensor'>
	dec_input torch.Size([128]) <class 'torch.cuda.LongTensor'>
	dec_input_emb torch.Size([128, 256]) <class 'torch.cuda.FloatTensor'>
	topi torch.Size([128, 1]) <class 'torch.cuda.LongTensor'>
	dec_input torch.Size([128]) <class 'torch.cuda.LongTensor'>
	dec_input_emb torch.Size([128, 256]) <class 'torch.cuda.FloatTensor'>
	topi torch.Size([128, 1]) <class 'torch.cuda.LongTensor'>
	dec_input torch.Size([128]) <class 'torch.cuda.LongTensor'>
	dec_input_emb torch.Size([128, 256]) <class 'torch.cuda.FloatTensor'>
	topi torch.Size([128, 1]) <class 'torch.cuda.LongTensor'>
	dec_input torch.Size([128]) <class 'torch.cuda.LongTensor'>
	dec_input_emb torch.Size([128, 256]) <class 'torch.cuda.FloatTensor'>
	topi torch.Size([128, 1]) <class 'torch.cuda.LongTensor'>
	dec_input torch.Size([128]) <class 'torch.cuda.LongTensor'>
	dec_input_emb torch.Size([128, 256]) <class 'torch.cuda.FloatTensor'>
	topi torch.Size([128, 1]) <class 'torch.cuda.LongTensor'>
	dec_input torch.Size([128]) <class 'torch.cuda.LongTensor'>
	dec_input_emb torch.Size([128, 256]) <class 'torch.cuda.FloatTensor'>
	topi torch.Size([128, 1]) <class 'torch.cuda.LongTensor'>
	dec_input torch.Size([128]) <class 'torch.cuda.LongTensor'>
	dec_input_emb torch.Size([128, 256]) <class 'torch.cuda.FloatTensor'>
	topi torch.Size([128, 1]) <class 'torch.cuda.LongTensor'>
	dec_input torch.Size([128]) <class 'torch.cuda.LongTensor'>
	dec_input_emb torch.Size([128, 256]) <class 'torch.cuda.FloatTensor'>
	topi torch.Size([128, 1]) <class 'torch.cuda.LongTensor'>
	dec_input torch.Size([128]) <class 'torch.cuda.LongTensor'>
	dec_input_emb torch.Size([128, 256]) <class 'torch.cuda.FloatTensor'>
	topi torch.Size([128, 1]) <class 'torch.cuda.LongTensor'>
	dec_input torch.Size([128]) <class 'torch.cuda.LongTensor'>
	dec_input_emb torch.Size([128, 256]) <class 'torch.cuda.FloatTensor'>
	topi torch.Size([128, 1]) <class 'torch.cuda.LongTensor'>
	dec_input torch.Size([128]) <class 'torch.cuda.LongTensor'>
	dec_input_emb torch.Size([128, 256]) <class 'torch.cuda.FloatTensor'>
	topi torch.Size([128, 1]) <class 'torch.cuda.LongTensor'>
	dec_input torch.Size([128]) <class 'torch.cuda.LongTensor'>
	dec_input_emb torch.Size([128, 256]) <class 'torch.cuda.FloatTensor'>
	topi torch.Size([128, 1]) <class 'torch.cuda.LongTensor'>
	dec_input torch.Size([128]) <class 'torch.cuda.LongTensor'>
	dec_input_emb torch.Size([128, 256]) <class 'torch.cuda.FloatTensor'>
	topi torch.Size([128, 1]) <class 'torch.cuda.LongTensor'>
	dec_input torch.Size([128]) <class 'torch.cuda.LongTensor'>
	dec_input_emb torch.Size([128, 256]) <class 'torch.cuda.FloatTensor'>
	topi torch.Size([128, 1]) <class 'torch.cuda.LongTensor'>
predicted_outputs torch.Size([21, 128, 256]) <class 'torch.cuda.FloatTensor'>
predicted_outputs torch.Size([2688, 6005]) <class 'torch.cuda.FloatTensor'>
predicted_outputs torch.Size([21, 128, 6005]) <class 'torch.cuda.FloatTensor'>
	0 - loss: 182.9229736328125
	100 - loss: 71.56849670410156
---------------------------------------------------------------------------
KeyboardInterrupt                         Traceback (most recent call last)
<ipython-input-25-91808252fcff> in <module>()
----> 1 train_epochs(10, encoder, decoder, eoptim, doptim, criterion,)

<ipython-input-14-857c2bff2705> in train_epochs(epochs, encoder, decoder, eoptim, doptim, criterion, print_every)
      9     for epoch in tqdm(range(epochs+1)):
     10         loss = train(encoder, decoder, eoptim, doptim, criterion, idx_q, idx_a,
---> 11                     print_every=print_every*100)    
     12         if epoch % print_every == 0:
     13             losses.append(loss)

<ipython-input-14-857c2bff2705> in train(encoder, decoder, eoptim, doptim, criterion, question_ids, answer_ids, print_every)
     39 
     40         encoder_output = encoder(data, initial_hidden, _batch_size)
---> 41         decoder_output = decoder(target, encoder_output, _batch_size)
     42         loss = 0
     43         for i in range(input_length):

/home/paarulakan/environments/python/pytorch-py35/lib/python3.5/site-packages/torch/nn/modules/module.py in __call__(self, *input, **kwargs)
    204 
    205     def __call__(self, *input, **kwargs):
--> 206         result = self.forward(*input, **kwargs)
    207         for hook in self._forward_hooks.values():
    208             hook_result = hook(self, input, result)

<ipython-input-23-30da90a7f90a> in forward(self, outputs, hidden, batch_size)
     63             psize('\tdec_input_emb', dec_input_emb)
     64 
---> 65             hidden, cell_state = self.decode(dec_input_emb, (hidden, cell_state))
     66             predicted_outputs.append(hidden)
     67 

/home/paarulakan/environments/python/pytorch-py35/lib/python3.5/site-packages/torch/nn/modules/module.py in __call__(self, *input, **kwargs)
    204 
    205     def __call__(self, *input, **kwargs):
--> 206         result = self.forward(*input, **kwargs)
    207         for hook in self._forward_hooks.values():
    208             hook_result = hook(self, input, result)

/home/paarulakan/environments/python/pytorch-py35/lib/python3.5/site-packages/torch/nn/modules/rnn.py in forward(self, input, hx)
    497             input, hx,
    498             self.weight_ih, self.weight_hh,
--> 499             self.bias_ih, self.bias_hh,
    500         )
    501 

/home/paarulakan/environments/python/pytorch-py35/lib/python3.5/site-packages/torch/nn/_functions/rnn.py in LSTMCell(input, hidden, w_ih, w_hh, b_ih, b_hh)
     25     ingate = F.sigmoid(ingate)
     26     forgetgate = F.sigmoid(forgetgate)
---> 27     cellgate = F.tanh(cellgate)
     28     outgate = F.sigmoid(outgate)
     29 

/home/paarulakan/environments/python/pytorch-py35/lib/python3.5/site-packages/torch/nn/functional.py in tanh(input)
    420 
    421 def tanh(input):
--> 422     return torch.tanh(input)
    423 
    424 

/home/paarulakan/environments/python/pytorch-py35/lib/python3.5/site-packages/torch/autograd/variable.py in tanh(self)
    346 
    347     def tanh(self):
--> 348         return Tanh()(self)
    349 
    350     def tanh_(self):

/home/paarulakan/environments/python/pytorch-py35/lib/python3.5/site-packages/torch/autograd/_functions/pointwise.py in forward(self, i)
     46             result = i.tanh_()
     47         else:
---> 48             result = i.tanh()
     49         self.save_for_backward(result)
     50         return result

KeyboardInterrupt: 

In [433]:
torch.save(encoder.state_dict(), 'graph.pytorch.encoder.pth')
torch.save(decoder.state_dict(), 'graph.pytorch.decoder.pth')

Test


In [446]:
encoder_test = Encoder(vocab_size, hidden_size)
decoder_test = Decoder(vocab_size, hidden_size)
encoder_test.cuda()
decoder_test.cuda()
encoder_test.load_state_dict(torch.load('graph.pytorch.encoder.pth'))
decoder_test.load_state_dict(torch.load('graph.pytorch.decoder.pth'))

In [450]:
batch = 0
l, r = batch * B, (batch + 1) * B
test_q, test_a = idx_q[0], idx_a[0]

encoder_test.eval()
decoder_test.eval()

test_q = Variable(torch.from_numpy(test_q).long().cuda())
test_a = Variable(torch.from_numpy(test_a).long().cuda())

config.printsize = True
batch_size = 1
hidden = initial_state(batch_size, hidden_size).cuda(), initial_state(batch_size, hidden_size).cuda()
predictions = decoder_test.predict(test_a, encoder_test(test_q, hidden, 1), 1)
predictions = predictions.squeeze(1)
predictions = F.log_softmax(predictions).max(1)[1].squeeze(1)


enc_inputs torch.Size([21]) <class 'torch.cuda.LongTensor'>
enc_embeddings torch.Size([21, 256]) <class 'torch.cuda.FloatTensor'>
enc_embeddings torch.Size([21, 1, 256]) <class 'torch.cuda.FloatTensor'>
hidden torch.Size([1, 256]) <class 'torch.cuda.FloatTensor'>
hidden torch.Size([1, 256]) <class 'torch.cuda.FloatTensor'>
GO torch.Size([1]) <class 'torch.cuda.LongTensor'>
GO_emd torch.Size([1, 256]) <class 'torch.cuda.FloatTensor'>
predicted_outputs torch.Size([21, 256]) <class 'torch.cuda.FloatTensor'>
predicted_outputs torch.Size([21, 6005]) <class 'torch.cuda.FloatTensor'>

In [216]:
def arr2sent(arr):
    return ' '.join([i2w[item] for item in arr])

In [451]:
print(predictions)
print(arr2sent(predictions.cpu().data.numpy()))
print(arr2sent(test_a.cpu().data.numpy()))


Variable containing:
  131
  289
   90
 5149
   75
   36
   46
   25
   25
   25
  141
   91
  213
   25
   25
   60
   20
  273
   14
   14
  122
[torch.cuda.LongTensor of size 21 (GPU 0)]

yeah cool great parade she all from me me me does need things me me thats not makes that that oh
yeah dude i would definitely consider a daniel unk super reliable and they are just bad ass EOS _ _ _

In [6]:
def train_epochs_(epochs, encoder, decoder, eoptim, doptim, criterion, print_every=1, validate_every=10):
    encoder.train()
    decoder.train()
    losses = []
    config.printsize = True

    for epoch in tqdm(range(epochs+1)):
        print('--- epoch: {} ---'.format(epoch))
        loss = train(encoder, decoder, eoptim, doptim, criterion, idx_q, idx_a,
                    print_every=print_every*1000)    
        if epoch % print_every == 0:
            losses.append(loss)
            print('{} - loss: {}'.format(epoch, loss))

        torch.save(encoder.state_dict(), 'graph.pytorch.encoder.pth')
        torch.save(decoder.state_dict(), 'graph.pytorch.decoder.pth')

        encoder_test.load_state_dict(torch.load('graph.pytorch.encoder.pth'))
        decoder_test.load_state_dict(torch.load('graph.pytorch.decoder.pth'))

        if epoch % validate_every == 0:
            test_q, test_a = idx_q[-1], idx_a[-1]

            encoder_test.eval()
            decoder_test.eval()

            test_q = Variable(torch.from_numpy(test_q).long().cuda())
            test_a = Variable(torch.from_numpy(test_a).long().cuda())

            #config.printsize = True
            _batch_size = 1
            hidden = initial_state(_batch_size, hidden_size).cuda(), initial_state(_batch_size, hidden_size).cuda()
            predictions = decoder_test.predict(test_a, encoder_test(test_q, hidden, _batch_size), _batch_size)
            predictions = predictions.squeeze(1)
            predictions = F.log_softmax(predictions).max(1)[1].squeeze(1)
            
            predictions_ = decoder_test(test_a, encoder_test(test_q, hidden, _batch_size), _batch_size)
            predictions_ = predictions_.squeeze(1)
            predictions_ = F.log_softmax(predictions_).max(1)[1].squeeze(1)

            print(arr2sent(predictions.cpu().data.numpy()))
            print(arr2sent(predictions_.cpu().data.numpy()))
            print(arr2sent(test_a.cpu().data.numpy()))