In [15]:
import sys
import torch
from torch import optim

from data_utils import load_dialog_task, vectorize_data, load_candidates, vectorize_candidates, tokenize
from six.moves import range, reduce
from itertools import chain
import numpy as np
import os
from sklearn import metrics
from torch.autograd import Variable as V

from model.mem_cnn_sim import MemCnnSim

In [2]:
# prepare the data 

def init(data_dir, task_id, OOV=False):
    # load candidates
    candidates, candid2indx = load_candidates(
        data_dir, task_id)
    n_cand = len(candidates)
    print("Candidate Size", n_cand)
    indx2candid = dict(
        (candid2indx[key], key) for key in candid2indx)

    # load task data
    train_data, test_data, val_data = load_dialog_task(
        data_dir, task_id, candid2indx, OOV)
    data = train_data + test_data + val_data

    # build parameters
    word_idx, sentence_size, \
    candidate_sentence_size, memory_size, \
    vocab_size = build_vocab(data, candidates)

    # Variable(torch.from_numpy(candidates_vec)).view(len(candidates), sentence_size)
    candidates_vec = vectorize_candidates(
        candidates, word_idx, candidate_sentence_size)

    return candid2indx, \
           indx2candid, \
           candidates_vec, \
           word_idx, \
           sentence_size, \
           candidate_sentence_size, \
           memory_size, \
           vocab_size, \
           train_data, test_data, val_data


def build_vocab(data, candidates, memory_size=50):
    vocab = reduce(lambda x, y: x | y, (set(
        list(chain.from_iterable(s)) + q) for s, q, a in data))
    vocab |= reduce(lambda x, y: x | y, (set(candidate)
                                         for candidate in candidates))
    vocab = sorted(vocab)
    word_idx = dict((c, i + 1) for i, c in enumerate(vocab))

    max_story_size = max(map(len, (s for s, _, _ in data)))
    mean_story_size = int(np.mean([len(s) for s, _, _ in data]))
    sentence_size = max(map(len, chain.from_iterable(s for s, _, _ in data)))
    candidate_sentence_size = max(map(len, candidates))
    query_size = max(map(len, (q for _, q, _ in data)))
    memory_size = min(memory_size, max_story_size)
    vocab_size = len(word_idx) + 1  # +1 for nil word
    sentence_size = max(query_size, sentence_size)  # for the position
    # params
    print("vocab size:", vocab_size)
    print("Longest sentence length", sentence_size)
    print("Longest candidate sentence length", candidate_sentence_size)
    print("Longest story length", max_story_size)
    print("Average story length", mean_story_size)

    return word_idx, \
           sentence_size, \
           candidate_sentence_size, \
           memory_size, \
           vocab_size

In [3]:
def eval(utter_batch, memory_batch, answer__batch, dialog_idx, mem_cnn_sim, cuda=False):
    mem_cnn_sim.eval()

    total_loss = []
    preds = []
    for start, end in dialog_idx:

        loss_per_diaglo = []

        for j in range(start, end + 1):

            memory = V(torch.from_numpy(memory_batch[j])).unsqueeze(0)
            utter = V(torch.from_numpy(utter_batch[j])).unsqueeze(0)

            if cuda:
                memory = transfer_to_gpu(memory)
                utter = transfer_to_gpu(utter)

            context, cand_ = mem_cnn_sim(utter, memory, cands_tensor)
            pred = mem_cnn_sim.predict(context, cand_)
            preds.append(pred.data[0])

            print('pred: {}, loss: {}'.format(indx2candid[pred.data[0]], loss.data[0]))

            loss_per_diaglo.append(loss.data[0])

        total_loss += loss_per_diaglo

    accuracy = metrics.accuracy_score(answer__batch[:len(preds)], preds)
    print()
    print('Validation accuracy: {}'.format(accuracy))
    print('Validation loss: {}'.format(sum(total_loss)))
    input()

    return accuracy

In [4]:
def transfer_to_gpu(tensor, dtype=torch.LongTensor):
    tensor_cuda = dtype(tensor.size()).cuda()
    tensor_cuda = V(tensor_cuda)
    tensor_cuda.data.copy_(tensor.data)
    return tensor_cuda


def save_checkpoint(state, filename='checkpoint.pth.tar'):
    torch.save(state, filename)


def load_checkpoit(model, optimizer, path_to_model):
    if os.path.isfile(path_to_model):
        print("=> loading checkpoint '{}'".format(path_to_model))
        checkpoint = torch.load(path_to_model)
        model.load_state_dict(checkpoint['state_dict'])
        optimizer.load_state_dict(checkpoint['optimizer'])
        print("=> loaded checkpoint '{}' (epoch {})"
              .format(path_to_model, checkpoint['epoch']))
    else:
        print("=> no checkpoint found at '{}'".format(path_to_model))


def load_model(model, model_dir):
    load_checkpoit(model, model.optimizer, model_dir+'best_model')

In [5]:
def test_model(mem_cnn_sim):

    for i in range(20):
        utter = V(torch.LongTensor([1,1,1])).unsqueeze(0)
        memory = V(torch.LongTensor([[1,2,3], [4,5,6]])).unsqueeze(0)
        cand = V(torch.LongTensor([[7,8,9], [10,11,12], [13,14,15], [16,17,18]]))
        flag = V(torch.FloatTensor([0,0,0,1]))

        context, cand_ = mem_cnn_sim(utter, memory, cand)
        loss = mem_cnn_sim.loss_op(context, cand_, flag)
        pred = mem_cnn_sim.predict(context, cand_)
        mem_cnn_sim.optimize(loss)

        print('loss: {}, pred: {}'.format(loss.data[0], pred.data[0]))

In [6]:
data_dir = "data/dialog-bAbI-tasks/"
task_id = 6
epochs = 10
model_dir = "task" + str(task_id) + "_model/"
if not os.path.exists(model_dir):
    os.makedirs(model_dir)
test_ = False

cuda = torch.cuda.is_available()
if cuda: print('Cuda is available.')

candid2indx, \
indx2candid, \
candidates_vec, \
word_idx, \
sentence_size, \
candidate_sentence_size, \
memory_size, \
vocab_size, \
train_data, test_data, val_data = init(data_dir, task_id)

trainS, trainQ, trainA, dialog_idx = vectorize_data(
    train_data, word_idx, sentence_size, memory_size)
valS, valQ, valA, dialog_idx_val = vectorize_data(
    val_data, word_idx, sentence_size, memory_size)
n_train = len(trainS)
n_val = len(valS)

print("Training Size", n_train)
print("Validation Size", n_val)


/home/chen/anaconda3/envs/pytorch0.2/lib/python3.6/re.py:212: FutureWarning: split() requires a non-empty pattern match.
  return _compile(pattern, flags).split(string, maxsplit)
Candidate Size 2407
vocab size: 1993
Longest sentence length 29
Longest candidate sentence length 27
Longest story length 812
Average story length 32
Training Size 14404
Validation Size 4159

In [7]:
param = {
        'hops': 3,
        "vocab_size": vocab_size,
        "embedding_size": 80,
        'num_filters': 20,
        "cand_vocab_size": vocab_size,
        'max_grad_norm': 40.0
         }

mem_cnn_sim = MemCnnSim(param)

if test_:
    test_model(mem_cnn_sim)
    input()

In [8]:
best_validation_accuracy = 0
time = []

cands_tensor = V(torch.from_numpy(candidates_vec))
num_cand = cands_tensor.size(0)
num_dialog = len(dialog_idx)

if cuda:
    cands_tensor = transfer_to_gpu(cands_tensor)

In [16]:
mem_cnn_sim.optimizer = optim.Adam(mem_cnn_sim.parameters(), lr=0.01)
mem_cnn_sim.train()


Out[16]:
MemCnnSim (
  (memn2n): CNN (
    (embedding): Embedding(1993, 80)
    (cnn): Conv2d(1, 20, kernel_size=(2, 80), stride=(1, 1))
    (l1): Linear (20 -> 80)
    (l2): Linear (80 -> 80)
    (l3): Linear (80 -> 80)
  )
  (cnn): CNN (
    (embedding): Embedding(1993, 80)
    (cnn): Conv2d(1, 20, kernel_size=(2, 80), stride=(1, 1))
    (l1): Linear (20 -> 80)
    (l2): Linear (80 -> 80)
    (l3): Linear (80 -> 80)
  )
  (criterion): CosineEmbeddingLoss (
  )
)

In [49]:
k = 0  # dialog utter
for k in range(20):
    ans = trainA[k]

    memory = V(torch.from_numpy(trainS[k])).unsqueeze(0)
    utter = V(torch.from_numpy(trainQ[k])).unsqueeze(0)

    flag = -1 * torch.ones(num_cand)
    flag[ans] = 1

    flag = V(flag)

    if cuda:
        mem_cnn_sim.cuda()

        memory = transfer_to_gpu(memory)
        utter = transfer_to_gpu(utter)
        flag = transfer_to_gpu(flag, dtype=torch.FloatTensor)

    context, cand_ = mem_cnn_sim(utter, memory, cands_tensor)
    loss = mem_cnn_sim.loss_op(context, cand_, flag)
    mem_cnn_sim.optimize(loss)

In [50]:
context, cand_ = mem_cnn_sim(utter, memory, cands_tensor)
pred = mem_cnn_sim.predict(cand_=cand_, context=context)
print(indx2candid[pred.data[0]])
print(pred.data[0])
print(loss.data[0])


The price range at curry_garden is expensive .
1306
0.00037769321352243423

In [40]:
print(train_data[k])
print(trainA[k])
print(trainQ[k])
print(trainS[k])


([], ['<silence>'], 1603)
1603
[812   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0]
[[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]]

In [ ]: