In [1]:
import json

from local_settings import settings, datautils

from datautils.vocabulary import Vocabulary

import pandas as pd
import numpy as np

import torch
from torch import FloatTensor
from torch import nn
from torch.autograd import Variable
from torch.nn import Parameter
from torch.nn import functional as F
from torch.utils.data import Dataset
from torch.utils.data import DataLoader

from tqdm import tqdm, tqdm_notebook

Class Definitions

Data Model:

  • Raw data
  • Vectorizer
  • Vectorized Data
  • Data generator

In [46]:
class RawSurnames(object):
    def __init__(self, data_path=settings.SURNAMES_CSV, delimiter=","):
        self.data = pd.read_csv(data_path, delimiter=delimiter)

    def get_data(self, filter_to_nationality=None):
        if filter_to_nationality is not None:
            return self.data[self.data.nationality.isin(filter_to_nationality)]
        return self.data

# vectorizer

class SurnamesVectorizer(object):
    def __init__(self, surname_vocab, nationality_vocab, max_seq_length):
        self.surname_vocab = surname_vocab
        self.nationality_vocab = nationality_vocab
        self.max_seq_length = max_seq_length
        
    def save(self, filename):
        vec_dict = {"surname_vocab": self.surname_vocab.get_serializable_contents(),
                    "nationality_vocab": self.nationality_vocab.get_serializable_contents(),
                    'max_seq_length': self.max_seq_length}

        with open(filename, "w") as fp:
            json.dump(vec_dict, fp)
        
    @classmethod
    def load(cls, filename):
        with open(filename, "r") as fp:
            vec_dict = json.load(fp)

        vec_dict["surname_vocab"] = Vocabulary.deserialize_from_contents(vec_dict["surname_vocab"])
        vec_dict["nationality_vocab"] = Vocabulary.deserialize_from_contents(vec_dict["nationality_vocab"])
        return cls(**vec_dict)

    @classmethod
    def fit(cls, surname_df):
        """
        """
        surname_vocab = Vocabulary(use_unks=False,
                                   use_mask=True,
                                   use_start_end=True,
                                   start_token=settings.START_TOKEN,
                                   end_token=settings.END_TOKEN)

        nationality_vocab = Vocabulary(use_unks=False, use_start_end=False, use_mask=False)

        max_seq_length = 0
        for index, row in surname_df.iterrows():
            surname_vocab.add_many(row.surname)
            nationality_vocab.add(row.nationality)

            if len(row.surname) > max_seq_length:
                max_seq_length = len(row.surname)
        max_seq_length = max_seq_length + 2

        return cls(surname_vocab, nationality_vocab, max_seq_length)

    @classmethod
    def fit_transform(cls, surname_df, split='train'):
        vectorizer = cls.fit(surname_df)
        return vectorizer, vectorizer.transform(surname_df, split)

    def transform(self, surname_df, split='train'):

        df = surname_df[surname_df.split==split].reset_index()
        n_data = len(df)
        
        x_surnames = np.zeros((n_data, self.max_seq_length), dtype=np.int64)
        y_surnames = np.ones((n_data, self.max_seq_length), dtype=np.int64) * settings.IGNORE_INDEX_VALUE
        x_nationalities = np.zeros(n_data, dtype=np.int64)

        for index, row in df.iterrows():
            vectorized_surname = list(self.surname_vocab.map(row.surname, 
                                                             include_start_end=True))
            x_part = vectorized_surname[:-1]
            y_part = vectorized_surname[1:]
            x_surnames[index, :len(x_part)] = x_part
            y_surnames[index, :len(y_part)] = y_part
            x_nationalities[index] = self.nationality_vocab[row.nationality]

        return VectorizedSurnames(x_surnames, x_nationalities, y_surnames)

# vec data

class VectorizedSurnames(Dataset):
    def __init__(self, x_surnames, x_nationalities, y_surnames):
        self.x_surnames = x_surnames
        self.x_nationalities = x_nationalities
        self.y_surnames = y_surnames

    def __len__(self):
        return len(self.x_surnames)

    def __getitem__(self, index):
        return {'x_surnames': self.x_surnames[index],
                'x_nationalities': self.x_nationalities[index],
                'y_surnames': self.y_surnames[index],
                'x_lengths': len(self.x_surnames[index].nonzero()[0])}

# data generator

def make_generator(vectorized_data, batch_size, num_batches=-1, 
                               num_workers=0, volatile_mode=False, 
                               strict_batching=True):

    loaded_data = DataLoader(vectorized_data, batch_size=batch_size, 
                             shuffle=True, num_workers=num_workers)

    def inner_func(num_batches=num_batches, 
                   volatile_mode=volatile_mode):

        for batch_index, batch in enumerate(loaded_data):
            out = {}
            current_batch_size = list(batch.values())[0].size(0)
            if current_batch_size < batch_size and strict_batching:
                break
            for key, value in batch.items():
                if not isinstance(value, Variable):
                    value = Variable(value)
                if settings.CUDA:
                    value = value.cuda()
                if volatile_mode:
                    value = value.volatile()
                out[key] = value
            yield out

            if num_batches > 0 and batch_index > num_batches:
                break

    return inner_func

Class definitions for the model


In [3]:
def new_parameter(*size):
    out = Parameter(FloatTensor(*size))
    torch.nn.init.xavier_normal(out)
    return out

class ExplicitRNN(nn.Module):
    def __init__(self, input_size, hidden_size, expect_batch_on_dim0=False):
        super(ExplicitRNN, self).__init__()
        self.W_in2hid = new_parameter(input_size, hidden_size)
        self.W_hid2hid = new_parameter(hidden_size, hidden_size)
            
        self.b_hid = new_parameter(1, hidden_size)
        
        self.hidden_size = hidden_size

        self.expect_batch_on_dim0 = expect_batch_on_dim0
    
    def _compute_next_hidden(self, x, h):
        return F.tanh(x.matmul(self.W_in2hid) + 
                      h.matmul(self.W_hid2hid) + 
                      self.b_hid)

    def forward(self, x_in, hid_t=None):
        if self.expect_batch_on_dim0:
            batch_size, seq_size, feat_size = x_in.size()
            x_in = x_in.permute(1, 0, 2)
        else:
            seq_size, batch_size, feat_size = x_in.size()

        hiddens = []
        if hid_t is None:
            hid_t = Variable(torch.zeros((batch_size, self.hidden_size)))
        
        if settings.CUDA:
            hid_t = hid_t.cuda()
            
        for t in range(seq_size):
            x_t = x_in[t]
            hid_t = self._compute_next_hidden(x_t, hid_t)
            
            hiddens.append(hid_t)
        hiddens = torch.stack(hiddens)

        if self.expect_batch_on_dim0:
            hiddens = hiddens.permute(1, 0, 2)

        return hiddens

    
class CharNN(nn.Module):
    def __init__(self, embedding_size, in_vocab_size, out_vocab_size, hidden_size, num_conditioning_states,
                 expect_batch_on_dim0=False):
        super(CharNN, self).__init__()
        
        self.emb = nn.Embedding(embedding_dim=embedding_size, 
                                num_embeddings=in_vocab_size, 
                                padding_idx=0)
        self.conditional_emb = nn.Embedding(embedding_dim=hidden_size, 
                                            num_embeddings=num_conditioning_states)
        self.fc = nn.Linear(in_features=hidden_size, out_features=out_vocab_size)
        self.rnn = ExplicitRNN(input_size=embedding_size, hidden_size=hidden_size, 
                               expect_batch_on_dim0=expect_batch_on_dim0)
    
    def forward(self, x_in, state_in, x_lengths=None, apply_softmax=False):
        x_in = self.emb(x_in)
        state_in = self.conditional_emb(state_in)
        y_out = self.rnn(x_in, state_in)

        dim0, dim1, dim2 = y_out.size()
        y_out = y_out.contiguous().view(-1, dim2)

        y_out = self.fc(y_out)

        # optionally apply the softmax
        if apply_softmax:
            y_out = F.softmax(y_out)

        y_out = y_out.view(dim0, dim1, -1)
        
        return y_out
    
def normalize_sizes(net_output, y_true):
    net_output = net_output.cpu()
    y_true = y_true.cpu()
    if len(net_output.size()) == 3:
        net_output.contiguous()
        net_output = net_output.view(-1, net_output.size(2))
    if len(y_true.size()) == 2:
        y_true.contiguous()
        y_true = y_true.view(-1)
    return net_output, y_true

def sequence_loss(net_output, y_true, loss_func=F.cross_entropy):
    net_output, y_true = normalize_sizes(net_output, y_true)
    return F.cross_entropy(net_output, y_true, ignore_index=settings.IGNORE_INDEX_VALUE)

def compute_accuracy(yhat, ytrue):
    yhat, ytrue = normalize_sizes(yhat, ytrue)
    _, yhat_indices = yhat.max(dim=1)
    n_correct = torch.eq(yhat_indices, ytrue).sum().data.numpy()[0]
    return n_correct / len(yhat_indices) * 100

def training_loop(net, datagen_func, optimizer, bar=None):
    if bar is None:
        bar = tqdm(position=2)
    accs = []
    for data_dictionary in datagen_func():
        net.zero_grad()
        optimizer.zero_grad()
        
        yhat = net(data_dictionary['x_surnames'],
                   data_dictionary['x_nationalities'], 
                   data_dictionary['x_lengths'])
        loss = sequence_loss(yhat, data_dictionary['y_surnames'])
        accs.append(compute_accuracy(yhat, data_dictionary['y_surnames']))
        
        bar.update(1)
        bar.set_postfix(loss=loss.cpu().data.numpy()[0], 
                        accuracy="{:0.2f}".format(np.mean(accs)))
        
        loss.backward()
        optimizer.step()
          
def val_loop(net, datagen_func, bar=None):
    if bar is None:
        bar = tqdm(position=1)
    accs = []
    for data_dictionary in datagen_func():
        yhat = net(data_dictionary['x_surnames'],
                   data_dictionary['x_nationalities'], 
                   data_dictionary['x_lengths'], apply_softmax=True)
        accs.append(compute_accuracy(yhat, data_dictionary['y_surnames']))
        bar.update(1)
        bar.set_postfix(accuracy="{:0.2f}".format(np.mean(accs)))

In [4]:
def sample(emb, rnn, fc, h_t=None, idx_t=None, n=20, temp=1):
    hiddens = [h_t]
    indices = [idx_t]
    out_dists = []
    
    for t in range(n):
        x_t = emb(idx_t)
        h_t = rnn._compute_next_hidden(x_t, h_t)
        
        y_t = fc(h_t)
        y_t = F.softmax( y_t / temp)
        idx_t = torch.multinomial(y_t, 1)[:, 0]
        
        
        hiddens.append(h_t)
        indices.append(idx_t)
        out_dists.append(y_t)
     
    indices = torch.stack(indices).squeeze().permute(1, 0)
    return indices

def long_variable(indices):
    out = Variable(torch.LongTensor(indices))
    if settings.CUDA:
        out = out.cuda()
    return out

def make_initial_hidden(batch_size, hidden_size):
    out = Variable(torch.ones(batch_size, hidden_size))
    if settings.CUDA:
        out = out.cuda()
    return out

def make_initial_x(batch_size, vectorizer):
    out = Variable(torch.ones(batch_size) * vectorizer.surname_vocab.start_index).long()
    if settings.CUDA:
        out = out.cuda()
    return out

def decode_one(vectorizer, seq):
    out = []
    for i in seq:
        if vectorizer.surname_vocab.start_index == i:
            continue
        if vectorizer.surname_vocab.end_index == i:
            return ''.join(out)
        out.append(vectorizer.surname_vocab.lookup(i))
    return ''.join(out)
            
def decode_matrix(vectorizer, mat):
    mat = mat.cpu().data.numpy()
    return [decode_one(vectorizer, mat[i]) for i in range(len(mat))]

def n_random_nationalities(n):
    keys = np.random.choice(vectorizer.nationality_vocab.keys(), size=n, replace=True)
    indices = long_variable([vectorizer.nationality_vocab[key] for key in keys])
    return keys, indices

def sample_n(n=10, temp=0.8):
    init_names, init_vector = n_random_nationalities(n)
    init_vector = net.conditional_emb(init_vector)
    samples = decode_matrix(vectorizer, 
                            sample(net.emb, net.rnn, net.fc, 
                                   init_vector, 
                                   make_initial_x(n, vectorizer),
                                   temp=temp))
    return list(zip(init_names, samples))

def sample_n_for_nationality(nationality, n=10, temp=0.8):
    assert nationality in vectorizer.nationality_vocab.keys(), 'not a nationality we trained on'
    keys = [nationality] * n
    init_vector = long_variable([vectorizer.nationality_vocab[key] for key in keys])
    init_vector = net.conditional_emb(init_vector)
    samples = decode_matrix(vectorizer, 
                        sample(net.emb, net.rnn, net.fc, 
                               init_vector, 
                               make_initial_x(n, vectorizer),
                               temp=temp))
    return list(zip(keys, samples))

Make, Train, and Eval


In [5]:
from settings import ZOO
import os

batch_size = 16

raw_data = RawSurnames().get_data()

zoo_info = ZOO.charnn_surname_conditioned_predicter

if os.path.exists(zoo_info['vocab']):
    vectorizer = SurnamesVectorizer.load(zoo_info['vocab'])
    print("Loading vectorizer!")
else:
    vectorizer = SurnamesVectorizer.fit(raw_data)
    print("Creating a new vectorizer.")
    
vec_train = vectorizer.transform(raw_data, split='train')
vec_test = vectorizer.transform(raw_data, split='test')

train_data_func = make_generator(vec_train, batch_size=batch_size)
test_data_func = make_generator(vec_test, batch_size=batch_size)
    
parameters = dict(zoo_info['parameters'])    
parameters['in_vocab_size'] = len(vectorizer.surname_vocab)
parameters['out_vocab_size'] = len(vectorizer.surname_vocab)
parameters['expect_batch_on_dim0'] = True
parameters['num_conditioning_states'] = len(vectorizer.nationality_vocab)

net = CharNN(**parameters)

if os.path.exists(zoo_info['filename']):
    print("Loading state dict!")
    net.load_state_dict(torch.load(zoo_info['filename'], map_location=lambda storage, loc: storage))
else:
    print("Using newly initiated network!")

if settings.CUDA:
    print("CUDA mode enabled")
    net = net.cuda()
else:
    print("CUDA mode not enabled")
    net = net.cpu()


Loading vectorizer!
Loading state dict!
CUDA mode enabled

In [41]:
sample_n_for_nationality('russian')


Out[41]:
[('russian', 'Kallemanov'),
 ('russian', 'Valsak'),
 ('russian', 'Jukyachki'),
 ('russian', 'Avitov'),
 ('russian', 'Avlader'),
 ('russian', 'Tulakov'),
 ('russian', 'Morojanov'),
 ('russian', 'Vitsyansky'),
 ('russian', 'Yankovsky'),
 ('russian', 'Agashno')]

In [43]:
sample_n()


Out[43]:
[('russian', 'Balakin'),
 ('spanish', 'Zistili'),
 ('scottish', 'Grones'),
 ('english', 'Tompen'),
 ('japanese', 'Iyof'),
 ('portuguese', 'Cloza'),
 ('japanese', 'Iukatamo'),
 ('japanese', 'Maneihama'),
 ('polish', 'Bostaid'),
 ('vietnamese', 'oha')]

In [44]:
# Train
n_epochs = 100
optimizer = torch.optim.Adam(net.parameters(), lr=0.0001)
bar = tqdm_notebook(total=n_epochs, position=0)
valbar = tqdm_notebook(position=2)
trainbar = tqdm_notebook(position=3)

try:
    
    for _ in range(n_epochs):

        net.train(False)
        val_loop(net, test_data_func, bar=valbar)
        net.train(True)
        training_loop(net, train_data_func, optimizer, bar=trainbar)
        init_names, init_vector = n_random_nationalities(2)
        init_vector = net.conditional_emb(init_vector)
        samples = decode_matrix(vectorizer, 
                                sample(net.emb, net.rnn, net.fc, 
                                       init_vector, 
                                       make_initial_x(2, vectorizer),
                                       temp=0.8))
        
        bar.update(1)
        postfix = dict(zip(init_names, samples))
        bar.set_postfix(**postfix)

    net.train(False)
    val_loop(net, test_data_func, valbar)
except KeyboardInterrupt:
    print("...")


...

In [59]:
sample_n(30, 0.9)


Out[59]:
[('portuguese', 'Rusa'),
 ('polish', 'Bamidev'),
 ('vietnamese', 'Shan'),
 ('russian', 'Dagarev'),
 ('scottish', 'Weinell'),
 ('italian', 'Valvallo'),
 ('italian', 'Vakhama'),
 ('vietnamese', 'Tran'),
 ('spanish', 'Bebrud'),
 ('italian', 'Firro'),
 ('czech', 'Cherensandeev'),
 ('greek', 'Alugoua'),
 ('scottish', 'Otres'),
 ('spanish', 'Molana'),
 ('czech', 'Pinko'),
 ('scottish', 'Bunsta'),
 ('irish', 'Lancen'),
 ('portuguese', 'Coma'),
 ('irish', "O'Houdleth"),
 ('english', 'Uttimo'),
 ('arabic', 'Hahal'),
 ('portuguese', 'Railborid'),
 ('arabic', 'Masour'),
 ('chinese', 'Monin'),
 ('english', 'Maller'),
 ('english', 'Miclles'),
 ('dutch', 'fehrey'),
 ('czech', 'Fengess'),
 ('czech', 'Mrostifi'),
 ('arabic', 'Atkin')]