In [1]:
import json

from local_settings import settings, datautils

from datautils.vocabulary import Vocabulary

import pandas as pd
import numpy as np

import torch
from torch import FloatTensor
from torch import nn
from torch.autograd import Variable
from torch.nn import Parameter
from torch.nn import functional as F
from torch.utils.data import Dataset
from torch.utils.data import DataLoader

from tqdm import tqdm, tqdm_notebook

Class Definitions

Data Model:

  • Raw data
  • Vectorizer
  • Vectorized Data
  • Data generator

In [2]:
class RawSurnames(object):
    def __init__(self, data_path=settings.SURNAMES_CSV, delimiter=","):
        self.data = pd.read_csv(data_path, delimiter=delimiter)

    def get_data(self, filter_to_nationality=None):
        if filter_to_nationality is not None:
            return self.data[self.data.nationality.isin(filter_to_nationality)]
        return self.data

# vectorizer

class SurnamesVectorizer(object):
    def __init__(self, surname_vocab, nationality_vocab, max_seq_length):
        self.surname_vocab = surname_vocab
        self.nationality_vocab = nationality_vocab
        self.max_seq_length = max_seq_length
        
    def save(self, filename):
        vec_dict = {"surname_vocab": self.surname_vocab.get_serializable_contents(),
                    "nationality_vocab": self.nationality_vocab.get_serializable_contents(),
                    'max_seq_length': self.max_seq_length}

        with open(filename, "w") as fp:
            json.dump(vec_dict, fp)
        
    @classmethod
    def load(cls, filename):
        with open(filename, "r") as fp:
            vec_dict = json.load(fp)

        vec_dict["surname_vocab"] = Vocabulary.deserialize_from_contents(vec_dict["surname_vocab"])
        vec_dict["nationality_vocab"] = Vocabulary.deserialize_from_contents(vec_dict["nationality_vocab"])
        return cls(**vec_dict)

    @classmethod
    def fit(cls, surname_df):
        """
        """
        surname_vocab = Vocabulary(use_unks=False,
                                   use_mask=True,
                                   use_start_end=True,
                                   start_token=settings.START_TOKEN,
                                   end_token=settings.END_TOKEN)

        nationality_vocab = Vocabulary(use_unks=False, use_start_end=False, use_mask=False)

        max_seq_length = 0
        for index, row in surname_df.iterrows():
            surname_vocab.add_many(row.surname)
            nationality_vocab.add(row.nationality)

            if len(row.surname) > max_seq_length:
                max_seq_length = len(row.surname)
        max_seq_length = max_seq_length + 2

        return cls(surname_vocab, nationality_vocab, max_seq_length)

    @classmethod
    def fit_transform(cls, surname_df, split='train'):
        vectorizer = cls.fit(surname_df)
        return vectorizer, vectorizer.transform(surname_df, split)

    def transform(self, surname_df, split='train'):

        df = surname_df[surname_df.split==split].reset_index()
        n_data = len(df)
        
        x_surnames = np.zeros((n_data, self.max_seq_length), dtype=np.int64)
        y_surnames = np.ones((n_data, self.max_seq_length), dtype=np.int64) * settings.IGNORE_INDEX_VALUE

        for index, row in df.iterrows():
            vectorized_surname = list(self.surname_vocab.map(row.surname, 
                                                             include_start_end=True))
            x_part = vectorized_surname[:-1]
            y_part = vectorized_surname[1:]
            x_surnames[index, :len(x_part)] = x_part
            y_surnames[index, :len(y_part)] = y_part

        return VectorizedSurnames(x_surnames, y_surnames)

# vec data

class VectorizedSurnames(Dataset):
    def __init__(self, x_surnames, y_surnames):
        self.x_surnames = x_surnames
        self.y_surnames = y_surnames

    def __len__(self):
        return len(self.x_surnames)

    def __getitem__(self, index):
        return {'x_surnames': self.x_surnames[index],
                'y_surnames': self.y_surnames[index],
                'x_lengths': len(self.x_surnames[index].nonzero()[0])}

# data generator

def make_generator(vectorized_data, batch_size, num_batches=-1, 
                               num_workers=0, volatile_mode=False, 
                               strict_batching=True):

    loaded_data = DataLoader(vectorized_data, batch_size=batch_size, 
                             shuffle=True, num_workers=num_workers)

    def inner_func(num_batches=num_batches, 
                   volatile_mode=volatile_mode):

        for batch_index, batch in enumerate(loaded_data):
            out = {}
            current_batch_size = list(batch.values())[0].size(0)
            if current_batch_size < batch_size and strict_batching:
                break
            for key, value in batch.items():
                if not isinstance(value, Variable):
                    value = Variable(value)
                if settings.CUDA:
                    value = value.cuda()
                if volatile_mode:
                    value = value.volatile()
                out[key] = value
            yield out

            if num_batches > 0 and batch_index > num_batches:
                break

    return inner_func

Class definitions for the model


In [3]:
def new_parameter(*size):
    out = Parameter(FloatTensor(*size))
    torch.nn.init.xavier_normal(out)
    return out

class ExplicitRNN(nn.Module):
    def __init__(self, input_size, hidden_size, expect_batch_on_dim0=False):
        super(ExplicitRNN, self).__init__()
        self.W_in2hid = new_parameter(input_size, hidden_size)
        self.W_hid2hid = new_parameter(hidden_size, hidden_size)
            
        self.b_hid = new_parameter(1, hidden_size)
        
        self.hidden_size = hidden_size

        self.expect_batch_on_dim0 = expect_batch_on_dim0
    
    def _compute_next_hidden(self, x, h):
        return F.tanh(x.matmul(self.W_in2hid) + 
                      h.matmul(self.W_hid2hid) + 
                      self.b_hid)

    def forward(self, x_in, hid_t=None):
        if self.expect_batch_on_dim0:
            batch_size, seq_size, feat_size = x_in.size()
            x_in = x_in.permute(1, 0, 2)
        else:
            seq_size, batch_size, feat_size = x_in.size()

        hiddens = []
        if hid_t is None:
            hid_t = Variable(torch.zeros((batch_size, self.hidden_size)))
        
        if settings.CUDA:
            hid_t = hid_t.cuda()
            
        for t in range(seq_size):
            x_t = x_in[t]
            hid_t = self._compute_next_hidden(x_t, hid_t)
            
            hiddens.append(hid_t)
        hiddens = torch.stack(hiddens)

        if self.expect_batch_on_dim0:
            hiddens = hiddens.permute(1, 0, 2)

        return hiddens

    
class CharNN(nn.Module):
    def __init__(self, embedding_size, in_vocab_size, out_vocab_size, hidden_size, 
                 expect_batch_on_dim0=False):
        super(CharNN, self).__init__()
        
        self.emb = nn.Embedding(embedding_dim=embedding_size, num_embeddings=in_vocab_size, padding_idx=0)
        self.fc = nn.Linear(in_features=hidden_size, out_features=out_vocab_size)
        self.rnn = ExplicitRNN(input_size=embedding_size, hidden_size=hidden_size, 
                               expect_batch_on_dim0=expect_batch_on_dim0)
    
    def forward(self, x_in, x_lengths=None, apply_softmax=False):
        x_in = self.emb(x_in)
        y_out = self.rnn(x_in)

        dim0, dim1, dim2 = y_out.size()
        y_out = y_out.contiguous().view(-1, dim2)

        y_out = self.fc(y_out)

        # optionally apply the softmax
        if apply_softmax:
            y_out = F.softmax(y_out)

        y_out = y_out.view(dim0, dim1, -1)
        
        return y_out
    
def normalize_sizes(net_output, y_true):
    net_output = net_output.cpu()
    y_true = y_true.cpu()
    if len(net_output.size()) == 3:
        net_output.contiguous()
        net_output = net_output.view(-1, net_output.size(2))
    if len(y_true.size()) == 2:
        y_true.contiguous()
        y_true = y_true.view(-1)
    return net_output, y_true

def sequence_loss(net_output, y_true, loss_func=F.cross_entropy):
    net_output, y_true = normalize_sizes(net_output, y_true)
    return F.cross_entropy(net_output, y_true, ignore_index=settings.IGNORE_INDEX_VALUE)

def compute_accuracy(yhat, ytrue):
    yhat, ytrue = normalize_sizes(yhat, ytrue)
    _, yhat_indices = yhat.max(dim=1)
    n_correct = torch.eq(yhat_indices, ytrue).sum().data.numpy()[0]
    return n_correct / len(yhat_indices) * 100

def training_loop(net, datagen_func, optimizer, bar=None):
    if bar is None:
        bar = tqdm(position=2)
    accs = []
    for data_dictionary in datagen_func():
        net.zero_grad()
        optimizer.zero_grad()
        
        yhat = net(data_dictionary['x_surnames'], data_dictionary['x_lengths'])
        loss = sequence_loss(yhat, data_dictionary['y_surnames'])
        accs.append(compute_accuracy(yhat, data_dictionary['y_surnames']))
        
        bar.update(1)
        bar.set_postfix(loss=loss.cpu().data.numpy()[0], 
                        accuracy="{:0.2f}".format(np.mean(accs)))
        
        loss.backward()
        optimizer.step()
          
def val_loop(net, datagen_func, bar=None):
    if bar is None:
        bar = tqdm(position=1)
    accs = []
    for data_dictionary in datagen_func():
        yhat = net(data_dictionary['x_surnames'], data_dictionary['x_lengths'], apply_softmax=True)
        accs.append(compute_accuracy(yhat, data_dictionary['y_surnames']))
        bar.update(1)
        bar.set_postfix(accuracy="{:0.2f}".format(np.mean(accs)))

In [4]:
def sample(emb, rnn, fc, h_t=None, idx_t=None, n=20, temp=1):
    hiddens = [h_t]
    indices = [idx_t]
    out_dists = []
    
    for t in range(n):
        x_t = emb(idx_t)
        h_t = rnn._compute_next_hidden(x_t, h_t)
        
        y_t = fc(h_t)
        y_t = F.softmax( y_t / temp)
        idx_t = torch.multinomial(y_t, 1)[:, 0]
        
        
        hiddens.append(h_t)
        indices.append(idx_t)
        out_dists.append(y_t)
     
    indices = torch.stack(indices).squeeze().permute(1, 0)
    return indices

def make_initial_hidden(batch_size, hidden_size):
    out = Variable(torch.ones(batch_size, hidden_size))
    if settings.CUDA:
        out = out.cuda()
    return out

def make_initial_x(batch_size, vectorizer):
    out = Variable(torch.ones(batch_size) * vectorizer.surname_vocab.start_index).long()
    if settings.CUDA:
        out = out.cuda()
    return out

def decode_one(vectorizer, seq):
    out = []
    for i in seq:
        if vectorizer.surname_vocab.start_index == i:
            continue
        if vectorizer.surname_vocab.end_index == i:
            return ''.join(out)
        out.append(vectorizer.surname_vocab.lookup(i))
    return ''.join(out)
            
def decode_matrix(vectorizer, mat):
    mat = mat.cpu().data.numpy()
    return [decode_one(vectorizer, mat[i]) for i in range(len(mat))]

Make, Train, and Eval


In [5]:
from settings import ZOO
import os

batch_size = 16

raw_data = RawSurnames().get_data()

zoo_info = ZOO.charnn_surname_predicter

if os.path.exists(zoo_info['vocab']):
    vectorizer = SurnamesVectorizer.load(zoo_info['vocab'])
    print("Loading vectorizer!")
else:
    vectorizer = SurnamesVectorizer.fit(raw_data)
    print("Creating a new vectorizer.")

vec_train = vectorizer.transform(raw_data, split='train')
vec_test = vectorizer.transform(raw_data, split='test')

train_data_func = make_generator(vec_train, batch_size=batch_size)
test_data_func = make_generator(vec_test, batch_size=batch_size)
    
parameters = dict(zoo_info['parameters'])    
parameters['in_vocab_size'] = len(vectorizer.surname_vocab)
parameters['out_vocab_size'] = len(vectorizer.surname_vocab)
parameters['expect_batch_on_dim0'] = True

net = CharNN(**parameters)

if os.path.exists(zoo_info['filename']):
    print("Loading state dict!")
    net.load_state_dict(torch.load(zoo_info['filename'], map_location=lambda storage, loc: storage))
else:
    print("Using newly initiated network!")

if settings.CUDA:
    print("CUDA mode enabled")
    net = net.cuda()
else:
    print("CUDA mode not enabled")
    net = net.cpu()


Loading vectorizer!
Loading state dict!
CUDA mode enabled

In [6]:
# Train
n_epochs = 100
optimizer = torch.optim.Adam(net.parameters(), lr=0.0001)
bar = tqdm_notebook(total=n_epochs, position=0)
valbar = tqdm_notebook(position=2)
trainbar = tqdm_notebook(position=3)

try:
    
    for _ in range(n_epochs):

        net.train(False)
        val_loop(net, test_data_func, bar=valbar)
        net.train(True)
        training_loop(net, train_data_func, optimizer, bar=trainbar)

        samples = decode_matrix(vectorizer, 
                                sample(net.emb, net.rnn, net.fc, 
                                       make_initial_hidden(2, parameters['hidden_size']), 
                                       make_initial_x(2, vectorizer),
                                       temp=0.8))
        
        bar.update(1)
        bar.set_postfix(sample0=samples[0], sample1=samples[1])

    net.train(False)
    val_loop(net, test_data_func, valbar)
except KeyboardInterrupt:
    print("...")


...

In [46]:
decode_matrix(vectorizer, 
              sample(net.emb, net.rnn, net.fc, 
                     make_initial_hidden(batch_size, parameters['hidden_size']), 
                     make_initial_x(batch_size, vectorizer),
                     temp=0.8))


Out[46]:
['Poldtoff',
 'Schestars',
 'Gordoud',
 'Kinsen',
 'Venzey',
 'Tumali',
 'Pets',
 'Aänchekin',
 'GDigkov',
 'Shadonov',
 'Boulyanson',
 'Gwae',
 'Zgerege',
 'Foxchevtsev',
 'Progkin',
 'Ussin']