In [1]:
import json

from local_settings import settings, datautils

from datautils.vocabulary import Vocabulary

import pandas as pd
import numpy as np

import torch
from torch import FloatTensor
from torch import nn
from torch.autograd import Variable
from torch.nn import Parameter
from torch.nn import functional as F
from torch.utils.data import Dataset
from torch.utils.data import DataLoader

from tqdm import tqdm, tqdm_notebook

Class Definitions

Data Model:

  • Raw data
  • Vectorizer
  • Vectorized Data
  • Data generator

In [2]:
class RawSurnames(object):
    def __init__(self, data_path=settings.SURNAMES_CSV, delimiter=","):
        self.data = pd.read_csv(data_path, delimiter=delimiter)

    def get_data(self, filter_to_nationality=None):
        if filter_to_nationality is not None:
            return self.data[self.data.nationality.isin(filter_to_nationality)]
        return self.data

# vectorizer

class SurnamesVectorizer(object):
    def __init__(self, surname_vocab, nationality_vocab, max_seq_length):
        self.surname_vocab = surname_vocab
        self.nationality_vocab = nationality_vocab
        self.max_seq_length = max_seq_length
        
    def save(self, filename):
        vec_dict = {"surname_vocab": self.surname_vocab.get_serializable_contents(),
                    "nationality_vocab": self.nationality_vocab.get_serializable_contents(),
                    'max_seq_length': self.max_seq_length}

        with open(filename, "w") as fp:
            json.dump(vec_dict, fp)
        
    @classmethod
    def load(cls, filename):
        with open(filename, "r") as fp:
            vec_dict = json.load(fp)

        vec_dict["surname_vocab"] = Vocabulary.deserialize_from_contents(vec_dict["surname_vocab"])
        vec_dict["nationality_vocab"] = Vocabulary.deserialize_from_contents(vec_dict["nationality_vocab"])
        return cls(**vec_dict)

    @classmethod
    def fit(cls, surname_df):
        """
        """
        surname_vocab = Vocabulary(use_unks=False,
                                   use_mask=True,
                                   use_start_end=True,
                                   start_token=settings.START_TOKEN,
                                   end_token=settings.END_TOKEN)

        nationality_vocab = Vocabulary(use_unks=False, use_start_end=False, use_mask=False)

        max_seq_length = 0
        for index, row in surname_df.iterrows():
            surname_vocab.add_many(row.surname)
            nationality_vocab.add(row.nationality)

            if len(row.surname) > max_seq_length:
                max_seq_length = len(row.surname)
        max_seq_length = max_seq_length + 2

        return cls(surname_vocab, nationality_vocab, max_seq_length)

    @classmethod
    def fit_transform(cls, surname_df, split='train'):
        vectorizer = cls.fit(surname_df)
        return vectorizer, vectorizer.transform(surname_df, split)

    def transform(self, surname_df, split='train'):

        df = surname_df[surname_df.split==split].reset_index()
        n_data = len(df)
        
        x_surnames = np.zeros((n_data, self.max_seq_length), dtype=np.int64)
        y_nationalities = np.zeros(n_data, dtype=np.int64)

        for index, row in df.iterrows():
            vectorized_surname = list(self.surname_vocab.map(row.surname, 
                                                             include_start_end=True))
            x_surnames[index, :len(vectorized_surname)] = vectorized_surname
            y_nationalities[index] = self.nationality_vocab[row.nationality]

        return VectorizedSurnames(x_surnames, y_nationalities)

# vec data


class VectorizedSurnames(Dataset):
    def __init__(self, x_surnames, y_nationalities):
        self.x_surnames = x_surnames
        self.y_nationalities = y_nationalities

    def __len__(self):
        return len(self.x_surnames)

    def __getitem__(self, index):
        return {'x_surnames': self.x_surnames[index],
                'y_nationalities': self.y_nationalities[index],
                'x_lengths': len(self.x_surnames[index].nonzero()[0])}

# data generator

def make_generator(vectorized_data, batch_size, num_batches=-1, 
                               num_workers=0, volatile_mode=False, 
                               strict_batching=True):

    loaded_data = DataLoader(vectorized_data, batch_size=batch_size, 
                             shuffle=True, num_workers=num_workers)

    def inner_func(num_batches=num_batches, 
                   volatile_mode=volatile_mode):

        for batch_index, batch in enumerate(loaded_data):
            out = {}
            current_batch_size = list(batch.values())[0].size(0)
            if current_batch_size < batch_size and strict_batching:
                break
            for key, value in batch.items():
                if not isinstance(value, Variable):
                    value = Variable(value)
                if settings.CUDA:
                    value = value.cuda()
                if volatile_mode:
                    value = value.volatile()
                out[key] = value
            yield out

            if num_batches > 0 and batch_index > num_batches:
                break

    return inner_func

Class definitions for the model


In [3]:
def new_parameter(*size):
    out = Parameter(FloatTensor(*size))
    torch.nn.init.xavier_normal(out)
    return out

def column_gather(y_out, x_lengths):
    '''Get a specific vector from each batch datapoint in `y_out`.

    More precisely, iterate over batch row indices, get the vector that's at
    the position indicated by the corresponding value in `x_lengths` at the row
    index. 

    Args:
        y_out (torch.FloatTensor, torch.cuda.FloatTensor)
            shape: (batch, sequence, feature)
        x_lengths (torch.LongTensor, torch.cuda.LongTensor)
            shape: (batch,)

    Returns:
        y_out (torch.FloatTensor, torch.cuda.FloatTensor)
            shape: (batch, feature)
    '''
    x_lengths = x_lengths.long().data.cpu().numpy() - 1
    # alternatively:
    # out = []
    # for batch_index, column_index in enumerate(x_lengths):
    #     out.append(y_out[batch_index, column_index])
    # return torch.stack(out)
    return torch.stack([y_out[batch_index, column_index]
                        for batch_index, column_index in enumerate(x_lengths)])

class ExplicitRNN(nn.Module):
    def __init__(self, input_size, hidden_size, expect_batch_on_dim0=False):
        super(ExplicitRNN, self).__init__()
        self.W_in2hid = new_parameter(input_size, hidden_size)
        self.W_hid2hid = new_parameter(hidden_size, hidden_size)
            
        self.b_hid = new_parameter(1, hidden_size)
        
        self.hidden_size = hidden_size

        self.expect_batch_on_dim0 = expect_batch_on_dim0
    
    def _compute_next_hidden(self, x, h):
        return F.tanh(x.matmul(self.W_in2hid) + 
                      h.matmul(self.W_hid2hid) + 
                      self.b_hid)

    def forward(self, x_in, hid_t=None):
        if self.expect_batch_on_dim0:
            batch_size, seq_size, feat_size = x_in.size()
            x_in = x_in.permute(1, 0, 2)
        else:
            seq_size, batch_size, feat_size = x_in.size()

        hiddens = []
        if hid_t is None:
            hid_t = Variable(torch.zeros((batch_size, self.hidden_size)))
        
        if settings.CUDA:
            hid_t = hid_t.cuda()
            
        for t in range(seq_size):
            x_t = x_in[t]
            hid_t = self._compute_next_hidden(x_t, hid_t)
            
            hiddens.append(hid_t)
        hiddens = torch.stack(hiddens)

        if self.expect_batch_on_dim0:
            hiddens = hiddens.permute(1, 0, 2)

        return hiddens

    
class CharNN(nn.Module):
    def __init__(self, embedding_size, in_vocab_size, out_vocab_size, hidden_size, class_weights=None,
                 expect_batch_on_dim0=False):
        super(CharNN, self).__init__()
        
        self.emb = nn.Embedding(embedding_dim=embedding_size, num_embeddings=in_vocab_size, padding_idx=0)
        self.fc = nn.Linear(in_features=hidden_size, out_features=out_vocab_size)
        self.rnn = ExplicitRNN(input_size=embedding_size, hidden_size=hidden_size, 
                               expect_batch_on_dim0=expect_batch_on_dim0)
        
        self.class_weights = class_weights
    
    def cuda(self):
        self.class_weights = self.class_weights.cuda()
        return super(CharNN, self).cuda()
        
        
    def cpu(self):
        self.class_weights = self.class_weights.cpu()
        return super(CharNN, self).cpu()
        
    
    def forward(self, x_in, x_lengths=None, apply_softmax=False):
        x_in = self.emb(x_in)
        y_out = self.rnn(x_in)
        
        if x_lengths is not None:
            y_out = column_gather(y_out, x_lengths)
        else:
            y_out = y_out[:, -1, :] 

        y_out = self.fc(y_out)

        # optionally apply the softmax
        if apply_softmax:
            y_out = F.softmax(y_out)
            
        return y_out
    
def sequence_loss(net_output, y_true, loss_func=F.cross_entropy, class_weights=None):
    if len(net_output.size()) == 3:
        net_output.contiguous()
        net_output = net_output.view(-1, net_output.size(2))
    if len(y_true.size()) == 2:
        y_true.contiguous()
        y_true = y_true.view(-1)
    
    return F.cross_entropy(net_output, y_true, weight=class_weights)

def compute_accuracy(yhat, ytrue):
    ytrue = ytrue.cpu()
    yhat_indices = yhat.cpu().max(dim=1)[1]
    n_correct = torch.eq(yhat_indices, ytrue).sum().data.numpy()[0]
    return n_correct / len(yhat_indices) * 100

def training_loop(net, datagen_func, optimizer, bar=None):
    if bar is None:
        bar = tqdm(position=2)
    accs = []
    for data_dictionary in datagen_func():
        net.zero_grad()
        optimizer.zero_grad()
        
        yhat = net(data_dictionary['x_surnames'], data_dictionary['x_lengths'])
        loss = sequence_loss(yhat, data_dictionary['y_nationalities'], class_weights=net.class_weights)
        accs.append(compute_accuracy(yhat, data_dictionary['y_nationalities']))
        
        bar.update(1)
        bar.set_postfix(loss=loss.cpu().data.numpy()[0], 
                        accuracy="{:0.2f}".format(np.mean(accs)))
        
        loss.backward()
        optimizer.step()
          
def val_loop(net, datagen_func, bar=None):
    if bar is None:
        bar = tqdm(position=1)
    accs = []
    for data_dictionary in datagen_func():
        yhat = net(data_dictionary['x_surnames'], data_dictionary['x_lengths'], apply_softmax=True)
        accs.append(compute_accuracy(yhat, data_dictionary['y_nationalities']))
        bar.update(1)
        bar.set_postfix(accuracy="{:0.2f}".format(np.mean(accs)))

Make, Train, and Eval


In [5]:
from settings import ZOO
import os

batch_size = 16

raw_data = RawSurnames().get_data()

zoo_info = ZOO.charnn_surname_classifer

if os.path.exists(zoo_info['vocab']):
    vectorizer = SurnamesVectorizer.load(zoo_info['vocab'])
    print("Loading vectorizer!")
else:
    vectorizer = SurnamesVectorizer.fit(raw_data)
    print("Creating a new vectorizer.")
    
vec_train = vectorizer.transform(raw_data, split='train')
vec_test = vectorizer.transform(raw_data, split='test')

train_data_func = make_generator(vec_train, batch_size=batch_size)
test_data_func = make_generator(vec_test, batch_size=batch_size)
    
class_counts = raw_data.nationality.value_counts().to_dict()
sorted_counts = sorted(class_counts.items(), key=lambda item: vectorizer.nationality_vocab[item[0]])
class_weights = 1.0 / torch.FloatTensor([float(count) for _, count in sorted_counts])

    
parameters = dict(zoo_info['parameters'])        
parameters['in_vocab_size'] = len(vectorizer.surname_vocab)
parameters['out_vocab_size'] = len(vectorizer.nationality_vocab)
parameters['expect_batch_on_dim0'] = True
parameters['class_weights'] = class_weights

net = CharNN(**parameters)

if os.path.exists(zoo_info['filename']):
    print("Loading state dict!")
    net.load_state_dict(torch.load(zoo_info['filename'], map_location=lambda storage, loc: storage))
else:
    print("Using newly initiated network!")

if settings.CUDA:
    print("CUDA mode enabled")
    net = net.cuda()
else:
    print("CUDA mode not enabled")
    net = net.cpu()


Loading vectorizer!
Loading state dict!
CUDA mode not enabled

In [6]:
def name_to_indices(name):
    name_indices = list(vectorizer.surname_vocab.map(name, include_start_end=True))
    out = torch.autograd.Variable(torch.LongTensor(name_indices)[None, :])
    length = torch.autograd.Variable(torch.LongTensor([len(name_indices)]))
    
    if settings.CUDA:
        out = out.cuda()
        length = length.cuda()
        
    return out, length
    
def predict_nationality(surname):
    y_prediction = net(*name_to_indices(surname))
    _, nationality_index = y_prediction.max(dim=1)
    return vectorizer.nationality_vocab.lookup(nationality_index.cpu().data.numpy()[0])

In [7]:
# Train
try:
    optimizer = torch.optim.Adam(net.parameters(), lr=0.0001)
    valbar = tqdm_notebook(position=2)
    trainbar = tqdm_notebook(position=3)
    for _ in tqdm_notebook(range(1000), total=1000, position=0):

        net.train(False)
        val_loop(net, test_data_func, bar=valbar)
        net.train(True)
        training_loop(net, train_data_func, optimizer, bar=trainbar)

    net.train(False)
    val_loop(net, test_data_func, bar=valbar)
except KeyboardInterrupt:
    print("...")


...

In [8]:
predict_nationality('satoshi nakamoto')


Out[8]:
'japanese'

In [9]:
predict_nationality('mcmahan')


Out[9]:
'irish'

In [10]:
predict_nationality('bismarck')


Out[10]:
'dutch'

In [11]:
predict_nationality('anderson')


Out[11]:
'english'