In [1]:
import json
from local_settings import settings, datautils
from datautils.vocabulary import Vocabulary
import pandas as pd
import numpy as np
import torch
from torch import FloatTensor
from torch import nn
from torch.autograd import Variable
from torch.nn import Parameter
from torch.nn import functional as F
from torch.utils.data import Dataset
from torch.utils.data import DataLoader
from tqdm import tqdm, tqdm_notebook
In [2]:
class RawSurnames(object):
def __init__(self, data_path=settings.SURNAMES_CSV, delimiter=","):
self.data = pd.read_csv(data_path, delimiter=delimiter)
def get_data(self, filter_to_nationality=None):
if filter_to_nationality is not None:
return self.data[self.data.nationality.isin(filter_to_nationality)]
return self.data
# vectorizer
class SurnamesVectorizer(object):
def __init__(self, surname_vocab, nationality_vocab, max_seq_length):
self.surname_vocab = surname_vocab
self.nationality_vocab = nationality_vocab
self.max_seq_length = max_seq_length
def save(self, filename):
vec_dict = {"surname_vocab": self.surname_vocab.get_serializable_contents(),
"nationality_vocab": self.nationality_vocab.get_serializable_contents(),
'max_seq_length': self.max_seq_length}
with open(filename, "w") as fp:
json.dump(vec_dict, fp)
@classmethod
def load(cls, filename):
with open(filename, "r") as fp:
vec_dict = json.load(fp)
vec_dict["surname_vocab"] = Vocabulary.deserialize_from_contents(vec_dict["surname_vocab"])
vec_dict["nationality_vocab"] = Vocabulary.deserialize_from_contents(vec_dict["nationality_vocab"])
return cls(**vec_dict)
@classmethod
def fit(cls, surname_df):
"""
"""
surname_vocab = Vocabulary(use_unks=False,
use_mask=True,
use_start_end=True,
start_token=settings.START_TOKEN,
end_token=settings.END_TOKEN)
nationality_vocab = Vocabulary(use_unks=False, use_start_end=False, use_mask=False)
max_seq_length = 0
for index, row in surname_df.iterrows():
surname_vocab.add_many(row.surname)
nationality_vocab.add(row.nationality)
if len(row.surname) > max_seq_length:
max_seq_length = len(row.surname)
max_seq_length = max_seq_length + 2
return cls(surname_vocab, nationality_vocab, max_seq_length)
@classmethod
def fit_transform(cls, surname_df, split='train'):
vectorizer = cls.fit(surname_df)
return vectorizer, vectorizer.transform(surname_df, split)
def transform(self, surname_df, split='train'):
df = surname_df[surname_df.split==split].reset_index()
n_data = len(df)
x_surnames = np.zeros((n_data, self.max_seq_length), dtype=np.int64)
y_nationalities = np.zeros(n_data, dtype=np.int64)
for index, row in df.iterrows():
vectorized_surname = list(self.surname_vocab.map(row.surname,
include_start_end=True))
x_surnames[index, :len(vectorized_surname)] = vectorized_surname
y_nationalities[index] = self.nationality_vocab[row.nationality]
return VectorizedSurnames(x_surnames, y_nationalities)
# vec data
class VectorizedSurnames(Dataset):
def __init__(self, x_surnames, y_nationalities):
self.x_surnames = x_surnames
self.y_nationalities = y_nationalities
def __len__(self):
return len(self.x_surnames)
def __getitem__(self, index):
return {'x_surnames': self.x_surnames[index],
'y_nationalities': self.y_nationalities[index],
'x_lengths': len(self.x_surnames[index].nonzero()[0])}
# data generator
def make_generator(vectorized_data, batch_size, num_batches=-1,
num_workers=0, volatile_mode=False,
strict_batching=True):
loaded_data = DataLoader(vectorized_data, batch_size=batch_size,
shuffle=True, num_workers=num_workers)
def inner_func(num_batches=num_batches,
volatile_mode=volatile_mode):
for batch_index, batch in enumerate(loaded_data):
out = {}
current_batch_size = list(batch.values())[0].size(0)
if current_batch_size < batch_size and strict_batching:
break
for key, value in batch.items():
if not isinstance(value, Variable):
value = Variable(value)
if settings.CUDA:
value = value.cuda()
if volatile_mode:
value = value.volatile()
out[key] = value
yield out
if num_batches > 0 and batch_index > num_batches:
break
return inner_func
In [3]:
def new_parameter(*size):
out = Parameter(FloatTensor(*size))
torch.nn.init.xavier_normal(out)
return out
def column_gather(y_out, x_lengths):
'''Get a specific vector from each batch datapoint in `y_out`.
More precisely, iterate over batch row indices, get the vector that's at
the position indicated by the corresponding value in `x_lengths` at the row
index.
Args:
y_out (torch.FloatTensor, torch.cuda.FloatTensor)
shape: (batch, sequence, feature)
x_lengths (torch.LongTensor, torch.cuda.LongTensor)
shape: (batch,)
Returns:
y_out (torch.FloatTensor, torch.cuda.FloatTensor)
shape: (batch, feature)
'''
x_lengths = x_lengths.long().data.cpu().numpy() - 1
# alternatively:
# out = []
# for batch_index, column_index in enumerate(x_lengths):
# out.append(y_out[batch_index, column_index])
# return torch.stack(out)
return torch.stack([y_out[batch_index, column_index]
for batch_index, column_index in enumerate(x_lengths)])
class ExplicitRNN(nn.Module):
def __init__(self, input_size, hidden_size, expect_batch_on_dim0=False):
super(ExplicitRNN, self).__init__()
self.W_in2hid = new_parameter(input_size, hidden_size)
self.W_hid2hid = new_parameter(hidden_size, hidden_size)
self.b_hid = new_parameter(1, hidden_size)
self.hidden_size = hidden_size
self.expect_batch_on_dim0 = expect_batch_on_dim0
def _compute_next_hidden(self, x, h):
return F.tanh(x.matmul(self.W_in2hid) +
h.matmul(self.W_hid2hid) +
self.b_hid)
def forward(self, x_in, hid_t=None):
if self.expect_batch_on_dim0:
batch_size, seq_size, feat_size = x_in.size()
x_in = x_in.permute(1, 0, 2)
else:
seq_size, batch_size, feat_size = x_in.size()
hiddens = []
if hid_t is None:
hid_t = Variable(torch.zeros((batch_size, self.hidden_size)))
if settings.CUDA:
hid_t = hid_t.cuda()
for t in range(seq_size):
x_t = x_in[t]
hid_t = self._compute_next_hidden(x_t, hid_t)
hiddens.append(hid_t)
hiddens = torch.stack(hiddens)
if self.expect_batch_on_dim0:
hiddens = hiddens.permute(1, 0, 2)
return hiddens
class CharNN(nn.Module):
def __init__(self, embedding_size, in_vocab_size, out_vocab_size, hidden_size, class_weights=None,
expect_batch_on_dim0=False):
super(CharNN, self).__init__()
self.emb = nn.Embedding(embedding_dim=embedding_size, num_embeddings=in_vocab_size, padding_idx=0)
self.fc = nn.Linear(in_features=hidden_size, out_features=out_vocab_size)
self.rnn = ExplicitRNN(input_size=embedding_size, hidden_size=hidden_size,
expect_batch_on_dim0=expect_batch_on_dim0)
self.class_weights = class_weights
def cuda(self):
self.class_weights = self.class_weights.cuda()
return super(CharNN, self).cuda()
def cpu(self):
self.class_weights = self.class_weights.cpu()
return super(CharNN, self).cpu()
def forward(self, x_in, x_lengths=None, apply_softmax=False):
x_in = self.emb(x_in)
y_out = self.rnn(x_in)
if x_lengths is not None:
y_out = column_gather(y_out, x_lengths)
else:
y_out = y_out[:, -1, :]
y_out = self.fc(y_out)
# optionally apply the softmax
if apply_softmax:
y_out = F.softmax(y_out)
return y_out
def sequence_loss(net_output, y_true, loss_func=F.cross_entropy, class_weights=None):
if len(net_output.size()) == 3:
net_output.contiguous()
net_output = net_output.view(-1, net_output.size(2))
if len(y_true.size()) == 2:
y_true.contiguous()
y_true = y_true.view(-1)
return F.cross_entropy(net_output, y_true, weight=class_weights)
def compute_accuracy(yhat, ytrue):
ytrue = ytrue.cpu()
yhat_indices = yhat.cpu().max(dim=1)[1]
n_correct = torch.eq(yhat_indices, ytrue).sum().data.numpy()[0]
return n_correct / len(yhat_indices) * 100
def training_loop(net, datagen_func, optimizer, bar=None):
if bar is None:
bar = tqdm(position=2)
accs = []
for data_dictionary in datagen_func():
net.zero_grad()
optimizer.zero_grad()
yhat = net(data_dictionary['x_surnames'], data_dictionary['x_lengths'])
loss = sequence_loss(yhat, data_dictionary['y_nationalities'], class_weights=net.class_weights)
accs.append(compute_accuracy(yhat, data_dictionary['y_nationalities']))
bar.update(1)
bar.set_postfix(loss=loss.cpu().data.numpy()[0],
accuracy="{:0.2f}".format(np.mean(accs)))
loss.backward()
optimizer.step()
def val_loop(net, datagen_func, bar=None):
if bar is None:
bar = tqdm(position=1)
accs = []
for data_dictionary in datagen_func():
yhat = net(data_dictionary['x_surnames'], data_dictionary['x_lengths'], apply_softmax=True)
accs.append(compute_accuracy(yhat, data_dictionary['y_nationalities']))
bar.update(1)
bar.set_postfix(accuracy="{:0.2f}".format(np.mean(accs)))
In [5]:
from settings import ZOO
import os
batch_size = 16
raw_data = RawSurnames().get_data()
zoo_info = ZOO.charnn_surname_classifer
if os.path.exists(zoo_info['vocab']):
vectorizer = SurnamesVectorizer.load(zoo_info['vocab'])
print("Loading vectorizer!")
else:
vectorizer = SurnamesVectorizer.fit(raw_data)
print("Creating a new vectorizer.")
vec_train = vectorizer.transform(raw_data, split='train')
vec_test = vectorizer.transform(raw_data, split='test')
train_data_func = make_generator(vec_train, batch_size=batch_size)
test_data_func = make_generator(vec_test, batch_size=batch_size)
class_counts = raw_data.nationality.value_counts().to_dict()
sorted_counts = sorted(class_counts.items(), key=lambda item: vectorizer.nationality_vocab[item[0]])
class_weights = 1.0 / torch.FloatTensor([float(count) for _, count in sorted_counts])
parameters = dict(zoo_info['parameters'])
parameters['in_vocab_size'] = len(vectorizer.surname_vocab)
parameters['out_vocab_size'] = len(vectorizer.nationality_vocab)
parameters['expect_batch_on_dim0'] = True
parameters['class_weights'] = class_weights
net = CharNN(**parameters)
if os.path.exists(zoo_info['filename']):
print("Loading state dict!")
net.load_state_dict(torch.load(zoo_info['filename'], map_location=lambda storage, loc: storage))
else:
print("Using newly initiated network!")
if settings.CUDA:
print("CUDA mode enabled")
net = net.cuda()
else:
print("CUDA mode not enabled")
net = net.cpu()
In [6]:
def name_to_indices(name):
name_indices = list(vectorizer.surname_vocab.map(name, include_start_end=True))
out = torch.autograd.Variable(torch.LongTensor(name_indices)[None, :])
length = torch.autograd.Variable(torch.LongTensor([len(name_indices)]))
if settings.CUDA:
out = out.cuda()
length = length.cuda()
return out, length
def predict_nationality(surname):
y_prediction = net(*name_to_indices(surname))
_, nationality_index = y_prediction.max(dim=1)
return vectorizer.nationality_vocab.lookup(nationality_index.cpu().data.numpy()[0])
In [7]:
# Train
try:
optimizer = torch.optim.Adam(net.parameters(), lr=0.0001)
valbar = tqdm_notebook(position=2)
trainbar = tqdm_notebook(position=3)
for _ in tqdm_notebook(range(1000), total=1000, position=0):
net.train(False)
val_loop(net, test_data_func, bar=valbar)
net.train(True)
training_loop(net, train_data_func, optimizer, bar=trainbar)
net.train(False)
val_loop(net, test_data_func, bar=valbar)
except KeyboardInterrupt:
print("...")
In [8]:
predict_nationality('satoshi nakamoto')
Out[8]:
In [9]:
predict_nationality('mcmahan')
Out[9]:
In [10]:
predict_nationality('bismarck')
Out[10]:
In [11]:
predict_nationality('anderson')
Out[11]: