In [1]:
import json
from local_settings import settings, datautils
from datautils.vocabulary import Vocabulary
import pandas as pd
import numpy as np
import torch
from torch import FloatTensor
from torch import nn
from torch.autograd import Variable
from torch.nn import Parameter
from torch.nn import functional as F
from torch.utils.data import Dataset
from torch.utils.data import DataLoader
from tqdm import tqdm, tqdm_notebook
In [46]:
class RawSurnames(object):
def __init__(self, data_path=settings.SURNAMES_CSV, delimiter=","):
self.data = pd.read_csv(data_path, delimiter=delimiter)
def get_data(self, filter_to_nationality=None):
if filter_to_nationality is not None:
return self.data[self.data.nationality.isin(filter_to_nationality)]
return self.data
# vectorizer
class SurnamesVectorizer(object):
def __init__(self, surname_vocab, nationality_vocab, max_seq_length):
self.surname_vocab = surname_vocab
self.nationality_vocab = nationality_vocab
self.max_seq_length = max_seq_length
def save(self, filename):
vec_dict = {"surname_vocab": self.surname_vocab.get_serializable_contents(),
"nationality_vocab": self.nationality_vocab.get_serializable_contents(),
'max_seq_length': self.max_seq_length}
with open(filename, "w") as fp:
json.dump(vec_dict, fp)
@classmethod
def load(cls, filename):
with open(filename, "r") as fp:
vec_dict = json.load(fp)
vec_dict["surname_vocab"] = Vocabulary.deserialize_from_contents(vec_dict["surname_vocab"])
vec_dict["nationality_vocab"] = Vocabulary.deserialize_from_contents(vec_dict["nationality_vocab"])
return cls(**vec_dict)
@classmethod
def fit(cls, surname_df):
"""
"""
surname_vocab = Vocabulary(use_unks=False,
use_mask=True,
use_start_end=True,
start_token=settings.START_TOKEN,
end_token=settings.END_TOKEN)
nationality_vocab = Vocabulary(use_unks=False, use_start_end=False, use_mask=False)
max_seq_length = 0
for index, row in surname_df.iterrows():
surname_vocab.add_many(row.surname)
nationality_vocab.add(row.nationality)
if len(row.surname) > max_seq_length:
max_seq_length = len(row.surname)
max_seq_length = max_seq_length + 2
return cls(surname_vocab, nationality_vocab, max_seq_length)
@classmethod
def fit_transform(cls, surname_df, split='train'):
vectorizer = cls.fit(surname_df)
return vectorizer, vectorizer.transform(surname_df, split)
def transform(self, surname_df, split='train'):
df = surname_df[surname_df.split==split].reset_index()
n_data = len(df)
x_surnames = np.zeros((n_data, self.max_seq_length), dtype=np.int64)
y_surnames = np.ones((n_data, self.max_seq_length), dtype=np.int64) * settings.IGNORE_INDEX_VALUE
x_nationalities = np.zeros(n_data, dtype=np.int64)
for index, row in df.iterrows():
vectorized_surname = list(self.surname_vocab.map(row.surname,
include_start_end=True))
x_part = vectorized_surname[:-1]
y_part = vectorized_surname[1:]
x_surnames[index, :len(x_part)] = x_part
y_surnames[index, :len(y_part)] = y_part
x_nationalities[index] = self.nationality_vocab[row.nationality]
return VectorizedSurnames(x_surnames, x_nationalities, y_surnames)
# vec data
class VectorizedSurnames(Dataset):
def __init__(self, x_surnames, x_nationalities, y_surnames):
self.x_surnames = x_surnames
self.x_nationalities = x_nationalities
self.y_surnames = y_surnames
def __len__(self):
return len(self.x_surnames)
def __getitem__(self, index):
return {'x_surnames': self.x_surnames[index],
'x_nationalities': self.x_nationalities[index],
'y_surnames': self.y_surnames[index],
'x_lengths': len(self.x_surnames[index].nonzero()[0])}
# data generator
def make_generator(vectorized_data, batch_size, num_batches=-1,
num_workers=0, volatile_mode=False,
strict_batching=True):
loaded_data = DataLoader(vectorized_data, batch_size=batch_size,
shuffle=True, num_workers=num_workers)
def inner_func(num_batches=num_batches,
volatile_mode=volatile_mode):
for batch_index, batch in enumerate(loaded_data):
out = {}
current_batch_size = list(batch.values())[0].size(0)
if current_batch_size < batch_size and strict_batching:
break
for key, value in batch.items():
if not isinstance(value, Variable):
value = Variable(value)
if settings.CUDA:
value = value.cuda()
if volatile_mode:
value = value.volatile()
out[key] = value
yield out
if num_batches > 0 and batch_index > num_batches:
break
return inner_func
In [3]:
def new_parameter(*size):
out = Parameter(FloatTensor(*size))
torch.nn.init.xavier_normal(out)
return out
class ExplicitRNN(nn.Module):
def __init__(self, input_size, hidden_size, expect_batch_on_dim0=False):
super(ExplicitRNN, self).__init__()
self.W_in2hid = new_parameter(input_size, hidden_size)
self.W_hid2hid = new_parameter(hidden_size, hidden_size)
self.b_hid = new_parameter(1, hidden_size)
self.hidden_size = hidden_size
self.expect_batch_on_dim0 = expect_batch_on_dim0
def _compute_next_hidden(self, x, h):
return F.tanh(x.matmul(self.W_in2hid) +
h.matmul(self.W_hid2hid) +
self.b_hid)
def forward(self, x_in, hid_t=None):
if self.expect_batch_on_dim0:
batch_size, seq_size, feat_size = x_in.size()
x_in = x_in.permute(1, 0, 2)
else:
seq_size, batch_size, feat_size = x_in.size()
hiddens = []
if hid_t is None:
hid_t = Variable(torch.zeros((batch_size, self.hidden_size)))
if settings.CUDA:
hid_t = hid_t.cuda()
for t in range(seq_size):
x_t = x_in[t]
hid_t = self._compute_next_hidden(x_t, hid_t)
hiddens.append(hid_t)
hiddens = torch.stack(hiddens)
if self.expect_batch_on_dim0:
hiddens = hiddens.permute(1, 0, 2)
return hiddens
class CharNN(nn.Module):
def __init__(self, embedding_size, in_vocab_size, out_vocab_size, hidden_size, num_conditioning_states,
expect_batch_on_dim0=False):
super(CharNN, self).__init__()
self.emb = nn.Embedding(embedding_dim=embedding_size,
num_embeddings=in_vocab_size,
padding_idx=0)
self.conditional_emb = nn.Embedding(embedding_dim=hidden_size,
num_embeddings=num_conditioning_states)
self.fc = nn.Linear(in_features=hidden_size, out_features=out_vocab_size)
self.rnn = ExplicitRNN(input_size=embedding_size, hidden_size=hidden_size,
expect_batch_on_dim0=expect_batch_on_dim0)
def forward(self, x_in, state_in, x_lengths=None, apply_softmax=False):
x_in = self.emb(x_in)
state_in = self.conditional_emb(state_in)
y_out = self.rnn(x_in, state_in)
dim0, dim1, dim2 = y_out.size()
y_out = y_out.contiguous().view(-1, dim2)
y_out = self.fc(y_out)
# optionally apply the softmax
if apply_softmax:
y_out = F.softmax(y_out)
y_out = y_out.view(dim0, dim1, -1)
return y_out
def normalize_sizes(net_output, y_true):
net_output = net_output.cpu()
y_true = y_true.cpu()
if len(net_output.size()) == 3:
net_output.contiguous()
net_output = net_output.view(-1, net_output.size(2))
if len(y_true.size()) == 2:
y_true.contiguous()
y_true = y_true.view(-1)
return net_output, y_true
def sequence_loss(net_output, y_true, loss_func=F.cross_entropy):
net_output, y_true = normalize_sizes(net_output, y_true)
return F.cross_entropy(net_output, y_true, ignore_index=settings.IGNORE_INDEX_VALUE)
def compute_accuracy(yhat, ytrue):
yhat, ytrue = normalize_sizes(yhat, ytrue)
_, yhat_indices = yhat.max(dim=1)
n_correct = torch.eq(yhat_indices, ytrue).sum().data.numpy()[0]
return n_correct / len(yhat_indices) * 100
def training_loop(net, datagen_func, optimizer, bar=None):
if bar is None:
bar = tqdm(position=2)
accs = []
for data_dictionary in datagen_func():
net.zero_grad()
optimizer.zero_grad()
yhat = net(data_dictionary['x_surnames'],
data_dictionary['x_nationalities'],
data_dictionary['x_lengths'])
loss = sequence_loss(yhat, data_dictionary['y_surnames'])
accs.append(compute_accuracy(yhat, data_dictionary['y_surnames']))
bar.update(1)
bar.set_postfix(loss=loss.cpu().data.numpy()[0],
accuracy="{:0.2f}".format(np.mean(accs)))
loss.backward()
optimizer.step()
def val_loop(net, datagen_func, bar=None):
if bar is None:
bar = tqdm(position=1)
accs = []
for data_dictionary in datagen_func():
yhat = net(data_dictionary['x_surnames'],
data_dictionary['x_nationalities'],
data_dictionary['x_lengths'], apply_softmax=True)
accs.append(compute_accuracy(yhat, data_dictionary['y_surnames']))
bar.update(1)
bar.set_postfix(accuracy="{:0.2f}".format(np.mean(accs)))
In [4]:
def sample(emb, rnn, fc, h_t=None, idx_t=None, n=20, temp=1):
hiddens = [h_t]
indices = [idx_t]
out_dists = []
for t in range(n):
x_t = emb(idx_t)
h_t = rnn._compute_next_hidden(x_t, h_t)
y_t = fc(h_t)
y_t = F.softmax( y_t / temp)
idx_t = torch.multinomial(y_t, 1)[:, 0]
hiddens.append(h_t)
indices.append(idx_t)
out_dists.append(y_t)
indices = torch.stack(indices).squeeze().permute(1, 0)
return indices
def long_variable(indices):
out = Variable(torch.LongTensor(indices))
if settings.CUDA:
out = out.cuda()
return out
def make_initial_hidden(batch_size, hidden_size):
out = Variable(torch.ones(batch_size, hidden_size))
if settings.CUDA:
out = out.cuda()
return out
def make_initial_x(batch_size, vectorizer):
out = Variable(torch.ones(batch_size) * vectorizer.surname_vocab.start_index).long()
if settings.CUDA:
out = out.cuda()
return out
def decode_one(vectorizer, seq):
out = []
for i in seq:
if vectorizer.surname_vocab.start_index == i:
continue
if vectorizer.surname_vocab.end_index == i:
return ''.join(out)
out.append(vectorizer.surname_vocab.lookup(i))
return ''.join(out)
def decode_matrix(vectorizer, mat):
mat = mat.cpu().data.numpy()
return [decode_one(vectorizer, mat[i]) for i in range(len(mat))]
def n_random_nationalities(n):
keys = np.random.choice(vectorizer.nationality_vocab.keys(), size=n, replace=True)
indices = long_variable([vectorizer.nationality_vocab[key] for key in keys])
return keys, indices
def sample_n(n=10, temp=0.8):
init_names, init_vector = n_random_nationalities(n)
init_vector = net.conditional_emb(init_vector)
samples = decode_matrix(vectorizer,
sample(net.emb, net.rnn, net.fc,
init_vector,
make_initial_x(n, vectorizer),
temp=temp))
return list(zip(init_names, samples))
def sample_n_for_nationality(nationality, n=10, temp=0.8):
assert nationality in vectorizer.nationality_vocab.keys(), 'not a nationality we trained on'
keys = [nationality] * n
init_vector = long_variable([vectorizer.nationality_vocab[key] for key in keys])
init_vector = net.conditional_emb(init_vector)
samples = decode_matrix(vectorizer,
sample(net.emb, net.rnn, net.fc,
init_vector,
make_initial_x(n, vectorizer),
temp=temp))
return list(zip(keys, samples))
In [5]:
from settings import ZOO
import os
batch_size = 16
raw_data = RawSurnames().get_data()
zoo_info = ZOO.charnn_surname_conditioned_predicter
if os.path.exists(zoo_info['vocab']):
vectorizer = SurnamesVectorizer.load(zoo_info['vocab'])
print("Loading vectorizer!")
else:
vectorizer = SurnamesVectorizer.fit(raw_data)
print("Creating a new vectorizer.")
vec_train = vectorizer.transform(raw_data, split='train')
vec_test = vectorizer.transform(raw_data, split='test')
train_data_func = make_generator(vec_train, batch_size=batch_size)
test_data_func = make_generator(vec_test, batch_size=batch_size)
parameters = dict(zoo_info['parameters'])
parameters['in_vocab_size'] = len(vectorizer.surname_vocab)
parameters['out_vocab_size'] = len(vectorizer.surname_vocab)
parameters['expect_batch_on_dim0'] = True
parameters['num_conditioning_states'] = len(vectorizer.nationality_vocab)
net = CharNN(**parameters)
if os.path.exists(zoo_info['filename']):
print("Loading state dict!")
net.load_state_dict(torch.load(zoo_info['filename'], map_location=lambda storage, loc: storage))
else:
print("Using newly initiated network!")
if settings.CUDA:
print("CUDA mode enabled")
net = net.cuda()
else:
print("CUDA mode not enabled")
net = net.cpu()
In [41]:
sample_n_for_nationality('russian')
Out[41]:
In [43]:
sample_n()
Out[43]:
In [44]:
# Train
n_epochs = 100
optimizer = torch.optim.Adam(net.parameters(), lr=0.0001)
bar = tqdm_notebook(total=n_epochs, position=0)
valbar = tqdm_notebook(position=2)
trainbar = tqdm_notebook(position=3)
try:
for _ in range(n_epochs):
net.train(False)
val_loop(net, test_data_func, bar=valbar)
net.train(True)
training_loop(net, train_data_func, optimizer, bar=trainbar)
init_names, init_vector = n_random_nationalities(2)
init_vector = net.conditional_emb(init_vector)
samples = decode_matrix(vectorizer,
sample(net.emb, net.rnn, net.fc,
init_vector,
make_initial_x(2, vectorizer),
temp=0.8))
bar.update(1)
postfix = dict(zip(init_names, samples))
bar.set_postfix(**postfix)
net.train(False)
val_loop(net, test_data_func, valbar)
except KeyboardInterrupt:
print("...")
In [59]:
sample_n(30, 0.9)
Out[59]: