In [2]:
import json
from local_settings import settings, datautils
from datautils.vocabulary import Vocabulary
import pandas as pd
import numpy as np
from ast import literal_eval
import torch
from torch import FloatTensor
from torch import nn
from torch.autograd import Variable
from torch.nn import Parameter
from torch.nn import functional as F
from torch.utils.data import Dataset
from torch.utils.data import DataLoader
from tqdm import tqdm, tqdm_notebook
In [3]:
class RawTrumpTweets(object):
def __init__(self, data_path=settings.TRUMP_FILENAME):
self.data = pd.read_csv(data_path)
def get_data(self):
return self.data
# vectorizer
class TrumpTweetVectorizer(object):
def __init__(self, word_vocab, max_seq_length):
self.word_vocab = word_vocab
self.max_seq_length = max_seq_length
def save(self, filename):
vec_dict = {"word_vocab": self.word_vocab.get_serializable_contents(),
'max_seq_length': self.max_seq_length}
with open(filename, "w") as fp:
json.dump(vec_dict, fp)
@classmethod
def load(cls, filename):
with open(filename, "r") as fp:
vec_dict = json.load(fp)
vec_dict["word_vocab"] = Vocabulary.deserialize_from_contents(vec_dict["word_vocab"])
return cls(**vec_dict)
@classmethod
def fit(cls, tweet_df):
"""
"""
vocab = Vocabulary(use_unks=False,
use_start_end=True,
use_mask=True,
start_token=settings.START_TOKEN,
end_token=settings.END_TOKEN)
max_seq_length = 0
for text in tweet_df.text:
split_text = text.split(" ")
vocab.add_many(split_text)
if len(split_text) > max_seq_length:
max_seq_length = len(split_text)
max_seq_length = max_seq_length + 2
return cls(vocab, max_seq_length)
@classmethod
def fit_transform(cls, tweet_df, split='train'):
vectorizer = cls.fit(tweet_df)
return vectorizer, vectorizer.transform(tweet_df, split)
def transform(self, tweet_df, split='train'):
tweet_df = tweet_df[tweet_df.split==split].reset_index()
num_data = len(tweet_df)
x_words = np.zeros((num_data, self.max_seq_length), dtype=np.int64)
y_words = np.ones((num_data, self.max_seq_length), dtype=np.int64)
for index, row in tweet_df.iterrows():
converted = list(self.word_vocab.map(row.text.split(' '), include_start_end=True))
x_version = converted[:-1]
y_version = converted[1:]
x_words[index, :len(x_version)] = x_version
y_words[index, :len(y_version)] = y_version
return VectorizedTrumpTweets(x_words, y_words)
# vec data
class VectorizedTrumpTweets(Dataset):
def __init__(self, x_words, y_words):
self.x_words = x_words
self.y_words = y_words
def __len__(self):
return len(self.x_words)
def __getitem__(self, index):
return {'x_words': self.x_words[index],
'y_words': self.y_words[index],
'x_lengths': len(self.x_words[index].nonzero()[0])}
# data generator
def make_generator(vectorized_data, batch_size, num_batches=-1,
num_workers=0, volatile_mode=False,
strict_batching=True):
loaded_data = DataLoader(vectorized_data, batch_size=batch_size,
shuffle=True, num_workers=num_workers)
def inner_func(num_batches=num_batches,
volatile_mode=volatile_mode):
for batch_index, batch in enumerate(loaded_data):
out = {}
current_batch_size = list(batch.values())[0].size(0)
if current_batch_size < batch_size and strict_batching:
break
for key, value in batch.items():
if not isinstance(value, Variable):
value = Variable(value)
if settings.CUDA:
value = value.cuda()
if volatile_mode:
value = value.volatile()
out[key] = value
yield out
if num_batches > 0 and batch_index > num_batches:
break
return inner_func
In [4]:
def new_parameter(*size):
out = Parameter(FloatTensor(*size))
torch.nn.init.xavier_normal(out)
return out
class ExplicitRNN(nn.Module):
def __init__(self, input_size, hidden_size, expect_batch_on_dim0=False):
super(ExplicitRNN, self).__init__()
self.W_in2hid = new_parameter(input_size, hidden_size)
self.W_hid2hid = new_parameter(hidden_size, hidden_size)
self.b_hid = new_parameter(1, hidden_size)
self.hidden_size = hidden_size
self.expect_batch_on_dim0 = expect_batch_on_dim0
def _compute_next_hidden(self, x, h):
return F.tanh(x.matmul(self.W_in2hid) +
h.matmul(self.W_hid2hid) +
self.b_hid)
def forward(self, x_in, hid_t=None):
if self.expect_batch_on_dim0:
batch_size, seq_size, feat_size = x_in.size()
x_in = x_in.permute(1, 0, 2)
else:
seq_size, batch_size, feat_size = x_in.size()
hiddens = []
if hid_t is None:
hid_t = Variable(torch.zeros((batch_size, self.hidden_size)))
if settings.CUDA:
hid_t = hid_t.cuda()
for t in range(seq_size):
x_t = x_in[t]
hid_t = self._compute_next_hidden(x_t, hid_t)
hiddens.append(hid_t)
hiddens = torch.stack(hiddens)
if self.expect_batch_on_dim0:
hiddens = hiddens.permute(1, 0, 2)
return hiddens
class WordRNN(nn.Module):
def __init__(self, embedding_size, in_vocab_size, out_vocab_size, hidden_size,
expect_batch_on_dim0=True):
super(WordRNN, self).__init__()
self.emb = nn.Embedding(embedding_dim=embedding_size,
num_embeddings=in_vocab_size,
padding_idx=0)
self.fc = nn.Linear(in_features=hidden_size, out_features=out_vocab_size)
self.rnn = ExplicitRNN(input_size=embedding_size, hidden_size=hidden_size,
expect_batch_on_dim0=expect_batch_on_dim0)
def forward(self, x_in, x_lengths=None, apply_softmax=False):
x_in = self.emb(x_in)
y_out = self.rnn(x_in)
dim0, dim1, dim2 = y_out.size()
y_out = y_out.contiguous().view(-1, dim2)
y_out = self.fc(y_out)
# optionally apply the softmax
if apply_softmax:
y_out = F.softmax(y_out)
y_out = y_out.view(dim0, dim1, -1)
return y_out
def normalize_sizes(net_output, y_true):
net_output = net_output.cpu()
y_true = y_true.cpu()
if len(net_output.size()) == 3:
net_output.contiguous()
net_output = net_output.view(-1, net_output.size(2))
if len(y_true.size()) == 2:
y_true.contiguous()
y_true = y_true.view(-1)
return net_output, y_true
def sequence_loss(net_output, y_true, loss_func=F.cross_entropy):
net_output, y_true = normalize_sizes(net_output, y_true)
return F.cross_entropy(net_output, y_true, ignore_index=settings.IGNORE_INDEX_VALUE)
def compute_accuracy(yhat, ytrue):
yhat, ytrue = normalize_sizes(yhat, ytrue)
_, yhat_indices = yhat.max(dim=1)
n_correct = torch.eq(yhat_indices, ytrue).sum().data.numpy()[0]
return n_correct / len(yhat_indices) * 100
def training_loop(net, datagen_func, optimizer, bar=None):
if bar is None:
bar = tqdm(position=2)
accs = []
for data_dictionary in datagen_func():
net.zero_grad()
optimizer.zero_grad()
yhat = net(data_dictionary['x_words'], data_dictionary['x_lengths'])
loss = sequence_loss(yhat, data_dictionary['y_words'])
accs.append(compute_accuracy(yhat, data_dictionary['y_words']))
bar.update(1)
bar.set_postfix(loss=loss.cpu().data.numpy()[0],
accuracy="{:0.2f}".format(np.mean(accs)))
loss.backward()
optimizer.step()
def val_loop(net, datagen_func, bar=None):
if bar is None:
bar = tqdm(position=1)
accs = []
for data_dictionary in datagen_func():
yhat = net(data_dictionary['x_words'], data_dictionary['x_lengths'], apply_softmax=True)
accs.append(compute_accuracy(yhat, data_dictionary['y_words']))
bar.update(1)
bar.set_postfix(accuracy="{:0.2f}".format(np.mean(accs)))
In [5]:
def sample(emb, rnn, fc, h_t=None, idx_t=None, n=20, temp=1):
hiddens = [h_t]
indices = [idx_t]
out_dists = []
for t in range(n):
x_t = emb(idx_t)
h_t = rnn._compute_next_hidden(x_t, h_t)
y_t = fc(h_t)
y_t = F.softmax( y_t / temp)
idx_t = torch.multinomial(y_t, 1)[:, 0]
hiddens.append(h_t)
indices.append(idx_t)
out_dists.append(y_t)
indices = torch.stack(indices).squeeze().permute(1, 0)
return indices
def make_initial_hidden(batch_size, hidden_size):
out = Variable(torch.ones(batch_size, hidden_size))
if settings.CUDA:
out = out.cuda()
return out
def make_initial_x(batch_size, vectorizer):
out = Variable(torch.ones(batch_size) * vectorizer.word_vocab.start_index).long()
if settings.CUDA:
out = out.cuda()
return out
def decode_one(vectorizer, seq):
out = []
for i in seq:
if vectorizer.word_vocab.start_index == i:
continue
if vectorizer.word_vocab.end_index == i:
return ' '.join(out)
out.append(vectorizer.word_vocab.lookup(i))
return ' '.join(out)
def decode_matrix(vectorizer, mat):
mat = mat.cpu().data.numpy()
return [decode_one(vectorizer, mat[i]) for i in range(len(mat))]
In [6]:
from settings import ZOO
import os
batch_size = 16
raw_data = RawTrumpTweets().get_data()
zoo_info = ZOO.wordrnn_trump_tweet_predicter
if os.path.exists(zoo_info['vocab']):
vectorizer = TrumpTweetVectorizer.load(zoo_info['vocab'])
print("Loading vectorizer!")
else:
vectorizer = TrumpTweetVectorizer.fit(raw_data)
print("Creating a new vectorizer.")
vec_train = vectorizer.transform(raw_data, split='train')
vec_test = vectorizer.transform(raw_data, split='test')
parameters = dict(zoo_info['parameters'])
parameters['in_vocab_size'] = len(vectorizer.word_vocab)
parameters['out_vocab_size'] = len(vectorizer.word_vocab)
parameters['expect_batch_on_dim0'] = True
net = WordRNN(**parameters)
if settings.CUDA:
print("CUDA mode enabled")
net = net.cuda()
else:
print("CUDA mode not enabled")
net = net.cpu()
In [7]:
decode_matrix(vectorizer,
sample(net.emb, net.rnn, net.fc,
make_initial_hidden(batch_size, parameters['hidden_size']),
make_initial_x(batch_size, vectorizer),
temp=0.8))
Out[7]:
In [8]:
FORCE_FRESH_INIT = False
if os.path.exists(zoo_info['filename']) and not FORCE_FRESH_INIT:
print("Loading state dict!")
net.load_state_dict(torch.load(zoo_info['filename'], map_location=lambda storage, loc: storage))
else:
print("Using newly initiated network!")
In [9]:
decode_matrix(vectorizer,
sample(net.emb, net.rnn, net.fc,
make_initial_hidden(batch_size, parameters['hidden_size']),
make_initial_x(batch_size, vectorizer),
temp=0.8))
Out[9]:
In [10]:
# Train
n_epochs = 100
optimizer = torch.optim.Adam(net.parameters(), lr=0.0001)
bar = tqdm_notebook(total=n_epochs, position=0)
valbar = tqdm_notebook(position=2)
trainbar = tqdm_notebook(position=3)
train_data_func = make_generator(vec_train, batch_size=batch_size)
test_data_func = make_generator(vec_test, batch_size=batch_size)
try:
for _ in range(n_epochs):
net.train(False)
val_loop(net, test_data_func, bar=valbar)
net.train(True)
training_loop(net, train_data_func, optimizer, bar=trainbar)
samples = decode_matrix(vectorizer,
sample(net.emb, net.rnn, net.fc,
make_initial_hidden(2, parameters['hidden_size']),
make_initial_x(2, vectorizer),
temp=0.8))
bar.update(1)
bar.set_postfix(sample0=samples[0], sample1=samples[1])
net.train(False)
val_loop(net, test_data_func, valbar)
except KeyboardInterrupt:
print("...")
In [60]:
def load_word_vectors(filename=settings.GLOVE_FILENAME):
word_to_index = {}
word_vectors = []
with open(filename) as fp:
for line in tqdm(fp.readlines()):
line = line.split(" ")
word = line[0]
word_to_index[word] = len(word_to_index)
vec = np.array([float(x) for x in line[1:]])
word_vectors.append(vec)
word_vector_size = len(word_vectors[0])
return word_to_index, word_vectors, word_vector_size
word_to_index, word_vectors, word_vector_size = load_word_vectors()
now, we want to collate what we have from the word vectors with what is is on our vocabulary!
In [8]:
net.emb.weight.size()
Out[8]:
In [73]:
net = WordRNN(**parameters)
if settings.CUDA:
print("CUDA mode enabled")
net = net.cuda()
else:
print("CUDA mode not enabled")
net = net.cpu()
In [80]:
n = 0
for word, emb_index in tqdm_notebook(vectorizer.word_vocab.items()):
if word.lower() in word_to_index:
n += 1
glove_index = word_to_index[word.lower()]
glove_vec = torch.FloatTensor(word_vectors[glove_index])
if settings.CUDA:
glove_vec = glove_vec.cuda()
net.emb.weight.data[emb_index, :].set_(glove_vec)
print(n, 'replaced')
In [85]:
# Train
n_epochs = 100
optimizer = torch.optim.Adam(net.parameters(), lr=0.001)
bar = tqdm_notebook(total=n_epochs, position=0)
valbar = tqdm_notebook(position=2, desc='validation data')
trainbar = tqdm_notebook(position=3, desc='training data')
batch_size=16
train_data_func = make_generator(vec_train, batch_size=batch_size)
test_data_func = make_generator(vec_test, batch_size=batch_size)
try:
for _ in range(n_epochs):
samples = decode_matrix(vectorizer,
sample(net.emb, net.rnn, net.fc,
make_initial_hidden(2, parameters['hidden_size']),
make_initial_x(2, vectorizer),
temp=0.8))
bar.set_postfix(sample0=samples[0], sample1=samples[1])
net.train(False)
val_loop(net, test_data_func, bar=valbar)
net.train(True)
training_loop(net, train_data_func, optimizer, bar=trainbar)
bar.update(1)
net.train(False)
val_loop(net, test_data_func, valbar)
except KeyboardInterrupt:
print("...")
In [11]:
batch_size=100
decode_matrix(vectorizer,
sample(net.emb, net.rnn, net.fc,
make_initial_hidden(batch_size, parameters['hidden_size']),
make_initial_x(batch_size, vectorizer),
temp=0.85))
Out[11]: