In [2]:
    
import json
from local_settings import settings, datautils
from datautils.vocabulary import Vocabulary
import pandas as pd
import numpy as np
from ast import literal_eval
import torch
from torch import FloatTensor
from torch import nn
from torch.autograd import Variable
from torch.nn import Parameter
from torch.nn import functional as F
from torch.utils.data import Dataset
from torch.utils.data import DataLoader
from tqdm import tqdm, tqdm_notebook
    
In [3]:
    
class RawTrumpTweets(object):
    def __init__(self, data_path=settings.TRUMP_FILENAME):
        self.data = pd.read_csv(data_path)
        
    def get_data(self):
        return self.data  
# vectorizer
class TrumpTweetVectorizer(object):
    def __init__(self, word_vocab, max_seq_length):
        self.word_vocab = word_vocab
        self.max_seq_length = max_seq_length
        
    def save(self, filename):
        vec_dict = {"word_vocab": self.word_vocab.get_serializable_contents(),
                    'max_seq_length': self.max_seq_length}
        with open(filename, "w") as fp:
            json.dump(vec_dict, fp)
        
    @classmethod
    def load(cls, filename):
        with open(filename, "r") as fp:
            vec_dict = json.load(fp)
        vec_dict["word_vocab"] = Vocabulary.deserialize_from_contents(vec_dict["word_vocab"])
        return cls(**vec_dict)
    @classmethod
    def fit(cls, tweet_df):
        """
        """
        vocab = Vocabulary(use_unks=False,
                           use_start_end=True,
                           use_mask=True,
                           start_token=settings.START_TOKEN,
                           end_token=settings.END_TOKEN)
        max_seq_length = 0
        for text in tweet_df.text:
            split_text = text.split(" ")
            vocab.add_many(split_text)
            if len(split_text) > max_seq_length:
                max_seq_length = len(split_text)
        max_seq_length = max_seq_length + 2
        return cls(vocab, max_seq_length)
    @classmethod
    def fit_transform(cls, tweet_df, split='train'):
        vectorizer = cls.fit(tweet_df)
        return vectorizer, vectorizer.transform(tweet_df, split)
    def transform(self, tweet_df, split='train'):
        tweet_df = tweet_df[tweet_df.split==split].reset_index()
        num_data = len(tweet_df)
        
        x_words = np.zeros((num_data, self.max_seq_length), dtype=np.int64)
        y_words = np.ones((num_data, self.max_seq_length), dtype=np.int64)
        for index, row in tweet_df.iterrows():
            converted = list(self.word_vocab.map(row.text.split(' '), include_start_end=True))
            x_version = converted[:-1]
            y_version = converted[1:]
            
            x_words[index, :len(x_version)] = x_version
            y_words[index, :len(y_version)] = y_version
            
        return VectorizedTrumpTweets(x_words, y_words)
# vec data
class VectorizedTrumpTweets(Dataset):
    def __init__(self, x_words, y_words):
        self.x_words = x_words
        self.y_words = y_words
    def __len__(self):
        return len(self.x_words)
    def __getitem__(self, index):
        return {'x_words': self.x_words[index],
                'y_words': self.y_words[index],
                'x_lengths': len(self.x_words[index].nonzero()[0])}
# data generator
def make_generator(vectorized_data, batch_size, num_batches=-1, 
                               num_workers=0, volatile_mode=False, 
                               strict_batching=True):
    loaded_data = DataLoader(vectorized_data, batch_size=batch_size, 
                             shuffle=True, num_workers=num_workers)
    def inner_func(num_batches=num_batches, 
                   volatile_mode=volatile_mode):
        for batch_index, batch in enumerate(loaded_data):
            out = {}
            current_batch_size = list(batch.values())[0].size(0)
            if current_batch_size < batch_size and strict_batching:
                break
            for key, value in batch.items():
                if not isinstance(value, Variable):
                    value = Variable(value)
                if settings.CUDA:
                    value = value.cuda()
                if volatile_mode:
                    value = value.volatile()
                out[key] = value
            yield out
            if num_batches > 0 and batch_index > num_batches:
                break
    return inner_func
    
In [4]:
    
def new_parameter(*size):
    out = Parameter(FloatTensor(*size))
    torch.nn.init.xavier_normal(out)
    return out
class ExplicitRNN(nn.Module):
    def __init__(self, input_size, hidden_size, expect_batch_on_dim0=False):
        super(ExplicitRNN, self).__init__()
        self.W_in2hid = new_parameter(input_size, hidden_size)
        self.W_hid2hid = new_parameter(hidden_size, hidden_size)
            
        self.b_hid = new_parameter(1, hidden_size)
        
        self.hidden_size = hidden_size
        self.expect_batch_on_dim0 = expect_batch_on_dim0
    
    def _compute_next_hidden(self, x, h):
        return F.tanh(x.matmul(self.W_in2hid) + 
                      h.matmul(self.W_hid2hid) + 
                      self.b_hid)
    def forward(self, x_in, hid_t=None):
        if self.expect_batch_on_dim0:
            batch_size, seq_size, feat_size = x_in.size()
            x_in = x_in.permute(1, 0, 2)
        else:
            seq_size, batch_size, feat_size = x_in.size()
        hiddens = []
        if hid_t is None:
            hid_t = Variable(torch.zeros((batch_size, self.hidden_size)))
        
        if settings.CUDA:
            hid_t = hid_t.cuda()
            
        for t in range(seq_size):
            x_t = x_in[t]
            hid_t = self._compute_next_hidden(x_t, hid_t)
            
            hiddens.append(hid_t)
        hiddens = torch.stack(hiddens)
        if self.expect_batch_on_dim0:
            hiddens = hiddens.permute(1, 0, 2)
        return hiddens
    
class WordRNN(nn.Module):
    def __init__(self, embedding_size, in_vocab_size, out_vocab_size, hidden_size, 
                 expect_batch_on_dim0=True):
        super(WordRNN, self).__init__()
        
        self.emb = nn.Embedding(embedding_dim=embedding_size, 
                                num_embeddings=in_vocab_size, 
                                padding_idx=0)
        self.fc = nn.Linear(in_features=hidden_size, out_features=out_vocab_size)
        self.rnn = ExplicitRNN(input_size=embedding_size, hidden_size=hidden_size, 
                               expect_batch_on_dim0=expect_batch_on_dim0)
    
    def forward(self, x_in, x_lengths=None, apply_softmax=False):
        x_in = self.emb(x_in)
        y_out = self.rnn(x_in)
        dim0, dim1, dim2 = y_out.size()
        y_out = y_out.contiguous().view(-1, dim2)
        y_out = self.fc(y_out)
        # optionally apply the softmax
        if apply_softmax:
            y_out = F.softmax(y_out)
        y_out = y_out.view(dim0, dim1, -1)
        
        return y_out
    
def normalize_sizes(net_output, y_true):
    net_output = net_output.cpu()
    y_true = y_true.cpu()
    if len(net_output.size()) == 3:
        net_output.contiguous()
        net_output = net_output.view(-1, net_output.size(2))
    if len(y_true.size()) == 2:
        y_true.contiguous()
        y_true = y_true.view(-1)
    return net_output, y_true
def sequence_loss(net_output, y_true, loss_func=F.cross_entropy):
    net_output, y_true = normalize_sizes(net_output, y_true)
    return F.cross_entropy(net_output, y_true, ignore_index=settings.IGNORE_INDEX_VALUE)
def compute_accuracy(yhat, ytrue):
    yhat, ytrue = normalize_sizes(yhat, ytrue)
    _, yhat_indices = yhat.max(dim=1)
    n_correct = torch.eq(yhat_indices, ytrue).sum().data.numpy()[0]
    return n_correct / len(yhat_indices) * 100
def training_loop(net, datagen_func, optimizer, bar=None):
    if bar is None:
        bar = tqdm(position=2)
    accs = []
    for data_dictionary in datagen_func():
        net.zero_grad()
        optimizer.zero_grad()
        
        yhat = net(data_dictionary['x_words'], data_dictionary['x_lengths'])
        loss = sequence_loss(yhat, data_dictionary['y_words'])
        accs.append(compute_accuracy(yhat, data_dictionary['y_words']))
        
        bar.update(1)
        bar.set_postfix(loss=loss.cpu().data.numpy()[0], 
                        accuracy="{:0.2f}".format(np.mean(accs)))
        
        loss.backward()
        optimizer.step()
          
def val_loop(net, datagen_func, bar=None):
    if bar is None:
        bar = tqdm(position=1)
    accs = []
    for data_dictionary in datagen_func():
        yhat = net(data_dictionary['x_words'], data_dictionary['x_lengths'], apply_softmax=True)
        accs.append(compute_accuracy(yhat, data_dictionary['y_words']))
        bar.update(1)
        bar.set_postfix(accuracy="{:0.2f}".format(np.mean(accs)))
    
In [5]:
    
def sample(emb, rnn, fc, h_t=None, idx_t=None, n=20, temp=1):
    hiddens = [h_t]
    indices = [idx_t]
    out_dists = []
    
    for t in range(n):
        x_t = emb(idx_t)
        h_t = rnn._compute_next_hidden(x_t, h_t)
        
        y_t = fc(h_t)
        y_t = F.softmax( y_t / temp)
        idx_t = torch.multinomial(y_t, 1)[:, 0]
        
        
        hiddens.append(h_t)
        indices.append(idx_t)
        out_dists.append(y_t)
     
    indices = torch.stack(indices).squeeze().permute(1, 0)
    return indices
def make_initial_hidden(batch_size, hidden_size):
    out = Variable(torch.ones(batch_size, hidden_size))
    if settings.CUDA:
        out = out.cuda()
    return out
def make_initial_x(batch_size, vectorizer):
    out = Variable(torch.ones(batch_size) * vectorizer.word_vocab.start_index).long()
    if settings.CUDA:
        out = out.cuda()
    return out
def decode_one(vectorizer, seq):
    out = []
    for i in seq:
        if vectorizer.word_vocab.start_index == i:
            continue
        if vectorizer.word_vocab.end_index == i:
            return ' '.join(out)
        out.append(vectorizer.word_vocab.lookup(i))
    return ' '.join(out)
            
def decode_matrix(vectorizer, mat):
    mat = mat.cpu().data.numpy()
    return [decode_one(vectorizer, mat[i]) for i in range(len(mat))]
    
In [6]:
    
from settings import ZOO
import os
batch_size = 16
raw_data = RawTrumpTweets().get_data()
zoo_info = ZOO.wordrnn_trump_tweet_predicter
if os.path.exists(zoo_info['vocab']):
    vectorizer = TrumpTweetVectorizer.load(zoo_info['vocab'])
    print("Loading vectorizer!")
else:
    vectorizer = TrumpTweetVectorizer.fit(raw_data)
    print("Creating a new vectorizer.")
vec_train = vectorizer.transform(raw_data, split='train')
vec_test = vectorizer.transform(raw_data, split='test')
parameters = dict(zoo_info['parameters'])    
parameters['in_vocab_size'] = len(vectorizer.word_vocab)
parameters['out_vocab_size'] = len(vectorizer.word_vocab)
parameters['expect_batch_on_dim0'] = True
net = WordRNN(**parameters)
if settings.CUDA:
    print("CUDA mode enabled")
    net = net.cuda()
else:
    print("CUDA mode not enabled")
    net = net.cpu()
    
    
In [7]:
    
decode_matrix(vectorizer, 
              sample(net.emb, net.rnn, net.fc, 
                     make_initial_hidden(batch_size, parameters['hidden_size']), 
                     make_initial_x(batch_size, vectorizer),
                     temp=0.8))
    
    Out[7]:
In [8]:
    
FORCE_FRESH_INIT = False
if os.path.exists(zoo_info['filename']) and not FORCE_FRESH_INIT:
    print("Loading state dict!")
    net.load_state_dict(torch.load(zoo_info['filename'], map_location=lambda storage, loc: storage))
else:
    print("Using newly initiated network!")
    
    
In [9]:
    
decode_matrix(vectorizer, 
              sample(net.emb, net.rnn, net.fc, 
                     make_initial_hidden(batch_size, parameters['hidden_size']), 
                     make_initial_x(batch_size, vectorizer),
                     temp=0.8))
    
    Out[9]:
In [10]:
    
# Train
n_epochs = 100
optimizer = torch.optim.Adam(net.parameters(), lr=0.0001)
bar = tqdm_notebook(total=n_epochs, position=0)
valbar = tqdm_notebook(position=2)
trainbar = tqdm_notebook(position=3)
train_data_func = make_generator(vec_train, batch_size=batch_size)
test_data_func = make_generator(vec_test, batch_size=batch_size)
try:
    
    for _ in range(n_epochs):
        net.train(False)
        val_loop(net, test_data_func, bar=valbar)
        net.train(True)
        training_loop(net, train_data_func, optimizer, bar=trainbar)
        samples = decode_matrix(vectorizer, 
                                sample(net.emb, net.rnn, net.fc, 
                                       make_initial_hidden(2, parameters['hidden_size']), 
                                       make_initial_x(2, vectorizer),
                                       temp=0.8))
        
        bar.update(1)
        bar.set_postfix(sample0=samples[0], sample1=samples[1])
    net.train(False)
    val_loop(net, test_data_func, valbar)
except KeyboardInterrupt:
    print("...")
    
    
 
 
    
 
 
    
 
 
    
In [60]:
    
def load_word_vectors(filename=settings.GLOVE_FILENAME):
    word_to_index = {}
    word_vectors = []
    
    with open(filename) as fp:
        for line in tqdm(fp.readlines()):
            line = line.split(" ")
            
            word = line[0]
            word_to_index[word] = len(word_to_index)
            
            vec = np.array([float(x) for x in line[1:]])
            word_vectors.append(vec)
    word_vector_size = len(word_vectors[0])
    return word_to_index, word_vectors, word_vector_size
word_to_index, word_vectors, word_vector_size = load_word_vectors()
    
    
now, we want to collate what we have from the word vectors with what is is on our vocabulary!
In [8]:
    
net.emb.weight.size()
    
    Out[8]:
In [73]:
    
net = WordRNN(**parameters)
if settings.CUDA:
    print("CUDA mode enabled")
    net = net.cuda()
else:
    print("CUDA mode not enabled")
    net = net.cpu()
    
    
In [80]:
    
n = 0
for word, emb_index in tqdm_notebook(vectorizer.word_vocab.items()):
    if word.lower() in word_to_index:
        n += 1
        glove_index = word_to_index[word.lower()]
        glove_vec = torch.FloatTensor(word_vectors[glove_index])
        if settings.CUDA:
            glove_vec = glove_vec.cuda()
        net.emb.weight.data[emb_index, :].set_(glove_vec)
print(n, 'replaced')
    
    
 
 
    
In [85]:
    
# Train
n_epochs = 100
optimizer = torch.optim.Adam(net.parameters(), lr=0.001)
bar = tqdm_notebook(total=n_epochs, position=0)
valbar = tqdm_notebook(position=2, desc='validation data')
trainbar = tqdm_notebook(position=3, desc='training data')
batch_size=16
train_data_func = make_generator(vec_train, batch_size=batch_size)
test_data_func = make_generator(vec_test, batch_size=batch_size)
try:
    
    for _ in range(n_epochs):
        samples = decode_matrix(vectorizer, 
                                sample(net.emb, net.rnn, net.fc, 
                                       make_initial_hidden(2, parameters['hidden_size']), 
                                       make_initial_x(2, vectorizer),
                                       temp=0.8))
        bar.set_postfix(sample0=samples[0], sample1=samples[1])
        
        net.train(False)
        val_loop(net, test_data_func, bar=valbar)
        net.train(True)
        training_loop(net, train_data_func, optimizer, bar=trainbar)
        
        bar.update(1)
    net.train(False)
    val_loop(net, test_data_func, valbar)
except KeyboardInterrupt:
    print("...")
    
    
 
 
    
    
 
 
    
 
 
    
In [11]:
    
batch_size=100
decode_matrix(vectorizer, 
              sample(net.emb, net.rnn, net.fc, 
                     make_initial_hidden(batch_size, parameters['hidden_size']), 
                     make_initial_x(batch_size, vectorizer),
                     temp=0.85))
    
    Out[11]: