For these experiments we use the pytorch_transformers package. It contains a variety of neural network architectures for transfer learning and pretrained models, including BERT and XLNET.
Two different BERT models are relevant for our experiments:
In [1]:
import torch
from pytorch_transformers.tokenization_bert import BertTokenizer
from pytorch_transformers.modeling_bert import BertForSequenceClassification
BERT_MODEL = 'bert-base-uncased'
BATCH_SIZE = 16 if "base" in BERT_MODEL else 2
GRADIENT_ACCUMULATION_STEPS = 1 if "base" in BERT_MODEL else 8
tokenizer = BertTokenizer.from_pretrained(BERT_MODEL)
In [2]:
import ndjson
import glob
prefix = "junkfood_but"
train_file = f"../data/interim/{prefix}_train_withprompt_diverse200.ndjson"
synth_files = glob.glob(f"../data/interim/{prefix}_train_withprompt_*.ndjson")
dev_file = f"../data/interim/{prefix}_dev_withprompt.ndjson"
test_file = f"../data/interim/{prefix}_test_withprompt.ndjson"
with open(train_file) as i:
train_data = ndjson.load(i)
synth_data = []
for f in synth_files:
if "allsynth" in f:
continue
with open(f) as i:
synth_data += ndjson.load(i)
with open(dev_file) as i:
dev_data = ndjson.load(i)
with open(test_file) as i:
test_data = ndjson.load(i)
Next, we build the label vocabulary, which maps every label in the training data to an index.
In [3]:
label2idx = {}
idx2label = {}
target_names = []
for item in train_data:
if item["label"] not in label2idx:
target_names.append(item["label"])
idx = len(label2idx)
label2idx[item["label"]] = idx
idx2label[idx] = item["label"]
print(label2idx)
print(idx2label)
In [4]:
import random
def sample(train_data, synth_data, label2idx, number):
"""Sample a fixed number of items from every label from
the training data and test data.
"""
new_train_data = []
for label in label2idx:
data_for_label = [i for i in train_data if i["label"] == label]
# If there is more training data than the required number,
# take a random sample of n examples from the training data.
if len(data_for_label) >= number:
random.shuffle(data_for_label)
new_train_data += data_for_label[:number]
# If there is less training data than the required number,
# combine training data with synthetic data.
elif len(data_for_label) < number:
# Automatically add all training data
new_train_data += data_for_label
# Compute the required number of additional data
rest = number-len(data_for_label)
# Collect the synthetic data for the label
synth_data_for_label = [i for i in synth_data if i["label"] == label]
# If there is more synthetic data than required,
# take a random sample from the synthetic data.
if len(synth_data_for_label) > rest:
random.shuffle(synth_data_for_label)
new_train_data += synth_data_for_label[:rest]
# If there is less synthetic data than required,
# sample with replacement from this data until we have
# the required number.
else:
new_train_data += random.choices(synth_data_for_label, k=rest)
return new_train_data
def random_sample(train_data, train_size):
random.shuffle(train_data)
train_data = train_data[:train_size]
return train_data
#train_data = train_data + synth_data
#train_data = sample(train_data, synth_data, label2idx, 200)
#train_data = random_sample(train_data, 200)
print("Train data size:", len(train_data))
In [5]:
model = BertForSequenceClassification.from_pretrained(BERT_MODEL, num_labels=len(label2idx))
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
model.train()
Out[5]:
We preprocess the data by turning every example to an InputFeatures item. This item has all the attributes we need for finetuning BERT:
In [6]:
import logging
import numpy as np
logging.basicConfig(format = '%(asctime)s - %(levelname)s - %(name)s - %(message)s',
datefmt = '%m/%d/%Y %H:%M:%S',
level = logging.INFO)
logger = logging.getLogger(__name__)
MAX_SEQ_LENGTH=100
class InputFeatures(object):
"""A single set of features of data."""
def __init__(self, input_ids, input_mask, segment_ids, label_id):
self.input_ids = input_ids
self.input_mask = input_mask
self.segment_ids = segment_ids
self.label_id = label_id
def convert_examples_to_features(examples, label2idx, max_seq_length, tokenizer, verbose=0):
"""Loads a data file into a list of `InputBatch`s."""
features = []
for (ex_index, ex) in enumerate(examples):
# TODO: should deal better with sentences > max tok length
input_ids = tokenizer.encode("[CLS] " + ex["text"] + " [SEP]")
segment_ids = [0] * len(input_ids)
# The mask has 1 for real tokens and 0 for padding tokens. Only real
# tokens are attended to.
input_mask = [1] * len(input_ids)
# Zero-pad up to the sequence length.
padding = [0] * (max_seq_length - len(input_ids))
input_ids += padding
input_mask += padding
segment_ids += padding
assert len(input_ids) == max_seq_length
assert len(input_mask) == max_seq_length
assert len(segment_ids) == max_seq_length
label_id = label2idx[ex["label"]]
if verbose and ex_index == 0:
logger.info("*** Example ***")
logger.info("text: %s" % ex["text"])
logger.info("input_ids: %s" % " ".join([str(x) for x in input_ids]))
logger.info("input_mask: %s" % " ".join([str(x) for x in input_mask]))
logger.info("segment_ids: %s" % " ".join([str(x) for x in segment_ids]))
logger.info("label:" + str(ex["label"]) + " id: " + str(label_id))
features.append(
InputFeatures(input_ids=input_ids,
input_mask=input_mask,
segment_ids=segment_ids,
label_id=label_id))
return features
train_features = convert_examples_to_features(train_data, label2idx, MAX_SEQ_LENGTH, tokenizer, verbose=0)
dev_features = convert_examples_to_features(dev_data, label2idx, MAX_SEQ_LENGTH, tokenizer)
test_features = convert_examples_to_features(test_data, label2idx, MAX_SEQ_LENGTH, tokenizer, verbose=1)
Next, we initialize data loaders for each of our data sets. These data loaders present the data for training (for example, by grouping them into batches).
In [7]:
import torch
from torch.utils.data import TensorDataset, DataLoader, RandomSampler
def get_data_loader(features, max_seq_length, batch_size, shuffle=True):
all_input_ids = torch.tensor([f.input_ids for f in features], dtype=torch.long)
all_input_mask = torch.tensor([f.input_mask for f in features], dtype=torch.long)
all_segment_ids = torch.tensor([f.segment_ids for f in features], dtype=torch.long)
all_label_ids = torch.tensor([f.label_id for f in features], dtype=torch.long)
data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label_ids)
dataloader = DataLoader(data, shuffle=shuffle, batch_size=batch_size)
return dataloader
train_dataloader = get_data_loader(train_features, MAX_SEQ_LENGTH, BATCH_SIZE)
dev_dataloader = get_data_loader(dev_features, MAX_SEQ_LENGTH, BATCH_SIZE)
test_dataloader = get_data_loader(test_features, MAX_SEQ_LENGTH, BATCH_SIZE, shuffle=False)
In [8]:
def evaluate(model, dataloader, verbose=False):
eval_loss = 0
nb_eval_steps = 0
predicted_labels, correct_labels = [], []
for step, batch in enumerate(tqdm(dataloader, desc="Evaluation iteration")):
batch = tuple(t.to(device) for t in batch)
input_ids, input_mask, segment_ids, label_ids = batch
with torch.no_grad():
tmp_eval_loss, logits = model(input_ids, segment_ids, input_mask, label_ids)
outputs = np.argmax(logits.to('cpu'), axis=1)
label_ids = label_ids.to('cpu').numpy()
predicted_labels += list(outputs)
correct_labels += list(label_ids)
eval_loss += tmp_eval_loss.mean().item()
nb_eval_steps += 1
eval_loss = eval_loss / nb_eval_steps
correct_labels = np.array(correct_labels)
predicted_labels = np.array(predicted_labels)
return eval_loss, correct_labels, predicted_labels
In [9]:
from pytorch_transformers.optimization import AdamW, WarmupLinearSchedule
NUM_TRAIN_EPOCHS = 20
LEARNING_RATE = 1e-5
WARMUP_PROPORTION = 0.1
def warmup_linear(x, warmup=0.002):
if x < warmup:
return x/warmup
return 1.0 - x
num_train_steps = int(len(train_data) / BATCH_SIZE / GRADIENT_ACCUMULATION_STEPS * NUM_TRAIN_EPOCHS)
param_optimizer = list(model.named_parameters())
no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
optimizer_grouped_parameters = [
{'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01},
{'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
]
optimizer = AdamW(optimizer_grouped_parameters, lr=LEARNING_RATE, correct_bias=False)
scheduler = WarmupLinearSchedule(optimizer, warmup_steps=100, t_total=num_train_steps)
Now we do the actual training. In each epoch, we present the model with all training data and compute the loss on the training set and the development set. We save the model whenever the development loss improves. We end training when we haven't seen an improvement of the development loss for a specific number of epochs (the patience).
Optionally, we use gradient accumulation to accumulate the gradient for several training steps. This is useful when we want to use a larger batch size than our current GPU allows us to do.
In [10]:
import os
from tqdm import trange
from tqdm import tqdm_notebook as tqdm
from sklearn.metrics import classification_report, precision_recall_fscore_support
OUTPUT_DIR = "/tmp/"
MODEL_FILE_NAME = "pytorch_model.bin"
PATIENCE = 5
global_step = 0
model.train()
loss_history = []
best_epoch = 0
for epoch in trange(int(NUM_TRAIN_EPOCHS), desc="Epoch"):
tr_loss = 0
nb_tr_examples, nb_tr_steps = 0, 0
for step, batch in enumerate(tqdm(train_dataloader, desc="Training iteration")):
batch = tuple(t.to(device) for t in batch)
input_ids, input_mask, segment_ids, label_ids = batch
outputs = model(input_ids, segment_ids, input_mask, label_ids)
loss = outputs[0]
if GRADIENT_ACCUMULATION_STEPS > 1:
loss = loss / GRADIENT_ACCUMULATION_STEPS
loss.backward()
tr_loss += loss.item()
nb_tr_examples += input_ids.size(0)
nb_tr_steps += 1
if (step + 1) % GRADIENT_ACCUMULATION_STEPS == 0:
lr_this_step = LEARNING_RATE * warmup_linear(global_step/num_train_steps, WARMUP_PROPORTION)
for param_group in optimizer.param_groups:
param_group['lr'] = lr_this_step
optimizer.step()
optimizer.zero_grad()
global_step += 1
dev_loss, _, _ = evaluate(model, dev_dataloader)
print("Loss history:", loss_history)
print("Dev loss:", dev_loss)
if len(loss_history) == 0 or dev_loss < min(loss_history):
model_to_save = model.module if hasattr(model, 'module') else model
output_model_file = os.path.join(OUTPUT_DIR, MODEL_FILE_NAME)
torch.save(model_to_save.state_dict(), output_model_file)
best_epoch = epoch
if epoch-best_epoch >= PATIENCE:
print("No improvement on development set. Finish training.")
break
loss_history.append(dev_loss)
In [11]:
print("Loading model from", output_model_file)
device="cpu"
model_state_dict = torch.load(output_model_file, map_location=lambda storage, loc: storage)
model = BertForSequenceClassification.from_pretrained(BERT_MODEL, state_dict=model_state_dict, num_labels=len(label2idx))
model.to(device)
model.eval()
#_, train_correct, train_predicted = evaluate(model, train_dataloader)
#_, dev_correct, dev_predicted = evaluate(model, dev_dataloader)
_, test_correct, test_predicted = evaluate(model, test_dataloader, verbose=True)
#print("Training performance:", precision_recall_fscore_support(train_correct, train_predicted, average="micro"))
#print("Development performance:", precision_recall_fscore_support(dev_correct, dev_predicted, average="micro"))
print("Test performance:", precision_recall_fscore_support(test_correct, test_predicted, average="micro"))
print(classification_report(test_correct, test_predicted, target_names=target_names))
In [12]:
c = 0
for item, predicted, correct in zip(test_data, test_predicted, test_correct):
assert item["label"] == idx2label[correct]
c += (item["label"] == idx2label[predicted])
print("{}#{}#{}".format(item["text"], idx2label[correct], idx2label[predicted]))
print(c)
print(c/len(test_data))
In [ ]: