In this notebook we repeat the experiments from the first BERT notebook, but this time we also feed the passage to the model. This results in the following differences:
Note that BERT only takes inputs with a maximum length of 512 (after tokenization). This may become a problem with long passages, but it looks like our passages are typically shorter than that.
In [1]:
import torch
from pytorch_transformers.tokenization_bert import BertTokenizer
from pytorch_transformers.modeling_bert import BertForSequenceClassification
BERT_MODEL = 'bert-base-uncased'
tokenizer = BertTokenizer.from_pretrained(BERT_MODEL)
In [2]:
import ndjson
import glob
train_files = glob.glob("../data/interim/eatingmeat_emma_train_withprompt*.ndjson")
dev_file = "../data/interim/eatingmeat_emma_dev_withprompt.ndjson"
test_file = "../data/interim/eatingmeat_emma_test_withprompt.ndjson"
passage_file = "../data/raw/eatingmeat_passage.txt"
train_data = []
for train_file in train_files:
print(train_file)
with open(train_file) as i:
train_data += ndjson.load(i)
with open(dev_file) as i:
dev_data = ndjson.load(i)
with open(test_file) as i:
test_data = ndjson.load(i)
with open(passage_file) as i:
passage = "".join(i.readlines())
Next, we build the label vocabulary, which maps every label in the training data to an index.
In [3]:
label2idx = {}
target_names = []
for item in train_data:
if item["label"] not in label2idx:
target_names.append(item["label"])
label2idx[item["label"]] = len(label2idx)
label2idx
Out[3]:
In [4]:
model = BertForSequenceClassification.from_pretrained(BERT_MODEL, num_labels=len(label2idx))
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
model.train()
Out[4]:
We preprocess the data by turning every example to an InputFeatures item. This item has all the attributes we need for finetuning BERT:
In [5]:
import logging
import warnings
import numpy as np
logging.basicConfig(format = '%(asctime)s - %(levelname)s - %(name)s - %(message)s',
datefmt = '%m/%d/%Y %H:%M:%S',
level = logging.INFO)
logger = logging.getLogger(__name__)
MAX_SEQ_LENGTH=512
class InputFeatures(object):
"""A single set of features of data."""
def __init__(self, input_ids, input_mask, segment_ids, label_id):
self.input_ids = input_ids
self.input_mask = input_mask
self.segment_ids = segment_ids
self.label_id = label_id
def convert_examples_to_features(examples, passage, label2idx, max_seq_length, tokenizer, verbose=0):
"""Loads a data file into a list of `InputBatch`s."""
features = []
for (ex_index, ex) in enumerate(examples):
# TODO: should deal better with sentences > max tok length
input_ids = tokenizer.encode("[CLS] " + passage + " " + ex["text"] + " [SEP]")
if len(input_ids) > max_seq_length:
warnings.warn("Input longer than maximum sequence length.")
input_ids = input_ids[:max_seq_length]
segment_ids = [0] * len(input_ids)
# The mask has 1 for real tokens and 0 for padding tokens. Only real
# tokens are attended to.
input_mask = [1] * len(input_ids)
# Zero-pad up to the sequence length.
padding = [0] * (max_seq_length - len(input_ids))
input_ids += padding
input_mask += padding
segment_ids += padding
assert len(input_ids) == max_seq_length
assert len(input_mask) == max_seq_length
assert len(segment_ids) == max_seq_length
label_id = label2idx[ex["label"]]
if verbose and ex_index == 0:
logger.info("*** Example ***")
logger.info("text: %s" % ex["text"])
logger.info("input_ids: %s" % " ".join([str(x) for x in input_ids]))
logger.info("input_mask: %s" % " ".join([str(x) for x in input_mask]))
logger.info("segment_ids: %s" % " ".join([str(x) for x in segment_ids]))
logger.info("label:" + str(ex["label"]) + " id: " + str(label_id))
features.append(
InputFeatures(input_ids=input_ids,
input_mask=input_mask,
segment_ids=segment_ids,
label_id=label_id))
return features
train_features = convert_examples_to_features(train_data, passage, label2idx, MAX_SEQ_LENGTH, tokenizer, verbose=0)
dev_features = convert_examples_to_features(dev_data, passage, label2idx, MAX_SEQ_LENGTH, tokenizer)
test_features = convert_examples_to_features(test_data, passage, label2idx, MAX_SEQ_LENGTH, tokenizer, verbose=1)
Next, we initialize data loaders for each of our data sets. These data loaders present the data for training (for example, by grouping them into batches).
In [6]:
import torch
from torch.utils.data import TensorDataset, DataLoader, RandomSampler
def get_data_loader(features, max_seq_length, batch_size):
all_input_ids = torch.tensor([f.input_ids for f in features], dtype=torch.long)
all_input_mask = torch.tensor([f.input_mask for f in features], dtype=torch.long)
all_segment_ids = torch.tensor([f.segment_ids for f in features], dtype=torch.long)
all_label_ids = torch.tensor([f.label_id for f in features], dtype=torch.long)
data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label_ids)
sampler = RandomSampler(data, replacement=False)
dataloader = DataLoader(data, sampler=sampler, batch_size=batch_size)
return dataloader
BATCH_SIZE = 2
train_dataloader = get_data_loader(train_features, MAX_SEQ_LENGTH, BATCH_SIZE)
dev_dataloader = get_data_loader(dev_features, MAX_SEQ_LENGTH, BATCH_SIZE)
test_dataloader = get_data_loader(test_features, MAX_SEQ_LENGTH, BATCH_SIZE)
In [7]:
def evaluate(model, dataloader):
eval_loss = 0
nb_eval_steps = 0
predicted_labels, correct_labels = [], []
for step, batch in enumerate(tqdm(dataloader, desc="Evaluation iteration")):
batch = tuple(t.to(device) for t in batch)
input_ids, input_mask, segment_ids, label_ids = batch
with torch.no_grad():
tmp_eval_loss, logits = model(input_ids, segment_ids, input_mask, label_ids)
outputs = np.argmax(logits.to('cpu'), axis=1)
label_ids = label_ids.to('cpu').numpy()
predicted_labels += list(outputs)
correct_labels += list(label_ids)
eval_loss += tmp_eval_loss.mean().item()
nb_eval_steps += 1
eval_loss = eval_loss / nb_eval_steps
correct_labels = np.array(correct_labels)
predicted_labels = np.array(predicted_labels)
return eval_loss, correct_labels, predicted_labels
In [8]:
from pytorch_transformers.optimization import AdamW, WarmupLinearSchedule
GRADIENT_ACCUMULATION_STEPS = 8
NUM_TRAIN_EPOCHS = 20
LEARNING_RATE = 1e-5
WARMUP_PROPORTION = 0.1
def warmup_linear(x, warmup=0.002):
if x < warmup:
return x/warmup
return 1.0 - x
num_train_steps = int(len(train_data) / BATCH_SIZE / GRADIENT_ACCUMULATION_STEPS * NUM_TRAIN_EPOCHS)
param_optimizer = list(model.named_parameters())
no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
optimizer_grouped_parameters = [
{'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01},
{'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
]
optimizer = AdamW(optimizer_grouped_parameters, lr=LEARNING_RATE, correct_bias=False)
scheduler = WarmupLinearSchedule(optimizer, warmup_steps=100, t_total=num_train_steps)
Now we do the actual training. In each epoch, we present the model with all training data and compute the loss on the training set and the development set. We save the model whenever the development loss improves. We end training when we haven't seen an improvement of the development loss for a specific number of epochs (the patience).
Optionally, we use gradient accumulation to accumulate the gradient for several training steps. This is useful when we want to use a larger batch size than our current GPU allows us to do.
In [9]:
import os
from tqdm import trange
from tqdm import tqdm_notebook as tqdm
from sklearn.metrics import classification_report, precision_recall_fscore_support
OUTPUT_DIR = "/tmp/"
MODEL_FILE_NAME = "pytorch_model.bin"
PATIENCE = 5
global_step = 0
model.train()
loss_history = []
best_epoch = 0
for epoch in trange(int(NUM_TRAIN_EPOCHS), desc="Epoch"):
tr_loss = 0
nb_tr_examples, nb_tr_steps = 0, 0
for step, batch in enumerate(tqdm(train_dataloader, desc="Training iteration")):
batch = tuple(t.to(device) for t in batch)
input_ids, input_mask, segment_ids, label_ids = batch
outputs = model(input_ids, segment_ids, input_mask, label_ids)
loss = outputs[0]
if GRADIENT_ACCUMULATION_STEPS > 1:
loss = loss / GRADIENT_ACCUMULATION_STEPS
loss.backward()
tr_loss += loss.item()
nb_tr_examples += input_ids.size(0)
nb_tr_steps += 1
if (step + 1) % GRADIENT_ACCUMULATION_STEPS == 0:
lr_this_step = LEARNING_RATE * warmup_linear(global_step/num_train_steps, WARMUP_PROPORTION)
for param_group in optimizer.param_groups:
param_group['lr'] = lr_this_step
optimizer.step()
optimizer.zero_grad()
global_step += 1
dev_loss, _, _ = evaluate(model, dev_dataloader)
print("Loss history:", loss_history)
print("Dev loss:", dev_loss)
if len(loss_history) == 0 or dev_loss < min(loss_history):
model_to_save = model.module if hasattr(model, 'module') else model
output_model_file = os.path.join(OUTPUT_DIR, MODEL_FILE_NAME)
torch.save(model_to_save.state_dict(), output_model_file)
best_epoch = epoch
if epoch-best_epoch >= PATIENCE:
print("No improvement on development set. Finish training.")
break
loss_history.append(dev_loss)
In [10]:
print("Loading model from", output_model_file)
device="cpu"
model_state_dict = torch.load(output_model_file, map_location=lambda storage, loc: storage)
model = BertForSequenceClassification.from_pretrained(BERT_MODEL, state_dict=model_state_dict, num_labels=len(label2idx))
model.to(device)
model.eval()
_, train_correct, train_predicted = evaluate(model, train_dataloader)
_, dev_correct, dev_predicted = evaluate(model, dev_dataloader)
_, test_correct, test_predicted = evaluate(model, test_dataloader)
print("Training performance:", precision_recall_fscore_support(train_correct, train_predicted, average="micro"))
print("Development performance:", precision_recall_fscore_support(dev_correct, dev_predicted, average="micro"))
print("Test performance:", precision_recall_fscore_support(test_correct, test_predicted, average="micro"))
print(classification_report(test_correct, test_predicted, target_names=target_names))
In [ ]: