In this notebook, we continue our BERT experiments. We try to finetune one BERT model on several of our data sets. This makes it easier to deploy our solution in production.
As a first test, we'll just train a BERT model that takes as input a response from any of several data sets, and outputs probabilities for all labels in all data sets. This is slightly suboptimal (after all, we don't need probabilities for labels that are not relevant to a specific prompt), but as long as we're not working with thousands of different labels, I don't think this is very problematic.
The setup and preprocessing procedure is very similar to that in the first "Bert experiments" notebook. I'll highlight the areas where it is different.
In [1]:
import torch
from pytorch_transformers.tokenization_bert import BertTokenizer
from pytorch_transformers.modeling_bert import BertForSequenceClassification
BERT_MODEL = 'bert-large-uncased'
BATCH_SIZE = 16 if "base" in BERT_MODEL else 2
GRADIENT_ACCUMULATION_STEPS = 1 if "base" in BERT_MODEL else 8
tokenizer = BertTokenizer.from_pretrained(BERT_MODEL)
As we build one "big" model, we combine the data from all of our input files. We keep the test files separate, because we want to be able to evaluate on every prompt separately.
In addition, we also remember which labels are relevant for every prompt, because in the prediction phase, we will only look at the probabilities of the relevant labels.
In [2]:
import ndjson
import glob
file_prefixes = ["eatingmeat_but_large", "eatingmeat_because_large",
"junkfood_but", "junkfood_because"]
train_data = []
dev_data = []
test_data = {}
label2idx = {}
target_names = {}
for prefix in file_prefixes:
train_files = glob.glob(f"../data/interim/{prefix}_train_withprompt*.ndjson")
dev_file = f"../data/interim/{prefix}_dev_withprompt.ndjson"
test_file = f"../data/interim/{prefix}_test_withprompt.ndjson"
target_names[prefix] = []
for train_file in train_files:
with open(train_file) as i:
new_train_data = ndjson.load(i)
for item in new_train_data:
if item["label"] not in label2idx:
target_names[prefix].append(item["label"])
label2idx[item["label"]] = len(label2idx)
train_data += new_train_data
with open(dev_file) as i:
dev_data += ndjson.load(i)
with open(test_file) as i:
test_data[prefix] = ndjson.load(i)
In [3]:
print(label2idx)
print(target_names)
In [4]:
model = BertForSequenceClassification.from_pretrained(BERT_MODEL, num_labels=len(label2idx))
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
model.train()
Out[4]:
In [5]:
import logging
import numpy as np
logging.basicConfig(format = '%(asctime)s - %(levelname)s - %(name)s - %(message)s',
datefmt = '%m/%d/%Y %H:%M:%S',
level = logging.INFO)
logger = logging.getLogger(__name__)
MAX_SEQ_LENGTH=100
class InputFeatures(object):
"""A single set of features of data."""
def __init__(self, input_ids, input_mask, segment_ids, label_id):
self.input_ids = input_ids
self.input_mask = input_mask
self.segment_ids = segment_ids
self.label_id = label_id
def convert_examples_to_features(examples, label2idx, max_seq_length, tokenizer, verbose=0):
"""Loads a data file into a list of `InputBatch`s."""
features = []
for (ex_index, ex) in enumerate(examples):
# TODO: should deal better with sentences > max tok length
input_ids = tokenizer.encode("[CLS] " + ex["text"] + " [SEP]")
segment_ids = [0] * len(input_ids)
# The mask has 1 for real tokens and 0 for padding tokens. Only real
# tokens are attended to.
input_mask = [1] * len(input_ids)
# Zero-pad up to the sequence length.
padding = [0] * (max_seq_length - len(input_ids))
input_ids += padding
input_mask += padding
segment_ids += padding
assert len(input_ids) == max_seq_length
assert len(input_mask) == max_seq_length
assert len(segment_ids) == max_seq_length
label_id = label2idx[ex["label"]]
if verbose and ex_index == 0:
logger.info("*** Example ***")
logger.info("text: %s" % ex["text"])
logger.info("input_ids: %s" % " ".join([str(x) for x in input_ids]))
logger.info("input_mask: %s" % " ".join([str(x) for x in input_mask]))
logger.info("segment_ids: %s" % " ".join([str(x) for x in segment_ids]))
logger.info("label:" + str(ex["label"]) + " id: " + str(label_id))
features.append(
InputFeatures(input_ids=input_ids,
input_mask=input_mask,
segment_ids=segment_ids,
label_id=label_id))
return features
train_features = convert_examples_to_features(train_data, label2idx, MAX_SEQ_LENGTH, tokenizer, verbose=0)
dev_features = convert_examples_to_features(dev_data, label2idx, MAX_SEQ_LENGTH, tokenizer)
test_features = {}
for prefix in test_data:
test_features[prefix] = convert_examples_to_features(test_data[prefix], label2idx, MAX_SEQ_LENGTH, tokenizer, verbose=1)
In [6]:
import torch
from torch.utils.data import TensorDataset, DataLoader, RandomSampler
def get_data_loader(features, max_seq_length, batch_size):
all_input_ids = torch.tensor([f.input_ids for f in features], dtype=torch.long)
all_input_mask = torch.tensor([f.input_mask for f in features], dtype=torch.long)
all_segment_ids = torch.tensor([f.segment_ids for f in features], dtype=torch.long)
all_label_ids = torch.tensor([f.label_id for f in features], dtype=torch.long)
data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label_ids)
sampler = RandomSampler(data)
dataloader = DataLoader(data, sampler=sampler, batch_size=batch_size)
return dataloader
train_dataloader = get_data_loader(train_features, MAX_SEQ_LENGTH, BATCH_SIZE)
dev_dataloader = get_data_loader(dev_features, MAX_SEQ_LENGTH, BATCH_SIZE)
test_dataloaders = {}
for prefix in test_features:
test_dataloaders[prefix] = get_data_loader(test_features[prefix], MAX_SEQ_LENGTH, BATCH_SIZE)
In [7]:
def evaluate(model, dataloader):
eval_loss = 0
nb_eval_steps = 0
predicted_labels, correct_labels = [], []
for step, batch in enumerate(tqdm(dataloader, desc="Evaluation iteration")):
batch = tuple(t.to(device) for t in batch)
input_ids, input_mask, segment_ids, label_ids = batch
with torch.no_grad():
tmp_eval_loss, logits = model(input_ids, segment_ids, input_mask, label_ids)
outputs = np.argmax(logits.to('cpu'), axis=1)
label_ids = label_ids.to('cpu').numpy()
predicted_labels += list(outputs)
correct_labels += list(label_ids)
eval_loss += tmp_eval_loss.mean().item()
nb_eval_steps += 1
eval_loss = eval_loss / nb_eval_steps
correct_labels = np.array(correct_labels)
predicted_labels = np.array(predicted_labels)
return eval_loss, correct_labels, predicted_labels
In [8]:
from pytorch_transformers.optimization import AdamW, WarmupLinearSchedule
NUM_TRAIN_EPOCHS = 100
LEARNING_RATE = 1e-5
WARMUP_PROPORTION = 0.1
def warmup_linear(x, warmup=0.002):
if x < warmup:
return x/warmup
return 1.0 - x
num_train_steps = int(len(train_data) / BATCH_SIZE / GRADIENT_ACCUMULATION_STEPS * NUM_TRAIN_EPOCHS)
param_optimizer = list(model.named_parameters())
no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
optimizer_grouped_parameters = [
{'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01},
{'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
]
optimizer = AdamW(optimizer_grouped_parameters, lr=LEARNING_RATE, correct_bias=False)
scheduler = WarmupLinearSchedule(optimizer, warmup_steps=100, t_total=num_train_steps)
In [9]:
import os
OUTPUT_DIR = "/tmp/"
MODEL_FILE_NAME = "pytorch_model.bin"
output_model_file = os.path.join(OUTPUT_DIR, MODEL_FILE_NAME)
In [10]:
from tqdm import trange
from tqdm import tqdm_notebook as tqdm
from sklearn.metrics import classification_report, precision_recall_fscore_support
PATIENCE = 5
global_step = 0
model.train()
loss_history = []
best_epoch = 0
for epoch in trange(int(NUM_TRAIN_EPOCHS), desc="Epoch"):
tr_loss = 0
nb_tr_examples, nb_tr_steps = 0, 0
for step, batch in enumerate(tqdm(train_dataloader, desc="Training iteration")):
batch = tuple(t.to(device) for t in batch)
input_ids, input_mask, segment_ids, label_ids = batch
outputs = model(input_ids, segment_ids, input_mask, label_ids)
loss = outputs[0]
if GRADIENT_ACCUMULATION_STEPS > 1:
loss = loss / GRADIENT_ACCUMULATION_STEPS
loss.backward()
tr_loss += loss.item()
nb_tr_examples += input_ids.size(0)
nb_tr_steps += 1
if (step + 1) % GRADIENT_ACCUMULATION_STEPS == 0:
lr_this_step = LEARNING_RATE * warmup_linear(global_step/num_train_steps, WARMUP_PROPORTION)
for param_group in optimizer.param_groups:
param_group['lr'] = lr_this_step
optimizer.step()
optimizer.zero_grad()
global_step += 1
dev_loss, _, _ = evaluate(model, dev_dataloader)
print("Loss history:", loss_history)
print("Dev loss:", dev_loss)
if len(loss_history) == 0 or dev_loss < min(loss_history):
model_to_save = model.module if hasattr(model, 'module') else model
torch.save(model_to_save.state_dict(), output_model_file)
best_epoch = epoch
if epoch-best_epoch >= PATIENCE:
print("No improvement on development set. Finish training.")
break
loss_history.append(dev_loss)
In [11]:
from tqdm import tqdm_notebook as tqdm
from sklearn.metrics import classification_report, precision_recall_fscore_support
device="cpu"
print("Loading model from", output_model_file)
model_state_dict = torch.load(output_model_file, map_location=lambda storage, loc: storage)
model = BertForSequenceClassification.from_pretrained(BERT_MODEL, state_dict=model_state_dict, num_labels=len(label2idx))
model.to(device)
model.eval()
#_, train_correct, train_predicted = evaluate(model, train_dataloader)
#_, dev_correct, dev_predicted = evaluate(model, dev_dataloader)
#print("Training performance:", precision_recall_fscore_support(train_correct, train_predicted, average="micro"))
#print("Development performance:", precision_recall_fscore_support(dev_correct, dev_predicted, average="micro"))
for prefix in test_dataloaders:
print(prefix)
_, test_correct, test_predicted = evaluate(model, test_dataloaders[prefix])
print("Test performance:", precision_recall_fscore_support(test_correct, test_predicted, average="micro"))
print(classification_report(test_correct, test_predicted, target_names=target_names[prefix]))
In [ ]: