In [0]:
from pathlib import Path
In [2]:
!pip install transformers
In [0]:
from transformers import BertTokenizer
import pandas as pd
import numpy as np
import torch
In [4]:
from google.colab import drive
drive.mount('/content/drive')
home_dir = Path("/content/drive/My\ Drive/ml/transformers-experiment")
In [5]:
cd $home_dir
In [0]:
data_url = "data/leadsherpa-all.csv"
column_names = ['text', 'dnc', 'verified_status']
sentence_col, _, label_col = column_names
# data_df = pd.read_csv(data_url, names=column_names)
# data_df.shape
In [0]:
def read_data_file(data_path, column_names=None, sentence_col=None, label_col=None, sep=','):
''' reads a single file into a dataframe '''
data_df = pd.read_csv(data_path, names=column_names, sep=sep)
return (
data_df,
data_df[sentence_col].str.lower().values,
data_df[label_col].str.lower().values
)
In [8]:
data_df, sentences, labels = read_data_file(
data_url,
column_names=column_names,
sentence_col='text',
label_col='verified_status'
)
sentences, labels = data_df['text'].str.lower().values, data_df['verified_status'].values
labels = np.array([ 1 if x == 'verified' else 0 for x in labels ])
label_names_ids = {0:'unverified', 1:'verified'}
sentences.shape, labels.shape, label_names_ids
Out[8]:
In [9]:
if torch.cuda.is_available():
device = torch.device('cuda')
print(f'There are {torch.cuda.device_count()} GPUs available')
print('We will use the GPU:', torch.cuda.get_device_name(0))
else:
print('No GPU available, using the cpu instead...')
device = torch.device('cpu')
In [0]:
# data_df.to_csv('/content/drive/My Drive/ml/leadsherpa-all.csv', index=False)
In [10]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)
input_ids = []
MAX_LEN = 32
attention_masks = []
for sentence in sentences:
encoded_sentence = tokenizer.encode(
text=sentence,
add_special_tokens=True,
max_length=MAX_LEN,
pad_to_max_length=True
)
attention_masks.append([ 0 if x==0 else 1 for x in encoded_sentence ])
input_ids.append(encoded_sentence)
In [11]:
from sklearn.model_selection import train_test_split
train_inputs, valid_inputs, train_labels, valid_labels = train_test_split(
input_ids, labels, random_state=2018, test_size=0.1
)
train_masks, valid_masks = train_test_split(
attention_masks, random_state=2018, test_size=0.1
)
print(len(train_inputs), len(valid_inputs), len(valid_masks), len(train_masks))
train_input_tensor = torch.tensor(train_inputs)
train_label_tensor = torch.tensor(train_labels)
train_mask_tensor = torch.tensor(train_masks)
valid_input_tensor = torch.tensor(valid_inputs)
valid_label_tensor = torch.tensor(valid_labels)
valid_mask_tensor = torch.tensor(valid_masks)
print(train_input_tensor.shape,
train_label_tensor.shape,
train_mask_tensor.shape,
valid_input_tensor.shape,
valid_label_tensor.shape,
valid_mask_tensor.shape,
)
In [0]:
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
# DataLoader requires batch_size for training
# Recommended batch sizes are 16 and 32
BATCH_SIZE = 32
train_data = TensorDataset(
train_input_tensor, train_mask_tensor, train_label_tensor
)
train_sampler = RandomSampler(train_data)
train_dataloader = DataLoader(
train_data, sampler=train_sampler, batch_size=BATCH_SIZE
)
valid_data = TensorDataset(
valid_input_tensor, valid_mask_tensor, valid_label_tensor
)
valid_sampler = SequentialSampler(valid_data)
valid_dataloader = DataLoader(
valid_data, sampler=valid_sampler
)
In [13]:
from transformers import BertForSequenceClassification, AdamW, BertConfig
model = BertForSequenceClassification.from_pretrained(
'bert-base-uncased', #12-layer BERT model with uncased vocab
num_labels=2,
output_attentions=False,
output_hidden_states=False
)
model.cuda();
In [14]:
# Get all of the model's parameters as a list of tuples
params = list(model.named_parameters())
print(f'The BERT model has {len(params)} named parameters')
print('==== Embedding Layer ====\n')
for p in params[0:5]:
print("{:<55} {:>12}".format(p[0], str(tuple(p[1].size()))))
print('\n==== First Transformer ====\n')
for p in params[5:21]:
print("{:<55} {:>12}".format(p[0], str(tuple(p[1].size()))))
print('\n==== Output Layer ====\n')
for p in params[-4:]:
print("{:<55} {:>12}".format(p[0], str(tuple(p[1].size()))))
In [15]:
from transformers import get_linear_schedule_with_warmup
optimizer = AdamW(
params=model.parameters(),
lr=2e-5
)
epochs = 4
total_steps = len(train_dataloader) * epochs
scheduler = get_linear_schedule_with_warmup(
optimizer=optimizer,
num_warmup_steps=1,
num_training_steps=total_steps
)
total_steps
Out[15]:
In [0]:
import numpy as np
import time, datetime
def flat_accuracy(preds, labels):
pred_flat = np.argmax(preds, axis=1).flatten()
labels_flat = labels.flatten()
return np.sum(pred_flat == labels_flat) / len(labels_flat)
def format_time(elapsed):
# Round to the nearest second
elapsed_rounded = int(round(elapsed))
# hh:mm:ss
return(str(datetime.timedelta(seconds=elapsed_rounded)))
In [17]:
import random
SEED = 100
random.seed(SEED)
torch.manual_seed(SEED)
torch.cuda.manual_seed_all(SEED)
loss_values = []
# 1. Iterating over the number of epochs
for epoch in range(0, epochs):
# =============================
# Training
# =============================
print(f'=============== Epoch {epoch+1} / {epochs} ======')
print('Training...')
t0 = time.time()
total_loss = 0 # Reset the total loss for this epoch.
model.train()
#2. Iterating over the batches in train_data_loader👇🏾
train_accuracy = 0.0
for step, batch in enumerate(train_dataloader):
# Progress update every 40 batches
if step % 40 == 0 and not step == 0:
# Calculate elapsed time in minutes
elapsed = format_time(time.time() - t0)
# Report progress
print('Batch {:>5,} of {:>5,}. Elapsed: {:}.'.format(step, len(train_dataloader), elapsed)) # Unpack this training batch from our dataloader. ')
# Copy tensor to GPU
# batch contains three tensors
# input_ids, attention_masks and labels
b_input_ids = batch[0].to(device)
b_input_mask = batch[1].to(device)
b_input_labels = batch[2].to(device)
model.zero_grad() # setting batch gradients to 0
outputs = model(
b_input_ids,
token_type_ids=None,
attention_mask=b_input_mask,
labels=b_input_labels
)
# The call to `model` always returns a tuple, so we need to pull the
# loss value out of the tuple.
# outputs = (loss, logits)
loss = outputs[0]
tlogits = outputs[1].detach().cpu().numpy()
tlabel_ids = b_input_labels.detach().cpu().numpy()
tmp_tr_acc = flat_accuracy(tlogits, tlabel_ids)
train_accuracy += tmp_tr_acc
# 👇🏾Accumulate the training loss over all of the batches so that we can
# calculate the average loss at the end. `loss` is a Tensor containing a
# single value; the `.item()` function just returns the Python value
# from the tensor.
total_loss += loss.item()
loss.backward() # backward pass to calculate gradients
# 👇🏾 cliping grads to help prevent the "exploding gradients" problem.
torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
optimizer.step()
scheduler.step() # Update the learning rate
# Calculate the average loss over the training data
average_train_loss = total_loss/len(train_dataloader)
loss_values.append(average_train_loss)
print(" Train Accuracy: {0:.2f}".format(train_accuracy/step))
print(f'Average training loss: {average_train_loss}')
print(f'Training epoch took: {format_time(time.time() - t0)}')
# =====================================
# Validation
# =====================================
# After the completion of each training epoch, measure your performance
# on the validation set
print()
print('Running Validation...')
t0 = time.time()
# put the model in evaluation mode -- the dropout layers behave differently
# during evaluation
model.eval()
# Tracking variables
eval_loss, eval_accuracy = 0, 0
nb_eval_steps, nb_eval_examples = 0, 0
# Evaluate data for one epoch
for batch in valid_dataloader:
batch = tuple(t.to(device) for t in batch)
# Unpack inputs from dataloader
b_input_ids, b_input_mask, b_labels = batch
# Telling the model not to compute or store gradients,
# saving memory and speeding up validation
with torch.no_grad():
# Foreward pass, calculate logit predictions
# Only returning logits instead of loss because
# we haven't provided labels
# token_type_ids is the same as the segment_ids which
# indicates which token belongs to sentence 1 or 2 in
# 2 sentence tasks
outputs = model(
b_input_ids,
token_type_ids=None,
attention_mask=b_input_mask
) # labels are not passed here in validation
# Get the "logits" output by the model. The "logits" are the output
# values prior to applying an activation function like softmax
logits = outputs[0]
# Move logits and labels to CPU
logits = logits.detach().cpu().numpy()
label_ids = b_labels.to('cpu').numpy()
# Calculate the accuracy for this batch of test sentences.
tmp_eval_accuracy = flat_accuracy(logits, label_ids)
eval_accuracy += tmp_eval_accuracy
# track the number of batches
nb_eval_steps +=1
# Report the final accuracy for this validation run
print(f' Accuracy: {eval_accuracy/nb_eval_steps}')
print(f' Validation took: {format_time(time.time() - t0)}')
print()
print('Training Complete')
In [21]:
import plotly.express as px
f = pd.DataFrame(loss_values)
f.columns=['Loss']
fig = px.line(f, x=f.index, y=f.Loss)
fig.update_layout(title='Training loss of the Model',
xaxis_title='Epoch',
yaxis_title='Loss')
fig.show()
In [0]:
from transformers import WEIGHTS_NAME, CONFIG_NAME
import os
output_dir = "./models/"
output_model_file = os.path.join(output_dir, WEIGHTS_NAME)
output_config_file = os.path.join(output_dir, CONFIG_NAME)
In [23]:
model.to('cpu')
model_to_save = model.module if hasattr(model, 'module') else model
torch.save(model_to_save.state_dict(), output_model_file)
model_to_save.config.to_json_file(output_config_file)
tokenizer.save_pretrained(output_dir)
# torch.save(model, f'models/leadsherpa-bert2')
# torch.save(tokenizer, f'models/leadsherpa-bert-tokenizer2')
Out[23]:
In [27]:
output_config_file
Out[27]:
0. After training is complete, make sure to change model device to cpu before saving the model
1. Load tokenizer
2. Load saved model
3. receive sentence/text
2. preprocess text
- encode/tokenize sentence with attention mask, padding, trimming to max length
- convert the tokenized input sequence into torch tensors
- set the model to eval mode
- set torch to no_grad() and run the sequence thru the model
- get the logit outputs
- get the index with the maximum logit value
In [0]:
# model = torch.load('leadsherpa-bert').to('cpu')
# tokenizer = torch.load('leadsherpa-bert-tokenizer')
model = BertForSequenceClassification.from_pretrained(output_dir)
tokenizer = BertTokenizer.from_pretrained(output_dir)
def process_sequence_and_attention(tokenizer, sentence, **kwargs):
encoded_sentence = tokenizer.encode(
text=sentence,
add_special_tokens=True,
max_length=32,
pad_to_max_length=True,
)
attention_masks = [ 0 if x==0 else 1 for x in encoded_sentence ]
att_mask_tensor = torch.tensor([attention_masks])
encoded_tensor = torch.tensor([encoded_sentence])
return encoded_tensor, att_mask_tensor
In [0]:
def make_predictions(model, encoded_sentence, attention_mask, token_type_id=None):
model.eval()
with torch.no_grad():
preds = model(
encoded_sentence,
token_type_ids=token_type_id,
attention_mask=attention_mask
)
# labels are not passed here in validation
# Get the "logits" output by the model. The "logits" are the output
# values prior to applying an activation function like softmax
logits = preds[0]
probabilities = torch.nn.functional.softmax(logits, dim=1)
probabilities = probabilities.detach().cpu().numpy()
# Move logits and labels to CPU
return probabilities[-1].round(5)
# logits, label_names_ids[np.argmax(logits, axis=1)[-1]]
In [0]:
model = torch.load('leadsherpa-bert').to('cpu')
tokenizer = torch.load('leadsherpa-bert-tokenizer')