In [0]:
from pathlib import Path

In [2]:
!pip install transformers


Collecting transformers
  Downloading https://files.pythonhosted.org/packages/a3/78/92cedda05552398352ed9784908b834ee32a0bd071a9b32de287327370b7/transformers-2.8.0-py3-none-any.whl (563kB)
     |████████████████████████████████| 573kB 8.7MB/s 
Requirement already satisfied: regex!=2019.12.17 in /usr/local/lib/python3.6/dist-packages (from transformers) (2019.12.20)
Requirement already satisfied: filelock in /usr/local/lib/python3.6/dist-packages (from transformers) (3.0.12)
Collecting sacremoses
  Downloading https://files.pythonhosted.org/packages/7d/34/09d19aff26edcc8eb2a01bed8e98f13a1537005d31e95233fd48216eed10/sacremoses-0.0.43.tar.gz (883kB)
     |████████████████████████████████| 890kB 52.2MB/s 
Requirement already satisfied: boto3 in /usr/local/lib/python3.6/dist-packages (from transformers) (1.12.47)
Requirement already satisfied: requests in /usr/local/lib/python3.6/dist-packages (from transformers) (2.23.0)
Collecting sentencepiece
  Downloading https://files.pythonhosted.org/packages/98/2c/8df20f3ac6c22ac224fff307ebc102818206c53fc454ecd37d8ac2060df5/sentencepiece-0.1.86-cp36-cp36m-manylinux1_x86_64.whl (1.0MB)
     |████████████████████████████████| 1.0MB 53.3MB/s 
Requirement already satisfied: tqdm>=4.27 in /usr/local/lib/python3.6/dist-packages (from transformers) (4.38.0)
Collecting tokenizers==0.5.2
  Downloading https://files.pythonhosted.org/packages/d1/3f/73c881ea4723e43c1e9acf317cf407fab3a278daab3a69c98dcac511c04f/tokenizers-0.5.2-cp36-cp36m-manylinux1_x86_64.whl (3.7MB)
     |████████████████████████████████| 3.7MB 57.5MB/s 
Requirement already satisfied: numpy in /usr/local/lib/python3.6/dist-packages (from transformers) (1.18.3)
Requirement already satisfied: dataclasses; python_version < "3.7" in /usr/local/lib/python3.6/dist-packages (from transformers) (0.7)
Requirement already satisfied: six in /usr/local/lib/python3.6/dist-packages (from sacremoses->transformers) (1.12.0)
Requirement already satisfied: click in /usr/local/lib/python3.6/dist-packages (from sacremoses->transformers) (7.1.2)
Requirement already satisfied: joblib in /usr/local/lib/python3.6/dist-packages (from sacremoses->transformers) (0.14.1)
Requirement already satisfied: jmespath<1.0.0,>=0.7.1 in /usr/local/lib/python3.6/dist-packages (from boto3->transformers) (0.9.5)
Requirement already satisfied: botocore<1.16.0,>=1.15.47 in /usr/local/lib/python3.6/dist-packages (from boto3->transformers) (1.15.47)
Requirement already satisfied: s3transfer<0.4.0,>=0.3.0 in /usr/local/lib/python3.6/dist-packages (from boto3->transformers) (0.3.3)
Requirement already satisfied: urllib3!=1.25.0,!=1.25.1,<1.26,>=1.21.1 in /usr/local/lib/python3.6/dist-packages (from requests->transformers) (1.24.3)
Requirement already satisfied: certifi>=2017.4.17 in /usr/local/lib/python3.6/dist-packages (from requests->transformers) (2020.4.5.1)
Requirement already satisfied: chardet<4,>=3.0.2 in /usr/local/lib/python3.6/dist-packages (from requests->transformers) (3.0.4)
Requirement already satisfied: idna<3,>=2.5 in /usr/local/lib/python3.6/dist-packages (from requests->transformers) (2.9)
Requirement already satisfied: docutils<0.16,>=0.10 in /usr/local/lib/python3.6/dist-packages (from botocore<1.16.0,>=1.15.47->boto3->transformers) (0.15.2)
Requirement already satisfied: python-dateutil<3.0.0,>=2.1 in /usr/local/lib/python3.6/dist-packages (from botocore<1.16.0,>=1.15.47->boto3->transformers) (2.8.1)
Building wheels for collected packages: sacremoses
  Building wheel for sacremoses (setup.py) ... done
  Created wheel for sacremoses: filename=sacremoses-0.0.43-cp36-none-any.whl size=893260 sha256=65a9f30b0ca50724bd0d772a4d9a1229fdcec16b845b2169da527fb359abdfd1
  Stored in directory: /root/.cache/pip/wheels/29/3c/fd/7ce5c3f0666dab31a50123635e6fb5e19ceb42ce38d4e58f45
Successfully built sacremoses
Installing collected packages: sacremoses, sentencepiece, tokenizers, transformers
Successfully installed sacremoses-0.0.43 sentencepiece-0.1.86 tokenizers-0.5.2 transformers-2.8.0

In [0]:
from transformers import BertTokenizer
import pandas as pd
import numpy as np
import torch

In [4]:
from google.colab import drive
drive.mount('/content/drive')

home_dir = Path("/content/drive/My\ Drive/ml/transformers-experiment")


Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3aietf%3awg%3aoauth%3a2.0%3aoob&response_type=code&scope=email%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdocs.test%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive.photos.readonly%20https%3a%2f%2fwww.googleapis.com%2fauth%2fpeopleapi.readonly

Enter your authorization code:
··········
Mounted at /content/drive

In [5]:
cd $home_dir


/content/drive/My Drive/ml/transformers-experiment

In [0]:
data_url = "data/leadsherpa-all.csv"

column_names = ['text', 'dnc', 'verified_status']
sentence_col, _, label_col = column_names

# data_df = pd.read_csv(data_url, names=column_names)
# data_df.shape

In [0]:
def read_data_file(data_path, column_names=None, sentence_col=None, label_col=None, sep=','):
    ''' reads a single file into a dataframe '''

    data_df = pd.read_csv(data_path, names=column_names, sep=sep)
    return (
        data_df, 
        data_df[sentence_col].str.lower().values, 
        data_df[label_col].str.lower().values
    )

In [8]:
data_df, sentences, labels = read_data_file(
    data_url, 
    column_names=column_names, 
    sentence_col='text', 
    label_col='verified_status'
)


sentences, labels = data_df['text'].str.lower().values, data_df['verified_status'].values
labels = np.array([ 1 if x == 'verified' else 0 for x in labels ])
label_names_ids = {0:'unverified', 1:'verified'}

sentences.shape, labels.shape, label_names_ids


Out[8]:
((32895,), (32895,), {0: 'unverified', 1: 'verified'})

In [9]:
if torch.cuda.is_available():
    device = torch.device('cuda')
    print(f'There are {torch.cuda.device_count()} GPUs available')
    print('We will use the GPU:', torch.cuda.get_device_name(0))
else:
    print('No GPU available, using the cpu instead...')
    device = torch.device('cpu')


There are 1 GPUs available
We will use the GPU: Tesla P100-PCIE-16GB

In [0]:
# data_df.to_csv('/content/drive/My Drive/ml/leadsherpa-all.csv', index=False)

In [10]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)

input_ids = []
MAX_LEN = 32
attention_masks = []

for sentence in sentences:
    encoded_sentence = tokenizer.encode(
        text=sentence, 
        add_special_tokens=True,
        max_length=MAX_LEN,
        pad_to_max_length=True
    )

    attention_masks.append([ 0 if x==0 else 1 for x in encoded_sentence ])
    input_ids.append(encoded_sentence)



Training and Validation Split and Convert Numpy Data to Pytorch Type


In [11]:
from sklearn.model_selection import train_test_split

train_inputs, valid_inputs, train_labels, valid_labels = train_test_split(
    input_ids, labels, random_state=2018, test_size=0.1
)

train_masks, valid_masks = train_test_split(
    attention_masks, random_state=2018, test_size=0.1
)

print(len(train_inputs), len(valid_inputs), len(valid_masks), len(train_masks))

train_input_tensor = torch.tensor(train_inputs)
train_label_tensor = torch.tensor(train_labels)
train_mask_tensor = torch.tensor(train_masks)
valid_input_tensor = torch.tensor(valid_inputs)
valid_label_tensor = torch.tensor(valid_labels)
valid_mask_tensor = torch.tensor(valid_masks)

print(train_input_tensor.shape, 
      train_label_tensor.shape, 
      train_mask_tensor.shape, 
      valid_input_tensor.shape, 
      valid_label_tensor.shape, 
      valid_mask_tensor.shape, 
      )


29605 3290 3290 29605
torch.Size([29605, 32]) torch.Size([29605]) torch.Size([29605, 32]) torch.Size([3290, 32]) torch.Size([3290]) torch.Size([3290, 32])

In [0]:
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler

# DataLoader requires batch_size for training
# Recommended batch sizes are 16 and 32

BATCH_SIZE = 32

train_data = TensorDataset(
    train_input_tensor, train_mask_tensor, train_label_tensor
)
train_sampler = RandomSampler(train_data)
train_dataloader = DataLoader(
    train_data, sampler=train_sampler, batch_size=BATCH_SIZE
)

valid_data = TensorDataset(
    valid_input_tensor, valid_mask_tensor, valid_label_tensor
)
valid_sampler = SequentialSampler(valid_data)
valid_dataloader = DataLoader(
    valid_data, sampler=valid_sampler
)

In [13]:
from transformers import BertForSequenceClassification, AdamW, BertConfig

model = BertForSequenceClassification.from_pretrained(
    'bert-base-uncased', #12-layer BERT model with uncased vocab
    num_labels=2,
    output_attentions=False,
    output_hidden_states=False
)
model.cuda();





In [14]:
# Get all of the model's parameters as a list of tuples

params = list(model.named_parameters())
print(f'The BERT model has {len(params)} named parameters')

print('==== Embedding Layer ====\n')
for p in params[0:5]:
    print("{:<55} {:>12}".format(p[0], str(tuple(p[1].size()))))

print('\n==== First Transformer ====\n')
for p in params[5:21]:
    print("{:<55} {:>12}".format(p[0], str(tuple(p[1].size()))))

print('\n==== Output Layer ====\n')
for p in params[-4:]:
    print("{:<55} {:>12}".format(p[0], str(tuple(p[1].size()))))


The BERT model has 201 named parameters
==== Embedding Layer ====

bert.embeddings.word_embeddings.weight                  (30522, 768)
bert.embeddings.position_embeddings.weight                (512, 768)
bert.embeddings.token_type_embeddings.weight                (2, 768)
bert.embeddings.LayerNorm.weight                              (768,)
bert.embeddings.LayerNorm.bias                                (768,)

==== First Transformer ====

bert.encoder.layer.0.attention.self.query.weight          (768, 768)
bert.encoder.layer.0.attention.self.query.bias                (768,)
bert.encoder.layer.0.attention.self.key.weight            (768, 768)
bert.encoder.layer.0.attention.self.key.bias                  (768,)
bert.encoder.layer.0.attention.self.value.weight          (768, 768)
bert.encoder.layer.0.attention.self.value.bias                (768,)
bert.encoder.layer.0.attention.output.dense.weight        (768, 768)
bert.encoder.layer.0.attention.output.dense.bias              (768,)
bert.encoder.layer.0.attention.output.LayerNorm.weight        (768,)
bert.encoder.layer.0.attention.output.LayerNorm.bias          (768,)
bert.encoder.layer.0.intermediate.dense.weight           (3072, 768)
bert.encoder.layer.0.intermediate.dense.bias                 (3072,)
bert.encoder.layer.0.output.dense.weight                 (768, 3072)
bert.encoder.layer.0.output.dense.bias                        (768,)
bert.encoder.layer.0.output.LayerNorm.weight                  (768,)
bert.encoder.layer.0.output.LayerNorm.bias                    (768,)

==== Output Layer ====

bert.pooler.dense.weight                                  (768, 768)
bert.pooler.dense.bias                                        (768,)
classifier.weight                                           (2, 768)
classifier.bias                                                 (2,)

In [15]:
from transformers import get_linear_schedule_with_warmup

optimizer = AdamW(
    params=model.parameters(),  
    lr=2e-5
)

epochs = 4
total_steps = len(train_dataloader) * epochs

scheduler = get_linear_schedule_with_warmup(
    optimizer=optimizer,
    num_warmup_steps=1,
    num_training_steps=total_steps
)
total_steps


Out[15]:
3704

In [0]:
import numpy as np
import time, datetime

def flat_accuracy(preds, labels):
 pred_flat = np.argmax(preds, axis=1).flatten()
 labels_flat = labels.flatten()
 return np.sum(pred_flat == labels_flat) / len(labels_flat)

def format_time(elapsed):
    # Round to the nearest second
    elapsed_rounded = int(round(elapsed))

    # hh:mm:ss
    return(str(datetime.timedelta(seconds=elapsed_rounded)))

In [17]:
import random

SEED = 100

random.seed(SEED)
torch.manual_seed(SEED)
torch.cuda.manual_seed_all(SEED)

loss_values = []

# 1. Iterating over the number of epochs
for epoch in range(0, epochs):

    # =============================
    # Training
    # =============================

    print(f'=============== Epoch {epoch+1} / {epochs} ======')
    print('Training...')

    t0 = time.time()
    total_loss = 0 # Reset the total loss for this epoch.
    
    model.train()

#2. Iterating over the batches in train_data_loader👇🏾
    
    train_accuracy = 0.0

    for step, batch in enumerate(train_dataloader):
        # Progress update every 40 batches
        if step % 40 == 0 and not step == 0: 
            # Calculate elapsed time in minutes
            elapsed = format_time(time.time() - t0)

            # Report progress
            print('Batch {:>5,} of {:>5,}. Elapsed: {:}.'.format(step, len(train_dataloader), elapsed)) # Unpack this training batch from our dataloader. ')

            # Copy tensor to GPU
            # batch contains three tensors
            # input_ids, attention_masks and labels
        b_input_ids = batch[0].to(device)
        b_input_mask = batch[1].to(device)
        b_input_labels = batch[2].to(device)
        
        model.zero_grad() # setting batch gradients to 0 

        outputs = model(
            b_input_ids, 
            token_type_ids=None, 
            attention_mask=b_input_mask,
            labels=b_input_labels
        )
        # The call to `model` always returns a tuple, so we need to pull the 
        # loss value out of the tuple.
        # outputs = (loss, logits)

        loss = outputs[0]
        
        tlogits = outputs[1].detach().cpu().numpy()
        tlabel_ids = b_input_labels.detach().cpu().numpy()

        tmp_tr_acc = flat_accuracy(tlogits, tlabel_ids)
        train_accuracy += tmp_tr_acc


        # 👇🏾Accumulate the training loss over all of the batches so that we can
        # calculate the average loss at the end. `loss` is a Tensor containing a
        # single value; the `.item()` function just returns the Python value 
        # from the tensor.
        total_loss += loss.item()

        loss.backward() # backward pass to calculate gradients
        
        # 👇🏾 cliping grads to help prevent the "exploding gradients" problem.
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)

        optimizer.step() 
        scheduler.step() # Update the learning rate

    # Calculate the average loss over the training data
    average_train_loss = total_loss/len(train_dataloader)

    loss_values.append(average_train_loss)
    print(" Train Accuracy: {0:.2f}".format(train_accuracy/step))
    print(f'Average training loss: {average_train_loss}')
    print(f'Training epoch took: {format_time(time.time() - t0)}')


    # =====================================
    #              Validation
    # =====================================

    # After the completion of each training epoch, measure your performance
    # on the validation set
    print()
    print('Running Validation...')

    t0 = time.time()

    # put the model in evaluation mode -- the dropout layers behave differently
    # during evaluation

    model.eval()

    # Tracking variables

    eval_loss, eval_accuracy = 0, 0
    nb_eval_steps, nb_eval_examples = 0, 0

    # Evaluate data for one epoch

    for batch in valid_dataloader:

        batch = tuple(t.to(device) for t in batch)

        # Unpack inputs from dataloader
        b_input_ids, b_input_mask, b_labels = batch

        # Telling the model not to compute or store gradients,
        # saving memory and speeding up validation
        with torch.no_grad():
            
            # Foreward pass, calculate logit predictions
            # Only returning logits instead of loss because
            # we haven't provided labels

            # token_type_ids is the same as the segment_ids which 
            # indicates which token belongs to sentence 1 or 2 in 
            # 2 sentence tasks

            outputs = model(
                b_input_ids, 
                token_type_ids=None,
                attention_mask=b_input_mask
            ) # labels are not passed here in validation

            # Get the "logits" output by the model. The "logits" are the output
            # values prior to applying an activation function like softmax

            logits = outputs[0]

            # Move logits and labels to CPU

            logits = logits.detach().cpu().numpy()
            label_ids = b_labels.to('cpu').numpy()

            # Calculate the accuracy for this batch of test sentences.
            tmp_eval_accuracy = flat_accuracy(logits, label_ids)
            eval_accuracy += tmp_eval_accuracy

            # track the number of batches
            nb_eval_steps +=1

    # Report the final accuracy for this validation run
    print(f' Accuracy: {eval_accuracy/nb_eval_steps}')
    print(f' Validation took: {format_time(time.time() - t0)}')

print()
print('Training Complete')


=============== Epoch 1 / 4 ======
Training...
/pytorch/torch/csrc/utils/python_arg_parser.cpp:756: UserWarning: This overload of add_ is deprecated:
	add_(Number alpha, Tensor other)
Consider using one of the following signatures instead:
	add_(Tensor other, *, Number alpha)
Batch    40 of   926. Elapsed: 0:00:05.
Batch    80 of   926. Elapsed: 0:00:10.
Batch   120 of   926. Elapsed: 0:00:16.
Batch   160 of   926. Elapsed: 0:00:21.
Batch   200 of   926. Elapsed: 0:00:26.
Batch   240 of   926. Elapsed: 0:00:31.
Batch   280 of   926. Elapsed: 0:00:36.
Batch   320 of   926. Elapsed: 0:00:41.
Batch   360 of   926. Elapsed: 0:00:46.
Batch   400 of   926. Elapsed: 0:00:51.
Batch   440 of   926. Elapsed: 0:00:56.
Batch   480 of   926. Elapsed: 0:01:00.
Batch   520 of   926. Elapsed: 0:01:05.
Batch   560 of   926. Elapsed: 0:01:10.
Batch   600 of   926. Elapsed: 0:01:15.
Batch   640 of   926. Elapsed: 0:01:20.
Batch   680 of   926. Elapsed: 0:01:25.
Batch   720 of   926. Elapsed: 0:01:30.
Batch   760 of   926. Elapsed: 0:01:35.
Batch   800 of   926. Elapsed: 0:01:40.
Batch   840 of   926. Elapsed: 0:01:45.
Batch   880 of   926. Elapsed: 0:01:50.
Batch   920 of   926. Elapsed: 0:01:55.
 Train Accuracy: 0.93
Average training loss: 0.17928609801717826
Training epoch took: 0:01:56

Running Validation...
 Accuracy: 0.941033434650456
 Validation took: 0:00:29
=============== Epoch 2 / 4 ======
Training...
Batch    40 of   926. Elapsed: 0:00:05.
Batch    80 of   926. Elapsed: 0:00:10.
Batch   120 of   926. Elapsed: 0:00:15.
Batch   160 of   926. Elapsed: 0:00:20.
Batch   200 of   926. Elapsed: 0:00:25.
Batch   240 of   926. Elapsed: 0:00:30.
Batch   280 of   926. Elapsed: 0:00:35.
Batch   320 of   926. Elapsed: 0:00:40.
Batch   360 of   926. Elapsed: 0:00:45.
Batch   400 of   926. Elapsed: 0:00:50.
Batch   440 of   926. Elapsed: 0:00:55.
Batch   480 of   926. Elapsed: 0:01:00.
Batch   520 of   926. Elapsed: 0:01:05.
Batch   560 of   926. Elapsed: 0:01:10.
Batch   600 of   926. Elapsed: 0:01:15.
Batch   640 of   926. Elapsed: 0:01:20.
Batch   680 of   926. Elapsed: 0:01:25.
Batch   720 of   926. Elapsed: 0:01:30.
Batch   760 of   926. Elapsed: 0:01:35.
Batch   800 of   926. Elapsed: 0:01:40.
Batch   840 of   926. Elapsed: 0:01:45.
Batch   880 of   926. Elapsed: 0:01:49.
Batch   920 of   926. Elapsed: 0:01:54.
 Train Accuracy: 0.96
Average training loss: 0.11794726819516065
Training epoch took: 0:01:55

Running Validation...
 Accuracy: 0.9462006079027355
 Validation took: 0:00:29
=============== Epoch 3 / 4 ======
Training...
Batch    40 of   926. Elapsed: 0:00:05.
Batch    80 of   926. Elapsed: 0:00:10.
Batch   120 of   926. Elapsed: 0:00:15.
Batch   160 of   926. Elapsed: 0:00:20.
Batch   200 of   926. Elapsed: 0:00:25.
Batch   240 of   926. Elapsed: 0:00:30.
Batch   280 of   926. Elapsed: 0:00:35.
Batch   320 of   926. Elapsed: 0:00:40.
Batch   360 of   926. Elapsed: 0:00:45.
Batch   400 of   926. Elapsed: 0:00:50.
Batch   440 of   926. Elapsed: 0:00:55.
Batch   480 of   926. Elapsed: 0:01:00.
Batch   520 of   926. Elapsed: 0:01:05.
Batch   560 of   926. Elapsed: 0:01:09.
Batch   600 of   926. Elapsed: 0:01:14.
Batch   640 of   926. Elapsed: 0:01:19.
Batch   680 of   926. Elapsed: 0:01:24.
Batch   720 of   926. Elapsed: 0:01:29.
Batch   760 of   926. Elapsed: 0:01:34.
Batch   800 of   926. Elapsed: 0:01:39.
Batch   840 of   926. Elapsed: 0:01:44.
Batch   880 of   926. Elapsed: 0:01:49.
Batch   920 of   926. Elapsed: 0:01:54.
 Train Accuracy: 0.97
Average training loss: 0.09664508117036995
Training epoch took: 0:01:55

Running Validation...
 Accuracy: 0.9477203647416413
 Validation took: 0:00:29
=============== Epoch 4 / 4 ======
Training...
Batch    40 of   926. Elapsed: 0:00:05.
Batch    80 of   926. Elapsed: 0:00:10.
Batch   120 of   926. Elapsed: 0:00:15.
Batch   160 of   926. Elapsed: 0:00:20.
Batch   200 of   926. Elapsed: 0:00:25.
Batch   240 of   926. Elapsed: 0:00:30.
Batch   280 of   926. Elapsed: 0:00:35.
Batch   320 of   926. Elapsed: 0:00:40.
Batch   360 of   926. Elapsed: 0:00:45.
Batch   400 of   926. Elapsed: 0:00:50.
Batch   440 of   926. Elapsed: 0:00:54.
Batch   480 of   926. Elapsed: 0:00:59.
Batch   520 of   926. Elapsed: 0:01:04.
Batch   560 of   926. Elapsed: 0:01:09.
Batch   600 of   926. Elapsed: 0:01:14.
Batch   640 of   926. Elapsed: 0:01:19.
Batch   680 of   926. Elapsed: 0:01:24.
Batch   720 of   926. Elapsed: 0:01:29.
Batch   760 of   926. Elapsed: 0:01:34.
Batch   800 of   926. Elapsed: 0:01:39.
Batch   840 of   926. Elapsed: 0:01:44.
Batch   880 of   926. Elapsed: 0:01:49.
Batch   920 of   926. Elapsed: 0:01:54.
 Train Accuracy: 0.98
Average training loss: 0.07824270795531908
Training epoch took: 0:01:55

Running Validation...
 Accuracy: 0.9458966565349544
 Validation took: 0:00:29

Training Complete

In [21]:
import plotly.express as px
f = pd.DataFrame(loss_values)
f.columns=['Loss']
fig = px.line(f, x=f.index, y=f.Loss)
fig.update_layout(title='Training loss of the Model',
                   xaxis_title='Epoch',
                   yaxis_title='Loss')
fig.show()



In [0]:
from transformers import WEIGHTS_NAME, CONFIG_NAME
import os

output_dir = "./models/"

output_model_file = os.path.join(output_dir, WEIGHTS_NAME)
output_config_file = os.path.join(output_dir, CONFIG_NAME)

In [23]:
model.to('cpu')

model_to_save = model.module if hasattr(model, 'module') else model

torch.save(model_to_save.state_dict(), output_model_file)
model_to_save.config.to_json_file(output_config_file)
tokenizer.save_pretrained(output_dir)


# torch.save(model, f'models/leadsherpa-bert2')
# torch.save(tokenizer, f'models/leadsherpa-bert-tokenizer2')


Out[23]:
('./models/vocab.txt',
 './models/special_tokens_map.json',
 './models/added_tokens.json')

In [27]:
output_config_file


Out[27]:
'./models/config.json'

Serving the Model

0. After training is complete, make sure to change model device to cpu before saving the model
1. Load tokenizer
2. Load saved model
3. receive sentence/text
2. preprocess text
    - encode/tokenize sentence with attention mask, padding, trimming to max length
    - convert the tokenized input sequence into torch tensors
    - set the model to eval mode
    - set torch to no_grad() and run the sequence thru the model
    - get the logit outputs
    - get the index with the maximum logit value

In [0]:
# model = torch.load('leadsherpa-bert').to('cpu')
# tokenizer = torch.load('leadsherpa-bert-tokenizer')

model = BertForSequenceClassification.from_pretrained(output_dir)
tokenizer = BertTokenizer.from_pretrained(output_dir)  

def process_sequence_and_attention(tokenizer, sentence, **kwargs):

    encoded_sentence = tokenizer.encode(
        text=sentence, 
        add_special_tokens=True,
        max_length=32,
        pad_to_max_length=True, 
    )

    attention_masks = [ 0 if x==0 else 1 for x in encoded_sentence ]
    
    att_mask_tensor = torch.tensor([attention_masks])
    encoded_tensor = torch.tensor([encoded_sentence])
    
    return encoded_tensor, att_mask_tensor

In [0]:
def make_predictions(model, encoded_sentence, attention_mask, token_type_id=None):
    
    model.eval()
    with torch.no_grad():
        
        preds = model(
                    encoded_sentence,
                    token_type_ids=token_type_id,
                    attention_mask=attention_mask
                ) 
            # labels are not passed here in validation
            # Get the "logits" output by the model. The "logits" are the output
            # values prior to applying an activation function like softmax
        
        logits = preds[0]
        probabilities = torch.nn.functional.softmax(logits, dim=1)        
        probabilities = probabilities.detach().cpu().numpy()
        # Move logits and labels to CPU

    return probabilities[-1].round(5)

# logits, label_names_ids[np.argmax(logits, axis=1)[-1]]

In [0]:
model = torch.load('leadsherpa-bert').to('cpu')
tokenizer = torch.load('leadsherpa-bert-tokenizer')