Creates TensorFlow Graphs for Spark NLP NerDLApproach

TensorFlow: 1.15.0


In [ ]:
import numpy as np
import os
import logging

import tensorflow.compat.v1 as tf
import string
import random
import math
import sys
import shutil

sys.path.append('../lib/ner')
from ner_model import NerModel
from dataset_encoder import DatasetEncoder
from ner_model_saver import NerModelSaver
from pathlib import Path

In [ ]:
print(tf.__version__)
print(tf.keras.__version__)

logger = tf.get_logger()
logger.setLevel(logging.ERROR)

tf.get_logger().setLevel('ERROR')

SETTINGS


In [ ]:
# By default the first GPU is used.
# If you have multiple GPU devices and wish to
# use a different device, you can set that here. (Make suer that device is available!)
# In case there is no GPU, it falls back on CPU

gpu_device=0

In [ ]:
def create_graph(ntags, embeddings_dim, nchars, lstm_size = 128):
    if sys.version_info[0] != 3 or sys.version_info[1] >= 7:
        print('Python 3.6 or above not supported by tensorflow')
        return
    if tf.__version__ != '1.15.0':
        print('Spark NLP is compiled with TensorFlow 1.15.0, Please use such version.')
        print('Current TensorFlow version: ', tf.__version__)
        return
    tf.disable_v2_behavior()
    tf.reset_default_graph()
    model_name = 'blstm'+'_{}_{}_{}_{}'.format(ntags, embeddings_dim, lstm_size, nchars)
    with tf.Session() as session:
        ner = NerModel(session=None, use_gpu_device=gpu_device)
        ner.add_cnn_char_repr(nchars, 25, 30)
        ner.add_bilstm_char_repr(nchars, 25, 30)
        ner.add_pretrained_word_embeddings(embeddings_dim)
        ner.add_context_repr(ntags, lstm_size, 3)
        ner.add_inference_layer(True)
        ner.add_training_op(5)
        ner.init_variables()
        saver = tf.train.Saver()
        file_name = model_name + '.pb'
        tf.io.write_graph(ner.session.graph, './', file_name, False)
        ner.close()
        session.close()

Attributes info

  • 1st attribute: max number of tags (Must be at least equal to the number of unique labels, including O if IOB)
  • 2nd attribute: embeddings dimension
  • 3rd attribute: max number of characters processed (Must be at least the largest possible amount of characters)
  • 4th attribute: LSTM Size (128)

In [ ]:
# CoNLL 2003 - English - GloVe 100d
create_graph(10, 100, 120)

# CoNLL 2003 - English - GloVe 300d
create_graph(10, 300, 120)

# CoNLL 2003 - English - ELMO
create_graph(10, 512, 120)

# CoNLL 2003 - English - BERT, ALBERT and XLNET Base
create_graph(10, 768, 120)

# CoNLL 2003 - English - BERT Large, XLNET Large and ELMO
create_graph(10, 1024, 120)

# CoNLL 2003 - English - ALBERT XLARGE
create_graph(10, 2048, 120)

# CoNLL 2003 - English - ALBERT XXLARGE
create_graph(10, 4096, 120)

# OntoNotes - English - GloVe 100d
create_graph(38, 100, 200)

# OntoNotes - English - GloVe 300d
create_graph(38, 300, 200)

# OntoNotes - English - ELMO
create_graph(38, 512, 200)

# OntoNotes - English - BERT, ALBERT and XLNET Base
create_graph(38, 768, 200)

# OntoNotes - English - BERT Large, XLNET Large and ELMO
create_graph(38, 1024, 200)

# OntoNotes - English - ALBERT XLARGE
create_graph(38, 2048, 200)

# OntoNotes - English - ALBERT XXLARGE
create_graph(38, 4096, 200)

# You got the idea :)
# Set the numbers according to your own dataset if the current graphs failed!