In [1]:
import tensorflow as tf
import tensorflow.contrib.keras as keras
import numpy as np
import os
import pickle
SENTENCE_LENGTH_MAX = 32
EMBEDDING_DIM=50
In [2]:
import nltk
from nltk.tokenize import TreebankWordTokenizer
sentence_splitter = nltk.data.load('tokenizers/punkt/english.pickle')
tokenizer = TreebankWordTokenizer()
nltk.download('averaged_perceptron_tagger')
From the corpus download page : http://wortschatz.uni-leipzig.de/en/download/
Here's the paper that explains how the corpus was constructed :
In [3]:
corpus_dir = './data/RNN/'
corpus_text_file = os.path.join(corpus_dir, 'en.wikipedia.2010.100K.txt')
In [4]:
if not os.path.isfile( corpus_text_file ):
raise RuntimeError("You need to download the corpus file : "+
"Use the downloader in 5-Text-Corpus-and-Embeddings.ipynb")
else:
print("Corpus available locally")
In [5]:
def corpus_sentence_tokens(corpus_text_file=corpus_text_file):
while True:
with open(corpus_text_file, encoding='utf-8') as f:
for line in f.readlines():
n,l = line.split('\t') # Strip of the initial numbers
for s in sentence_splitter.tokenize(l): # Split the lines into sentences (~1 each)
tree_banked = tokenizer.tokenize(s)
if len(tree_banked) < SENTENCE_LENGTH_MAX:
yield tree_banked
print("Corpus : Looping")
corpus_sentence_tokens_gen = corpus_sentence_tokens()
In [6]:
' | '.join(next(corpus_sentence_tokens_gen))
In [7]:
from nltk.tag.perceptron import PerceptronTagger
pos_tagger = PerceptronTagger(load=True)
' | '.join(list(pos_tagger.classes))
In [9]:
s = "Let 's see what part of speech analysis on Jeff 's sample text looks like .".split(' ')
#s = next(corpus_sentence_tokens_gen)
pos_tagger.tag(s)
In [10]:
tag_list = 'O E'.split(' ')
pos_tagger_entity_tags = set('NNP'.split(' '))
pos_tagger_to_idx = dict([ (t,(1 if t in pos_tagger_entity_tags else 0))
for i,t in enumerate(pos_tagger.classes)])
TAG_SET_SIZE= len(tag_list)
pos_tagger_to_idx['NNP'], pos_tagger_to_idx['VBP']
In [11]:
glove_dir = './data/RNN/'
glove_100k_50d = 'glove.first-100k.6B.50d.txt'
glove_100k_50d_path = os.path.join(glove_dir, glove_100k_50d)
if not os.path.isfile( glove_100k_50d_path ):
raise RuntimeError("You need to download GloVE Embeddings "+
": Use the downloader in 5-Text-Corpus-and-Embeddings.ipynb")
else:
print("GloVE available locally")
In [12]:
# Due to size constraints, only use the first 100k vectors (i.e. 100k most frequently used words)
import glove
word_embedding = glove.Glove.load_stanford( glove_100k_50d_path )
word_embedding.word_vectors.shape
In [13]:
BATCH_SIZE = 64
RNN_HIDDEN_SIZE = EMBEDDING_DIM # ?+1 for capitalisation flag
In [14]:
word_embedding.word_vectors.shape
In [16]:
word_embedding_rnn = np.vstack([
np.zeros( (1, EMBEDDING_DIM,), dtype='float32'), # This is the 'zero' value (used as a mask in Keras)
np.zeros( (1, EMBEDDING_DIM,), dtype='float32'), # This is for 'UNK' (word == 1)
word_embedding.word_vectors,
])
word_embedding_rnn.shape
In [17]:
def word_to_idx_rnn(word):
idx = word_embedding.dictionary.get(word.lower(), -1) # since UNK=1 = (-1+2)
return idx+2 # skip ahead 2 places
from tensorflow.contrib.keras.python.keras.utils.np_utils import to_categorical
def sentences_for_network(list_of_sentences, include_targets=False, one_hot_targets=False):
len_of_list = len(list_of_sentences)
#print("sentences_for_network.sentences.length = %d" % (len_of_list,))
input_values = np.zeros((len_of_list, SENTENCE_LENGTH_MAX), dtype='int32')
for i, sent in enumerate(list_of_sentences):
for j, word in enumerate(sent):
input_values[i,j] = word_to_idx_rnn(word)
if not include_targets:
return (input_values, None)
if one_hot_targets:
# Add extra dimension here to suit Keras' TimeDistributed(Dense(softmax))
# as discussed : https://github.com/fchollet/keras/issues/6363
target_values = np.zeros((len_of_list, SENTENCE_LENGTH_MAX, TAG_SET_SIZE), dtype='int32')
else:
target_values = np.zeros((len_of_list, SENTENCE_LENGTH_MAX), dtype='int32')
for i, sent in enumerate(list_of_sentences):
sentence_tags = pos_tagger.tag(sent)
for j, word_tag in enumerate(sentence_tags):
tag = word_tag[1] # tags are returned as tuples (word, tag)
pos_class = pos_tagger_to_idx[tag] # These are the class #s
if one_hot_targets:
target_values[i,j] = to_categorical(pos_class, num_classes=TAG_SET_SIZE)
else:
target_values[i,j] = pos_class
return (input_values, target_values)
def batch_for_network_generator():
while True:
batch_of_sentences = [ next(corpus_sentence_tokens_gen) for i in range(BATCH_SIZE) ]
yield sentences_for_network(batch_of_sentences, include_targets=True, one_hot_targets=True)
In [19]:
single_batch_input, single_batch_targets = next(batch_for_network_generator())
single_batch_input.shape, single_batch_targets.shape
#single_batch_input[0]
#single_batch_targets[0]
In [20]:
#from tensorflow.contrib.keras.api.keras.preprocessing import sequence
from tensorflow.contrib.keras.api.keras.layers import Input, Embedding, GRU, Dense #, Activation
from tensorflow.contrib.keras.api.keras.models import Model
# Hmm : The following is not in the API...
from tensorflow.contrib.keras.python.keras.layers import Bidirectional, TimeDistributed
In [21]:
tokens_input = Input(shape=(SENTENCE_LENGTH_MAX,), dtype='int32', name="SentencesTokens")
# load pre-trained word embeddings into an Embedding layer
# note that we set trainable = False so as to keep the embeddings fixed
embedded_sequences = Embedding(word_embedding_rnn.shape[0],
EMBEDDING_DIM,
weights=[ word_embedding_rnn ],
input_length=SENTENCE_LENGTH_MAX,
trainable=False,
mask_zero=True,
name="SentencesEmbedded") (tokens_input)
#extra_input = ...
aggregate_vectors = embedded_sequences # concat...
rnn_outputs = Bidirectional( GRU(RNN_HIDDEN_SIZE, return_sequences=True), merge_mode='concat' )(aggregate_vectors)
is_ner_outputs = TimeDistributed( Dense(TAG_SET_SIZE, activation='softmax'),
input_shape=(BATCH_SIZE, SENTENCE_LENGTH_MAX, RNN_HIDDEN_SIZE*2),
name='POS-class')(rnn_outputs)
In [22]:
model = Model(inputs=[tokens_input], outputs=[is_ner_outputs])
model.summary()
In [23]:
model.compile(loss='categorical_crossentropy', optimizer="adam") # , metrics=['accuracy']
In [24]:
#model.fit(x, y_one_hot)
model.fit_generator(batch_for_network_generator(), 1000, epochs=1, verbose=1)
In [ ]:
weights_file = './data/cache/tagger_rnn_trained_keras.h5'
# Actually, this includes the embedding, which is a little redundant
if not os.path.isfile( weights_file ):
model.save_weights(weights_file)
In [ ]:
if os.path.isfile( weights_file ):
model.load_weights(weights_file)
In [25]:
def tag_results_for(test_sentences):
#sentences_for_network(list_of_sentences, include_targets=False, one_hot_targets=False)
input_values, target_values_int = sentences_for_network(test_sentences, include_targets=True)
rnn_output = model.predict_on_batch(input_values)
# rnn_output here is a softmax-vector at every word location
for i,sent in enumerate(test_sentences): # [0:5]):
annotated = [
"%s-%d-%d" % (word, target_values_int[i,j], np.argmax(rnn_output[i,j]), )
for j,word in enumerate(sent)
]
print(' '.join(annotated))
In [26]:
sentences=[
"Dr. Andrews works at Red Cat Labs .",
"Let 's see what part of speech analysis looks like .",
"When are you off to New York , Chaitanya ?",
]
# Uncomment this for 8 sentences from the corpus
#sentences = [ ' '.join(next(corpus_sentence_tokens_gen)) for i in range(8) ]
test_sentences_mixed = [ s.split(' ') for s in sentences ]
test_sentences_title = [ s.title().split(' ') for s in sentences ]
test_sentences_single = [ s.lower().split(' ') for s in sentences ]
#test_sentences_single = [ s.upper().split(' ') for s in sentences ]
print("Format : WORD-NLTK-RNN\n")
tag_results_for(test_sentences_mixed)
print()
tag_results_for(test_sentences_title)
print()
tag_results_for(test_sentences_single)
In [ ]:
Make the tagger identify different PoS (say : 'verbs')
Make the tagger return several different tags instead
See whether more advanced 'LSTM' nodes would improve the scores
Add a special 'is_uppercase' element to the embedding vector (or, more simply, just replace one of the elements with an indicator). Does this help the NNP accuracy?
In [ ]: