In [ ]:
# Upload the IMDB_all_reviews.txt here...
In [ ]:
!wc IMDB*.txt
# 25000 6723817 33596339 IMDB_all_reviews.txt
In [ ]:
! head IMDB_all_reviews.txt
In [ ]:
# Now split into train / validation (and also lowercase it all)
import random
reviews_train_len, reviews_valid_len = 0,0
with open('IMDB_all_reviews.txt', 'rt') as fin, \
open('reviews-train.txt', 'wt') as ftrain, \
open('reviews-valid.txt', 'wt') as fvalid:
for l in fin:
if random.random()<0.9:
ftrain.write(l.lower()) # No need for +'\n' - l includes it
reviews_train_len += 1
else:
fvalid.write(l.lower())
reviews_valid_len += 1
In [ ]:
! wc reviews-*.txt
In [ ]:
In [ ]:
import os
import requests, shutil
import numpy as np
import tensorflow as tf
from tensorflow import keras
In [ ]:
# ! rm glove.first-100k.6B.50d.txt # Force download
In [ ]:
# Load the GloVe embedding, along with the words
glove_dir = './'
glove_100k_50d = 'glove.first-100k.6B.50d.txt'
glove_100k_50d_path = os.path.join(glove_dir, glove_100k_50d)
data_cache = './'
glove_full_tar = 'glove.6B.zip'
glove_full_50d = 'glove.6B.50d.txt'
#force_download_from_original=False
download_url= 'http://redcatlabs.com/downloads/deep-learning-workshop/notebooks/data/RNN/'+glove_100k_50d
original_url = 'http://nlp.stanford.edu/data/'+glove_full_tar
if not os.path.isfile( glove_100k_50d_path ):
if not os.path.exists(glove_dir):
os.makedirs(glove_dir)
# First, try to download a pre-prepared file directly...
response = requests.get(download_url, stream=True)
if response.status_code == requests.codes.ok:
print("Downloading 42Mb pre-prepared GloVE file from RedCatLabs")
with open(glove_100k_50d_path, 'wb') as out_file:
shutil.copyfileobj(response.raw, out_file)
else:
# But, for some reason, RedCatLabs didn't give us the file directly
if not os.path.exists(data_cache):
os.makedirs(data_cache)
if not os.path.isfile( os.path.join(data_cache, glove_full_50d) ):
zipfilepath = os.path.join(data_cache, glove_full_tar)
if not os.path.isfile( zipfilepath ):
print("Downloading 860Mb GloVE file from Stanford")
response = requests.get(download_url, stream=True)
with open(zipfilepath, 'wb') as out_file:
shutil.copyfileobj(response.raw, out_file)
if os.path.isfile(zipfilepath):
print("Unpacking 50d GloVE file from zip")
import zipfile
zipfile.ZipFile(zipfilepath, 'r').extract(glove_full_50d, data_cache)
with open(os.path.join(data_cache, glove_full_50d), 'rt') as in_file:
with open(glove_100k_50d_path, 'wt') as out_file:
print("Reducing 50d GloVE file to first 100k words")
for i, l in enumerate(in_file.readlines()):
if i>=100000: break
out_file.write(l)
# Get rid of tarfile source (the required text file itself will remain)
#os.unlink(zipfilepath)
#os.unlink(os.path.join(data_cache, glove_full_50d))
if os.path.isfile( glove_100k_50d_path ):
print("GloVE available locally")
! head -3 {glove_100k_50d_path}
In [ ]:
# https://stackoverflow.com/questions/48677077/how-do-i-create-a-keras-embedding-layer-from-a-pre-trained-word-embedding-datase
In [ ]:
# Prepare Glove File
def readGloveFile(gloveFile):
with open(gloveFile, 'r') as f:
wordToGlove = {} # map from a token (word) to a Glove embedding vector
wordToIndex = {} # map from a token to an index
indexToWord = {} # map from an index to a token
for line in f:
record = line.strip().split()
token = record[0] # take the token (word) from the text line
# associate the Glove embedding vector to a that token (word)
wordToGlove[token] = np.array(record[1:], dtype=np.float64)
#tokens = sorted(wordToGlove.keys())
#tokens = wordToGlove.keys() #
token_mask, token_unk = '<MASK>', '<UNK>'
token_list = [token_mask, token_unk,]+list(wordToGlove.keys())
wordToGlove[token_mask] = np.zeros_like(wordToGlove[token])
wordToGlove[token_unk] = np.zeros_like(wordToGlove[token])
for idx, tok in enumerate(token_list):
#kerasIdx = idx + 1 # 0 is reserved for masking in Keras (see above)
#kerasIdx = idx
#wordToIndex[tok] = kerasIdx # associate an index to a token (word)
#indexToWord[kerasIdx] = tok # associate a word to a token (word).
wordToIndex[tok] = idx # associate an index to a token (word)
indexToWord[idx] = tok # associate a word to a token (word).
# Note: inverse of dictionary above
return wordToIndex, indexToWord, wordToGlove, token_list
wordToIndex, indexToWord, wordToGlove, token_list = readGloveFile(glove_100k_50d_path)
[ indexToWord[i] for i in range(12)]
In [ ]:
# Create Pretrained Keras Embedding Layer
def createPretrainedEmbeddingLayer(wordToGlove, wordToIndex, isTrainable):
#vocabLen = len(wordToIndex) + 1 # adding 1 to account for masking
vocabLen = len(wordToIndex)
embDim = next(iter(wordToGlove.values())).shape[0] # works with any glove dimensions (e.g. 50)
embeddingMatrix = np.zeros((vocabLen, embDim)) # initialize with zeros
for word, index in wordToIndex.items():
embeddingMatrix[index, :] = wordToGlove[word] # create embedding: word index to Glove word embedding
embeddingLayer = keras.layers.Embedding(vocabLen, embDim,
weights=[embeddingMatrix],
mask_zero=True, # zero embedding for zero_padding
trainable=isTrainable)
return embeddingLayer, embDim
pretrainedEmbeddingLayer, embedding_dim = createPretrainedEmbeddingLayer(wordToGlove, wordToIndex, False)
In [ ]:
# usage
#model = Sequential()
#model.add(pretrainedEmbeddingLayer)
# or
#model.add(Embedding(max_features, 128, mask_zero = True)) # zero embedding for zero_padding
#model.add(LSTM(128, dropout=0.2, recurrent_dropout=0.2))
In [ ]:
In [ ]:
## Terrible documentation :
# https://www.tensorflow.org/guide/datasets#consuming_text_data
## Much better documentation :
# https://cs230-stanford.github.io/tensorflow-input-data.html#introduction-to-tfdata-with-a-text-example
In [ ]:
#https://www.tensorflow.org/api_docs/python/tf/contrib/lookup/index_table_from_tensor
mapping_strings = tf.constant(token_list)
embedding_mapping = tf.contrib.lookup.index_table_from_tensor(
mapping=mapping_strings, num_oov_buckets=0, default_value=wordToIndex['<UNK>'])
In [ ]:
reviews_train = tf.data.TextLineDataset("reviews-train.txt")
reviews_valid = tf.data.TextLineDataset("reviews-valid.txt")
In [ ]:
num_classes = 2
batch_size = 128
def preprocess_line(line):
line_data = tf.string_split([line], delimiter='|').values
label = tf.string_to_number( line_data[0], out_type=tf.int32)
txt = tf.string_split([ line_data[1] ],
delimiter=' ',
skip_empty=True).values # lower-case conversion done above
# so now txt is an tf vector of strings - each a word
# If we wanted to do a char-wise analysis, we could use the code below
# *if* we could split the original line_data[1] into
# a vector of single character-length strings...
# Other way to split into words ...
# https://www.tensorflow.org/api_docs/python/tf/keras/preprocessing/text/text_to_word_sequence
#txt = tf.keras.preprocessing.text.text_to_word_sequence( line_data[1] )
txt_ids = embedding_mapping.lookup(txt)
label_onehot = tf.one_hot(label, depth=num_classes, axis=-1)
# This gives us :
# reviews[0] as a review tensor with variable length; and
# reviews[1] as label
return txt_ids, label_onehot
def batch_padded(ds, is_training=False, buffer_size=100, batch_size=batch_size):
if is_training:
ds = ds.shuffle(buffer_size=buffer_size)
ds = ds.repeat(100) # "Forever"
ds = ds.map(preprocess_line, num_parallel_calls=4)
ds = ds.padded_batch(batch_size,
padded_shapes=(tf.TensorShape([None]), tf.TensorShape([num_classes])),
#padding_values=(0,0) # Defaults to 0 padding, which is <MASK> which is fine
)
ds = ds.prefetch(1) # Makes it run async (prefetch 1 batch ahead)
return ds
reviews_train_ds = batch_padded(reviews_train, is_training=True)
reviews_valid_ds = batch_padded(reviews_valid, is_training=False)
In [ ]:
#train_iterator = reviews_train_ds.make_one_shot_iterator() # Does not work : DS includes a lookup table
train_iterator = reviews_train_ds.make_initializable_iterator()
train_next_batch = train_iterator.get_next()
In [ ]:
tf.global_variables_initializer()
In [ ]:
# This just proves that the dataset iterator can read a review,
# and then convert it back from the indices to words
with tf.Session() as sess:
sess.run( tf.tables_initializer() )
sess.run( train_iterator.initializer )
for i in range(1):
sentence, label = sess.run(train_next_batch)
print(label[0], ' :: ', (' '.join([indexToWord[i] for i in sentence[0] ])[:200]),)
In [ ]:
# http://ruder.io/text-classification-tensorflow-estimators/ # TF. Estimators => not relevant
In [ ]:
# Props to :
# https://github.com/keras-team/keras/blob/master/examples/imdb_lstm.py
# Just for the model (since the built-in imdb dataset makes it 'too easy')
# https://stackoverflow.com/questions/46135499/how-to-properly-combine-tensorflows-dataset-api-and-keras
# https://gist.github.com/datlife/abfe263803691a8864b7a2d4f87c4ab8
# For the dataset direct to keras example
In [ ]:
inputs, targets = train_iterator.get_next()
In [ ]:
#model = keras.Sequential()
#model.add(keras.layers.Embedding(max_features, 128))
#model.add(pretrainedEmbeddingLayer)
#model.add(keras.layers.LSTM(128, dropout=0.2, recurrent_dropout=0.2))
#model.add(keras.layers.Dense(num_classes, activation='softmax'))
In [ ]:
model_input = keras.layers.Input(tensor=inputs)
#x = keras.layers.Embedding(max_features, 128)(model_input)
x = pretrainedEmbeddingLayer(model_input)
x = keras.layers.LSTM(128, dropout=0.2, recurrent_dropout=0.2)(x)
model_output = keras.layers.Dense(num_classes, activation='softmax')(x)
train_model = keras.models.Model(inputs=model_input, outputs=model_output)
print('Model built')
In [ ]:
optimizer = keras.optimizers.RMSprop(lr=2e-3, decay=1e-5)
train_model.compile(optimizer=optimizer,
loss='categorical_crossentropy',
target_tensors=[targets],
metrics=['accuracy'],
)
train_model.summary()
In [ ]:
epochs=1 # Should get to acc:0.81 (each epoch ~5mins)
steps_per_epoch=reviews_train_len // batch_size
print('Train...')
with keras.backend.get_session().as_default() as sess:
sess.run( tf.tables_initializer() )
sess.run( train_iterator.initializer )
train_model.fit(epochs=epochs, steps_per_epoch=steps_per_epoch, verbose=1)
print('Training ended')
In [ ]:
# Cannot predict directly from this dataset-trained model : need to save and reload as a regular model...
#train_model.predict({inputs:batch_predict}) # , batch_size=1
#train_model.predict(batch_predict) # , batch_size=1
In [ ]:
train_model.save('IMDB-saved-model.h5')
In [ ]:
test_model = keras.models.load_model('IMDB-saved-model.h5')
In [ ]:
import string
def to_idx(s):
# Remove punctuation characters except for the apostrophe
# https://docs.python.org/3/library/stdtypes.html?highlight=maketrans#str.maketrans
translator = str.maketrans('', '', string.punctuation.replace("'", ''))
tokens = s.translate(translator).lower().split()
return [wordToIndex.get(t, wordToIndex['<UNK>']) for t in tokens]
#return np.array(
# [1] + [word_index[t] +
# index_offset if t in word_index else 2 for t in tokens])
to_idx('this movie was excellent')
In [ ]:
batch_predict = np.array([
to_idx('this movie was excellent'),
to_idx('i hated the movie'),
] )
test_model.predict(batch_predict) # , batch_size=1
In [ ]: