In [ ]:
# Upload the IMDB_all_reviews.txt here...

In [ ]:
!wc IMDB*.txt
#    25000  6723817 33596339 IMDB_all_reviews.txt

In [ ]:
! head IMDB_all_reviews.txt

In [ ]:
# Now split into train / validation (and also lowercase it all)
import random

reviews_train_len, reviews_valid_len = 0,0

with open('IMDB_all_reviews.txt', 'rt') as fin,      \
     open('reviews-train.txt', 'wt') as ftrain, \
     open('reviews-valid.txt', 'wt') as fvalid:
  for l in fin:
    if random.random()<0.9:
      ftrain.write(l.lower())  # No need for +'\n' - l includes it
      reviews_train_len += 1
    else:
      fvalid.write(l.lower())      
      reviews_valid_len += 1

In [ ]:
! wc reviews-*.txt

In [ ]:


In [ ]:
import os 

import requests, shutil

import numpy as np

import tensorflow as tf
from tensorflow import keras

In [ ]:
# ! rm glove.first-100k.6B.50d.txt  # Force download

In [ ]:
# Load the GloVe embedding, along with the words

glove_dir = './'
glove_100k_50d = 'glove.first-100k.6B.50d.txt'
glove_100k_50d_path = os.path.join(glove_dir, glove_100k_50d)

data_cache = './'
glove_full_tar = 'glove.6B.zip'
glove_full_50d = 'glove.6B.50d.txt'

#force_download_from_original=False
download_url= 'http://redcatlabs.com/downloads/deep-learning-workshop/notebooks/data/RNN/'+glove_100k_50d
original_url = 'http://nlp.stanford.edu/data/'+glove_full_tar

if not os.path.isfile( glove_100k_50d_path ):
    if not os.path.exists(glove_dir):
        os.makedirs(glove_dir)
    
    # First, try to download a pre-prepared file directly...
    response = requests.get(download_url, stream=True)
    if response.status_code == requests.codes.ok:
        print("Downloading 42Mb pre-prepared GloVE file from RedCatLabs")
        with open(glove_100k_50d_path, 'wb') as out_file:
            shutil.copyfileobj(response.raw, out_file)
    else:
        # But, for some reason, RedCatLabs didn't give us the file directly
        if not os.path.exists(data_cache):
            os.makedirs(data_cache)
        
        if not os.path.isfile( os.path.join(data_cache, glove_full_50d) ):
            zipfilepath = os.path.join(data_cache, glove_full_tar)
            if not os.path.isfile( zipfilepath ):
                print("Downloading 860Mb GloVE file from Stanford")
                response = requests.get(download_url, stream=True)
                with open(zipfilepath, 'wb') as out_file:
                    shutil.copyfileobj(response.raw, out_file)
            if os.path.isfile(zipfilepath):
                print("Unpacking 50d GloVE file from zip")
                import zipfile
                zipfile.ZipFile(zipfilepath, 'r').extract(glove_full_50d, data_cache)

        with open(os.path.join(data_cache, glove_full_50d), 'rt') as in_file:
            with open(glove_100k_50d_path, 'wt') as out_file:
                print("Reducing 50d GloVE file to first 100k words")
                for i, l in enumerate(in_file.readlines()):
                    if i>=100000: break
                    out_file.write(l)
    
        # Get rid of tarfile source (the required text file itself will remain)
        #os.unlink(zipfilepath)
        #os.unlink(os.path.join(data_cache, glove_full_50d))

if os.path.isfile( glove_100k_50d_path ):
  print("GloVE available locally")
  ! head -3 {glove_100k_50d_path}

In [ ]:
# https://stackoverflow.com/questions/48677077/how-do-i-create-a-keras-embedding-layer-from-a-pre-trained-word-embedding-datase

In [ ]:
# Prepare Glove File
def readGloveFile(gloveFile):
    with open(gloveFile, 'r') as f:
        wordToGlove = {}  # map from a token (word) to a Glove embedding vector
        wordToIndex = {}  # map from a token to an index
        indexToWord = {}  # map from an index to a token 

        for line in f:
            record = line.strip().split()
            token = record[0] # take the token (word) from the text line
            # associate the Glove embedding vector to a that token (word)
            wordToGlove[token] = np.array(record[1:], dtype=np.float64) 
        
        #tokens = sorted(wordToGlove.keys())
        #tokens = wordToGlove.keys() # 
        
        token_mask, token_unk = '<MASK>', '<UNK>'
        token_list = [token_mask, token_unk,]+list(wordToGlove.keys())
        
        wordToGlove[token_mask] = np.zeros_like(wordToGlove[token]) 
        wordToGlove[token_unk]  = np.zeros_like(wordToGlove[token]) 
        
        for idx, tok in enumerate(token_list):
            #kerasIdx = idx + 1  # 0 is reserved for masking in Keras (see above)
            #kerasIdx = idx
            #wordToIndex[tok] = kerasIdx # associate an index to a token (word)
            #indexToWord[kerasIdx] = tok # associate a word to a token (word). 
            wordToIndex[tok] = idx # associate an index to a token (word)
            indexToWord[idx] = tok # associate a word to a token (word). 
            # Note: inverse of dictionary above

    return wordToIndex, indexToWord, wordToGlove, token_list

wordToIndex, indexToWord, wordToGlove, token_list = readGloveFile(glove_100k_50d_path)
[ indexToWord[i] for i in range(12)]

In [ ]:
# Create Pretrained Keras Embedding Layer
def createPretrainedEmbeddingLayer(wordToGlove, wordToIndex, isTrainable):
    #vocabLen = len(wordToIndex) + 1  # adding 1 to account for masking
    vocabLen = len(wordToIndex)
    embDim = next(iter(wordToGlove.values())).shape[0]  # works with any glove dimensions (e.g. 50)

    embeddingMatrix = np.zeros((vocabLen, embDim))    # initialize with zeros
    for word, index in wordToIndex.items():
        embeddingMatrix[index, :] = wordToGlove[word] # create embedding: word index to Glove word embedding

    embeddingLayer = keras.layers.Embedding(vocabLen, embDim, 
                                     weights=[embeddingMatrix], 
                                     mask_zero=True,  # zero embedding for zero_padding
                                     trainable=isTrainable)
    return embeddingLayer, embDim
  
pretrainedEmbeddingLayer, embedding_dim = createPretrainedEmbeddingLayer(wordToGlove, wordToIndex, False)

In [ ]:
# usage
#model = Sequential()
#model.add(pretrainedEmbeddingLayer)
# or
#model.add(Embedding(max_features, 128, mask_zero = True))  # zero embedding for zero_padding
#model.add(LSTM(128, dropout=0.2, recurrent_dropout=0.2))

In [ ]:


In [ ]:
## Terrible documentation :
# https://www.tensorflow.org/guide/datasets#consuming_text_data

## Much better documentation :
# https://cs230-stanford.github.io/tensorflow-input-data.html#introduction-to-tfdata-with-a-text-example

In [ ]:
#https://www.tensorflow.org/api_docs/python/tf/contrib/lookup/index_table_from_tensor

mapping_strings = tf.constant(token_list)
embedding_mapping = tf.contrib.lookup.index_table_from_tensor(
    mapping=mapping_strings, num_oov_buckets=0, default_value=wordToIndex['<UNK>'])

In [ ]:
reviews_train = tf.data.TextLineDataset("reviews-train.txt")
reviews_valid = tf.data.TextLineDataset("reviews-valid.txt")

In [ ]:
num_classes = 2
batch_size  = 128

def preprocess_line(line):
  line_data = tf.string_split([line], delimiter='|').values
  
  label = tf.string_to_number( line_data[0], out_type=tf.int32)
  txt = tf.string_split([ line_data[1] ], 
                        delimiter=' ', 
                        skip_empty=True).values  # lower-case conversion done above
    
  # so now txt is an tf vector of strings - each a word
   
  # If we wanted to do a char-wise analysis, we could use the code below
  #   *if* we could split the original line_data[1] into 
  #   a vector of single character-length strings...

  
  # Other way to split into words ... 
  # https://www.tensorflow.org/api_docs/python/tf/keras/preprocessing/text/text_to_word_sequence
  #txt = tf.keras.preprocessing.text.text_to_word_sequence( line_data[1] )

  txt_ids = embedding_mapping.lookup(txt)
  label_onehot = tf.one_hot(label, depth=num_classes, axis=-1)
  
  # This gives us :
  #   reviews[0] as a review tensor with variable length; and 
  #   reviews[1] as label
  return txt_ids, label_onehot

def batch_padded(ds, is_training=False, buffer_size=100, batch_size=batch_size):
  if is_training:
    ds = ds.shuffle(buffer_size=buffer_size)
  ds = ds.repeat(100)  # "Forever"
  ds = ds.map(preprocess_line, num_parallel_calls=4)
  ds = ds.padded_batch(batch_size, 
           padded_shapes=(tf.TensorShape([None]), tf.TensorShape([num_classes])), 
           #padding_values=(0,0)  # Defaults to 0 padding, which is <MASK> which is fine
          )
  
  ds = ds.prefetch(1)  # Makes it run async (prefetch 1 batch ahead)
  return ds  

reviews_train_ds = batch_padded(reviews_train, is_training=True)
reviews_valid_ds = batch_padded(reviews_valid, is_training=False)

In [ ]:
#train_iterator = reviews_train_ds.make_one_shot_iterator()  # Does not work : DS includes a lookup table
train_iterator = reviews_train_ds.make_initializable_iterator()
train_next_batch = train_iterator.get_next()

In [ ]:
tf.global_variables_initializer()

In [ ]:
# This just proves that the dataset iterator can read a review,
#   and then convert it back from the indices to words

with tf.Session() as sess:
  sess.run( tf.tables_initializer() )
  sess.run( train_iterator.initializer )
  for i in range(1):
    sentence, label = sess.run(train_next_batch)
    print(label[0], ' :: ', (' '.join([indexToWord[i] for i in sentence[0] ])[:200]),)

In [ ]:
# http://ruder.io/text-classification-tensorflow-estimators/ # TF. Estimators => not relevant

In [ ]:
# Props to :

# https://github.com/keras-team/keras/blob/master/examples/imdb_lstm.py
#   Just for the model (since the built-in imdb dataset makes it 'too easy')

# https://stackoverflow.com/questions/46135499/how-to-properly-combine-tensorflows-dataset-api-and-keras
# https://gist.github.com/datlife/abfe263803691a8864b7a2d4f87c4ab8
#   For the dataset direct to keras example

Now for the model...


In [ ]:
inputs, targets = train_iterator.get_next()

Sequential style


In [ ]:
#model = keras.Sequential()
#model.add(keras.layers.Embedding(max_features, 128))
#model.add(pretrainedEmbeddingLayer)
#model.add(keras.layers.LSTM(128, dropout=0.2, recurrent_dropout=0.2))
#model.add(keras.layers.Dense(num_classes, activation='softmax'))

Functional style


In [ ]:
model_input = keras.layers.Input(tensor=inputs)
#x = keras.layers.Embedding(max_features, 128)(model_input)
x = pretrainedEmbeddingLayer(model_input)
x = keras.layers.LSTM(128, dropout=0.2, recurrent_dropout=0.2)(x)
model_output = keras.layers.Dense(num_classes, activation='softmax')(x)

train_model = keras.models.Model(inputs=model_input, outputs=model_output)
print('Model built')

In [ ]:
optimizer = keras.optimizers.RMSprop(lr=2e-3, decay=1e-5)

train_model.compile(optimizer=optimizer,
                    loss='categorical_crossentropy',
                    target_tensors=[targets],
                    metrics=['accuracy'],
                   )

train_model.summary()

Fit the Model


In [ ]:
epochs=1  # Should get to acc:0.81 (each epoch ~5mins)
steps_per_epoch=reviews_train_len // batch_size  

print('Train...')
with keras.backend.get_session().as_default() as sess:
  sess.run( tf.tables_initializer() )
  sess.run( train_iterator.initializer )
train_model.fit(epochs=epochs, steps_per_epoch=steps_per_epoch, verbose=1)
print('Training ended')

In [ ]:
# Cannot predict directly from this dataset-trained model : need to save and reload as a regular model...
#train_model.predict({inputs:batch_predict})  # , batch_size=1
#train_model.predict(batch_predict)  # , batch_size=1

In [ ]:
train_model.save('IMDB-saved-model.h5')

Load the saved model


In [ ]:
test_model = keras.models.load_model('IMDB-saved-model.h5')

In [ ]:
import string

def to_idx(s):
    # Remove punctuation characters except for the apostrophe
    #  https://docs.python.org/3/library/stdtypes.html?highlight=maketrans#str.maketrans
    translator = str.maketrans('', '', string.punctuation.replace("'", ''))
    tokens = s.translate(translator).lower().split()
    return [wordToIndex.get(t, wordToIndex['<UNK>']) for t in tokens]
    #return np.array(
    #           [1] + [word_index[t] + 
    #           index_offset if t in word_index else 2 for t in tokens])    
to_idx('this movie was excellent')

In [ ]:
batch_predict = np.array([ 
    to_idx('this movie was excellent'),
    to_idx('i hated the movie'),
] )
test_model.predict(batch_predict)  # , batch_size=1

In [ ]: