In [1]:
# Based on
# https://github.com/fchollet/deep-learning-with-python-notebooks/blob/master/6.2-understanding-recurrent-neural-networks.ipynb
In [2]:
import warnings
warnings.filterwarnings('ignore')
In [3]:
%matplotlib inline
%pylab inline
In [4]:
import tensorflow as tf
tf.logging.set_verbosity(tf.logging.ERROR)
print(tf.__version__)
In [5]:
# https://keras.io/datasets/#imdb-movie-reviews-sentiment-classification
max_features = 1000 # number of words to consider as features
maxlen = 20 # cut texts after this number of words (among top max_features most common words)
# each review is encoded as a sequence of word indexes
# indexed by overall frequency in the dataset
# output is 0 (negative) or 1 (positive)
imdb = tf.keras.datasets.imdb.load_data(num_words=max_features)
(raw_input_train, y_train), (raw_input_test, y_test) = imdb
In [6]:
# tf.keras.datasets.imdb.load_data?
In [7]:
y_train.min()
Out[7]:
In [8]:
y_train.max()
Out[8]:
In [9]:
# 25000 texts
len(raw_input_train)
Out[9]:
In [10]:
# first text has 218 words
len(raw_input_train[0])
Out[10]:
In [11]:
raw_input_train[0]
Out[11]:
In [12]:
# tf.keras.preprocessing.sequence.pad_sequences?
In [13]:
# https://www.tensorflow.org/api_docs/python/tf/keras/preprocessing/sequence/pad_sequences
input_train = tf.keras.preprocessing.sequence.pad_sequences(raw_input_train, maxlen=maxlen)
input_test = tf.keras.preprocessing.sequence.pad_sequences(raw_input_test, maxlen=maxlen)
In [14]:
input_train.shape, input_test.shape, y_train.shape, y_test.shape
Out[14]:
In [15]:
# left padded with zeros
# As a convention, "0" does not stand for a specific word, but instead is used to encode any unknown word.
input_train[0]
Out[15]:
In [16]:
# tf.keras.layers.Embedding?
In [17]:
embedding_dim = 3
random_model = tf.keras.Sequential()
# Parameters: max_features * embedding_dim
random_model.add(tf.keras.layers.Embedding(name='embedding',input_dim=max_features, output_dim=embedding_dim, input_length=maxlen))
random_model.summary()
In [18]:
random_model.predict(input_train[:1])
Out[18]:
In [19]:
embedding_dim = 3
model = tf.keras.Sequential()
# Parameters: max_features * embedding_dim
model.add(tf.keras.layers.Embedding(name='embedding', input_dim=max_features, output_dim=embedding_dim, input_length=maxlen))
# Output: maxlen * embedding_dim (8)
model.add(tf.keras.layers.Flatten(name='flatten'))
# binary classifier
model.add(tf.keras.layers.Dense(name='fc', units=32, activation='relu'))
model.add(tf.keras.layers.Dense(name='classifier', units=1, activation='sigmoid'))
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
model.summary()
In [20]:
batch_size = 128
%time history = model.fit(input_train, y_train, epochs=10, batch_size=batch_size, validation_split=0.2)
In [21]:
train_loss, train_accuracy = model.evaluate(input_train, y_train, batch_size=batch_size)
train_accuracy
Out[21]:
In [22]:
test_loss, test_accuracy = model.evaluate(input_test, y_test, batch_size=batch_size)
test_accuracy
Out[22]:
In [23]:
# precition
model.predict(input_test[0:5])
Out[23]:
In [24]:
# ground truth
y_test[0:5]
Out[24]:
In [25]:
embedding_layer = model.get_layer('embedding')
In [26]:
model_stub= tf.keras.Model(inputs=model.input, outputs=embedding_layer.output)
In [27]:
embedding_prediction = model_stub.predict(input_test[0:5])
In [28]:
# 5 sample reviews, 500 words per review, 8 dimensions per word
embedding_prediction.shape
Out[28]:
In [29]:
# 8 embedding dimensions of first word of first sample review
embedding_prediction[0][0]
Out[29]:
In [30]:
input_train[0]
Out[30]:
In [31]:
model_stub.predict(input_train[:1])
Out[31]:
In [32]:
random_model.predict(input_train[:1])
Out[32]:
In [ ]: