|
|
In [ ]:
#@title Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
This notebook contains a solution to the multiclass classification exercise from the Basic text classification tutorial on tensorflow.org. This notebook contains contains few comments. To learn more, please refer to the tutorial.
In [ ]:
!pip install tf-nightly
In [ ]:
import tensorflow as tf
from tensorflow.keras import layers
from tensorflow.keras import losses
from tensorflow.keras import preprocessing
from tensorflow.keras.layers.experimental.preprocessing import TextVectorization
In [ ]:
print(tf.__version__)
In [ ]:
url = "http://storage.googleapis.com/download.tensorflow.org/data/stack_overflow_16k.tar.gz"
tf.keras.utils.get_file("stack_overflow_16k.tar.gz", url,
untar=True, cache_dir='.',
cache_subdir='')
In [ ]:
batch_size = 32
raw_train_ds = tf.keras.preprocessing.text_dataset_from_directory(
'train', batch_size=batch_size, validation_split=0.2, subset='training', seed=42)
raw_val_ds = tf.keras.preprocessing.text_dataset_from_directory(
'train', batch_size=batch_size, validation_split=0.2, subset='validation', seed=42)
raw_test_ds = tf.keras.preprocessing.text_dataset_from_directory(
'test', batch_size=batch_size)
In [ ]:
print(raw_train_ds.class_names)
In [ ]:
for text_batch, label_batch in raw_train_ds.take(1):
for i in range(5):
print(text_batch.numpy()[i])
print(label_batch.numpy()[i])
print()
In [ ]:
max_features = 5000
embedding_dim = 128
sequence_length = 500
vectorize_layer = TextVectorization(
max_tokens=max_features,
output_mode='int',
output_sequence_length=sequence_length)
In [ ]:
# Make a text-only dataset (no labels) and call adapt
text_ds = raw_train_ds.map(lambda x, y: x)
vectorize_layer.adapt(text_ds)
In [ ]:
def vectorize_text(text, label):
text = tf.expand_dims(text, -1)
return vectorize_layer(text), label
train_ds = raw_train_ds.map(vectorize_text)
val_ds = raw_val_ds.map(vectorize_text)
test_ds = raw_test_ds.map(vectorize_text)
In [ ]:
AUTOTUNE = tf.data.experimental.AUTOTUNE
train_ds = train_ds.cache().prefetch(buffer_size=AUTOTUNE)
val_ds = val_ds.cache().prefetch(buffer_size=AUTOTUNE)
test_ds = test_ds.cache().prefetch(buffer_size=AUTOTUNE)
In [ ]:
model = tf.keras.Sequential([
layers.Embedding(max_features + 1, embedding_dim),
layers.Dropout(0.2),
layers.GlobalAveragePooling1D(),
layers.Dropout(0.2),
layers.Dense(4)])
In [ ]:
model.compile(
loss=losses.SparseCategoricalCrossentropy(from_logits=True),
optimizer='adam',
metrics=['accuracy'])
In [ ]:
history = model.fit(
train_ds,
validation_data=val_ds,
epochs=5)
In [ ]:
loss, accuracy = model.evaluate(test_ds)
print("Loss: ", loss)
print("Accuracy: ", accuracy)