In [0]:
#@title Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
So far, you have written some test sentences and generated a word index and then created sequences for the sentences.
Now you will tokenize and sequence a larger body of text, specifically reviews from Amazon and Yelp.
You will use a dataset containing Amazon and Yelp reviews of products and restaurants. This dataset was originally extracted from Kaggle.
The dataset includes reviews, and each review is labelled as 0 (bad) or 1 (good). However, in this exercise, you will only work with the reviews, not the labels, to practice tokenizing and sequencing the text.
Feel free to download the dataset from a drive folder belonging to Udacity and open it on your local machine to see more reviews.
In [0]:
# Import Tokenizer and pad_sequences
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
# Import numpy and pandas
import numpy as np
import pandas as pd
In [0]:
path = tf.keras.utils.get_file('reviews.csv',
'https://drive.google.com/uc?id=13ySLC_ue6Umt9RJYSeM2t-V0kCv-4C-P')
print (path)
In [0]:
# Read the csv file
dataset = pd.read_csv(path)
# Review the first few entries in the dataset
dataset.head()
In [0]:
# Get the reviews from the text column
reviews = dataset['text'].tolist()
In [0]:
tokenizer = Tokenizer(oov_token="<OOV>")
tokenizer.fit_on_texts(reviews)
word_index = tokenizer.word_index
print(len(word_index))
print(word_index)
In [0]:
sequences = tokenizer.texts_to_sequences(reviews)
padded_sequences = pad_sequences(sequences, padding='post')
# What is the shape of the vector containing the padded sequences?
# The shape shows the number of sequences and the length of each one.
print(padded_sequences.shape)
# What is the first review?
print (reviews[0])
# Show the sequence for the first review
print(padded_sequences[0])
# Try printing the review and padded sequence for other elements.