In [0]:
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import Model
from tensorflow.keras.layers import Dense, Embedding, Input, Flatten, Conv2D, MaxPooling2D
from google.cloud import bigquery
In [0]:
reviews_data = {
"review_text": ["The food was great, but it took forever to get seated.", "The tacos were life changing.", "This food made me question the presence of my taste buds."],
"meal_type": ["lunch", "dinner", "dinner"],
"meal_total": [50, 75, 60],
"rating": [4, 5, 1]
}
Step 1: process review_text so it can be fed in as an embedding
In [27]:
vocab_size = 50
tokenize = keras.preprocessing.text.Tokenizer(num_words=vocab_size)
tokenize.fit_on_texts(reviews_data['review_text'])
reviews_train = tokenize.texts_to_sequences(reviews_data['review_text'])
max_sequence_len = 20
reviews_train = keras.preprocessing.sequence.pad_sequences(reviews_train, maxlen=max_sequence_len, padding='post')
print(reviews_train)
Step 2: convert meal_type to one-hot
In [0]:
possible_meal_vocab = ['breakfast', 'lunch', 'dinner']
one_hot_meals = []
for i in reviews_data['meal_type']:
one_hot_arr = [0] * len(possible_meal_vocab)
one_index = possible_meal_vocab.index(i)
one_hot_arr[one_index] = 1
one_hot_meals.append(one_hot_arr)
Step 3: combine one-hot meal_type with meal_total into a single array
In [0]:
tabular_features = np.concatenate((np.array(one_hot_meals), np.expand_dims(reviews_data['meal_total'], axis=1)), axis=1)
Step 4: build the tabular and embedding layers with the Keras functional API
In [0]:
batch_size = len(reviews_data['review_text'])
embedding_input = Input(shape=(max_sequence_len,))
embedding_layer = Embedding(batch_size, 64)(embedding_input)
embedding_layer = Flatten()(embedding_layer)
embedding_layer = Dense(3, activation='relu')(embedding_layer)
tabular_input = Input(shape=(len(tabular_features[0]),))
tabular_layer = Dense(32, activation='relu')(tabular_input)
Step 5: concatenate the layers into a model
In [0]:
merged_input = keras.layers.concatenate([embedding_layer, tabular_layer])
merged_dense = Dense(16)(merged_input)
output = Dense(1)(merged_dense)
model = Model(inputs=[embedding_input, tabular_input], outputs=output)
In [32]:
# Preview the model architecture
model.summary()
Step 1: write a function to bucket ratings using a threshold
In [0]:
def good_or_bad(rating):
if rating > 3:
return 1
else:
return 0
Step 2: Bucket the data and create a new input array with both the numeric and bucketed (boolean) rating
In [34]:
rating_processed = []
for i in reviews_data['rating']:
rating_processed.append([i, good_or_bad(i)])
print(rating_processed)
We'll use the Stack Overflow dataset in BigQuery to show how you can create a model with text represented as both Bag of Words and embeddings. To run the code in this section you'll need a Google Cloud project.
In [0]:
# Authenticate to connect to BigQuery
from google.colab import auth
auth.authenticate_user()
Step 1: Get the data from BigQuery and save it to a Pandas DataFrame.
Be sure to replace your-project below with the name of your Google Cloud project.
In [0]:
%%bigquery df --project your-project
SELECT
title,
answer_count,
REPLACE(tags, "|", ",") as tags
FROM
`bigquery-public-data.stackoverflow.posts_questions`
WHERE
REGEXP_CONTAINS( tags, r"(?:keras|matplotlib|pandas)")
LIMIT 1000
In [49]:
# Preview the dataset, note that we'll use some of the
df.head()
Out[49]:
Step 2: Define the vocab size and max sequence length, and create an instance of the Tokenizer class, fitting it on our Stack Overflow data.
In [0]:
stacko_vocab_size = 200
stacko_sequence_len = 40
# Create a tokenizer for this data
stacko_tokenize = keras.preprocessing.text.Tokenizer(num_words=stacko_vocab_size)
stacko_tokenize.fit_on_texts(df['title'].values)
In [39]:
# Preview the first 20 words in the Tokenizer's vocabulary
list(stacko_tokenize.word_index.keys())[:20]
Out[39]:
Step 3: Convert the questions to sequences for the embedding representation
In [0]:
questions_train_embedding = stacko_tokenize.texts_to_sequences(df['title'].values)
questions_train_embedding = keras.preprocessing.sequence.pad_sequences(questions_train_embedding, maxlen=stacko_sequence_len, padding='post')
In [41]:
# Preview the embedding representation with the actual input text
print(df['title'].iloc[0])
print(questions_train_embedding[0])
Step 4: Create the Bag of Words representation
In [42]:
questions_train_matrix = stacko_tokenize.texts_to_matrix(df['title'].values)
print(questions_train_matrix[0])
Step 5: Create the embedding and BOW input layers
In [0]:
batch_size = len(df) # Note: we're using the whole dataset as the batch size for demo purposes
embedding_input = Input(shape=(stacko_sequence_len,))
embedding_layer = Embedding(batch_size, 64)(embedding_input)
embedding_layer = Flatten()(embedding_layer)
embedding_layer = Dense(32, activation='relu')(embedding_layer)
bow_input = Input(shape=(stacko_vocab_size,))
bow_layer = Dense(32, activation='relu')(bow_input)
Step 6: Create the model with the embedding and BOW layers
In [0]:
merged_text_input = keras.layers.concatenate([embedding_layer, bow_layer])
merged_dense_text = Dense(16)(merged_text_input)
merged_output = Dense(1)(merged_dense_text)
model = Model(inputs=[embedding_input, bow_input], outputs=merged_output)
In [45]:
model.summary()
We'll create a new dataset of Stack Overflow questions, this time adding some tabular features extracted from the text and changing the prediction task to whether a question is answered.
Step 1: Get the data in BigQuery. Remember to replace your-project below with the name of our GCP project.
In [0]:
%%bigquery df_tabular --project your-project
SELECT
title,
answer_count,
LENGTH(title) AS title_len,
ARRAY_LENGTH(SPLIT(title, " ")) AS word_count,
ENDS_WITH(title, "?") AS ends_with_q_mark,
REPLACE(tags, "|", ",") as tags,
IF
(answer_count > 0,
1,
0) AS is_answered
FROM
`bigquery-public-data.stackoverflow.posts_questions`
WHERE
REGEXP_CONTAINS( tags, r"(?:keras|matplotlib|pandas)")
LIMIT 1000
In [52]:
df_tabular.head()
Out[52]:
Step 2: Extract the tabular features
In [57]:
stacko_tabular_features = df_tabular[['title_len', 'word_count', 'ends_with_q_mark']]
stacko_tabular_features['ends_with_q_mark'] = stacko_tabular_features['ends_with_q_mark'].astype(int)
stacko_tabular_features.head()
Out[57]:
Step 3: Create an Input layer for the tabular features
In [0]:
stacko_tabular_input = Input(shape=(len(stacko_tabular_features.values[0]),))
stacko_tabular_layer = Dense(32, activation='relu')(stacko_tabular_input)
Step 4: Define a model using stacko_tabular_layer and our BOW layer from above
In [0]:
merged_mixed_input = keras.layers.concatenate([stacko_tabular_layer, bow_layer])
merged_mixed_text = Dense(16)(merged_mixed_input)
merged_mixed_output = Dense(1)(merged_mixed_text)
mixed_text_model = Model(inputs=[stacko_tabular_input, bow_input], outputs=merged_mixed_output)
In [66]:
mixed_text_model.summary()
Step 1: Define the pixel value and tiled representation layers
In [0]:
# Define image input layer (same shape for both pixel and tiled representation)
image_input = Input(shape=(28,28,1))
# Define pixel representation
pixel_layer = Flatten()(image_input)
# Define tiled representation
tiled_layer = Conv2D(filters=16, kernel_size=3, activation='relu')(image_input)
tiled_layer = MaxPooling2D()(tiled_layer)
tiled_layer = tf.keras.layers.Flatten()(tiled_layer)
Step 2: Concatenate the layers and create a model
In [0]:
merged_image_layers = keras.layers.concatenate([pixel_layer, tiled_layer])
merged_dense = Dense(16, activation='relu')(merged_image_layers)
merged_output = Dense(1)(merged_dense)
mixed_image_model = Model(inputs=image_input, outputs=merged_output)
In [70]:
mixed_image_model.summary()
In [ ]:
### Combining images and metadata
This shows how we'd feed both images and associated metadata into a single model. To demonstrate this we'll be using the dummy dataset of tabular data below.
In [0]:
tabular_image_metadata = {
'time': [9,10,2],
'visibility': [0.2, 0.5, 0.1],
'inclement_weather': [[0,0,1], [0,0,1], [1,0,0]],
'location': [[0,1,0,0,0], [0,0,0,1,0], [1,0,0,0,0]]
}
Step 1: Concatenate all tabular features
In [0]:
tabular_image_features = np.concatenate((
np.expand_dims(tabular_image_metadata['time'], axis=1),
np.expand_dims(tabular_image_metadata['visibility'], axis=1),
np.array(tabular_image_metadata['inclement_weather']),
np.array(tabular_image_metadata['location'])
), axis=1)
In [82]:
# Preview the data
tabular_image_features
Out[82]:
Step 2: Define the tabular layer
In [0]:
image_tabular_input = Input(shape=(len(tabular_image_features[0]),))
image_tabular_layer = Dense(32, activation='relu')(image_tabular_input)
Step 3: Merge the tabular layer with the tiled layer we defined above
In [0]:
mixed_image_layers = keras.layers.concatenate([image_tabular_layer, tiled_layer])
merged_image_dense = Dense(16, activation='relu')(mixed_image_layers)
merged_image_output = Dense(1)(merged_image_dense)
mixed_image_tabular_model = Model(inputs=[image_tabular_input, tiled_input], outputs=merged_image_output)
In [86]:
mixed_image_tabular_model.summary()
Copyright 2020 Google Inc. Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License