Parsing and Querying AI Platform Prediction request-response logs in BigQuery

This tutorial shows you how to create a view to parse the raw request instances and response predictions logged from AI Platfrom Prediction to BigQuery. The tutorial covers the following steps:

  1. Define dataset metadata
  2. Generate the CREATE VIEW script that parse the raw data
  3. Execute the script
  4. Query the view to retreive the parsed data

Setup

Install packages and dependencies


In [ ]:
!pip install -U -q google-api-python-client
!pip install -U -q pandas

Configure GCP environment settings


In [ ]:
PROJECT_ID = "sa-data-validation"
MODEL_NAME = 'covertype_classifier'
VERSION_NAME = 'v1' 
BQ_DATASET_NAME = 'prediction_logs'
BQ_TABLE_NAME = 'covertype_classifier_logs'  
!gcloud config set project $PROJECT_ID

Authenticate your GCP account

This is required if you run the notebook in Colab


In [ ]:
try:
  from google.colab import auth
  auth.authenticate_user()
  print("Colab user is authenticated.")
except: pass

Import libraries


In [ ]:
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

import os
import pandas as pd
from google.cloud import bigquery

1. Define Dataset Metadata


In [ ]:
HEADER = ['Elevation', 'Aspect', 'Slope','Horizontal_Distance_To_Hydrology',
          'Vertical_Distance_To_Hydrology', 'Horizontal_Distance_To_Roadways',
          'Hillshade_9am', 'Hillshade_Noon', 'Hillshade_3pm',
          'Horizontal_Distance_To_Fire_Points', 'Wilderness_Area', 'Soil_Type',
          'Cover_Type']

TARGET_FEATURE_NAME = 'Cover_Type'

FEATURE_LABELS = ['0', '1', '2', '3', '4', '5', '6']

NUMERIC_FEATURE_NAMES = ['Aspect', 'Elevation', 'Hillshade_3pm', 
                         'Hillshade_9am', 'Hillshade_Noon', 
                         'Horizontal_Distance_To_Fire_Points',
                         'Horizontal_Distance_To_Hydrology',
                         'Horizontal_Distance_To_Roadways','Slope',
                         'Vertical_Distance_To_Hydrology']

CATEGORICAL_FEATURES_WITH_VOCABULARY = {
    'Soil_Type': ['2702', '2703', '2704', '2705', '2706', '2717', '3501', '3502', 
                  '4201', '4703', '4704', '4744', '4758', '5101', '6101', '6102', 
                  '6731', '7101', '7102', '7103', '7201', '7202', '7700', '7701', 
                  '7702', '7709', '7710', '7745', '7746', '7755', '7756', '7757', 
                  '7790', '8703', '8707', '8708', '8771', '8772', '8776'], 
    'Wilderness_Area': ['Cache', 'Commanche', 'Neota', 'Rawah']
}

FEATURE_NAMES = list(CATEGORICAL_FEATURES_WITH_VOCABULARY.keys()) + NUMERIC_FEATURE_NAMES

HEADER_DEFAULTS = [[0] if feature_name in NUMERIC_FEATURE_NAMES + [TARGET_FEATURE_NAME] else ['NA'] 
                   for feature_name in HEADER]

NUM_CLASSES = len(FEATURE_LABELS)

2. Generate the CREATE VIEW script


In [ ]:
LABEL_KEY = 'predicted_label'
SCORE_KEY = 'confidence'
SIGNATURE_NAME = 'serving_default'


def _extract_json(column, feature_name):
  return "JSON_EXTRACT({}, '$.{}')".format(column, feature_name)

def _replace_brackets(field):
  return "REPLACE(REPLACE({}, ']', ''), '[','')".format(field)

def _replace_quotes(field):
  return 'REPLACE({}, "\\"","")'.format(field)

def _cast_to_numeric(field):
  return "CAST({} AS NUMERIC)".format(field)

def _add_alias(field, feature_name):
  return "{} AS {}".format(field, feature_name)

view_name = "vw_"+BQ_TABLE_NAME+"_"+VERSION_NAME

colum_names = FEATURE_NAMES
input_features = ', \r\n  '.join(colum_names)

json_features_extraction = []
for feature_name in colum_names:
  field = _extract_json('instance', feature_name)
  field = _replace_brackets(field) 
  if feature_name in NUMERIC_FEATURE_NAMES:
    field = _cast_to_numeric(field)
  else:
    field = _replace_quotes(field)
  field = _add_alias(field, feature_name)
  json_features_extraction.append(field)

json_features_extraction = ', \r\n    '.join(json_features_extraction)

json_prediction_extraction = []
for feature_name in [LABEL_KEY, SCORE_KEY]:
  field = _extract_json('prediction', feature_name)
  field = _replace_brackets(field) 
  if feature_name == SCORE_KEY:
    field = _cast_to_numeric(field)
  else:
    field = _replace_quotes(field)
  field = _add_alias(field, feature_name)
  json_prediction_extraction.append(field)

json_prediction_extraction = ', \r\n    '.join(json_prediction_extraction)

In [ ]:
sql_script = '''
CREATE OR REPLACE VIEW @dataset_name.@view_name
AS

WITH step1
AS
(
  SELECT 
    model, 
    model_version, 
    time, 
    SPLIT(JSON_EXTRACT(raw_data, '$.instances'), '}],[{') instance_list, 
    SPLIT(JSON_EXTRACT(raw_prediction, '$.predictions'), '}],[{') as prediction_list
  FROM 
  `@project.@dataset_name.@table_name` 
  WHERE 
    model = '@model_name' AND
    model_version = '@version'
),

step2
AS
(
  SELECT
    model, 
    model_version, 
    time, 
    REPLACE(REPLACE(instance, '[{', '{'),'}]', '}') AS instance,
    REPLACE(REPLACE(prediction, '[{', '{'),'}]', '}') AS prediction,
  FROM step1
  JOIN UNNEST(step1.instance_list) AS instance
  WITH OFFSET AS f1
  JOIN UNNEST(step1.prediction_list) AS prediction
  WITH OFFSET AS f2
  ON f1=f2
),

step3 AS
(
  SELECT 
    model, 
    model_version, 
    time,
    @json_features_extraction,
    @json_prediction_extraction
  FROM step2
)

SELECT *
FROM step3

'''

In [ ]:
sql_script = sql_script.replace("@project", PROJECT_ID)
sql_script = sql_script.replace("@dataset_name", BQ_DATASET_NAME)
sql_script = sql_script.replace("@table_name", BQ_TABLE_NAME)
sql_script = sql_script.replace("@view_name", view_name)
sql_script = sql_script.replace("@model_name", MODEL_NAME)
sql_script = sql_script.replace("@version", VERSION_NAME)
sql_script = sql_script.replace("@input_features", input_features)
sql_script = sql_script.replace("@json_features_extraction", json_features_extraction)
sql_script = sql_script.replace("@json_prediction_extraction", json_prediction_extraction)

Print generated script


In [ ]:
print(sql_script)

3. Execute the CREATE VIEW scritp


In [ ]:
client = bigquery.Client(PROJECT_ID)
client.query(query = sql_script)
print("View was created or replaced.")

4. Query the view


In [ ]:
query = '''
  SELECT * FROM 
  `{}.{}` 
  LIMIT {}
'''.format(BQ_DATASET_NAME, view_name, 3)

pd.io.gbq.read_gbq(
    query, project_id=PROJECT_ID).T

In [ ]: