This tutorial shows you how to create a view to parse the raw request instances and response predictions logged from AI Platfrom Prediction to BigQuery. The tutorial covers the following steps:
In [ ]:
!pip install -U -q google-api-python-client
!pip install -U -q pandas
In [ ]:
PROJECT_ID = "sa-data-validation"
MODEL_NAME = 'covertype_classifier'
VERSION_NAME = 'v1'
BQ_DATASET_NAME = 'prediction_logs'
BQ_TABLE_NAME = 'covertype_classifier_logs'
!gcloud config set project $PROJECT_ID
In [ ]:
try:
from google.colab import auth
auth.authenticate_user()
print("Colab user is authenticated.")
except: pass
In [ ]:
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import os
import pandas as pd
from google.cloud import bigquery
In [ ]:
HEADER = ['Elevation', 'Aspect', 'Slope','Horizontal_Distance_To_Hydrology',
'Vertical_Distance_To_Hydrology', 'Horizontal_Distance_To_Roadways',
'Hillshade_9am', 'Hillshade_Noon', 'Hillshade_3pm',
'Horizontal_Distance_To_Fire_Points', 'Wilderness_Area', 'Soil_Type',
'Cover_Type']
TARGET_FEATURE_NAME = 'Cover_Type'
FEATURE_LABELS = ['0', '1', '2', '3', '4', '5', '6']
NUMERIC_FEATURE_NAMES = ['Aspect', 'Elevation', 'Hillshade_3pm',
'Hillshade_9am', 'Hillshade_Noon',
'Horizontal_Distance_To_Fire_Points',
'Horizontal_Distance_To_Hydrology',
'Horizontal_Distance_To_Roadways','Slope',
'Vertical_Distance_To_Hydrology']
CATEGORICAL_FEATURES_WITH_VOCABULARY = {
'Soil_Type': ['2702', '2703', '2704', '2705', '2706', '2717', '3501', '3502',
'4201', '4703', '4704', '4744', '4758', '5101', '6101', '6102',
'6731', '7101', '7102', '7103', '7201', '7202', '7700', '7701',
'7702', '7709', '7710', '7745', '7746', '7755', '7756', '7757',
'7790', '8703', '8707', '8708', '8771', '8772', '8776'],
'Wilderness_Area': ['Cache', 'Commanche', 'Neota', 'Rawah']
}
FEATURE_NAMES = list(CATEGORICAL_FEATURES_WITH_VOCABULARY.keys()) + NUMERIC_FEATURE_NAMES
HEADER_DEFAULTS = [[0] if feature_name in NUMERIC_FEATURE_NAMES + [TARGET_FEATURE_NAME] else ['NA']
for feature_name in HEADER]
NUM_CLASSES = len(FEATURE_LABELS)
In [ ]:
LABEL_KEY = 'predicted_label'
SCORE_KEY = 'confidence'
SIGNATURE_NAME = 'serving_default'
def _extract_json(column, feature_name):
return "JSON_EXTRACT({}, '$.{}')".format(column, feature_name)
def _replace_brackets(field):
return "REPLACE(REPLACE({}, ']', ''), '[','')".format(field)
def _replace_quotes(field):
return 'REPLACE({}, "\\"","")'.format(field)
def _cast_to_numeric(field):
return "CAST({} AS NUMERIC)".format(field)
def _add_alias(field, feature_name):
return "{} AS {}".format(field, feature_name)
view_name = "vw_"+BQ_TABLE_NAME+"_"+VERSION_NAME
colum_names = FEATURE_NAMES
input_features = ', \r\n '.join(colum_names)
json_features_extraction = []
for feature_name in colum_names:
field = _extract_json('instance', feature_name)
field = _replace_brackets(field)
if feature_name in NUMERIC_FEATURE_NAMES:
field = _cast_to_numeric(field)
else:
field = _replace_quotes(field)
field = _add_alias(field, feature_name)
json_features_extraction.append(field)
json_features_extraction = ', \r\n '.join(json_features_extraction)
json_prediction_extraction = []
for feature_name in [LABEL_KEY, SCORE_KEY]:
field = _extract_json('prediction', feature_name)
field = _replace_brackets(field)
if feature_name == SCORE_KEY:
field = _cast_to_numeric(field)
else:
field = _replace_quotes(field)
field = _add_alias(field, feature_name)
json_prediction_extraction.append(field)
json_prediction_extraction = ', \r\n '.join(json_prediction_extraction)
In [ ]:
sql_script = '''
CREATE OR REPLACE VIEW @dataset_name.@view_name
AS
WITH step1
AS
(
SELECT
model,
model_version,
time,
SPLIT(JSON_EXTRACT(raw_data, '$.instances'), '}],[{') instance_list,
SPLIT(JSON_EXTRACT(raw_prediction, '$.predictions'), '}],[{') as prediction_list
FROM
`@project.@dataset_name.@table_name`
WHERE
model = '@model_name' AND
model_version = '@version'
),
step2
AS
(
SELECT
model,
model_version,
time,
REPLACE(REPLACE(instance, '[{', '{'),'}]', '}') AS instance,
REPLACE(REPLACE(prediction, '[{', '{'),'}]', '}') AS prediction,
FROM step1
JOIN UNNEST(step1.instance_list) AS instance
WITH OFFSET AS f1
JOIN UNNEST(step1.prediction_list) AS prediction
WITH OFFSET AS f2
ON f1=f2
),
step3 AS
(
SELECT
model,
model_version,
time,
@json_features_extraction,
@json_prediction_extraction
FROM step2
)
SELECT *
FROM step3
'''
In [ ]:
sql_script = sql_script.replace("@project", PROJECT_ID)
sql_script = sql_script.replace("@dataset_name", BQ_DATASET_NAME)
sql_script = sql_script.replace("@table_name", BQ_TABLE_NAME)
sql_script = sql_script.replace("@view_name", view_name)
sql_script = sql_script.replace("@model_name", MODEL_NAME)
sql_script = sql_script.replace("@version", VERSION_NAME)
sql_script = sql_script.replace("@input_features", input_features)
sql_script = sql_script.replace("@json_features_extraction", json_features_extraction)
sql_script = sql_script.replace("@json_prediction_extraction", json_prediction_extraction)
Print generated script
In [ ]:
print(sql_script)
In [ ]:
client = bigquery.Client(PROJECT_ID)
client.query(query = sql_script)
print("View was created or replaced.")
In [ ]:
query = '''
SELECT * FROM
`{}.{}`
LIMIT {}
'''.format(BQ_DATASET_NAME, view_name, 3)
pd.io.gbq.read_gbq(
query, project_id=PROJECT_ID).T
In [ ]: