This notebook accompanies the article Training a recommendation model for Google Analytics data using BigQuery ML
In [27]:
%%bigquery df
WITH CTE_visitor_content_time AS (
SELECT
fullVisitorID AS visitorId,
visitNumber,
(SELECT MAX(IF(index=10, value, NULL)) FROM UNNEST(hits.customDimensions)) AS latestContentId,
hits.time AS hit_time
FROM
`cloud-training-demos.GA360_test.ga_sessions_sample`,
UNNEST(hits) AS hits
WHERE
# only include hits on pages
hits.type = "PAGE"
GROUP BY
fullVisitorId,
visitNumber,
latestContentId,
hits.time ),
CTE_visitor_page_content AS (
SELECT *,
# Schema: https://support.google.com/analytics/answer/3437719?hl=en
# For a completely unique visit-session ID, we combine combination of fullVisitorId and visitNumber:
(LEAD(hit_time, 1) OVER (PARTITION BY CONCAT(visitorId, visitNumber, latestContentId) ORDER BY hit_time ASC) - hit_time) AS session_duration
FROM CTE_visitor_content_time
)
-- Aggregate web stats
SELECT
visitorId,
latestContentId as contentId,
SUM(session_duration) AS session_duration
FROM
CTE_visitor_page_content
WHERE
latestContentId IS NOT NULL
GROUP BY
visitorId,
latestContentId
HAVING
session_duration > 0
In [28]:
df.head()
Out[28]:
In [30]:
df.describe()
Out[30]:
In [29]:
df[["session_duration"]].plot(kind="hist", logy=True, bins=100, figsize=[8,5]);
In [15]:
%%bigquery
CREATE TEMPORARY FUNCTION CLIP_LESS(x FLOAT64, a FLOAT64) AS (
IF (x < a, a, x)
);
CREATE TEMPORARY FUNCTION CLIP_GT(x FLOAT64, b FLOAT64) AS (
IF (x > b, b, x)
);
CREATE TEMPORARY FUNCTION CLIP(x FLOAT64, a FLOAT64, b FLOAT64) AS (
CLIP_GT(CLIP_LESS(x, a), b)
);
CREATE OR REPLACE TABLE advdata.ga360_recommendations_data
AS
WITH CTE_visitor_page_content AS (
SELECT
# Schema: https://support.google.com/analytics/answer/3437719?hl=en
# For a completely unique visit-session ID, we combine combination of fullVisitorId and visitNumber:
CONCAT(fullVisitorID,'-',CAST(visitNumber AS STRING)) AS visitorId,
(SELECT MAX(IF(index=10, value, NULL)) FROM UNNEST(hits.customDimensions)) AS latestContentId,
(LEAD(hits.time, 1) OVER (PARTITION BY fullVisitorId ORDER BY hits.time ASC) - hits.time) AS session_duration
FROM
`cloud-training-demos.GA360_test.ga_sessions_sample`,
UNNEST(hits) AS hits
WHERE
# only include hits on pages
hits.type = "PAGE"
GROUP BY
fullVisitorId,
visitNumber,
latestContentId,
hits.time ),
aggregate_web_stats AS (
-- Aggregate web stats
SELECT
visitorId,
latestContentId as contentId,
SUM(session_duration) AS session_duration
FROM
CTE_visitor_page_content
WHERE
latestContentId IS NOT NULL
GROUP BY
visitorId,
latestContentId
HAVING
session_duration > 0
),
normalized_session_duration AS (
SELECT APPROX_QUANTILES(session_duration,100)[OFFSET(50)] AS median_duration
FROM aggregate_web_stats
)
SELECT
* EXCEPT(session_duration, median_duration),
CLIP(0.3 * session_duration / median_duration, 0, 1.0) AS normalized_session_duration
FROM
aggregate_web_stats, normalized_session_duration
In [19]:
%%bigquery df_scaled
SELECT * FROM advdata.ga360_recommendations_data
In [20]:
df_scaled[["normalized_session_duration"]].plot(kind="hist", logy=True, bins=100, figsize=[8,5]);
In [21]:
df_scaled.head()
Out[21]:
In [ ]:
%%bash
cd ../flex_slots
./run_query_on_flex_slots.sh
In [22]:
%%bigquery
SELECT
visitorId,
ARRAY_AGG(STRUCT(contentId, predicted_normalized_session_duration)
ORDER BY predicted_normalized_session_duration DESC
LIMIT 3)
FROM ML.RECOMMEND(MODEL advdata.ga360_recommendations_model)
WHERE predicted_normalized_session_duration < 1
GROUP BY visitorId
LIMIT 5
Out[22]:
Copyright 2020 Google Inc. Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License
In [ ]: