In [1]:
%%bigquery
SELECT
bqutil.fn.median(ARRAY_AGG(TIMESTAMP_DIFF(a.creation_date, q.creation_date, SECOND))) AS time_to_answer
FROM `bigquery-public-data.stackoverflow.posts_questions` q
JOIN `bigquery-public-data.stackoverflow.posts_answers` a
ON q.accepted_answer_id = a.id
Out[1]:
Find the error metric of always predicting that it will take 2120 seconds to get an answer. This the baseline metric against which to report model performance.
In [2]:
%%bigquery
WITH benchmark_eval AS (
SELECT
2120 - TIMESTAMP_DIFF(a.creation_date, q.creation_date, SECOND) AS error
FROM `bigquery-public-data.stackoverflow.posts_questions` q
JOIN `bigquery-public-data.stackoverflow.posts_answers` a
ON q.accepted_answer_id = a.id
)
SELECT
AVG(ABS(error)) AS mean_absolute_error
FROM
benchmark_eval
Out[2]:
In [3]:
%%bigquery
SELECT
AVG(IF(a.last_edit_date IS NULL, 0, 1)) AS prob_edited
FROM `bigquery-public-data.stackoverflow.posts_questions` q
JOIN `bigquery-public-data.stackoverflow.posts_answers` a
ON q.accepted_answer_id = a.id
Out[3]:
Problem: Country from which a Stack Overflow question will be answered.
Benchmark: Fractions of answers written by people from France, India, and so on.
In [4]:
%%bigquery
SELECT
COUNTIF(ENDS_WITH(u.location, 'France')) / COUNT(u.location) AS from_france,
COUNTIF(ENDS_WITH(u.location, 'India')) / COUNT(u.location) AS from_india
FROM `bigquery-public-data.stackoverflow.posts_questions` q
JOIN `bigquery-public-data.stackoverflow.posts_answers` a
ON q.accepted_answer_id = a.id
JOIN `bigquery-public-data.stackoverflow.users` u
ON u.id = a.owner_user_id
Out[4]:
In [6]:
%%bigquery
With trips AS (
SELECT
total_amount,
ST_Distance(ST_GeogPoint(pickup_longitude, pickup_latitude),
ST_GeogPoint(dropoff_longitude, dropoff_latitude))/1000 AS dist
FROM `bigquery-public-data.new_york.tlc_yellow_trips_2015`
WHERE pickup_latitude BETWEEN 35 and 45
AND dropoff_latitude BETWEEN 35 and 45
AND pickup_longitude BETWEEN -80 and -70
AND dropoff_longitude BETWEEN -80 and -70
AND total_amount IS NOT NULL
)
SELECT AVG(total_amount)/AVG(dist)
FROM trips
Out[6]:
In [7]:
%%bigquery
CREATE TEMPORARY FUNCTION is_peak_hour(start_date TIMESTAMP) aS
(EXTRACT(DAYOFWEEK FROM start_date) BETWEEN 2 AND 6 -- weekday
AND (
EXTRACT(HOUR FROM start_date) BETWEEN 6 AND 10
OR
EXTRACT(HOUR FROM start_date) BETWEEN 15 AND 18))
;
SELECT
start_station_name,
is_peak_hour(start_date) AS is_peak,
AVG(duration) AS predicted_duration,
FROM `bigquery-public-data.london_bicycles.cycle_hire`
GROUP BY 1, 2
ORDER BY predicted_duration DESC
LIMIT 10
Out[7]:
In [ ]:
Copyright 2020 Google Inc. Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License