I decided after seeing the Instacart competition data was relational that it was finally time for me to pick up SQL. It didn't help me ascend past Sh1ng's feature engineering, but models using it got me a quasi-respectable .402x. After a while I decided that getting up the leaderboard was less important than learning...
Initially I worked in SQLite, then moved to Postgres to use it's arrays and aggregate functions.
The outputs are a set of dataframes grouped by different things (order/products, user, user/department, etc) that are joined in the models to save disk space etc.
In [2]:
import pickle
import numpy as np
import odo
import pandas as pd
In [3]:
# Not included in Kaggle Docker image - with docker-compose it only actually installs the package once anyway.
import os
os.system('pip install psycopg2')
import psycopg2
# note: database must be created by psql command line
conn_string = "host='db' dbname='instacart811' user='postgres' password='insta'"
conn = psycopg2.connect(conn_string)
In [4]:
def pd_cmd(s):
return pd.read_sql_query(s, conn)
def sql_cmd(s):
try:
cursor = conn.cursor()
cursor.execute(s)
except Exception as e:
print(e)
cursor.close()
conn.rollback()
cursor.close()
conn.commit()
def sql_explain(s):
try:
cursor = conn.cursor()
cursor.execute('EXPLAIN ANALYZE ' + s)
print(cursor.fetchall())
except Exception as e:
print(e)
cursor.close()
conn.rollback()
cursor.close()
conn.commit()
def sql_mktable(s, name, prikey = None):
try:
cursor = conn.cursor()
cursor.execute('DROP TABLE IF EXISTS ' + name + ' CASCADE; CREATE TABLE ' + name + ' AS ' + s)
if prikey is not None:
cursor.execute("ALTER TABLE " + name + " ADD PRIMARY KEY (" + prikey + ");")
except Exception as e:
print(e)
cursor.close()
conn.rollback()
cursor.close()
conn.commit()
Didn't spend enough time trying to get this to work. It was the biggest piece of Sh1ng's FE that I didn't reimplement!
In [107]:
# https://wiki.postgresql.org/wiki/Aggregate_Median
if False:
sql_cmd(
'''
CREATE OR REPLACE FUNCTION _final_median(NUMERIC[])
RETURNS NUMERIC AS
$$
SELECT AVG(val)
FROM (
SELECT val
FROM unnest($1) val
ORDER BY 1
LIMIT 2 - MOD(array_upper($1, 1), 2)
OFFSET CEIL(array_upper($1, 1) / 2.0) - 1
) sub;
$$
LANGUAGE 'sql' IMMUTABLE;
CREATE AGGREGATE median(NUMERIC) (
SFUNC=array_append,
STYPE=NUMERIC[],
FINALFUNC=_final_median,
INITCOND='{}'
);
'''
)
In [7]:
for f in ['aisles', 'departments', 'order_products__prior', 'order_products__train', 'orders', 'products']:
fname = '../input/{0}.csv'.format(f)
fd = open(fname, 'r')
#dshape = odo.discover(odo.resource(fname))
#print(dshape)
t = odo.odo(fname, 'postgresql://postgres:insta@db/instacart811::{0}'.format(f))
In [13]:
cursor = conn.cursor()
cursor.execute('''
ALTER TABLE "aisles" ADD PRIMARY KEY (aisle_id);
ALTER TABLE "departments" ADD PRIMARY KEY (department_id);
ALTER TABLE "orders" ADD PRIMARY KEY (order_id);
ALTER TABLE "products" ADD PRIMARY KEY (product_id);
-- The order_products__* tables have no primary key AFAIK
--ALTER TABLE "order_products__prior" ADD PRIMARY KEY (order_id);
--ALTER TABLE "order_products__train" ADD PRIMARY KEY (order_id);
''')
cursor.close()
conn.commit()
In [16]:
cursor = conn.cursor()
cursor.execute('''
DROP TABLE IF EXISTS orders_time;
CREATE TABLE orders_time AS
WITH orders_hdiff AS (
SELECT *,
lag(order_hour_of_day) OVER (PARTITION BY user_id ORDER BY order_number) as __ohod_lag,
lag(order_dow) OVER (PARTITION BY user_id ORDER BY order_number) as __odow_lag
FROM orders
--WHERE user_id < 10
), orders_gapcount AS (
SELECT *,
sum((COALESCE(orders.days_since_prior_order, 0) = 30)::int) OVER (PARTITION BY user_id ORDER BY order_number) AS order_month_count,
CASE WHEN days_since_prior_order = 30 THEN 0
WHEN days_since_prior_order is NULL THEN 0
ELSE days_since_prior_order + ((order_hour_of_day - __ohod_lag) / 24.0)
END as __dpo_corr_month,
-- Base is 38 to cover potential negative dow/time offsets. It's a clipped number *anyway*
CASE WHEN days_since_prior_order = 30 THEN 38 + (order_dow - __odow_lag) + ((order_hour_of_day - __ohod_lag) / 24.0)
WHEN days_since_prior_order is NULL THEN 0
ELSE days_since_prior_order + ((order_hour_of_day - __ohod_lag) / 24.0)
END as __dpo_corr
FROM orders_hdiff as orders
)
SELECT order_id,
-- user_id, eval_set, order_number, order_dow, order_hour_of_day, days_since_prior_order,
order_month_count,
__dpo_corr as order_dpo_decimal,
SUM(__dpo_corr_month) OVER (PARTITION BY user_id, order_month_count ORDER BY order_number) as order_elapsed_time_inmonth,
SUM(__dpo_corr) OVER (PARTITION BY user_id ORDER BY order_number) as order_elapsed_time
FROM orders_gapcount;
''')
cursor.close()
conn.commit()
In [17]:
sql_cmd('''ALTER TABLE "orders_time" ADD PRIMARY KEY (order_id);''')
In [39]:
pd.read_sql_query('''select * from orders_time WHERE order_id < 10;''', conn)
Out[39]:
In [233]:
cmd = '''
SELECT orders.*,
orders.order_number as user_last_order,
orders_time.order_elapsed_time as user_last_order_time
FROM orders
INNER JOIN orders_time ON orders_time.order_id = orders.order_id
WHERE orders.eval_set != 'prior' -- and orders.user_id < 10
--GROUP BY orders.user_id
'''
#pd_cmd(cmd)
sql_mktable(cmd, 'user_last_order', 'user_id')
In [7]:
pd_cmd('''SELECT * from user_last_order where user_id < 10''')
Out[7]:
In [114]:
cmd = '''
SELECT order_id,
MAX(order_products__prior.add_to_cart_order) as order_cart_size
FROM order_products__prior
GROUP BY order_id
'''
#pd_cmd(cmd)
sql_mktable(cmd, 'order_cart_size', "order_id")
In [209]:
cmd = '''
DROP VIEW IF EXISTS opp_rel;
CREATE OR REPLACE VIEW opp_rel AS
SELECT order_products__prior.*,
order_products__prior.add_to_cart_order / ocs.order_cart_size::float as add_to_cart_order_rel,
ocs.order_cart_size - order_products__prior.add_to_cart_order as add_to_cart_order_inverted,
(ocs.order_cart_size - order_products__prior.add_to_cart_order) / ocs.order_cart_size::float as add_to_cart_order_inverted_rel
FROM order_products__prior
INNER JOIN order_cart_size AS ocs ON ocs.order_id = order_products__prior.order_id
'''
sql_cmd(cmd)
In [46]:
cmd = '''
WITH user_product_l0 as (
SELECT orders.user_id, opp.product_id,
COUNT(*) as up_count,
AVG(opp.add_to_cart_order_rel) as up_add_to_cart_order_mean,
AVG(opp.add_to_cart_order_inverted_rel) as up_add_to_cart_order_inverted_mean,
ARRAY_AGG((ulo.user_last_order_time - order_elapsed_time) ORDER BY orders.order_number DESC) as up_order_times_last,
ARRAY_AGG(order_elapsed_time ORDER BY orders.order_number DESC) as up_order_times,
ARRAY_AGG(order_elapsed_time ORDER BY orders.order_number) as up_order_times_asc,
ARRAY_AGG(orders.order_number::int ORDER BY orders.order_number DESC) as up_order_numbers
FROM orders
INNER JOIN orders_time ON orders_time.order_id = orders.order_id
INNER JOIN order_cart_size ON order_cart_size.order_id = orders.order_id
INNER JOIN opp_rel as opp on opp.order_id = orders.order_id
INNER JOIN user_last_order as ulo on ulo.user_id = orders.user_id
-- WHERE orders.user_id = 1
GROUP BY orders.user_id, opp.product_id)
SELECT
CASE WHEN up_count >= 2 THEN (up_order_times[1] - up_order_times_asc[1]) / (up_count - 1)
ELSE 0
END AS up_reorder_rate,
up_count - 1 AS up_reorder_count,
*
FROM user_product_l0
'''
sql_mktable(cmd, 'user_product_l1', "user_id, product_id")
#x = pd_cmd(cmd)
#x.head()
#sql_cmd('EXPLAIN ' + cmd)
#pd_cmd(cmd)
In [47]:
pd_cmd('''SELECT * FROM user_product_l1 WHERE user_id = 1''')
Out[47]:
In [59]:
cmd = '''
SELECT user_id, product_id, reordered
FROM orders
LEFT JOIN order_products__train ON order_products__train.order_id = orders.order_id
WHERE orders.eval_set = 'train' and order_products__train.reordered = 1 -- and user_id = 1
'''
sql_mktable(cmd, 'train_reordered', "user_id, product_id")
In [51]:
cmd = '''
select up.user_id, up.product_id,
-- ulo.eval_set,
-- ulo.order_id,
CASE WHEN up_reorder_rate > 0 THEN up_reorder_rate ELSE -1 END as up_reorder_rate,
up_count,
GREATEST(reordered, 0) as reordered,
ulo.user_last_order as user_max_order,
ulo.user_last_order_time - up_order_times_last[1] as up_time_first,
ulo.user_last_order_time - up_order_times[1] as up_time_last,
(ulo.user_last_order_time - up_order_times[1]) / GREATEST(ulo.user_last_order_time, 1) as up_time_last_ratio,
ulo.user_last_order_time - up_order_times[2] as up_time_prev1,
ulo.user_last_order_time - up_order_times[3] as up_time_prev2,
ulo.user_last_order - up_order_numbers[1] as up_ordergap_last,
up_order_numbers[1] as up_last_order,
up_order_numbers[array_upper(up_order_numbers, 1)] as up_first_order,
up_count / GREATEST((ulo.user_last_order - 1), 1)::float as up_order_rate,
up.up_add_to_cart_order_mean, up.up_add_to_cart_order_inverted_mean
from user_product_l1 as up
inner join user_last_order as ulo on ulo.user_id = up.user_id
left outer join train_reordered as tro on tro.user_id = up.user_id and tro.product_id = up.product_id
--where up.user_id < 50000
--where up.user_id = 1
'''
# data['up_order_rate'] = data.up_order_count / (data.user_num_orders - 1)
df_up1 = pd_cmd(cmd)
df_up1.head(20)
Out[51]:
In [52]:
df_up1.to_pickle('testdata.pkl')
In [53]:
cmd = '''
select ulo.user_id,
ulo.eval_set,
ulo.order_id,
ulo.days_since_prior_order,
ulo.order_dow,
ulo.order_hour_of_day,
--ulo.order_dpo_decimal,
ulo.days_since_prior_order / GREATEST(user_last_order_time, 1) as u_dpo_ratio,
AVG((up.up_count > 1)::int) / GREATEST(SUM((up.up_count >= 1)::int), 1) as u_reordered_product_ratio,
SUM((up.up_count >= 1)::int) as u_num_products
from user_last_order as ulo
inner join user_product_l1 as up on up.user_id = ulo.user_id
--where ulo.eval_set = 'train' and ulo.user_id < 50000
--where ulo.eval_set = 'train' and ulo.user_id < 5
group by ulo.user_id
'''
df_u1 = pd_cmd(cmd)
df_u1.head(20)
Out[53]:
In [243]:
df_u1.to_pickle('testdata_user.pkl')
In [61]:
cmd = '''
select p.product_id,
SUM(up_reorder_rate) / GREATEST(SUM((up_reorder_count > 0)::int), 1)::float as p_reorder_rate,
AVG(up_count / (ulo.user_last_order - 1)) as p_order_rate,
AVG(up_add_to_cart_order_mean) as p_add_to_cart_order_mean,
SUM((up_count > 1)::int) / GREATEST(SUM((up_count >= 1)::int), 1)::float as p_reordered_product_ratio,
SUM((up_count > 0)::int) as p_users,
p.aisle_id, p.department_id
from user_product_l1 as up
inner join user_last_order as ulo on ulo.user_id = up.user_id
inner join products as p on p.product_id = up.product_id
left outer join train_reordered as tro on tro.user_id = up.user_id and tro.product_id = up.product_id
--where up.user_id < 500
group by p.product_id
'''
# data['up_order_rate'] = data.up_order_count / (data.user_num_orders - 1)
df_p1 = pd_cmd(cmd)
df_p1.head(20)
Out[61]:
In [62]:
df_p1.to_pickle('testdata_prod.pkl')
In [238]:
cmd = '''
select p.aisle_id,
AVG(up_count / (ulo.user_last_order - 1)) as a_order_rate,
SUM((up_count > 1)::int) / SUM((up_count >= 1)::int)::float as a_reordered_product_ratio,
SUM((up_count > 0)::int) as a_up_count
from user_product_l1 as up
inner join user_last_order as ulo on ulo.user_id = up.user_id
inner join products as p on p.product_id = up.product_id
left outer join train_reordered as tro on tro.user_id = up.user_id and tro.product_id = up.product_id
--where ulo.eval_set = 'train' and up.user_id < 50000
group by p.aisle_id
'''
# data['up_order_rate'] = data.up_order_count / (data.user_num_orders - 1)
df_a1 = pd_cmd(cmd)
df_a1.to_pickle('testdata_aisle.pkl')
df_a1.head(20)
Out[238]:
In [239]:
cmd = '''
select p.department_id,
SUM((up_count > 1)::int) / SUM((up_count >= 1)::int)::float as d_reordered_product_ratio,
AVG(up_count / (ulo.user_last_order - 1)) as d_order_rate,
SUM((up_count > 0)::int) as d_up_count
from user_product_l1 as up
inner join user_last_order as ulo on ulo.user_id = up.user_id
inner join products as p on p.product_id = up.product_id
left outer join train_reordered as tro on tro.user_id = up.user_id and tro.product_id = up.product_id
--where ulo.eval_set = 'train' and up.user_id < 50000
group by p.department_id
'''
# data['up_order_rate'] = data.up_order_count / (data.user_num_orders - 1)
df_d1 = pd_cmd(cmd)
df_d1.to_pickle('testdata_dept.pkl')
df_d1.head(20)
Out[239]:
In [57]:
cmd = '''
WITH uo0 AS
(
SELECT orders.user_id, orders.order_id, orders.order_number,
SUM(reordered) as o_reordered_count,
COUNT(*) as o_cart_size
FROM orders
INNER JOIN order_products__prior as opp on orders.order_id = opp.order_id
--WHERE user_id <= 5
GROUP by orders.order_id
), uo1 AS (
SELECT user_id,
SUM((o_reordered_count = 0)::int) uo_nonecount,
ARRAY_AGG(o_reordered_count ORDER BY order_number DESC) as uo_agg_reordered_count,
COUNT(*) as u_order_count,
AVG(o_cart_size) AS u_avg_cart_size
FROM uo0
GROUP BY user_id
)
SELECT user_id, uo_nonecount, u_order_count, u_avg_cart_size, (uo_agg_reordered_count[1] = 0)::int as uo_prevnone
FROM uo1
'''
df_uo = pd_cmd(cmd)
df_uo.set_index('user_id').to_pickle('testdata-uo.pkl')
df_uo.head()
Out[57]:
In [127]:
cmd = '''
SELECT orders.order_id,
AVG(reordered) as reorder_avg,
COUNT(*) as order_size
FROM orders
INNER JOIN order_products__prior as opp on orders.order_id = opp.order_id
-- WHERE user_id <= 2
GROUP by orders.order_id
'''
#pd_cmd(cmd)
sql_mktable(cmd, 'order_reordered_avg', 'order_id')
In [193]:
pd_cmd('''select * from order_reordered_avg limit 5;''')
Out[193]:
In [194]:
cmd = '''
WITH reordered_agg AS (
SELECT user_id,
-- ARRAY_AGG(orders.order_id ORDER BY order_number DESC) as order_id,
ARRAY_AGG(reorder_avg ORDER BY order_number DESC) as reorder_agg,
ARRAY_AGG(order_size ORDER BY order_number DESC) as order_size_agg
FROM orders
INNER JOIN order_reordered_avg ON order_reordered_avg.order_id = orders.order_id
WHERE orders.eval_set = 'prior' -- and user_id < 5
GROUP BY user_id
)
SELECT user_id,
reorder_agg[1] as u_reordered_prev1,
reorder_agg[2] as u_reordered_prev2,
reorder_agg[3] as u_reordered_prev3,
order_size_agg[1] as u_ordersize_prev1,
order_size_agg[2] as u_ordersize_prev2,
order_size_agg[3] as u_ordersize_prev3
FROM reordered_agg
'''
#INNER JOIN orders_time ON orders_time.order_id = orders.order_id
df_reorderedpast = pd_cmd(cmd)
In [196]:
df_reorderedpast.to_pickle('testdata-user_reordered.pkl')
In [195]:
df_reorderedpast.head()
Out[195]:
In [77]:
cmd = '''
WITH opc AS (
SELECT * FROM order_products__prior
UNION
SELECT * FROM order_products__train
), opd0 AS (
SELECT opc.order_id, opc.add_to_cart_order, opc.reordered, products.department_id
FROM opc
INNER JOIN products ON products.product_id = opc.product_id
-- WHERE opc.order_id < 5
)
SELECT
order_id,
department_id,
MIN(add_to_cart_order) as add_to_cart_order,
COUNT(*) as products,
SUM(reordered)::int as reordered_count,
(SUM(reordered) >= 1)::int as reordered
FROM opd0
GROUP BY order_id, department_id
'''
#pd_cmd(cmd)
# sql_mktable(cmd, 'order_departments')
In [79]:
cmd = '''
WITH opd0 AS (
SELECT opc.order_id, opc.add_to_cart_order, opc.reordered, products.department_id
FROM order_products__prior as opc
INNER JOIN products ON products.product_id = opc.product_id
-- WHERE opc.order_id < 5
)
SELECT
order_id,
department_id,
MIN(add_to_cart_order) as add_to_cart_order,
COUNT(*) as d_product_count,
SUM(reordered)::int as reordered_count,
(SUM(reordered) >= 1)::int as reordered
FROM opd0
GROUP BY order_id, department_id
'''
#pd_cmd(cmd)
sql_mktable(cmd, 'order_departments__prior')
In [88]:
cmd = '''
WITH opd0 AS (
SELECT opc.order_id, opc.add_to_cart_order, opc.reordered, products.department_id
FROM order_products__train as opc
INNER JOIN products ON products.product_id = opc.product_id
-- WHERE opc.order_id < 5
)
SELECT
order_id,
department_id,
MIN(add_to_cart_order) as add_to_cart_order,
COUNT(*) as d_product_count,
SUM(reordered)::int as reordered_count,
(SUM(reordered) >= 1)::int as reordered
FROM opd0
GROUP BY order_id, department_id
'''
#pd_cmd(cmd)
sql_mktable(cmd, 'order_departments__train')
In [83]:
cmd = '''
DROP VIEW IF EXISTS odp_rel;
CREATE OR REPLACE VIEW odp_rel AS
SELECT order_departments__prior.*,
order_departments__prior.add_to_cart_order / ocs.order_cart_size::float as add_to_cart_order_rel,
ocs.order_cart_size - order_departments__prior.add_to_cart_order as add_to_cart_order_inverted,
(ocs.order_cart_size - order_departments__prior.add_to_cart_order) / ocs.order_cart_size::float as add_to_cart_order_inverted_rel
FROM order_departments__prior
INNER JOIN order_cart_size AS ocs ON ocs.order_id = order_departments__prior.order_id
'''
sql_cmd(cmd)
#sql_mktable(cmd, 'order_cart_size', "order_id")
In [116]:
cmd = '''
WITH user_department_l0 as (
SELECT orders.user_id, opp.department_id,
COUNT(*) as ud_count,
AVG(opp.add_to_cart_order_rel) as ud_add_to_cart_order_mean,
AVG(opp.add_to_cart_order_inverted_rel) as ud_add_to_cart_order_inverted_mean,
ARRAY_AGG((ulo.user_last_order_time - order_elapsed_time) ORDER BY orders.order_number DESC) as ud_order_times_last,
ARRAY_AGG(order_elapsed_time ORDER BY orders.order_number DESC) as ud_order_times,
ARRAY_AGG(order_elapsed_time ORDER BY orders.order_number) as ud_order_times_asc,
ARRAY_AGG(orders.order_number::int ORDER BY orders.order_number DESC) as ud_order_numbers
FROM orders
INNER JOIN orders_time ON orders_time.order_id = orders.order_id
INNER JOIN order_cart_size ON order_cart_size.order_id = orders.order_id
INNER JOIN odp_rel as opp on opp.order_id = orders.order_id
INNER JOIN user_last_order as ulo on ulo.user_id = orders.user_id
-- WHERE orders.user_id = 1
GROUP BY orders.user_id, opp.department_id)
SELECT
CASE WHEN ud_count >= 2 THEN (ud_order_times[1] - ud_order_times_asc[1]) / (ud_count - 1)
ELSE 0
END AS ud_reorder_rate,
ud_count - 1 AS ud_reorder_count,
*
FROM user_department_l0
'''
sql_mktable(cmd, 'user_department_l1', "user_id, department_id")
#x = pd_cmd(cmd)
#x.head()
#sql_cmd('EXPLAIN ' + cmd)
#pd_cmd(cmd)
In [117]:
pd_cmd('''SELECT * FROM user_product_l1 WHERE user_id = 1''')
Out[117]:
In [89]:
cmd = '''
SELECT user_id, department_id, reordered
FROM orders
LEFT JOIN order_departments__train ON order_departments__train.order_id = orders.order_id
WHERE orders.eval_set = 'train' and order_departments__train.reordered = 1 -- and user_id = 1
'''
sql_mktable(cmd, 'train_departments_reordered', "user_id, department_id")
In [125]:
cmd = '''
select ud.user_id, ud.department_id,
CASE WHEN ud_reorder_rate > 0 THEN ud_reorder_rate ELSE -1 END as ud_reorder_rate,
ud_count,
GREATEST(reordered, 0) as ud_reordered,
-- ulo.user_last_order as user_max_order,
ulo.user_last_order_time - ud_order_times_last[1] as ud_time_first,
ulo.user_last_order_time - ud_order_times[1] as ud_time_last,
(ulo.user_last_order_time - ud_order_times[1]) / GREATEST(ulo.user_last_order_time, 1) as ud_time_last_ratio,
ulo.user_last_order_time - ud_order_times[2] as ud_time_prev1,
ulo.user_last_order_time - ud_order_times[3] as ud_time_prev2,
ulo.user_last_order - ud_order_numbers[1] as ud_ordergap_last,
ud_order_numbers[1] as ud_last_order,
ud_order_numbers[array_upper(ud_order_numbers, 1)] as ud_first_order,
ud_count / GREATEST((ulo.user_last_order - 1), 1)::float as ud_order_rate,
ud.ud_add_to_cart_order_mean, ud.ud_add_to_cart_order_inverted_mean
from user_department_l1 as ud
inner join user_last_order as ulo on ulo.user_id = ud.user_id
left outer join train_departments_reordered as tro on tro.user_id = ud.user_id and tro.department_id = ud.department_id
--where up.user_id < 50000
--where ud.user_id = 1
'''
# data['up_order_rate'] = data.up_order_count / (data.user_num_orders - 1)
df_udl1 = pd_cmd(cmd)
df_udl1.head(20)
Out[125]:
In [126]:
# merge w/data on user,dept
df_udl1.to_pickle('testdata_ud.pkl')
In [120]:
cmd = '''
select ulo.user_id,
AVG((ud.ud_count > 1)::int) / GREATEST(SUM((ud.ud_count >= 1)::int), 1) as u_reordered_department_ratio,
SUM((ud.ud_count >= 1)::int) as u_num_departments
from user_last_order as ulo
--inner join user_product_l1 as up on up.user_id = ulo.user_id
inner join user_department_l1 as ud on ud.user_id = ulo.user_id
--where ulo.eval_set = 'train' and ulo.user_id < 50000
--where ulo.eval_set = 'train' and ulo.user_id < 5
group by ulo.user_id
'''
df_ud1 = pd_cmd(cmd)
df_ud1.head(20)
Out[120]:
In [122]:
# merge on user_id
df_ud1.to_pickle('testdata_user_dept.pkl')
In [ ]: