(Postgres) SQL Feature Engineering

I decided after seeing the Instacart competition data was relational that it was finally time for me to pick up SQL. It didn't help me ascend past Sh1ng's feature engineering, but models using it got me a quasi-respectable .402x. After a while I decided that getting up the leaderboard was less important than learning...

Initially I worked in SQLite, then moved to Postgres to use it's arrays and aggregate functions.

The outputs are a set of dataframes grouped by different things (order/products, user, user/department, etc) that are joined in the models to save disk space etc.


In [2]:
import pickle 

import numpy as np
import odo
import pandas as pd


/opt/conda/lib/python3.6/site-packages/odo/backends/pandas.py:94: FutureWarning: pandas.tslib is deprecated and will be removed in a future version.
You can access NaTType as type(pandas.NaT)
  @convert.register((pd.Timestamp, pd.Timedelta), (pd.tslib.NaTType, type(None)))

In [3]:
# Not included in Kaggle Docker image - with docker-compose it only actually installs the package once anyway.

import os
os.system('pip install psycopg2')

import psycopg2

# note:  database must be created by psql command line

conn_string = "host='db' dbname='instacart811' user='postgres' password='insta'"
conn = psycopg2.connect(conn_string)

I'm well aware that these helper functions are not best practice...

...but they simplified many of my calls past here, allowing me to switch quickly between trial runs and things that actually created tables.


In [4]:
def pd_cmd(s):
    return pd.read_sql_query(s, conn)

def sql_cmd(s):
    try:
        cursor = conn.cursor()
        cursor.execute(s)
    except Exception as e:
        print(e)
        cursor.close()
        conn.rollback()
    
    cursor.close()
    conn.commit()
    

def sql_explain(s):
    try:
        cursor = conn.cursor()
        cursor.execute('EXPLAIN ANALYZE ' + s)
        print(cursor.fetchall())
    except Exception as e:
        print(e)
        cursor.close()
        conn.rollback()
    
    cursor.close()
    conn.commit()
    
def sql_mktable(s, name, prikey = None):
    try:
        cursor = conn.cursor()
        cursor.execute('DROP TABLE IF EXISTS ' + name + ' CASCADE; CREATE TABLE ' + name + ' AS ' + s)
        if prikey is not None:
            cursor.execute("ALTER TABLE " + name + " ADD PRIMARY KEY (" + prikey + ");")
    except Exception as e:
        print(e)
        cursor.close()
        conn.rollback()
    
    cursor.close()
    conn.commit()

Didn't spend enough time trying to get this to work. It was the biggest piece of Sh1ng's FE that I didn't reimplement!


In [107]:
# https://wiki.postgresql.org/wiki/Aggregate_Median
if False:
    sql_cmd(
    '''
    CREATE OR REPLACE FUNCTION _final_median(NUMERIC[])
       RETURNS NUMERIC AS
    $$
       SELECT AVG(val)
       FROM (
         SELECT val
         FROM unnest($1) val
         ORDER BY 1
         LIMIT  2 - MOD(array_upper($1, 1), 2)
         OFFSET CEIL(array_upper($1, 1) / 2.0) - 1
       ) sub;
    $$
    LANGUAGE 'sql' IMMUTABLE;

    CREATE AGGREGATE median(NUMERIC) (
      SFUNC=array_append,
      STYPE=NUMERIC[],
      FINALFUNC=_final_median,
      INITCOND='{}'
    );
    '''
    )

Database loading stage (do NOT rerun after creating database!)

NOTE: fix products to not have \""'s before running this!


In [7]:
for f in ['aisles', 'departments', 'order_products__prior', 'order_products__train', 'orders', 'products']:
    fname = '../input/{0}.csv'.format(f)
    fd = open(fname, 'r')
    #dshape = odo.discover(odo.resource(fname)) 
    #print(dshape)
    t = odo.odo(fname, 'postgresql://postgres:insta@db/instacart811::{0}'.format(f))

In [13]:
cursor = conn.cursor()
cursor.execute('''
    ALTER TABLE "aisles" ADD PRIMARY KEY (aisle_id);
    ALTER TABLE "departments" ADD PRIMARY KEY (department_id);
    ALTER TABLE "orders" ADD PRIMARY KEY (order_id);
    ALTER TABLE "products" ADD PRIMARY KEY (product_id);
    -- The order_products__* tables have no primary key AFAIK
    --ALTER TABLE "order_products__prior" ADD PRIMARY KEY (order_id);
    --ALTER TABLE "order_products__train" ADD PRIMARY KEY (order_id);
''')

cursor.close()
conn.commit()

Now prepare FE tables, starting with time


In [16]:
cursor = conn.cursor()
cursor.execute('''
    DROP TABLE IF EXISTS orders_time;
    
    CREATE TABLE orders_time AS
        WITH orders_hdiff AS (
        
            SELECT *,
                   lag(order_hour_of_day) OVER (PARTITION BY user_id ORDER BY order_number) as __ohod_lag,
                   lag(order_dow) OVER (PARTITION BY user_id ORDER BY order_number) as __odow_lag
            FROM orders
            --WHERE user_id < 10
            
        ), orders_gapcount AS (
        
            SELECT *,
                   sum((COALESCE(orders.days_since_prior_order, 0) = 30)::int) OVER (PARTITION BY user_id ORDER BY order_number) AS order_month_count,
                   CASE WHEN days_since_prior_order = 30 THEN 0
                        WHEN days_since_prior_order is NULL THEN 0
                        ELSE days_since_prior_order + ((order_hour_of_day - __ohod_lag) / 24.0)
                    END as __dpo_corr_month,
                   -- Base is 38 to cover potential negative dow/time offsets.  It's a clipped number *anyway*
                   CASE WHEN days_since_prior_order = 30 THEN 38 + (order_dow - __odow_lag) + ((order_hour_of_day - __ohod_lag) / 24.0)
                        WHEN days_since_prior_order is NULL THEN 0
                        ELSE days_since_prior_order + ((order_hour_of_day - __ohod_lag) / 24.0)
                    END as __dpo_corr
            FROM orders_hdiff as orders
            
            )
        SELECT order_id,
                -- user_id, eval_set, order_number, order_dow, order_hour_of_day, days_since_prior_order,
                order_month_count,
                __dpo_corr as order_dpo_decimal,
                SUM(__dpo_corr_month) OVER (PARTITION BY user_id, order_month_count ORDER BY order_number) as order_elapsed_time_inmonth,
                SUM(__dpo_corr) OVER (PARTITION BY user_id ORDER BY order_number) as order_elapsed_time
        FROM orders_gapcount;    

''')

cursor.close()
conn.commit()

In [17]:
sql_cmd('''ALTER TABLE "orders_time" ADD PRIMARY KEY (order_id);''')

In [39]:
pd.read_sql_query('''select * from orders_time WHERE order_id < 10;''', conn)


Out[39]:
order_id order_month_count order_dpo_decimal order_elapsed_time_inmonth order_elapsed_time
0 8 1 16.625000 23.583333 61.375000
1 6 2 37.958333 0.000000 77.916667
2 1 0 8.541667 30.833333 30.833333
3 9 0 5.500000 110.375000 110.375000
4 7 3 40.208333 0.000000 170.708333
5 5 0 9.083333 289.166667 289.166667
6 4 0 7.041667 224.000000 224.000000
7 2 0 7.833333 28.000000 28.000000
8 3 1 12.125000 101.125000 166.041667

build a table with the user_id's last order (train or eval)


In [233]:
cmd = '''
SELECT orders.*, 
       orders.order_number as user_last_order,      
       orders_time.order_elapsed_time as user_last_order_time
FROM orders
INNER JOIN orders_time ON orders_time.order_id = orders.order_id
WHERE orders.eval_set != 'prior' -- and orders.user_id < 10
--GROUP BY orders.user_id
'''

#pd_cmd(cmd)
sql_mktable(cmd, 'user_last_order', 'user_id')

In [7]:
pd_cmd('''SELECT * from user_last_order where user_id < 10''')


Out[7]:
order_id user_id eval_set order_number order_dow order_hour_of_day days_since_prior_order user_last_order user_last_order_time
0 2774568 3 test 13 5 15 11.0 13 144.041667
1 1492625 2 train 15 1 11 30.0 15 253.000000
2 2196797 5 train 5 0 11 6.0 5 45.958333
3 1528013 6 test 4 3 16 22.0 4 39.916667
4 525192 7 train 21 2 11 6.0 21 222.083333
5 1094988 9 train 4 6 10 30.0 4 87.708333
6 329954 4 test 6 3 12 30.0 6 91.041667
7 1187899 1 train 11 4 8 14.0 11 201.000000
8 880375 8 train 4 1 14 10.0 4 85.583333

In [114]:
cmd = '''

SELECT order_id,
       MAX(order_products__prior.add_to_cart_order) as order_cart_size
       FROM order_products__prior
       GROUP BY order_id
'''

#pd_cmd(cmd)
sql_mktable(cmd, 'order_cart_size', "order_id")

In [209]:
cmd = '''
DROP VIEW IF EXISTS opp_rel;
CREATE OR REPLACE VIEW opp_rel AS
    SELECT order_products__prior.*,
           order_products__prior.add_to_cart_order /  ocs.order_cart_size::float as add_to_cart_order_rel,
           ocs.order_cart_size - order_products__prior.add_to_cart_order as add_to_cart_order_inverted,
           (ocs.order_cart_size - order_products__prior.add_to_cart_order) /  ocs.order_cart_size::float as add_to_cart_order_inverted_rel
           FROM order_products__prior
           INNER JOIN order_cart_size AS ocs ON ocs.order_id = order_products__prior.order_id
'''

sql_cmd(cmd)

This complex table produces information on user/product features including time and elapsed # of orders. This takes ~4-5 mins to run on a 3ghz Nehalem w/SSD


In [46]:
cmd = '''
WITH user_product_l0 as (
    SELECT orders.user_id, opp.product_id,
           COUNT(*) as up_count,
           AVG(opp.add_to_cart_order_rel) as up_add_to_cart_order_mean,
           AVG(opp.add_to_cart_order_inverted_rel) as up_add_to_cart_order_inverted_mean,
           ARRAY_AGG((ulo.user_last_order_time - order_elapsed_time) ORDER BY orders.order_number DESC) as up_order_times_last,
           ARRAY_AGG(order_elapsed_time ORDER BY orders.order_number DESC) as up_order_times,
           ARRAY_AGG(order_elapsed_time ORDER BY orders.order_number) as up_order_times_asc,
           ARRAY_AGG(orders.order_number::int ORDER BY orders.order_number DESC) as up_order_numbers
    FROM orders
    INNER JOIN orders_time ON orders_time.order_id = orders.order_id
    INNER JOIN order_cart_size ON order_cart_size.order_id = orders.order_id
    INNER JOIN opp_rel as opp on opp.order_id = orders.order_id
    INNER JOIN user_last_order as ulo on ulo.user_id = orders.user_id
    -- WHERE orders.user_id = 1
    GROUP BY orders.user_id, opp.product_id)

SELECT 
    CASE WHEN up_count >= 2 THEN (up_order_times[1] - up_order_times_asc[1]) / (up_count - 1)
        ELSE 0
        END AS up_reorder_rate,
    up_count - 1 AS up_reorder_count,
    *

FROM user_product_l0
'''

sql_mktable(cmd, 'user_product_l1', "user_id, product_id")
#x = pd_cmd(cmd)
#x.head()

#sql_cmd('EXPLAIN ' + cmd)
#pd_cmd(cmd)

In [47]:
pd_cmd('''SELECT * FROM user_product_l1 WHERE user_id = 1''')


Out[47]:
up_reorder_rate up_reorder_count user_id product_id up_count up_add_to_cart_order_mean up_add_to_cart_order_inverted_mean up_order_times_last up_order_times up_order_times_asc up_order_numbers
0 20.777778 9 1 196 10 0.245278 0.754722 [14.0, 54.6666666666667, 54.75, 68.95833333333... [187.0, 146.333333333333, 146.25, 132.04166666... [0.0, 14.9583333333333, 36.1666666666667, 64.9... [10, 9, 8, 7, 6, 5, 4, 3, 2, 1]
1 21.505208 8 1 10258 9 0.562037 0.437963 [14.0, 54.6666666666667, 54.75, 68.95833333333... [187.0, 146.333333333333, 146.25, 132.04166666... [14.9583333333333, 36.1666666666667, 64.958333... [10, 9, 8, 7, 6, 5, 4, 3, 2]
2 0.000000 0 1 10326 1 0.625000 0.375000 [107.708333333333] [93.2916666666667] [93.2916666666667] [5]
3 20.777778 9 1 12427 10 0.541667 0.458333 [14.0, 54.6666666666667, 54.75, 68.95833333333... [187.0, 146.333333333333, 146.25, 132.04166666... [0.0, 14.9583333333333, 36.1666666666667, 64.9... [10, 9, 8, 7, 6, 5, 4, 3, 2, 1]
4 86.020833 2 1 13032 3 0.962963 0.037037 [14.0, 68.9583333333333, 186.041666666667] [187.0, 132.041666666667, 14.9583333333333] [14.9583333333333, 132.041666666667, 187.0] [10, 7, 2]
5 78.333333 1 1 13176 2 0.833333 0.166667 [107.708333333333, 186.041666666667] [93.2916666666667, 14.9583333333333] [14.9583333333333, 93.2916666666667] [5, 2]
6 0.000000 0 1 14084 1 0.400000 0.600000 [201.0] [0.0] [0.0] [1]
7 0.000000 0 1 17122 1 0.750000 0.250000 [107.708333333333] [93.2916666666667] [93.2916666666667] [5]
8 21.547619 7 1 25133 8 0.702778 0.297222 [14.0, 54.6666666666667, 54.75, 68.95833333333... [187.0, 146.333333333333, 146.25, 132.04166666... [36.1666666666667, 64.9583333333333, 93.291666... [10, 9, 8, 7, 6, 5, 4, 3]
9 14.958333 1 1 26088 2 0.816667 0.183333 [186.041666666667, 201.0] [14.9583333333333, 0.0] [0.0, 14.9583333333333] [2, 1]
10 64.958333 1 1 26405 2 1.000000 0.000000 [136.041666666667, 201.0] [64.9583333333333, 0.0] [0.0, 64.9583333333333] [4, 1]
11 0.000000 0 1 30450 1 1.000000 0.000000 [164.833333333333] [36.1666666666667] [36.1666666666667] [3]
12 0.000000 0 1 35951 1 0.777778 0.222222 [14.0] [187.0] [187.0] [10]
13 0.000000 0 1 38928 1 0.444444 0.555556 [14.0] [187.0] [187.0] [10]
14 0.000000 0 1 39657 1 0.333333 0.666667 [14.0] [187.0] [187.0] [10]
15 0.000000 0 1 41787 1 0.875000 0.125000 [107.708333333333] [93.2916666666667] [93.2916666666667] [5]
16 20.375000 2 1 46149 3 0.462963 0.537037 [14.0, 54.6666666666667, 54.75] [187.0, 146.333333333333, 146.25] [146.25, 146.333333333333, 187.0] [10, 9, 8]
17 0.083333 1 1 49235 2 0.583333 0.416667 [54.6666666666667, 54.75] [146.333333333333, 146.25] [146.25, 146.333333333333] [9, 8]

This table creates a set of user/product ID's reordered in the training set


In [59]:
cmd = '''
    SELECT user_id, product_id, reordered
    FROM orders
    LEFT JOIN order_products__train ON order_products__train.order_id = orders.order_id
    WHERE orders.eval_set = 'train' and order_products__train.reordered = 1 -- and user_id = 1
'''

sql_mktable(cmd, 'train_reordered', "user_id, product_id")

Now for the output queries - these go directly into Pandas DataFrames and pickled to disk


In [51]:
cmd = '''
select up.user_id, up.product_id,
--       ulo.eval_set,
--       ulo.order_id,
       CASE WHEN up_reorder_rate > 0 THEN up_reorder_rate ELSE -1 END as up_reorder_rate,
       up_count,
       GREATEST(reordered, 0) as reordered,
       ulo.user_last_order as user_max_order,
       ulo.user_last_order_time - up_order_times_last[1] as up_time_first,
       ulo.user_last_order_time - up_order_times[1] as up_time_last,
       (ulo.user_last_order_time - up_order_times[1]) / GREATEST(ulo.user_last_order_time, 1) as up_time_last_ratio,
       ulo.user_last_order_time - up_order_times[2] as up_time_prev1,
       ulo.user_last_order_time - up_order_times[3] as up_time_prev2,
       ulo.user_last_order - up_order_numbers[1] as up_ordergap_last,
       up_order_numbers[1] as up_last_order,
       up_order_numbers[array_upper(up_order_numbers, 1)] as up_first_order,
       up_count / GREATEST((ulo.user_last_order - 1), 1)::float as up_order_rate,
       up.up_add_to_cart_order_mean, up.up_add_to_cart_order_inverted_mean
from user_product_l1 as up
inner join user_last_order as ulo on ulo.user_id = up.user_id
left outer join train_reordered as tro on tro.user_id = up.user_id and tro.product_id = up.product_id
--where up.user_id < 50000
--where up.user_id = 1
'''
# data['up_order_rate'] = data.up_order_count / (data.user_num_orders - 1)

df_up1 = pd_cmd(cmd)
df_up1.head(20)


Out[51]:
user_id product_id up_reorder_rate up_count reordered user_max_order up_time_first up_time_last up_time_last_ratio up_time_prev1 up_time_prev2 up_ordergap_last up_last_order up_first_order up_order_rate up_add_to_cart_order_mean up_add_to_cart_order_inverted_mean
0 1 196 20.777778 10 1 11 187.000000 14.000000 0.069652 54.666667 54.750000 1 10 1 1.000000 0.245278 0.754722
1 1 10258 21.505208 9 1 11 187.000000 14.000000 0.069652 54.666667 54.750000 1 10 2 0.900000 0.562037 0.437963
2 1 10326 -1.000000 1 0 11 93.291667 107.708333 0.535862 NaN NaN 6 5 5 0.100000 0.625000 0.375000
3 1 12427 20.777778 10 0 11 187.000000 14.000000 0.069652 54.666667 54.750000 1 10 1 1.000000 0.541667 0.458333
4 1 13032 86.020833 3 1 11 187.000000 14.000000 0.069652 68.958333 186.041667 1 10 2 0.300000 0.962963 0.037037
5 1 13176 78.333333 2 0 11 93.291667 107.708333 0.535862 186.041667 NaN 6 5 2 0.200000 0.833333 0.166667
6 1 14084 -1.000000 1 0 11 0.000000 201.000000 1.000000 NaN NaN 10 1 1 0.100000 0.400000 0.600000
7 1 17122 -1.000000 1 0 11 93.291667 107.708333 0.535862 NaN NaN 6 5 5 0.100000 0.750000 0.250000
8 1 25133 21.547619 8 1 11 187.000000 14.000000 0.069652 54.666667 54.750000 1 10 3 0.800000 0.702778 0.297222
9 1 26088 14.958333 2 1 11 14.958333 186.041667 0.925580 201.000000 NaN 9 2 1 0.200000 0.816667 0.183333
10 1 26405 64.958333 2 1 11 64.958333 136.041667 0.676824 201.000000 NaN 7 4 1 0.200000 1.000000 0.000000
11 1 30450 -1.000000 1 0 11 36.166667 164.833333 0.820066 NaN NaN 8 3 3 0.100000 1.000000 0.000000
12 1 35951 -1.000000 1 0 11 187.000000 14.000000 0.069652 NaN NaN 1 10 10 0.100000 0.777778 0.222222
13 1 38928 -1.000000 1 1 11 187.000000 14.000000 0.069652 NaN NaN 1 10 10 0.100000 0.444444 0.555556
14 1 39657 -1.000000 1 1 11 187.000000 14.000000 0.069652 NaN NaN 1 10 10 0.100000 0.333333 0.666667
15 1 41787 -1.000000 1 0 11 93.291667 107.708333 0.535862 NaN NaN 6 5 5 0.100000 0.875000 0.125000
16 1 46149 20.375000 3 1 11 187.000000 14.000000 0.069652 54.666667 54.750000 1 10 8 0.300000 0.462963 0.537037
17 1 49235 0.083333 2 1 11 146.333333 54.666667 0.271973 54.750000 NaN 2 9 8 0.200000 0.583333 0.416667
18 2 23 -1.000000 1 0 15 83.166667 169.833333 0.671278 NaN NaN 7 8 8 0.071429 0.750000 0.250000
19 2 79 -1.000000 1 0 15 204.000000 49.000000 0.193676 NaN NaN 2 13 13 0.071429 0.333333 0.666667

In [52]:
df_up1.to_pickle('testdata.pkl')

In [53]:
cmd = '''
select ulo.user_id,
       ulo.eval_set,
       ulo.order_id,
       ulo.days_since_prior_order,
       ulo.order_dow,
       ulo.order_hour_of_day,
       --ulo.order_dpo_decimal,
       ulo.days_since_prior_order / GREATEST(user_last_order_time, 1) as u_dpo_ratio,
       AVG((up.up_count > 1)::int) / GREATEST(SUM((up.up_count >= 1)::int), 1) as u_reordered_product_ratio,
       SUM((up.up_count >= 1)::int) as u_num_products
from user_last_order as ulo 
inner join user_product_l1 as up on up.user_id = ulo.user_id
--where ulo.eval_set = 'train' and ulo.user_id < 50000
--where ulo.eval_set = 'train' and ulo.user_id < 5
group by ulo.user_id
'''

df_u1 = pd_cmd(cmd)
df_u1.head(20)


Out[53]:
user_id eval_set order_id days_since_prior_order order_dow order_hour_of_day u_dpo_ratio u_reordered_product_ratio u_num_products
0 1 train 1187899 14.0 4 8 0.069652 0.030864 18
1 2 train 1492625 30.0 1 11 0.118577 0.003556 102

In [243]:
df_u1.to_pickle('testdata_user.pkl')

In [61]:
cmd = '''
select p.product_id,
       SUM(up_reorder_rate) / GREATEST(SUM((up_reorder_count > 0)::int), 1)::float as p_reorder_rate,
       AVG(up_count / (ulo.user_last_order - 1)) as p_order_rate,
       AVG(up_add_to_cart_order_mean) as p_add_to_cart_order_mean,
       SUM((up_count > 1)::int) / GREATEST(SUM((up_count >= 1)::int), 1)::float as p_reordered_product_ratio,
       SUM((up_count > 0)::int) as p_users,
       p.aisle_id, p.department_id
from user_product_l1 as up
inner join user_last_order as ulo on ulo.user_id = up.user_id
inner join products as p on p.product_id = up.product_id
left outer join train_reordered as tro on tro.user_id = up.user_id and tro.product_id = up.product_id
--where up.user_id < 500
group by p.product_id
'''
# data['up_order_rate'] = data.up_order_count / (data.user_num_orders - 1)

df_p1 = pd_cmd(cmd)
df_p1.head(20)


Out[61]:
product_id p_reorder_rate p_order_rate p_add_to_cart_order_mean p_reordered_product_ratio p_users aisle_id department_id
0 2848 50.572917 0.000000 0.631400 0.307692 13 90 7
1 36761 40.416667 0.000000 0.643354 0.071429 28 81 15
2 2026 45.872431 0.004484 0.610207 0.385650 223 125 19
3 10295 57.708333 0.000000 0.482497 0.166667 6 126 11
4 11890 0.000000 0.000000 0.675397 0.000000 6 22 11
5 46770 42.110721 0.006329 0.607975 0.544304 158 51 13
6 22262 55.481303 0.000000 0.737971 0.240741 54 105 13
7 43007 18.780093 0.000000 0.589927 0.500000 12 49 12
8 39118 8.811632 0.000000 0.709678 0.166667 12 28 5
9 39665 0.041667 0.000000 0.473901 0.500000 2 25 11
10 6122 36.881944 0.000000 0.762810 0.183333 60 69 15
11 49059 7.062500 0.000000 0.787212 0.250000 4 125 19
12 40448 60.912202 0.000000 0.610447 0.312500 112 29 13
13 12267 0.000000 0.000000 0.666998 0.000000 7 45 19
14 48394 30.369118 0.000000 0.732032 0.333333 15 54 17
15 28365 12.076389 0.000000 0.659852 0.285714 7 78 19
16 23772 44.083333 0.000000 0.571281 0.111111 9 116 1
17 8533 57.631944 0.000000 0.642477 0.666667 3 64 7
18 12377 56.870226 0.022222 0.608343 0.333333 45 98 7
19 9243 61.008642 0.000000 0.629577 0.244389 401 16 4

In [62]:
df_p1.to_pickle('testdata_prod.pkl')

In [238]:
cmd = '''
select p.aisle_id,
       AVG(up_count / (ulo.user_last_order - 1)) as a_order_rate,
       SUM((up_count > 1)::int) / SUM((up_count >= 1)::int)::float as a_reordered_product_ratio,
       SUM((up_count > 0)::int) as a_up_count
from user_product_l1 as up
inner join user_last_order as ulo on ulo.user_id = up.user_id
inner join products as p on p.product_id = up.product_id
left outer join train_reordered as tro on tro.user_id = up.user_id and tro.product_id = up.product_id
--where ulo.eval_set = 'train' and up.user_id < 50000
group by p.aisle_id
'''
# data['up_order_rate'] = data.up_order_count / (data.user_num_orders - 1)

df_a1 = pd_cmd(cmd)

df_a1.to_pickle('testdata_aisle.pkl')
df_a1.head(20)


Out[238]:
aisle_id a_order_rate a_reordered_product_ratio a_up_count
0 129 0.002903 0.366212 92318
1 106 0.002949 0.402499 135650
2 120 0.009356 0.493114 455325
3 8 0.003779 0.307340 17466
4 80 0.000177 0.183266 11306
5 16 0.001583 0.380415 182511
6 54 0.002241 0.379718 114693
7 47 0.000419 0.211295 31023
8 103 0.000365 0.177423 8212
9 115 0.019160 0.535125 227556
10 110 0.001319 0.302680 62191
11 99 0.003333 0.380328 38104
12 46 0.004201 0.365010 9997
13 48 0.005175 0.417448 29757
14 83 0.003477 0.447998 1385849
15 28 0.007433 0.370181 15876
16 36 0.002031 0.429786 107821
17 94 0.004160 0.327571 117785
18 15 0.003200 0.358646 10311
19 77 0.011841 0.430059 129131

In [239]:
cmd = '''
select p.department_id,
       SUM((up_count > 1)::int) / SUM((up_count >= 1)::int)::float as d_reordered_product_ratio,
       AVG(up_count / (ulo.user_last_order - 1)) as d_order_rate,
       SUM((up_count > 0)::int) as d_up_count
from user_product_l1 as up
inner join user_last_order as ulo on ulo.user_id = up.user_id
inner join products as p on p.product_id = up.product_id
left outer join train_reordered as tro on tro.user_id = up.user_id and tro.product_id = up.product_id
--where ulo.eval_set = 'train' and up.user_id < 50000
group by p.department_id
'''
# data['up_order_rate'] = data.up_order_count / (data.user_num_orders - 1)

df_d1 = pd_cmd(cmd)

df_d1.to_pickle('testdata_dept.pkl')
df_d1.head(20)


Out[239]:
department_id d_reordered_product_ratio d_order_rate d_up_count
0 14 0.393509 0.003431 311556
1 17 0.274462 0.001010 441591
2 12 0.411940 0.003664 306489
3 8 0.421312 0.005518 38964
4 15 0.325318 0.001757 579523
5 1 0.375686 0.004460 1024542
6 10 0.368392 0.003624 14623
7 11 0.217988 0.000652 303539
8 4 0.481560 0.006361 3318581
9 18 0.406287 0.002029 178433
10 16 0.472298 0.007928 1786795
11 6 0.243816 0.001001 169837
12 19 0.393108 0.004256 1229577
13 21 0.287069 0.003375 41774
14 2 0.243984 0.001815 21485
15 3 0.437396 0.005793 437599
16 20 0.428173 0.005633 412385
17 13 0.230859 0.000635 1225276
18 5 0.367453 0.008714 66101
19 9 0.328629 0.001509 467046

In [57]:
cmd = '''

WITH uo0 AS 
(    
    SELECT orders.user_id, orders.order_id, orders.order_number, 
           SUM(reordered) as o_reordered_count,
           COUNT(*) as o_cart_size
    FROM orders
    INNER JOIN order_products__prior as opp on orders.order_id = opp.order_id
    --WHERE user_id <= 5
    GROUP by orders.order_id
), uo1 AS (
    SELECT user_id, 
           SUM((o_reordered_count = 0)::int) uo_nonecount, 
           ARRAY_AGG(o_reordered_count ORDER BY order_number DESC) as uo_agg_reordered_count, 
           COUNT(*) as u_order_count,
           AVG(o_cart_size) AS u_avg_cart_size
    FROM uo0
    GROUP BY user_id
)
SELECT user_id, uo_nonecount, u_order_count, u_avg_cart_size, (uo_agg_reordered_count[1] = 0)::int as uo_prevnone
FROM uo1
'''

df_uo = pd_cmd(cmd)

df_uo.set_index('user_id').to_pickle('testdata-uo.pkl')
df_uo.head()


Out[57]:
user_id uo_nonecount u_order_count u_avg_cart_size uo_prevnone
0 1 1 10 5.900000 0
1 2 2 14 13.928571 0
2 3 1 12 7.333333 0
3 4 4 5 3.600000 1
4 5 1 4 9.250000 0

The last bit of feature engineering I did dealt with order sizes and reorder rates, to match the last/prev[12] features in Sh1ng's kernel.

I found that preparing average # of reorders per order first made prepping the output for this a lot easier!


In [127]:
cmd = '''
SELECT orders.order_id, 
       AVG(reordered) as reorder_avg,
       COUNT(*) as order_size
FROM orders
INNER JOIN order_products__prior as opp on orders.order_id = opp.order_id
-- WHERE user_id <= 2
GROUP by orders.order_id
'''

#pd_cmd(cmd)
sql_mktable(cmd, 'order_reordered_avg', 'order_id')

In [193]:
pd_cmd('''select * from order_reordered_avg limit 5;''')


Out[193]:
order_id reorder_avg order_size
0 1987167 0.800000 5
1 910309 0.636364 22
2 2291575 0.583333 12
3 2400369 0.750000 40
4 164444 0.666667 3

In [194]:
cmd = '''
WITH reordered_agg AS (
    SELECT user_id,
           -- ARRAY_AGG(orders.order_id ORDER BY order_number DESC) as order_id,
           ARRAY_AGG(reorder_avg ORDER BY order_number DESC) as reorder_agg,
           ARRAY_AGG(order_size ORDER BY order_number DESC) as order_size_agg
    FROM orders
    INNER JOIN order_reordered_avg ON order_reordered_avg.order_id = orders.order_id
    WHERE orders.eval_set = 'prior' -- and user_id < 5
    GROUP BY user_id
)
SELECT user_id, 
       reorder_agg[1] as u_reordered_prev1, 
       reorder_agg[2] as u_reordered_prev2, 
       reorder_agg[3] as u_reordered_prev3,
       order_size_agg[1] as u_ordersize_prev1, 
       order_size_agg[2] as u_ordersize_prev2, 
       order_size_agg[3] as u_ordersize_prev3 
FROM reordered_agg
'''
#INNER JOIN orders_time ON orders_time.order_id = orders.order_id

df_reorderedpast = pd_cmd(cmd)

In [196]:
df_reorderedpast.to_pickle('testdata-user_reordered.pkl')

In [195]:
df_reorderedpast.head()


Out[195]:
user_id u_reordered_prev1 u_reordered_prev2 u_reordered_prev3 u_ordersize_prev1 u_ordersize_prev2 u_ordersize_prev3
0 1 0.666667 1.0 0.666667 9 6 6
1 2 0.625000 0.0 0.578947 16 9 19
2 3 1.000000 1.0 0.833333 6 5 6
3 4 0.000000 0.0 0.142857 3 2 7
4 5 0.666667 0.4 0.444444 12 5 9

order_departments (instead of products) handling.

(largely a replication of above - was hoping these new features would give a bigger lift than they did!)


In [77]:
cmd = '''

WITH opc AS (
    SELECT * FROM order_products__prior
    UNION
    SELECT * FROM order_products__train
), opd0 AS (
    SELECT opc.order_id, opc.add_to_cart_order, opc.reordered, products.department_id
    FROM opc
    INNER JOIN products ON products.product_id = opc.product_id
--    WHERE opc.order_id < 5
) 
SELECT
    order_id, 
    department_id,
    MIN(add_to_cart_order) as add_to_cart_order, 
    COUNT(*) as products,
    SUM(reordered)::int as reordered_count,
    (SUM(reordered) >= 1)::int as reordered
FROM opd0
GROUP BY order_id, department_id
'''

#pd_cmd(cmd)
# sql_mktable(cmd, 'order_departments')

In [79]:
cmd = '''

WITH opd0 AS (
    SELECT opc.order_id, opc.add_to_cart_order, opc.reordered, products.department_id
    FROM order_products__prior as opc
    INNER JOIN products ON products.product_id = opc.product_id
--    WHERE opc.order_id < 5
) 
SELECT
    order_id, 
    department_id,
    MIN(add_to_cart_order) as add_to_cart_order, 
    COUNT(*) as d_product_count,
    SUM(reordered)::int as reordered_count,
    (SUM(reordered) >= 1)::int as reordered
FROM opd0
GROUP BY order_id, department_id
'''

#pd_cmd(cmd)
sql_mktable(cmd, 'order_departments__prior')

In [88]:
cmd = '''

WITH opd0 AS (
    SELECT opc.order_id, opc.add_to_cart_order, opc.reordered, products.department_id
    FROM order_products__train as opc
    INNER JOIN products ON products.product_id = opc.product_id
--    WHERE opc.order_id < 5
) 
SELECT
    order_id, 
    department_id,
    MIN(add_to_cart_order) as add_to_cart_order, 
    COUNT(*) as d_product_count,
    SUM(reordered)::int as reordered_count,
    (SUM(reordered) >= 1)::int as reordered
FROM opd0
GROUP BY order_id, department_id
'''

#pd_cmd(cmd)
sql_mktable(cmd, 'order_departments__train')

In [83]:
cmd = '''
DROP VIEW IF EXISTS odp_rel;
CREATE OR REPLACE VIEW odp_rel AS
    SELECT order_departments__prior.*,
           order_departments__prior.add_to_cart_order /  ocs.order_cart_size::float as add_to_cart_order_rel,
           ocs.order_cart_size - order_departments__prior.add_to_cart_order as add_to_cart_order_inverted,
           (ocs.order_cart_size - order_departments__prior.add_to_cart_order) /  ocs.order_cart_size::float as add_to_cart_order_inverted_rel
           FROM order_departments__prior
           INNER JOIN order_cart_size AS ocs ON ocs.order_id = order_departments__prior.order_id
'''

sql_cmd(cmd)
#sql_mktable(cmd, 'order_cart_size', "order_id")

In [116]:
cmd = '''
WITH user_department_l0 as (
    SELECT orders.user_id, opp.department_id,
           COUNT(*) as ud_count,
           AVG(opp.add_to_cart_order_rel) as ud_add_to_cart_order_mean,
           AVG(opp.add_to_cart_order_inverted_rel) as ud_add_to_cart_order_inverted_mean,
           ARRAY_AGG((ulo.user_last_order_time - order_elapsed_time) ORDER BY orders.order_number DESC) as ud_order_times_last,
           ARRAY_AGG(order_elapsed_time ORDER BY orders.order_number DESC) as ud_order_times,
           ARRAY_AGG(order_elapsed_time ORDER BY orders.order_number) as ud_order_times_asc,
           ARRAY_AGG(orders.order_number::int ORDER BY orders.order_number DESC) as ud_order_numbers
    FROM orders
    INNER JOIN orders_time ON orders_time.order_id = orders.order_id
    INNER JOIN order_cart_size ON order_cart_size.order_id = orders.order_id
    INNER JOIN odp_rel as opp on opp.order_id = orders.order_id
    INNER JOIN user_last_order as ulo on ulo.user_id = orders.user_id
--    WHERE orders.user_id = 1
    GROUP BY orders.user_id, opp.department_id)

SELECT 
    CASE WHEN ud_count >= 2 THEN (ud_order_times[1] - ud_order_times_asc[1]) / (ud_count - 1)
        ELSE 0
        END AS ud_reorder_rate,
    ud_count - 1 AS ud_reorder_count,
    *
FROM user_department_l0
'''

sql_mktable(cmd, 'user_department_l1', "user_id, department_id")
#x = pd_cmd(cmd)
#x.head()

#sql_cmd('EXPLAIN ' + cmd)
#pd_cmd(cmd)

In [117]:
pd_cmd('''SELECT * FROM user_product_l1 WHERE user_id = 1''')


Out[117]:
up_reorder_rate up_reorder_count user_id product_id up_count up_add_to_cart_order_mean up_add_to_cart_order_inverted_mean up_order_times_last up_order_times up_order_times_asc up_order_numbers
0 20.777778 9 1 196 10 0.245278 0.754722 [14.0, 54.6666666666667, 54.75, 68.95833333333... [187.0, 146.333333333333, 146.25, 132.04166666... [0.0, 14.9583333333333, 36.1666666666667, 64.9... [10, 9, 8, 7, 6, 5, 4, 3, 2, 1]
1 21.505208 8 1 10258 9 0.562037 0.437963 [14.0, 54.6666666666667, 54.75, 68.95833333333... [187.0, 146.333333333333, 146.25, 132.04166666... [14.9583333333333, 36.1666666666667, 64.958333... [10, 9, 8, 7, 6, 5, 4, 3, 2]
2 0.000000 0 1 10326 1 0.625000 0.375000 [107.708333333333] [93.2916666666667] [93.2916666666667] [5]
3 20.777778 9 1 12427 10 0.541667 0.458333 [14.0, 54.6666666666667, 54.75, 68.95833333333... [187.0, 146.333333333333, 146.25, 132.04166666... [0.0, 14.9583333333333, 36.1666666666667, 64.9... [10, 9, 8, 7, 6, 5, 4, 3, 2, 1]
4 86.020833 2 1 13032 3 0.962963 0.037037 [14.0, 68.9583333333333, 186.041666666667] [187.0, 132.041666666667, 14.9583333333333] [14.9583333333333, 132.041666666667, 187.0] [10, 7, 2]
5 78.333333 1 1 13176 2 0.833333 0.166667 [107.708333333333, 186.041666666667] [93.2916666666667, 14.9583333333333] [14.9583333333333, 93.2916666666667] [5, 2]
6 0.000000 0 1 14084 1 0.400000 0.600000 [201.0] [0.0] [0.0] [1]
7 0.000000 0 1 17122 1 0.750000 0.250000 [107.708333333333] [93.2916666666667] [93.2916666666667] [5]
8 21.547619 7 1 25133 8 0.702778 0.297222 [14.0, 54.6666666666667, 54.75, 68.95833333333... [187.0, 146.333333333333, 146.25, 132.04166666... [36.1666666666667, 64.9583333333333, 93.291666... [10, 9, 8, 7, 6, 5, 4, 3]
9 14.958333 1 1 26088 2 0.816667 0.183333 [186.041666666667, 201.0] [14.9583333333333, 0.0] [0.0, 14.9583333333333] [2, 1]
10 64.958333 1 1 26405 2 1.000000 0.000000 [136.041666666667, 201.0] [64.9583333333333, 0.0] [0.0, 64.9583333333333] [4, 1]
11 0.000000 0 1 30450 1 1.000000 0.000000 [164.833333333333] [36.1666666666667] [36.1666666666667] [3]
12 0.000000 0 1 35951 1 0.777778 0.222222 [14.0] [187.0] [187.0] [10]
13 0.000000 0 1 38928 1 0.444444 0.555556 [14.0] [187.0] [187.0] [10]
14 0.000000 0 1 39657 1 0.333333 0.666667 [14.0] [187.0] [187.0] [10]
15 0.000000 0 1 41787 1 0.875000 0.125000 [107.708333333333] [93.2916666666667] [93.2916666666667] [5]
16 20.375000 2 1 46149 3 0.462963 0.537037 [14.0, 54.6666666666667, 54.75] [187.0, 146.333333333333, 146.25] [146.25, 146.333333333333, 187.0] [10, 9, 8]
17 0.083333 1 1 49235 2 0.583333 0.416667 [54.6666666666667, 54.75] [146.333333333333, 146.25] [146.25, 146.333333333333] [9, 8]

In [89]:
cmd = '''
    SELECT user_id, department_id, reordered
    FROM orders
    LEFT JOIN order_departments__train ON order_departments__train.order_id = orders.order_id
    WHERE orders.eval_set = 'train' and order_departments__train.reordered = 1 -- and user_id = 1
'''

sql_mktable(cmd, 'train_departments_reordered', "user_id, department_id")

In [125]:
cmd = '''
select ud.user_id, ud.department_id,
       CASE WHEN ud_reorder_rate > 0 THEN ud_reorder_rate ELSE -1 END as ud_reorder_rate,
       ud_count,
       GREATEST(reordered, 0) as ud_reordered,
       -- ulo.user_last_order as user_max_order,
       ulo.user_last_order_time - ud_order_times_last[1] as ud_time_first,
       ulo.user_last_order_time - ud_order_times[1] as ud_time_last,
       (ulo.user_last_order_time - ud_order_times[1]) / GREATEST(ulo.user_last_order_time, 1) as ud_time_last_ratio,
       ulo.user_last_order_time - ud_order_times[2] as ud_time_prev1,
       ulo.user_last_order_time - ud_order_times[3] as ud_time_prev2,
       ulo.user_last_order - ud_order_numbers[1] as ud_ordergap_last,
       ud_order_numbers[1] as ud_last_order,
       ud_order_numbers[array_upper(ud_order_numbers, 1)] as ud_first_order,
       ud_count / GREATEST((ulo.user_last_order - 1), 1)::float as ud_order_rate,
       ud.ud_add_to_cart_order_mean, ud.ud_add_to_cart_order_inverted_mean
from user_department_l1 as ud
inner join user_last_order as ulo on ulo.user_id = ud.user_id
left outer join train_departments_reordered as tro on tro.user_id = ud.user_id and tro.department_id = ud.department_id
--where up.user_id < 50000
--where ud.user_id = 1
'''
# data['up_order_rate'] = data.up_order_count / (data.user_num_orders - 1)

df_udl1 = pd_cmd(cmd)
df_udl1.head(20)


Out[125]:
user_id department_id ud_reorder_rate ud_count ud_reordered ud_time_first ud_time_last ud_time_last_ratio ud_time_prev1 ud_time_prev2 ud_ordergap_last ud_last_order ud_first_order ud_order_rate ud_add_to_cart_order_mean ud_add_to_cart_order_inverted_mean
0 1 4 78.333333 2 0 93.291667 107.708333 0.535862 186.041667 NaN 6 5 2 0.200000 0.645833 0.354167
1 1 7 20.777778 10 1 187.000000 14.000000 0.069652 54.666667 54.750000 1 10 1 1.000000 0.211944 0.788056
2 1 13 -1.000000 1 0 36.166667 164.833333 0.820066 NaN NaN 8 3 3 0.100000 1.000000 0.000000
3 1 14 86.020833 3 1 187.000000 14.000000 0.069652 68.958333 186.041667 1 10 2 0.300000 0.962963 0.037037
4 1 16 23.375000 9 1 187.000000 14.000000 0.069652 54.666667 54.750000 1 10 1 0.900000 0.619753 0.380247
5 1 17 64.958333 2 1 64.958333 136.041667 0.676824 201.000000 NaN 7 4 1 0.200000 1.000000 0.000000
6 1 19 20.777778 10 1 187.000000 14.000000 0.069652 54.666667 54.750000 1 10 1 1.000000 0.421667 0.578333
7 2 1 32.666667 7 1 216.958333 36.041667 0.142457 118.041667 156.000000 1 14 4 0.500000 0.696934 0.303066
8 2 3 41.083333 2 0 204.000000 49.000000 0.193676 90.083333 NaN 2 13 12 0.142857 1.000000 0.000000
9 2 4 16.689103 14 1 216.958333 36.041667 0.142457 49.000000 90.083333 1 14 1 1.000000 0.357366 0.642634
10 2 6 59.979167 3 0 216.958333 36.041667 0.142457 118.041667 156.000000 1 14 10 0.214286 0.627778 0.372222
11 2 7 43.391667 6 0 216.958333 36.041667 0.142457 90.083333 169.833333 1 14 1 0.428571 0.496500 0.503500
12 2 11 -1.000000 1 0 29.000000 224.000000 0.885375 NaN NaN 10 5 5 0.071429 0.692308 0.307692
13 2 12 -1.000000 1 0 83.166667 169.833333 0.671278 NaN NaN 7 8 8 0.071429 0.750000 0.250000
14 2 13 27.119792 9 0 216.958333 36.041667 0.142457 49.000000 90.083333 1 14 1 0.642857 0.462231 0.537769
15 2 14 -1.000000 1 0 29.000000 224.000000 0.885375 NaN NaN 10 5 5 0.071429 0.461538 0.538462
16 2 15 -1.000000 1 0 204.000000 49.000000 0.193676 NaN NaN 2 13 13 0.071429 0.333333 0.666667
17 2 16 20.700000 11 0 216.958333 36.041667 0.142457 90.083333 118.041667 1 14 2 0.785714 0.382892 0.617108
18 2 19 18.079861 13 1 216.958333 36.041667 0.142457 49.000000 90.083333 1 14 1 0.928571 0.151627 0.848373
19 2 20 22.666667 10 1 204.000000 49.000000 0.193676 90.083333 118.041667 2 13 1 0.714286 0.474289 0.525711

In [126]:
# merge w/data on user,dept
df_udl1.to_pickle('testdata_ud.pkl')

In [120]:
cmd = '''
select ulo.user_id,
       AVG((ud.ud_count > 1)::int) / GREATEST(SUM((ud.ud_count >= 1)::int), 1) as u_reordered_department_ratio,
       SUM((ud.ud_count >= 1)::int) as u_num_departments
from user_last_order as ulo 
--inner join user_product_l1 as up on up.user_id = ulo.user_id
inner join user_department_l1 as ud on ud.user_id = ulo.user_id
--where ulo.eval_set = 'train' and ulo.user_id < 50000
--where ulo.eval_set = 'train' and ulo.user_id < 5
group by ulo.user_id
'''

df_ud1 = pd_cmd(cmd)
df_ud1.head(20)


Out[120]:
user_id u_reordered_department_ratio u_num_departments
0 202094 0.098765 9
1 2026 0.082645 11
2 81339 0.083333 6
3 103294 0.049587 11
4 121499 0.081633 7
5 197414 0.046875 8
6 164444 0.061224 14
7 188931 0.062500 4
8 22262 0.061224 7
9 160071 0.057851 11
10 155028 0.058594 16
11 137077 0.187500 4
12 55290 0.049587 11
13 59315 0.080000 10
14 191210 0.044444 15
15 92077 0.078125 8
16 108031 0.062500 16
17 56414 0.086420 9
18 62012 0.111111 3
19 49059 0.059172 13

In [122]:
# merge on user_id
df_ud1.to_pickle('testdata_user_dept.pkl')

In [ ]: