Sessionize

The MADlib sessionize function performs time-oriented session reconstruction on a data set comprising a sequence of events. A defined period of inactivity indicates the end of one session and beginning of the next session.


In [ ]:
%load_ext sql

In [ ]:
# %sql postgresql://gpdbchina@10.194.10.68:55000/madlib
%sql postgresql://fmcquillan@localhost:5432/madlib

In [15]:
%sql select madlib.version();


1 rows affected.
Out[15]:
version
MADlib version: 1.9.1, git revision: rc/v1.9-rc1-39-g1929aed, cmake configuration time: Tue Aug 30 00:17:02 UTC 2016, build type: RelWithDebInfo, build system: Darwin-14.5.0, C compiler: Clang, C++ compiler: Clang

The data set describes shopper behavior on a notional web site that sells beer and wine. A beacon fires an event to a log file when the shopper visits different pages on the site: landing page, beer selection page, wine selection page, and checkout. Each user is identified by a a user id, and every time a page is visited, the page and time stamp are logged.

Create the data table:


In [16]:
%%sql 
DROP TABLE IF EXISTS eventlog CASCADE; -- Use CASCADE because views created below depend on this table

CREATE TABLE eventlog (event_timestamp TIMESTAMP,
            user_id INT,
            page TEXT,
            revenue FLOAT);

INSERT INTO eventlog VALUES
('04/15/2015 02:19:00', 101331, 'CHECKOUT', 16),
('04/15/2015 02:17:00', 202201, 'WINE', 0),
('04/15/2015 03:18:00', 202201, 'BEER', 0),
('04/15/2015 01:03:00', 100821, 'LANDING', 0),
('04/15/2015 01:04:00', 100821, 'WINE', 0),
('04/15/2015 01:05:00', 100821, 'CHECKOUT', 39),
('04/15/2015 02:06:00', 100821, 'WINE', 0),
('04/15/2015 02:09:00', 100821, 'WINE', 0),
('04/15/2015 02:15:00', 101331, 'LANDING', 0),
('04/15/2015 02:16:00', 101331, 'WINE', 0),
('04/15/2015 02:17:00', 101331, 'HELP', 0),
('04/15/2015 02:18:00', 101331, 'WINE', 0),
('04/15/2015 02:29:00', 201881, 'LANDING', 0),
('04/15/2015 02:30:00', 201881, 'BEER', 0),
('04/15/2015 01:05:00', 202201, 'LANDING', 0),
('04/15/2015 01:06:00', 202201, 'HELP', 0),
('04/15/2015 01:09:00', 202201, 'LANDING', 0),
('04/15/2015 02:15:00', 202201, 'WINE', 0),
('04/15/2015 02:16:00', 202201, 'BEER', 0),
('04/15/2015 03:19:00', 202201, 'WINE', 0),
('04/15/2015 03:22:00', 202201, 'CHECKOUT', 21);

SELECT * FROM eventlog ORDER BY event_timestamp;


Done.
Done.
21 rows affected.
21 rows affected.
Out[16]:
event_timestamp user_id page revenue
2015-04-15 01:03:00 100821 LANDING 0.0
2015-04-15 01:04:00 100821 WINE 0.0
2015-04-15 01:05:00 202201 LANDING 0.0
2015-04-15 01:05:00 100821 CHECKOUT 39.0
2015-04-15 01:06:00 202201 HELP 0.0
2015-04-15 01:09:00 202201 LANDING 0.0
2015-04-15 02:06:00 100821 WINE 0.0
2015-04-15 02:09:00 100821 WINE 0.0
2015-04-15 02:15:00 101331 LANDING 0.0
2015-04-15 02:15:00 202201 WINE 0.0
2015-04-15 02:16:00 101331 WINE 0.0
2015-04-15 02:16:00 202201 BEER 0.0
2015-04-15 02:17:00 202201 WINE 0.0
2015-04-15 02:17:00 101331 HELP 0.0
2015-04-15 02:18:00 101331 WINE 0.0
2015-04-15 02:19:00 101331 CHECKOUT 16.0
2015-04-15 02:29:00 201881 LANDING 0.0
2015-04-15 02:30:00 201881 BEER 0.0
2015-04-15 03:18:00 202201 BEER 0.0
2015-04-15 03:19:00 202201 WINE 0.0
2015-04-15 03:22:00 202201 CHECKOUT 21.0

Sessionize the table by each user_id:


In [17]:
%%sql
DROP VIEW IF EXISTS sessionize_output_view;

 SELECT madlib.sessionize(
     'eventlog',             -- Name of input table
     'sessionize_output_view',   -- View to store sessionize results
      'user_id',             -- Partition input table by user id
     'event_timestamp',      -- Time column used to compute sessions
     '0:30:0'                -- Time out used to define a session (30 minutes)
    );
    
SELECT * FROM sessionize_output_view ORDER BY user_id, event_timestamp;


Done.
1 rows affected.
21 rows affected.
Out[17]:
event_timestamp user_id page revenue session_id
2015-04-15 01:03:00 100821 LANDING 0.0 1
2015-04-15 01:04:00 100821 WINE 0.0 1
2015-04-15 01:05:00 100821 CHECKOUT 39.0 1
2015-04-15 02:06:00 100821 WINE 0.0 2
2015-04-15 02:09:00 100821 WINE 0.0 2
2015-04-15 02:15:00 101331 LANDING 0.0 1
2015-04-15 02:16:00 101331 WINE 0.0 1
2015-04-15 02:17:00 101331 HELP 0.0 1
2015-04-15 02:18:00 101331 WINE 0.0 1
2015-04-15 02:19:00 101331 CHECKOUT 16.0 1
2015-04-15 02:29:00 201881 LANDING 0.0 1
2015-04-15 02:30:00 201881 BEER 0.0 1
2015-04-15 01:05:00 202201 LANDING 0.0 1
2015-04-15 01:06:00 202201 HELP 0.0 1
2015-04-15 01:09:00 202201 LANDING 0.0 1
2015-04-15 02:15:00 202201 WINE 0.0 2
2015-04-15 02:16:00 202201 BEER 0.0 2
2015-04-15 02:17:00 202201 WINE 0.0 2
2015-04-15 03:18:00 202201 BEER 0.0 3
2015-04-15 03:19:00 202201 WINE 0.0 3
2015-04-15 03:22:00 202201 CHECKOUT 21.0 3

Now let's say we want to see 3 minute sessions by a group of users with a certain range of user IDs. To do this, we need to sessionize the table based on a partition expression. Also, we want to persist a table output with a reduced set of columns in the table.


In [ ]:
%%sql
DROP TABLE IF EXISTS sessionize_output_table;

 SELECT madlib.sessionize(
     'eventlog',                    -- Name of input table
     'sessionize_output_table',     -- Table to store sessionize results
     'user_id < 200000',            -- Partition input table by subset of users
     'event_timestamp',             -- Order partitions in input table by time
     '180',                         -- Use 180 second time out to define sessions
                                    -- Note that this is the same as '0:03:0'
     'event_timestamp, user_id, user_id < 200000 AS "Department-A1"',    -- Select only user_id and event_timestamp columns, along with the session id as output
     'f'                            -- create a table
     );
    
    SELECT * FROM sessionize_output_table WHERE "Department-A1"='TRUE' ORDER BY event_timestamp;

In [ ]: