In [ ]:
%load_ext sql
In [ ]:
# %sql postgresql://gpdbchina@10.194.10.68:55000/madlib
%sql postgresql://fmcquillan@localhost:5432/madlib
In [15]:
%sql select madlib.version();
Out[15]:
The data set describes shopper behavior on a notional web site that sells beer and wine. A beacon fires an event to a log file when the shopper visits different pages on the site: landing page, beer selection page, wine selection page, and checkout. Each user is identified by a a user id, and every time a page is visited, the page and time stamp are logged.
Create the data table:
In [16]:
%%sql
DROP TABLE IF EXISTS eventlog CASCADE; -- Use CASCADE because views created below depend on this table
CREATE TABLE eventlog (event_timestamp TIMESTAMP,
user_id INT,
page TEXT,
revenue FLOAT);
INSERT INTO eventlog VALUES
('04/15/2015 02:19:00', 101331, 'CHECKOUT', 16),
('04/15/2015 02:17:00', 202201, 'WINE', 0),
('04/15/2015 03:18:00', 202201, 'BEER', 0),
('04/15/2015 01:03:00', 100821, 'LANDING', 0),
('04/15/2015 01:04:00', 100821, 'WINE', 0),
('04/15/2015 01:05:00', 100821, 'CHECKOUT', 39),
('04/15/2015 02:06:00', 100821, 'WINE', 0),
('04/15/2015 02:09:00', 100821, 'WINE', 0),
('04/15/2015 02:15:00', 101331, 'LANDING', 0),
('04/15/2015 02:16:00', 101331, 'WINE', 0),
('04/15/2015 02:17:00', 101331, 'HELP', 0),
('04/15/2015 02:18:00', 101331, 'WINE', 0),
('04/15/2015 02:29:00', 201881, 'LANDING', 0),
('04/15/2015 02:30:00', 201881, 'BEER', 0),
('04/15/2015 01:05:00', 202201, 'LANDING', 0),
('04/15/2015 01:06:00', 202201, 'HELP', 0),
('04/15/2015 01:09:00', 202201, 'LANDING', 0),
('04/15/2015 02:15:00', 202201, 'WINE', 0),
('04/15/2015 02:16:00', 202201, 'BEER', 0),
('04/15/2015 03:19:00', 202201, 'WINE', 0),
('04/15/2015 03:22:00', 202201, 'CHECKOUT', 21);
SELECT * FROM eventlog ORDER BY event_timestamp;
Out[16]:
Sessionize the table by each user_id:
In [17]:
%%sql
DROP VIEW IF EXISTS sessionize_output_view;
SELECT madlib.sessionize(
'eventlog', -- Name of input table
'sessionize_output_view', -- View to store sessionize results
'user_id', -- Partition input table by user id
'event_timestamp', -- Time column used to compute sessions
'0:30:0' -- Time out used to define a session (30 minutes)
);
SELECT * FROM sessionize_output_view ORDER BY user_id, event_timestamp;
Out[17]:
Now let's say we want to see 3 minute sessions by a group of users with a certain range of user IDs. To do this, we need to sessionize the table based on a partition expression. Also, we want to persist a table output with a reduced set of columns in the table.
In [ ]:
%%sql
DROP TABLE IF EXISTS sessionize_output_table;
SELECT madlib.sessionize(
'eventlog', -- Name of input table
'sessionize_output_table', -- Table to store sessionize results
'user_id < 200000', -- Partition input table by subset of users
'event_timestamp', -- Order partitions in input table by time
'180', -- Use 180 second time out to define sessions
-- Note that this is the same as '0:03:0'
'event_timestamp, user_id, user_id < 200000 AS "Department-A1"', -- Select only user_id and event_timestamp columns, along with the session id as output
'f' -- create a table
);
SELECT * FROM sessionize_output_table WHERE "Department-A1"='TRUE' ORDER BY event_timestamp;
In [ ]: