Pivot

The goal of the MADlib pivot function is to provide a data summarization tool that can do basic OLAP type operations on data stored in one table and output the summarized data to a second table.


In [ ]:
%load_ext sql

In [ ]:
# %sql postgresql://gpdbchina@10.194.10.68:55000/madlib
%sql postgresql://fmcquillan@localhost:5432/madlib

In [36]:
%sql select madlib.version();


1 rows affected.
Out[36]:
version
MADlib version: 1.9.1, git revision: rc/v1.9-rc1-39-g1929aed, cmake configuration time: Tue Aug 30 00:17:02 UTC 2016, build type: RelWithDebInfo, build system: Darwin-14.5.0, C compiler: Clang, C++ compiler: Clang

Create a simple dataset to demonstrate a basic pivot:


In [37]:
%%sql 
DROP TABLE IF EXISTS pivset CASCADE;
CREATE TABLE pivset(
                  id INTEGER,
                  piv INTEGER,
                  val FLOAT8
                );
INSERT INTO pivset VALUES
    (0, 10, 1),
    (0, 10, 2),
    (0, 20, 3),
    (1, 20, 4),
    (1, 30, 5),
    (1, 30, 6),
    (1, 10, 7),
    (NULL, 10, 8),
    (1, NULL, 9),
    (1, 10, NULL);

SELECT * FROM pivset ORDER BY id;


Done.
Done.
10 rows affected.
10 rows affected.
Out[37]:
id piv val
0 20 3.0
0 10 1.0
0 10 2.0
1 30 6.0
1 10 7.0
1 None 9.0
1 10 None
1 20 4.0
1 30 5.0
None 10 8.0

In [38]:
%%sql
DROP TABLE IF EXISTS pivout;
SELECT madlib.pivot('pivset',     -- source data
                    'pivout',     -- output data
                    'id',         -- index (rows in the output table)
                    'piv',        -- pivot columns
                    'val');       -- values to be summarized in the output table
SELECT * FROM pivout ORDER BY id;


Done.
1 rows affected.
3 rows affected.
Out[38]:
id val_avg_piv_10 val_avg_piv_20 val_avg_piv_30
0 1.5 3.0 None
1 7.0 4.0 5.5
None 8.0 None None

Now let's add some more columns to our data set and create a view:


In [39]:
%%sql
DROP VIEW IF EXISTS pivset_ext;
CREATE VIEW pivset_ext AS
    SELECT *,
    COALESCE(id + (val / 3)::integer, 0) AS id2,
    COALESCE(100*(val / 3)::integer, 0) AS piv2,
    COALESCE(val + 10, 0) AS val2
   FROM pivset;
SELECT id,id2,piv,piv2,val,val2 FROM pivset_ext
ORDER BY id,id2,piv,piv2,val,val2;


Done.
Done.
10 rows affected.
Out[39]:
id id2 piv piv2 val val2
0 0 10 0 1.0 11.0
0 1 10 100 2.0 12.0
0 1 20 100 3.0 13.0
1 0 10 0 None 0.0
1 2 20 100 4.0 14.0
1 3 10 200 7.0 17.0
1 3 30 200 5.0 15.0
1 3 30 200 6.0 16.0
1 4 None 300 9.0 19.0
None 0 10 300 8.0 18.0

Let's use a different aggregate function on the view we just created:


In [40]:
%%sql
DROP TABLE IF EXISTS pivout;
SELECT madlib.pivot('pivset_ext', 'pivout', 'id', 'piv', 'val', 'sum');
SELECT * FROM pivout ORDER BY id;


Done.
1 rows affected.
3 rows affected.
Out[40]:
id val_sum_piv_10 val_sum_piv_20 val_sum_piv_30
0 3.0 3.0 None
1 7.0 4.0 11.0
None 8.0 None None

Now create a custom aggregate. Note that the aggregate must have a strict transition function:


In [41]:
%%sql
DROP FUNCTION IF EXISTS array_add1 (ANYARRAY, ANYELEMENT) CASCADE;
CREATE FUNCTION array_add1(ANYARRAY, ANYELEMENT) RETURNS ANYARRAY AS $$
  SELECT $1 || $2
$$ LANGUAGE sql STRICT;

DROP AGGREGATE IF EXISTS array_accum1 (anyelement);
CREATE AGGREGATE array_accum1 (anyelement) (
    sfunc = array_add1,     -- state transition function
    stype = anyarray,       -- current internal state of the aggregate (temp variable)
    initcond = '{}'         -- IC is NULL                                                                                                                                     
);
DROP TABLE IF EXISTS pivout;
SELECT madlib.pivot('pivset_ext', 'pivout', 'id', 'piv', 'val', 'array_accum1'); -- OK since STRICT
-- SELECT madlib.pivot('pivset_ext', 'pivout', 'id', 'piv', 'val', 'array_agg'); -- this will throw error since not STRICT
SELECT * FROM pivout ORDER BY id;


Done.
Done.
Done.
Done.
Done.
1 rows affected.
3 rows affected.
Out[41]:
id val_array_accum1_piv_10 val_array_accum1_piv_20 val_array_accum1_piv_30
0 [1.0, 2.0] [3.0] []
1 [7.0] [4.0] [5.0, 6.0]
None [8.0] [] []

Keep null values in the pivot column:


In [ ]:
%%sql
DROP TABLE IF EXISTS pivout;
SELECT madlib.pivot('pivset_ext', 'pivout', 'id', 'piv', 'val', 'sum', True);
SELECT * FROM pivout ORDER BY id;

Fill null results with a value of interest:


In [ ]:
%%sql
DROP TABLE IF EXISTS pivout;
SELECT madlib.pivot('pivset_ext', 'pivout', 'id', 'piv', 'val', 'sum', '111');
SELECT * FROM pivout ORDER BY id;

Use multiple index columns:


In [ ]:
%%sql
DROP TABLE IF EXISTS pivout;
SELECT madlib.pivot('pivset_ext', 'pivout', 'id,id2', 'piv', 'val');
SELECT * FROM pivout ORDER BY id,id2;

Use multiple pivot columns:


In [ ]:
%%sql
DROP TABLE IF EXISTS pivout;
SELECT madlib.pivot('pivset_ext', 'pivout', 'id', 'piv, piv2', 'val');
SELECT * FROM pivout ORDER BY id;

Use multiple value columns:


In [ ]:
%%sql
DROP TABLE IF EXISTS pivout;
SELECT madlib.pivot('pivset_ext', 'pivout', 'id', 'piv', 'val, val2');
SELECT * FROM pivout ORDER BY id;

Use multiple aggregate functions on the same value column (cross product):


In [ ]:
%%sql
DROP TABLE IF EXISTS pivout;
SELECT madlib.pivot('pivset_ext', 'pivout', 'id', 'piv', 'val', 'avg, sum');
SELECT * FROM pivout ORDER BY id;

Use different aggregate functions for different value columns:


In [ ]:
%%sql
DROP TABLE IF EXISTS pivout;
SELECT madlib.pivot('pivset_ext', 'pivout', 'id', 'piv', 'val, val2',
    'val=avg, val2=sum');
SELECT * FROM pivout ORDER BY id;

Use multiple aggregate functions for different value columns:


In [ ]:
%%sql
DROP TABLE IF EXISTS pivout;
SELECT madlib.pivot('pivset_ext', 'pivout', 'id, id2', 'piv, piv2', 'val, val2',
    'val=avg, val2=[avg,sum]', '111', True);
SELECT * FROM pivout ORDER BY id,id2;

Combine all of the options:


In [42]:
%%sql
DROP TABLE IF EXISTS pivout;
SELECT madlib.pivot('pivset_ext', 'pivout', 'id, id2', 'piv, piv2', 'val, val2',
    'val=avg, val2=[avg,sum]', '111', True);
SELECT * FROM pivout ORDER BY id,id2;


Done.
1 rows affected.
7 rows affected.
Out[42]:
id id2 val_avg_piv_null_piv2_0 val_avg_piv_null_piv2_100 val_avg_piv_null_piv2_200 val_avg_piv_null_piv2_300 val_avg_piv_10_piv2_0 val_avg_piv_10_piv2_100 val_avg_piv_10_piv2_200 val_avg_piv_10_piv2_300 val_avg_piv_20_piv2_0 val_avg_piv_20_piv2_100 val_avg_piv_20_piv2_200 val_avg_piv_20_piv2_300 val_avg_piv_30_piv2_0 val_avg_piv_30_piv2_100 val_avg_piv_30_piv2_200 val_avg_piv_30_piv2_300 val2_avg_piv_null_piv2_0 val2_avg_piv_null_piv2_100 val2_avg_piv_null_piv2_200 val2_avg_piv_null_piv2_300 val2_avg_piv_10_piv2_0 val2_avg_piv_10_piv2_100 val2_avg_piv_10_piv2_200 val2_avg_piv_10_piv2_300 val2_avg_piv_20_piv2_0 val2_avg_piv_20_piv2_100 val2_avg_piv_20_piv2_200 val2_avg_piv_20_piv2_300 val2_avg_piv_30_piv2_0 val2_avg_piv_30_piv2_100 val2_avg_piv_30_piv2_200 val2_avg_piv_30_piv2_300 val2_sum_piv_null_piv2_0 val2_sum_piv_null_piv2_100 val2_sum_piv_null_piv2_200 val2_sum_piv_null_piv2_300 val2_sum_piv_10_piv2_0 val2_sum_piv_10_piv2_100 val2_sum_piv_10_piv2_200 val2_sum_piv_10_piv2_300 val2_sum_piv_20_piv2_0 val2_sum_piv_20_piv2_100 val2_sum_piv_20_piv2_200 val2_sum_piv_20_piv2_300 val2_sum_piv_30_piv2_0 val2_sum_piv_30_piv2_100 val2_sum_piv_30_piv2_200 val2_sum_piv_30_piv2_300
0 0 111.0 111.0 111.0 111.0 1.0 111.0 111.0 111.0 111.0 111.0 111.0 111.0 111.0 111.0 111.0 111.0 111.0 111.0 111.0 111.0 11.0 111.0 111.0 111.0 111.0 111.0 111.0 111.0 111.0 111.0 111.0 111.0 111.0 111.0 111.0 111.0 11.0 111.0 111.0 111.0 111.0 111.0 111.0 111.0 111.0 111.0 111.0 111.0
0 1 111.0 111.0 111.0 111.0 111.0 2.0 111.0 111.0 111.0 3.0 111.0 111.0 111.0 111.0 111.0 111.0 111.0 111.0 111.0 111.0 111.0 12.0 111.0 111.0 111.0 13.0 111.0 111.0 111.0 111.0 111.0 111.0 111.0 111.0 111.0 111.0 111.0 12.0 111.0 111.0 111.0 13.0 111.0 111.0 111.0 111.0 111.0 111.0
1 0 111.0 111.0 111.0 111.0 111.0 111.0 111.0 111.0 111.0 111.0 111.0 111.0 111.0 111.0 111.0 111.0 111.0 111.0 111.0 111.0 0.0 111.0 111.0 111.0 111.0 111.0 111.0 111.0 111.0 111.0 111.0 111.0 111.0 111.0 111.0 111.0 0.0 111.0 111.0 111.0 111.0 111.0 111.0 111.0 111.0 111.0 111.0 111.0
1 2 111.0 111.0 111.0 111.0 111.0 111.0 111.0 111.0 111.0 4.0 111.0 111.0 111.0 111.0 111.0 111.0 111.0 111.0 111.0 111.0 111.0 111.0 111.0 111.0 111.0 14.0 111.0 111.0 111.0 111.0 111.0 111.0 111.0 111.0 111.0 111.0 111.0 111.0 111.0 111.0 111.0 14.0 111.0 111.0 111.0 111.0 111.0 111.0
1 3 111.0 111.0 111.0 111.0 111.0 111.0 7.0 111.0 111.0 111.0 111.0 111.0 111.0 111.0 5.5 111.0 111.0 111.0 111.0 111.0 111.0 111.0 17.0 111.0 111.0 111.0 111.0 111.0 111.0 111.0 15.5 111.0 111.0 111.0 111.0 111.0 111.0 111.0 17.0 111.0 111.0 111.0 111.0 111.0 111.0 111.0 31.0 111.0
1 4 111.0 111.0 111.0 9.0 111.0 111.0 111.0 111.0 111.0 111.0 111.0 111.0 111.0 111.0 111.0 111.0 111.0 111.0 111.0 19.0 111.0 111.0 111.0 111.0 111.0 111.0 111.0 111.0 111.0 111.0 111.0 111.0 111.0 111.0 111.0 19.0 111.0 111.0 111.0 111.0 111.0 111.0 111.0 111.0 111.0 111.0 111.0 111.0
None 0 111.0 111.0 111.0 111.0 111.0 111.0 111.0 8.0 111.0 111.0 111.0 111.0 111.0 111.0 111.0 111.0 111.0 111.0 111.0 111.0 111.0 111.0 111.0 18.0 111.0 111.0 111.0 111.0 111.0 111.0 111.0 111.0 111.0 111.0 111.0 111.0 111.0 111.0 111.0 18.0 111.0 111.0 111.0 111.0 111.0 111.0 111.0 111.0

Create a dictionary for output column names:


In [ ]:
%%sql
DROP TABLE IF EXISTS pivout, pivout_dictionary;
SELECT madlib.pivot('pivset_ext', 'pivout', 'id, id2', 'piv, piv2', 'val, val2',
    'val=avg, val2=[avg,sum]', '111', True, True);
SELECT * FROM pivout_dictionary;

In [ ]:
%%sql
SELECT * FROM pivout ORDER BY id,id2;