Import raw data



In [1]:

    
import graphlab as gl
gl.canvas.set_target('ipynb')



In [6]:

    
sf = gl.SFrame('https://static.turi.com/datasets/extended-bakery/bakery_train.sf')









    



PROGRESS: Downloading https://static.turi.com/datasets/extended-bakery/bakery_train.sf/dir_archive.ini to /var/tmp/graphlab-srikris/58364/b6c4ccbb-55d5-403e-b02b-ce9bd9dfa500.ini
PROGRESS: Downloading https://static.turi.com/datasets/extended-bakery/bakery_train.sf/objects.bin to /var/tmp/graphlab-srikris/58364/0faea7c9-a42a-4eea-8a47-c076f95c84dc.bin
PROGRESS: Downloading https://static.turi.com/datasets/extended-bakery/bakery_train.sf/m_a39fbefdaa582db5.frame_idx to /var/tmp/graphlab-srikris/58364/681f37b3-4c2c-4dde-b19b-85906a1837e2.frame_idx
PROGRESS: Downloading https://static.turi.com/datasets/extended-bakery/bakery_train.sf/m_a39fbefdaa582db5.sidx to /var/tmp/graphlab-srikris/58364/65c0cb92-9bae-46b1-8658-c561488314fe.sidx



In [7]:

    
sf









    



PROGRESS: Downloading https://static.turi.com/datasets/extended-bakery/bakery_train.sf/m_a39fbefdaa582db5.0000 to /var/tmp/graphlab-srikris/58364/e700f270-0e0c-4f9c-9089-9bb567176a03.0000






    Out[7]:





    
        Receipt
        SaleDate
        EmpId
        StoreNum
        Quantity
        Item
    
    
        1
        12-JAN-2000
        20
        20
        1
        GanacheCookie
    
    
        1
        12-JAN-2000
        20
        20
        5
        ApplePie
    
    
        2
        15-JAN-2000
        35
        10
        1
        CoffeeEclair
    
    
        2
        15-JAN-2000
        35
        10
        3
        ApplePie
    
    
        2
        15-JAN-2000
        35
        10
        4
        AlmondTwist
    
    
        2
        15-JAN-2000
        35
        10
        3
        HotCoffee
    
    
        3
        8-JAN-2000
        13
        13
        5
        OperaCake
    
    
        3
        8-JAN-2000
        13
        13
        3
        OrangeJuice
    
    
        3
        8-JAN-2000
        13
        13
        3
        CheeseCroissant
    
    
        4
        24-JAN-2000
        16
        16
        1
        TruffleCake
    

[212933 rows x 6 columns]
Note: Only the head of the SFrame is printed.
You can use print_rows(num_rows=m, num_columns=n) to print more rows and columns.



In [8]:

    
sf['Item'].show()

Find Patterns



In [9]:

    
model = gl.frequent_pattern_mining.create(sf, 
                item='Item', 
                features = ['Receipt', 'StoreNum'], 
                min_length=2)









    



PROGRESS: Indexing complete. Found 50 unique items.
PROGRESS: Preprocessing complete. Found 59999 unique transactions.
PROGRESS: Building frequent pattern tree.
PROGRESS: +-----------+----------------+------------+----------------------+------------------+
PROGRESS: | Iteration | Num. Patterns  | Support    | Current Min Support  | Elapsed Time     |
PROGRESS: +-----------+----------------+------------+----------------------+------------------+
PROGRESS: | 0         | 0              | 6582       | 103                  | 1.006ms          |
PROGRESS: | 1         | 0              | 6131       | 103                  | 1.409ms          |
PROGRESS: | 2         | 1              | 6011       | 103                  | 1.724ms          |
PROGRESS: | 3         | 3              | 5624       | 103                  | 2.161ms          |
PROGRESS: | 4         | 6              | 5613       | 103                  | 2.694ms          |
PROGRESS: | 5         | 10             | 5582       | 103                  | 3.14ms           |
PROGRESS: | 6         | 15             | 5495       | 103                  | 3.686ms          |
PROGRESS: | 7         | 21             | 5437       | 103                  | 4.109ms          |
PROGRESS: | 8         | 28             | 5378       | 103                  | 4.453ms          |
PROGRESS: | 9         | 38             | 5087       | 103                  | 4.971ms          |
PROGRESS: | 10        | 48             | 5030       | 103                  | 5.631ms          |
PROGRESS: | 11        | 59             | 5021       | 103                  | 6.232ms          |
PROGRESS: | 12        | 71             | 5000       | 103                  | 6.86ms           |
PROGRESS: | 13        | 83             | 4990       | 103                  | 7.59ms           |
PROGRESS: | 14        | 97             | 4959       | 103                  | 8.532ms          |
PROGRESS: | 15        | 100            | 4953       | 239                  | 9.555ms          |
PROGRESS: | 16        | 100            | 4940       | 252                  | 10.74ms          |
PROGRESS: | 17        | 100            | 4899       | 265                  | 11.826ms         |
PROGRESS: | 18        | 100            | 4884       | 274                  | 13.175ms         |
PROGRESS: | 19        | 100            | 4661       | 279                  | 14.334ms         |
PROGRESS: | 20        | 100            | 4622       | 282                  | 15.054ms         |
PROGRESS: | 21        | 100            | 4613       | 283                  | 15.772ms         |
PROGRESS: | 22        | 100            | 4566       | 288                  | 16.797ms         |
PROGRESS: | 23        | 100            | 4550       | 293                  | 17.895ms         |
PROGRESS: | 24        | 100            | 4525       | 295                  | 19.002ms         |
PROGRESS: | 25        | 100            | 4511       | 296                  | 19.925ms         |
PROGRESS: | 26        | 100            | 4401       | 297                  | 20.986ms         |
PROGRESS: | 27        | 100            | 4131       | 297                  | 22.021ms         |
PROGRESS: | 28        | 100            | 4123       | 297                  | 22.813ms         |
PROGRESS: | 29        | 100            | 4111       | 297                  | 24.168ms         |
PROGRESS: | 30        | 100            | 4071       | 297                  | 25.278ms         |
PROGRESS: | 31        | 100            | 4058       | 298                  | 26.721ms         |
PROGRESS: | 32        | 100            | 4054       | 300                  | 28.235ms         |
PROGRESS: | 33        | 100            | 4050       | 301                  | 30.849ms         |
PROGRESS: | 34        | 100            | 4048       | 302                  | 32.357ms         |
PROGRESS: | 35        | 100            | 4031       | 304                  | 34.162ms         |
PROGRESS: | 36        | 100            | 3779       | 306                  | 35.882ms         |
PROGRESS: | 37        | 100            | 3680       | 315                  | 37.767ms         |
PROGRESS: | 38        | 100            | 2697       | 320                  | 39.707ms         |
PROGRESS: | 39        | 100            | 2608       | 320                  | 41.195ms         |
PROGRESS: | 40        | 100            | 2580       | 320                  | 42.59ms          |
PROGRESS: | 41        | 100            | 2564       | 320                  | 44.011ms         |
PROGRESS: | 42        | 100            | 2557       | 320                  | 45.347ms         |
PROGRESS: | 43        | 100            | 2551       | 320                  | 46.798ms         |
PROGRESS: | 44        | 100            | 2549       | 320                  | 48.766ms         |
PROGRESS: | 45        | 100            | 2540       | 320                  | 50.667ms         |
PROGRESS: | 46        | 100            | 2538       | 320                  | 52.198ms         |
PROGRESS: | 47        | 100            | 2532       | 320                  | 53.951ms         |
PROGRESS: | 48        | 100            | 2513       | 320                  | 55.735ms         |
PROGRESS: | 49        | 100            | 2503       | 320                  | 57.286ms         |
PROGRESS: | Final     | 100            | -          | 320                  | 59.06ms          |
PROGRESS: +-----------+----------------+------------+----------------------+------------------+
PROGRESS: Pattern mining complete. Found 100 unique closed patterns.



In [11]:

    
patterns = model.get_frequent_patterns()
patterns.print_rows(max_column_width=100)
model.save('pattern_mining_model.gl')









    



+----------------------------------------+---------+
|                pattern                 | support |
+----------------------------------------+---------+
|      [CherryTart, ApricotDanish]       |   3209  |
|     [TuileCookie, MarzipanCookie]      |   3023  |
|    [ChocolateCake, ChocolateCoffee]    |   2652  |
|        [CherryTart, OperaCake]         |   2625  |
|     [GongolaisCookie, TruffleCake]     |   2620  |
|     [StrawberryCake, NapoleonCake]     |   2615  |
|       [ApricotDanish, OperaCake]       |   2604  |
|   [ApricotCroissant, BlueberryTart]    |   2599  |
|     [OrangeJuice, CheeseCroissant]     |   2575  |
| [CherryTart, ApricotDanish, OperaCake] |   2487  |
+----------------------------------------+---------+
[100 rows x 2 columns]

Making predictions!



In [12]:

    
new_data = gl.SFrame({'Receipt': [1356]*2, 
                      'StoreNum': [2]*2, 
                      'Item': ['CherryTart', 'ApplePie']})
model.predict(new_data)









    



PROGRESS: Preprocessing complete. Found 1 unique transactions.






    Out[12]:





    
        Receipt
        StoreNum
        prefix
        prediction
        confidence
        prefix support
        joint support
    
    
        1356
        2
        [CherryTart]
        [ApricotDanish]
        0.571708533761
        5613
        3209
    

[1 rows x 7 columns]



In [13]:

    
model.predict_topk(new_data)









    



PROGRESS: Preprocessing complete. Found 1 unique transactions.






    Out[13]:





    
        Receipt
        StoreNum
        prefix
        prediction
        confidence
        prefix support
        joint support
    
    
        1356
        2
        [CherryTart]
        [ApricotDanish]
        0.571708533761
        5613
        3209
    
    
        1356
        2
        [ApplePie]
        [CoffeeEclair]
        0.483340545219
        4622
        2234
    
    
        1356
        2
        [ApplePie]
        [AlmondTwist]
        0.475984422328
        4622
        2200
    
    
        1356
        2
        [CherryTart]
        [OperaCake]
        0.467664350615
        5613
        2625
    
    
        1356
        2
        [ApplePie]
        [AlmondTwist,
CoffeeEclair] ...
        0.44634357421
        4622
        2063
    

[5 rows x 7 columns]

Extract features



In [14]:

    
pattern_sf = model.extract_features(sf)
pattern_sf









    



PROGRESS: Preprocessing complete. Found 59999 unique transactions.






    Out[14]:





    
        Receipt
        StoreNum
        extracted_features
    
    
        47850
        2
        [0.0, 0.0, 0.0, 0.0, 0.0,
0.0, 0.0, 0.0, 0.0, 0.0, ...
    
    
        26659
        3
        [0.0, 0.0, 1.0, 0.0, 0.0,
0.0, 0.0, 0.0, 0.0, 0.0, ...
    
    
        28397
        3
        [0.0, 0.0, 1.0, 0.0, 0.0,
0.0, 0.0, 0.0, 0.0, 0.0, ...
    
    
        13610
        3
        [0.0, 0.0, 0.0, 0.0, 0.0,
0.0, 0.0, 0.0, 0.0, 0.0, ...
    
    
        27382
        12
        [0.0, 0.0, 0.0, 0.0, 0.0,
0.0, 0.0, 0.0, 1.0, 0.0, ...
    
    
        38454
        3
        [0.0, 0.0, 0.0, 0.0, 0.0,
0.0, 0.0, 0.0, 0.0, 0.0, ...
    
    
        17365
        12
        [0.0, 0.0, 0.0, 0.0, 1.0,
0.0, 0.0, 0.0, 0.0, 0.0, ...
    
    
        48835
        3
        [0.0, 0.0, 0.0, 0.0, 0.0,
0.0, 0.0, 0.0, 0.0, 0.0, ...
    
    
        35951
        4
        [0.0, 0.0, 0.0, 0.0, 0.0,
0.0, 0.0, 0.0, 0.0, 0.0, ...
    
    
        35648
        13
        [1.0, 0.0, 0.0, 0.0, 0.0,
0.0, 0.0, 0.0, 0.0, 0.0, ...
    

[59999 rows x 3 columns]
Note: Only the head of the SFrame is printed.
You can use print_rows(num_rows=m, num_columns=n) to print more rows and columns.

Cluster in Employee Space



In [15]:

    
emps = sf.groupby(['StoreNum','Receipt'], {
         'EmpId': gl.aggregate.SELECT_ONE('EmpId')})
emps









    Out[15]:





    
        Receipt
        StoreNum
        EmpId
    
    
        47850
        2
        23
    
    
        26659
        3
        3
    
    
        28397
        3
        3
    
    
        13610
        3
        3
    
    
        27382
        12
        45
    
    
        38454
        3
        3
    
    
        17365
        12
        44
    
    
        48835
        3
        3
    
    
        35951
        4
        26
    
    
        35648
        13
        13
    

[59999 rows x 3 columns]
Note: Only the head of the SFrame is printed.
You can use print_rows(num_rows=m, num_columns=n) to print more rows and columns.



In [17]:

    
emp_space = emps.join(pattern_sf)\
                .groupby('EmpId', {'all_features': gl.aggregate.SUM('extracted_features')})
emp_space









    Out[17]:





    
        EmpId
        all_features
    
    
        49
        [56.0, 55.0, 40.0, 43.0,
42.0, 39.0, 42.0, 49.0, ...
    
    
        13
        [184.0, 164.0, 122.0,
143.0, 94.0, 128.0, ...
    
    
        36
        [41.0, 48.0, 50.0, 34.0,
41.0, 38.0, 31.0, 46.0, ...
    
    
        24
        [39.0, 42.0, 35.0, 35.0,
34.0, 31.0, 36.0, 27.0, ...
    
    
        2
        [29.0, 39.0, 30.0, 28.0,
46.0, 30.0, 25.0, 38.0, ...
    
    
        46
        [39.0, 32.0, 14.0, 30.0,
41.0, 43.0, 29.0, 34.0, ...
    
    
        30
        [23.0, 19.0, 18.0, 21.0,
16.0, 12.0, 20.0, 19.0, ...
    
    
        42
        [25.0, 22.0, 23.0, 21.0,
17.0, 28.0, 20.0, 20.0, ...
    
    
        44
        [17.0, 30.0, 20.0, 16.0,
19.0, 21.0, 16.0, 12.0, ...
    
    
        31
        [21.0, 19.0, 21.0, 18.0,
16.0, 16.0, 20.0, 20.0, ...
    

[50 rows x 2 columns]
Note: Only the head of the SFrame is printed.
You can use print_rows(num_rows=m, num_columns=n) to print more rows and columns.



In [18]:

    
cl_model = gl.kmeans.create(emp_space, 
                            features = ['all_features'], 
                            num_clusters=3)









    



PROGRESS: Initializing data.
PROGRESS: Batch size is larger than the input dataset. Switching to an exact Kmeans method.
PROGRESS: Choosing initial cluster centers with Kmeans++.
PROGRESS: +---------------+-----------+
PROGRESS: | Center number | Row index |
PROGRESS: +---------------+-----------+
PROGRESS: | 0             | 27        |
PROGRESS: | 1             | 39        |
PROGRESS: | 2             | 28        |
PROGRESS: +---------------+-----------+
PROGRESS: Starting kmeans model training.
PROGRESS: Assigning points to initial cluster centers.
PROGRESS: +-----------+-------------------------------+
PROGRESS: | Iteration | Number of changed assignments |
PROGRESS: +-----------+-------------------------------+
PROGRESS: | 1         | 0                             |
PROGRESS: +-----------+-------------------------------+



In [19]:

    
emp_space['cluster_id'] = cl_model['cluster_id']['cluster_id']
emp_space









    Out[19]:





    
        EmpId
        all_features
        cluster_id
    
    
        49
        [56.0, 55.0, 40.0, 43.0,
42.0, 39.0, 42.0, 49.0, ...
        0
    
    
        13
        [184.0, 164.0, 122.0,
143.0, 94.0, 128.0, ...
        1
    
    
        36
        [41.0, 48.0, 50.0, 34.0,
41.0, 38.0, 31.0, 46.0, ...
        0
    
    
        24
        [39.0, 42.0, 35.0, 35.0,
34.0, 31.0, 36.0, 27.0, ...
        0
    
    
        2
        [29.0, 39.0, 30.0, 28.0,
46.0, 30.0, 25.0, 38.0, ...
        0
    
    
        46
        [39.0, 32.0, 14.0, 30.0,
41.0, 43.0, 29.0, 34.0, ...
        0
    
    
        30
        [23.0, 19.0, 18.0, 21.0,
16.0, 12.0, 20.0, 19.0, ...
        2
    
    
        42
        [25.0, 22.0, 23.0, 21.0,
17.0, 28.0, 20.0, 20.0, ...
        2
    
    
        44
        [17.0, 30.0, 20.0, 16.0,
19.0, 21.0, 16.0, 12.0, ...
        2
    
    
        31
        [21.0, 19.0, 21.0, 18.0,
16.0, 16.0, 20.0, 20.0, ...
        2
    

[50 rows x 3 columns]
Note: Only the head of the SFrame is printed.
You can use print_rows(num_rows=m, num_columns=n) to print more rows and columns.



In [20]:

    
emp_space.show(x='cluster_id', y = 'StoreNum', view='Bar Chart')



In [ ]:

Receipt	SaleDate	EmpId	StoreNum	Quantity	Item
1	12-JAN-2000	20	20	1	GanacheCookie
1	12-JAN-2000	20	20	5	ApplePie
2	15-JAN-2000	35	10	1	CoffeeEclair
2	15-JAN-2000	35	10	3	ApplePie
2	15-JAN-2000	35	10	4	AlmondTwist
2	15-JAN-2000	35	10	3	HotCoffee
3	8-JAN-2000	13	13	5	OperaCake
3	8-JAN-2000	13	13	3	OrangeJuice
3	8-JAN-2000	13	13	3	CheeseCroissant
4	24-JAN-2000	16	16	1	TruffleCake

Receipt	StoreNum	prefix	prediction	confidence	prefix support	joint support
1356	2	[CherryTart]	[ApricotDanish]	0.571708533761	5613	3209
1356	2	[ApplePie]	[CoffeeEclair]	0.483340545219	4622	2234
1356	2	[ApplePie]	[AlmondTwist]	0.475984422328	4622	2200
1356	2	[CherryTart]	[OperaCake]	0.467664350615	5613	2625
1356	2	[ApplePie]	[AlmondTwist, CoffeeEclair] ...	0.44634357421	4622	2063

Receipt	StoreNum	extracted_features
47850	2	[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...
26659	3	[0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...
28397	3	[0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...
13610	3	[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...
27382	12	[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, ...
38454	3	[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...
17365	12	[0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...
48835	3	[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...
35951	4	[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...
35648	13	[1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...

EmpId	all_features
49	[56.0, 55.0, 40.0, 43.0, 42.0, 39.0, 42.0, 49.0, ...
13	[184.0, 164.0, 122.0, 143.0, 94.0, 128.0, ...
36	[41.0, 48.0, 50.0, 34.0, 41.0, 38.0, 31.0, 46.0, ...
24	[39.0, 42.0, 35.0, 35.0, 34.0, 31.0, 36.0, 27.0, ...
2	[29.0, 39.0, 30.0, 28.0, 46.0, 30.0, 25.0, 38.0, ...
46	[39.0, 32.0, 14.0, 30.0, 41.0, 43.0, 29.0, 34.0, ...
30	[23.0, 19.0, 18.0, 21.0, 16.0, 12.0, 20.0, 19.0, ...
42	[25.0, 22.0, 23.0, 21.0, 17.0, 28.0, 20.0, 20.0, ...
44	[17.0, 30.0, 20.0, 16.0, 19.0, 21.0, 16.0, 12.0, ...
31	[21.0, 19.0, 21.0, 18.0, 16.0, 16.0, 20.0, 20.0, ...