Market Basket Analysis Introduction

Companion notebook to http://pbpython.com/market-basket-analysis.html


In [1]:
import pandas as pd
from mlxtend.frequent_patterns import apriori
from mlxtend.frequent_patterns import association_rules

In [2]:
df = pd.read_excel('http://archive.ics.uci.edu/ml/machine-learning-databases/00352/Online%20Retail.xlsx')

In [3]:
df.head()


Out[3]:
InvoiceNo StockCode Description Quantity InvoiceDate UnitPrice CustomerID Country
0 536365 85123A WHITE HANGING HEART T-LIGHT HOLDER 6 2010-12-01 08:26:00 2.55 17850.0 United Kingdom
1 536365 71053 WHITE METAL LANTERN 6 2010-12-01 08:26:00 3.39 17850.0 United Kingdom
2 536365 84406B CREAM CUPID HEARTS COAT HANGER 8 2010-12-01 08:26:00 2.75 17850.0 United Kingdom
3 536365 84029G KNITTED UNION FLAG HOT WATER BOTTLE 6 2010-12-01 08:26:00 3.39 17850.0 United Kingdom
4 536365 84029E RED WOOLLY HOTTIE WHITE HEART. 6 2010-12-01 08:26:00 3.39 17850.0 United Kingdom

In [4]:
# Clean up spaces in description and remove any rows that don't have a valid invoice
df['Description'] = df['Description'].str.strip()
df.dropna(axis=0, subset=['InvoiceNo'], inplace=True)

In [5]:
df['InvoiceNo'] = df['InvoiceNo'].astype('str')
df = df[~df['InvoiceNo'].str.contains('C')]

In [6]:
basket = (df[df['Country'] =="France"]
          .groupby(['InvoiceNo', 'Description'])['Quantity']
          .sum().unstack().reset_index().fillna(0)
          .set_index('InvoiceNo'))

In [7]:
basket.head()


Out[7]:
Description 10 COLOUR SPACEBOY PEN 12 COLOURED PARTY BALLOONS 12 EGG HOUSE PAINTED WOOD 12 MESSAGE CARDS WITH ENVELOPES 12 PENCIL SMALL TUBE WOODLAND 12 PENCILS SMALL TUBE RED RETROSPOT 12 PENCILS SMALL TUBE SKULL 12 PENCILS TALL TUBE POSY 12 PENCILS TALL TUBE RED RETROSPOT 12 PENCILS TALL TUBE WOODLAND ... WRAP VINTAGE PETALS DESIGN YELLOW COAT RACK PARIS FASHION YELLOW GIANT GARDEN THERMOMETER YELLOW SHARK HELICOPTER ZINC STAR T-LIGHT HOLDER ZINC FOLKART SLEIGH BELLS ZINC HERB GARDEN CONTAINER ZINC METAL HEART DECORATION ZINC T-LIGHT HOLDER STAR LARGE ZINC T-LIGHT HOLDER STARS SMALL
InvoiceNo
536370 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
536852 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
536974 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
537065 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
537463 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0

5 rows × 1563 columns


In [8]:
# Show a subset of columns
basket.iloc[:,[0,1,2,3,4,5,6, 7]].head()


Out[8]:
Description 10 COLOUR SPACEBOY PEN 12 COLOURED PARTY BALLOONS 12 EGG HOUSE PAINTED WOOD 12 MESSAGE CARDS WITH ENVELOPES 12 PENCIL SMALL TUBE WOODLAND 12 PENCILS SMALL TUBE RED RETROSPOT 12 PENCILS SMALL TUBE SKULL 12 PENCILS TALL TUBE POSY
InvoiceNo
536370 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
536852 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
536974 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
537065 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
537463 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0

In [9]:
# Convert the units to 1 hot encoded values
def encode_units(x):
    if x <= 0:
        return 0
    if x >= 1:
        return 1

In [10]:
basket_sets = basket.applymap(encode_units)

In [11]:
# No need to track postage
basket_sets.drop('POSTAGE', inplace=True, axis=1)

In [12]:
basket_sets.head()


Out[12]:
Description 10 COLOUR SPACEBOY PEN 12 COLOURED PARTY BALLOONS 12 EGG HOUSE PAINTED WOOD 12 MESSAGE CARDS WITH ENVELOPES 12 PENCIL SMALL TUBE WOODLAND 12 PENCILS SMALL TUBE RED RETROSPOT 12 PENCILS SMALL TUBE SKULL 12 PENCILS TALL TUBE POSY 12 PENCILS TALL TUBE RED RETROSPOT 12 PENCILS TALL TUBE WOODLAND ... WRAP VINTAGE PETALS DESIGN YELLOW COAT RACK PARIS FASHION YELLOW GIANT GARDEN THERMOMETER YELLOW SHARK HELICOPTER ZINC STAR T-LIGHT HOLDER ZINC FOLKART SLEIGH BELLS ZINC HERB GARDEN CONTAINER ZINC METAL HEART DECORATION ZINC T-LIGHT HOLDER STAR LARGE ZINC T-LIGHT HOLDER STARS SMALL
InvoiceNo
536370 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
536852 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
536974 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
537065 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
537463 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0

5 rows × 1562 columns


In [13]:
# Build up the frequent items
frequent_itemsets = apriori(basket_sets, min_support=0.07, use_colnames=True)

In [14]:
frequent_itemsets.head()


Out[14]:
support itemsets
0 0.071429 [4 TRADITIONAL SPINNING TOPS]
1 0.096939 [ALARM CLOCK BAKELIKE GREEN]
2 0.102041 [ALARM CLOCK BAKELIKE PINK]
3 0.094388 [ALARM CLOCK BAKELIKE RED]
4 0.081633 [BAKING SET 9 PIECE RETROSPOT]

In [15]:
# Create the rules
rules = association_rules(frequent_itemsets, metric="lift", min_threshold=1)
rules


Out[15]:
antecedants consequents support confidence lift
0 (PLASTERS IN TIN CIRCUS PARADE) (PLASTERS IN TIN WOODLAND ANIMALS) 0.168367 0.606061 3.545907
1 (PLASTERS IN TIN WOODLAND ANIMALS) (PLASTERS IN TIN CIRCUS PARADE) 0.170918 0.597015 3.545907
2 (ALARM CLOCK BAKELIKE PINK) (ALARM CLOCK BAKELIKE GREEN) 0.102041 0.725000 7.478947
3 (ALARM CLOCK BAKELIKE GREEN) (ALARM CLOCK BAKELIKE PINK) 0.096939 0.763158 7.478947
4 (SET/6 RED SPOTTY PAPER PLATES) (SET/6 RED SPOTTY PAPER CUPS) 0.127551 0.960000 6.968889
5 (SET/6 RED SPOTTY PAPER CUPS) (SET/6 RED SPOTTY PAPER PLATES) 0.137755 0.888889 6.968889
6 (SET/6 RED SPOTTY PAPER CUPS, SET/20 RED RETRO... (SET/6 RED SPOTTY PAPER PLATES) 0.102041 0.975000 7.644000
7 (SET/6 RED SPOTTY PAPER PLATES, SET/6 RED SPOT... (SET/20 RED RETROSPOT PAPER NAPKINS) 0.122449 0.812500 6.125000
8 (SET/6 RED SPOTTY PAPER PLATES, SET/20 RED RET... (SET/6 RED SPOTTY PAPER CUPS) 0.102041 0.975000 7.077778
9 (SET/6 RED SPOTTY PAPER CUPS) (SET/6 RED SPOTTY PAPER PLATES, SET/20 RED RET... 0.137755 0.722222 7.077778
10 (SET/20 RED RETROSPOT PAPER NAPKINS) (SET/6 RED SPOTTY PAPER PLATES, SET/6 RED SPOT... 0.132653 0.750000 6.125000
11 (SET/6 RED SPOTTY PAPER PLATES) (SET/6 RED SPOTTY PAPER CUPS, SET/20 RED RETRO... 0.127551 0.780000 7.644000
12 (PLASTERS IN TIN CIRCUS PARADE) (PLASTERS IN TIN SPACEBOY) 0.168367 0.530303 3.849607
13 (PLASTERS IN TIN SPACEBOY) (PLASTERS IN TIN CIRCUS PARADE) 0.137755 0.648148 3.849607
14 (PLASTERS IN TIN WOODLAND ANIMALS) (PLASTERS IN TIN SPACEBOY) 0.170918 0.611940 4.442233
15 (PLASTERS IN TIN SPACEBOY) (PLASTERS IN TIN WOODLAND ANIMALS) 0.137755 0.759259 4.442233
16 (ALARM CLOCK BAKELIKE PINK) (ALARM CLOCK BAKELIKE RED) 0.102041 0.725000 7.681081
17 (ALARM CLOCK BAKELIKE RED) (ALARM CLOCK BAKELIKE PINK) 0.094388 0.783784 7.681081
18 (ALARM CLOCK BAKELIKE GREEN) (ALARM CLOCK BAKELIKE RED) 0.096939 0.815789 8.642959
19 (ALARM CLOCK BAKELIKE RED) (ALARM CLOCK BAKELIKE GREEN) 0.094388 0.837838 8.642959
20 (SET/6 RED SPOTTY PAPER CUPS) (SET/20 RED RETROSPOT PAPER NAPKINS) 0.137755 0.740741 5.584046
21 (SET/20 RED RETROSPOT PAPER NAPKINS) (SET/6 RED SPOTTY PAPER CUPS) 0.132653 0.769231 5.584046
22 (SPACEBOY LUNCH BOX) (DOLLY GIRL LUNCH BOX) 0.125000 0.571429 5.743590
23 (DOLLY GIRL LUNCH BOX) (SPACEBOY LUNCH BOX) 0.099490 0.717949 5.743590
24 (SET/6 RED SPOTTY PAPER PLATES) (SET/20 RED RETROSPOT PAPER NAPKINS) 0.127551 0.800000 6.030769
25 (SET/20 RED RETROSPOT PAPER NAPKINS) (SET/6 RED SPOTTY PAPER PLATES) 0.132653 0.769231 6.030769

In [16]:
rules[ (rules['lift'] >= 6) &
       (rules['confidence'] >= 0.8) ]


Out[16]:
antecedants consequents support confidence lift
4 (SET/6 RED SPOTTY PAPER PLATES) (SET/6 RED SPOTTY PAPER CUPS) 0.127551 0.960000 6.968889
5 (SET/6 RED SPOTTY PAPER CUPS) (SET/6 RED SPOTTY PAPER PLATES) 0.137755 0.888889 6.968889
6 (SET/6 RED SPOTTY PAPER CUPS, SET/20 RED RETRO... (SET/6 RED SPOTTY PAPER PLATES) 0.102041 0.975000 7.644000
7 (SET/6 RED SPOTTY PAPER PLATES, SET/6 RED SPOT... (SET/20 RED RETROSPOT PAPER NAPKINS) 0.122449 0.812500 6.125000
8 (SET/6 RED SPOTTY PAPER PLATES, SET/20 RED RET... (SET/6 RED SPOTTY PAPER CUPS) 0.102041 0.975000 7.077778
18 (ALARM CLOCK BAKELIKE GREEN) (ALARM CLOCK BAKELIKE RED) 0.096939 0.815789 8.642959
19 (ALARM CLOCK BAKELIKE RED) (ALARM CLOCK BAKELIKE GREEN) 0.094388 0.837838 8.642959
24 (SET/6 RED SPOTTY PAPER PLATES) (SET/20 RED RETROSPOT PAPER NAPKINS) 0.127551 0.800000 6.030769

In [17]:
basket['ALARM CLOCK BAKELIKE GREEN'].sum()


Out[17]:
340.0

In [18]:
basket['ALARM CLOCK BAKELIKE RED'].sum()


Out[18]:
316.0

In [20]:
basket2 = (df[df['Country'] =="Germany"]
          .groupby(['InvoiceNo', 'Description'])['Quantity']
          .sum().unstack().reset_index().fillna(0)
          .set_index('InvoiceNo'))

In [21]:
basket_sets2 = basket2.applymap(encode_units)

In [22]:
basket_sets2.drop('POSTAGE', inplace=True, axis=1)

In [23]:
frequent_itemsets2 = apriori(basket_sets2, min_support=0.05, use_colnames=True)

In [24]:
rules2 = association_rules(frequent_itemsets2, metric="lift", min_threshold=1)
rules2


Out[24]:
antecedants consequents support confidence lift
0 (WOODLAND CHARLOTTE BAG) (ROUND SNACK BOXES SET OF4 WOODLAND) 0.126915 0.500000 2.040179
1 (ROUND SNACK BOXES SET OF4 WOODLAND) (WOODLAND CHARLOTTE BAG) 0.245077 0.258929 2.040179
2 (PLASTERS IN TIN CIRCUS PARADE) (ROUND SNACK BOXES SET OF4 WOODLAND) 0.115974 0.490566 2.001685
3 (ROUND SNACK BOXES SET OF4 WOODLAND) (PLASTERS IN TIN CIRCUS PARADE) 0.245077 0.232143 2.001685
4 (PLASTERS IN TIN WOODLAND ANIMALS) (PLASTERS IN TIN SPACEBOY) 0.137856 0.444444 4.145125
5 (PLASTERS IN TIN SPACEBOY) (PLASTERS IN TIN WOODLAND ANIMALS) 0.107221 0.571429 4.145125
6 (WOODLAND CHARLOTTE BAG) (RED RETROSPOT CHARLOTTE BAG) 0.126915 0.465517 6.648168
7 (RED RETROSPOT CHARLOTTE BAG) (WOODLAND CHARLOTTE BAG) 0.070022 0.843750 6.648168
8 (ROUND SNACK BOXES SET OF4 WOODLAND) (ROUND SNACK BOXES SET OF 4 FRUITS) 0.245077 0.535714 3.400298
9 (ROUND SNACK BOXES SET OF 4 FRUITS) (ROUND SNACK BOXES SET OF4 WOODLAND) 0.157549 0.833333 3.400298
10 (PLASTERS IN TIN WOODLAND ANIMALS) (ROUND SNACK BOXES SET OF4 WOODLAND) 0.137856 0.539683 2.202098
11 (ROUND SNACK BOXES SET OF4 WOODLAND) (PLASTERS IN TIN WOODLAND ANIMALS) 0.245077 0.303571 2.202098
12 (PLASTERS IN TIN CIRCUS PARADE) (PLASTERS IN TIN WOODLAND ANIMALS) 0.115974 0.584906 4.242887
13 (PLASTERS IN TIN WOODLAND ANIMALS) (PLASTERS IN TIN CIRCUS PARADE) 0.137856 0.492063 4.242887
14 (PLASTERS IN TIN CIRCUS PARADE) (ROUND SNACK BOXES SET OF 4 FRUITS) 0.115974 0.433962 2.754455
15 (ROUND SNACK BOXES SET OF 4 FRUITS) (PLASTERS IN TIN CIRCUS PARADE) 0.157549 0.319444 2.754455
16 (SPACEBOY LUNCH BOX) (ROUND SNACK BOXES SET OF4 WOODLAND) 0.102845 0.680851 2.778116
17 (ROUND SNACK BOXES SET OF4 WOODLAND) (SPACEBOY LUNCH BOX) 0.245077 0.285714 2.778116

In [25]:
rules2[ (rules2['lift'] >= 4) &
        (rules2['confidence'] >= 0.5) ]


Out[25]:
antecedants consequents support confidence lift
5 (PLASTERS IN TIN SPACEBOY) (PLASTERS IN TIN WOODLAND ANIMALS) 0.107221 0.571429 4.145125
7 (RED RETROSPOT CHARLOTTE BAG) (WOODLAND CHARLOTTE BAG) 0.070022 0.843750 6.648168
12 (PLASTERS IN TIN CIRCUS PARADE) (PLASTERS IN TIN WOODLAND ANIMALS) 0.115974 0.584906 4.242887