Companion notebook to http://pbpython.com/market-basket-analysis.html
In [1]:
import pandas as pd
from mlxtend.frequent_patterns import apriori
from mlxtend.frequent_patterns import association_rules
In [2]:
df = pd.read_excel('http://archive.ics.uci.edu/ml/machine-learning-databases/00352/Online%20Retail.xlsx')
In [3]:
df.head()
Out[3]:
In [4]:
# Clean up spaces in description and remove any rows that don't have a valid invoice
df['Description'] = df['Description'].str.strip()
df.dropna(axis=0, subset=['InvoiceNo'], inplace=True)
In [5]:
df['InvoiceNo'] = df['InvoiceNo'].astype('str')
df = df[~df['InvoiceNo'].str.contains('C')]
In [6]:
basket = (df[df['Country'] =="France"]
.groupby(['InvoiceNo', 'Description'])['Quantity']
.sum().unstack().reset_index().fillna(0)
.set_index('InvoiceNo'))
In [7]:
basket.head()
Out[7]:
In [8]:
# Show a subset of columns
basket.iloc[:,[0,1,2,3,4,5,6, 7]].head()
Out[8]:
In [9]:
# Convert the units to 1 hot encoded values
def encode_units(x):
if x <= 0:
return 0
if x >= 1:
return 1
In [10]:
basket_sets = basket.applymap(encode_units)
In [11]:
# No need to track postage
basket_sets.drop('POSTAGE', inplace=True, axis=1)
In [12]:
basket_sets.head()
Out[12]:
In [13]:
# Build up the frequent items
frequent_itemsets = apriori(basket_sets, min_support=0.07, use_colnames=True)
In [14]:
frequent_itemsets.head()
Out[14]:
In [15]:
# Create the rules
rules = association_rules(frequent_itemsets, metric="lift", min_threshold=1)
rules
Out[15]:
In [16]:
rules[ (rules['lift'] >= 6) &
(rules['confidence'] >= 0.8) ]
Out[16]:
In [17]:
basket['ALARM CLOCK BAKELIKE GREEN'].sum()
Out[17]:
In [18]:
basket['ALARM CLOCK BAKELIKE RED'].sum()
Out[18]:
In [20]:
basket2 = (df[df['Country'] =="Germany"]
.groupby(['InvoiceNo', 'Description'])['Quantity']
.sum().unstack().reset_index().fillna(0)
.set_index('InvoiceNo'))
In [21]:
basket_sets2 = basket2.applymap(encode_units)
In [22]:
basket_sets2.drop('POSTAGE', inplace=True, axis=1)
In [23]:
frequent_itemsets2 = apriori(basket_sets2, min_support=0.05, use_colnames=True)
In [24]:
rules2 = association_rules(frequent_itemsets2, metric="lift", min_threshold=1)
rules2
Out[24]:
In [25]:
rules2[ (rules2['lift'] >= 4) &
(rules2['confidence'] >= 0.5) ]
Out[25]: