In [253]:
# Generate item ids
item_ids = range(1, 101) # 100 items, numbered from 1 to 100
print items
In [254]:
# Item i is in basket b if and only if i divides b with no remainder.
# Thus, item 1 is in all the baskets, item 2 is in all fifty of the
# even-numbered baskets, and so on
# Item Generator
def generate_items(basket_id, item_ids):
"""
Generate items belonging to a given basket id
:param: basket_id: the id of the basket
:param: items: the list of items
:return: set of items whose ids divide the basket id with no remainder
"""
basket_items = {item for item in item_ids if item % basket_id == 0}
return basket_items
# Baskets Generator
baskets = {b: generate_items(b, item_ids) for b in range(1, 101)}
# View contents of Basket #:20
print "Basket ID 20: ", baskets[20]
In [269]:
# Generate the singleton item sets
# here, we use tuples instead of sets since we want to use them as keys for dicts
items = [tuple([x]) for x in item_ids]
print items
In [270]:
def get_support_values(items, baskets):
"""
Find and return support value of the items
Support of an item I is given by the number of baskets of which I is a subset
:param items: List of item subsets. Each list subset can contain one or more items
:param baskets: A dict of baskets with basket id as key and basket contents as sets
:return: support_values: A dict of Support value for each of the item subsets
"""
# get the baskets that each itemsets are part of
item_baskets = {item: [basket_id for basket_id in baskets.keys()
if set(item) <= baskets[basket_id]] for item in items}
# get the number of baskets (support value) that each itemsets are part of
support_values = { item: len(item_baskets[item]) for item in item_baskets.keys()}
return support_values, item_baskets
In [271]:
support_values, item_baskets = get_support_values(items, baskets)
print support_values
In [272]:
def find_frequent_items(support_values, support_threshold):
"""
Return item subsets with support values greater than support_threshold
:param support_values: List of support values corresponding to the items subsets
:param support_threshold: items with support value greater than support_threshold are 'frequent'
:return: list of item sets with support value greater than the threshold
"""
support_threshold_exceeded = [item for item in support_values.keys() if support_values[item] >= support_threshold]
return support_threshold_exceeded
In [278]:
frequent_singletons = find_frequent_items(support_values, 5)
print frequent_singletons
In [279]:
import itertools
# generate all possible two item combinations
two_itemsets = list(itertools.combinations(item_ids, 2))
# get the support values and the baskets for the doubleton item sets
doubletons_support_values, doubletons_baskets = get_support_values(two_itemsets, baskets)
# get the frequent doubleons with support threshold = 5
frequent_doubletons = find_frequent_items(doubletons_support_values, 5)
print frequent_doubletons
In [319]:
# Warning: Naive Implementation
# Refer next notebooks for a faster implementation
def calc_confidence(itemset1, itemset2, item_ids, baskets):
"""
Calculate the confidence of association rule itemset1 -> itemset2
The 'confidence' is defined as the ratio of support for (itemset1 U itemset2) to
the support for itemset1 only.
:param itemset1: set containing itemset 1
:param itemset2: set containing itemset 2
:param item_ids: list of item ids
:param baskets: list of all baskets
:return: support (itemset1 U itemset2) / support(itemset1)
"""
# find the length of the union of itemset1 & itemset2
itemset_len = len(itemset1.union(itemset2))
# generate all possible item combinations of length of the combined set
all_itemsets = list(itertools.combinations(item_ids, itemset_len))
# generate the support values & baskets for the length of combined set
support_values, baskets_info = get_support_values(all_itemsets, baskets)
# get the support value for itemset1 union itemset2
union_itemset_support = support_values[tuple(sorted(list(itemset1.union(itemset2))))]
# get support value for itemset1
itemset1_baskets = [tuple(basket_id) for itemset, basket_id in baskets_info.iteritems() if itemset1 <= set(itemset)]
itemset1_support = len(set(itemset1_baskets))
return union_itemset_support / (itemset1_support * 1.0)
In [320]:
print calc_confidence({4,12}, {2}, item_ids, baskets)
In [322]:
print calc_confidence({24,60}, {8}, item_ids, baskets)
In [323]:
# Warning : takes a long time
print calc_confidence({2,3,4}, {5}, item_ids, baskets)