In [33]:
import numpy as np
from collections import defaultdict

dataset_filename = "affinity_dataset.txt"
X = np.loadtxt(dataset_filename)
n_sample,n_features = X.shape

num_apple_purchases = 0
for sample in X:
    if sample[3] == 1:
        num_apple_purchases +=1
print ("%d prople bought apples " % num_apple_purchases)


#规则应验
valid_rules = defaultdict(int)
#规则无效
invalid_rules = defaultdict(int)
#条件相同的规则数量
num_occurances = defaultdict(int)


for sample in X:
    for premise in range(4):
        if sample[premise] == 0:continue
        num_occurances[premise] +=1
        for conclusion in range(4):
            if premise == conclusion: continue
            if sample[conclusion] == 1:
                valid_rules[(premise, conclusion)] += 1
            else:
                invalid_rules[(premise, conclusion)] += 1
        
print (num_occurances)
#print (invalid_rules)
#print (valid_rules)

support = valid_rules
confidence = defaultdict(float)

for premise,conclusion in valid_rules.keys():
    rule =(premise,conclusion)
    confidence[rule] = '%.3f' % (float(support[rule])/float(num_occurances[premise]))
print (confidence)


36 prople bought apples 
defaultdict(<type 'int'>, {0: 27, 1: 46, 2: 41, 3: 36})
defaultdict(<type 'float'>, {(0, 1): '0.519', (1, 2): '0.152', (3, 2): '0.694', (1, 3): '0.196', (3, 0): '0.139', (3, 1): '0.250', (2, 1): '0.171', (2, 0): '0.098', (2, 3): '0.610', (1, 0): '0.304', (0, 3): '0.185', (0, 2): '0.148'})

In [34]:
# 标签集合
features = ["面包","牛奶","奶酪","苹果","香蕉"]

def print_rule(premise,conclusion,features):
    premise_name = features[premise]
    conclusion_name = features[conclusion]
    print("Rule: If a person buys {0} they will also buy{1}".format(premise_name, conclusion_name))
    print(" - Support: {0}".format(support[(premise,conclusion)]))
    print(" - Confidence: %.3f" % (float(confidence[(premise,conclusion)])))

premise=1
conclusion=2
print_rule(premise,conclusion,features)


Rule: If a person buys 牛奶 they will also buy奶酪
 - Support: 7
 - Confidence: 0.152

In [39]:
sorted_confidence = sorted(confidence.items(), key=lambda k:k[1],reverse=True)

for index in range(len(features)):
    print ("rule #{0}".format(index+1))
    premise,conclusion = sorted_confidence[index][0]
    print_rule(premise,conclusion,features)


rule #1
Rule: If a person buys 苹果 they will also buy奶酪
 - Support: 25
 - Confidence: 0.694
rule #2
Rule: If a person buys 奶酪 they will also buy苹果
 - Support: 25
 - Confidence: 0.610
rule #3
Rule: If a person buys 面包 they will also buy牛奶
 - Support: 14
 - Confidence: 0.519
rule #4
Rule: If a person buys 牛奶 they will also buy面包
 - Support: 14
 - Confidence: 0.304
rule #5
Rule: If a person buys 苹果 they will also buy牛奶
 - Support: 9
 - Confidence: 0.250

In [ ]: