In [115]:
import json
import datetime
import numpy as np

In [116]:
# 1,3,4
# 2,3,5
# 1,2,3,5
# 2,5
def loadDataSet():
    return [[1, 3, 4], [2, 3, 5], [1, 2, 3, 5], [2, 5]]
dataSet = loadDataSet()

In [117]:
dataSet


Out[117]:
[[1, 3, 4], [2, 3, 5], [1, 2, 3, 5], [2, 5]]

In [118]:
# 找出所有dataset中的item集合
# frozenset 和 set 最大的差異在於前者不可變,後者可變,可當作dict的key
def createC1(dataSet):
    C1 = []
    for transaction in dataSet:
        for item in transaction:
            if not [item] in C1:
                C1.append([item])    
    C1.sort()
#     print(C1)
    return map(frozenset, C1)#use frozen set so we
                            #can use it as a key in a dict

In [119]:
c1 = createC1(dataSet)

In [120]:
c1


Out[120]:
[frozenset({1}),
 frozenset({2}),
 frozenset({3}),
 frozenset({4}),
 frozenset({5})]

In [121]:
# scan dataset 輸出高於minSupport的item
# dict supportData 儲存 item support value
def scanD(D, Ck, minSupport):
    ssCnt = {}
    # 算出每一個item總出現次數
    for tid in D:
        for can in Ck:
            if can.issubset(tid):
                if not ssCnt.has_key(can): ssCnt[can]=1
                else: ssCnt[can] += 1
    numItems = float(len(D))
    retList = []
    supportData = {}
# retList 儲存 >輸出高於minSupport的items集合
    for key in ssCnt:
        support = ssCnt[key]/numItems
        if support >= minSupport:
            # retList.insert(0,key) 從首插入
            # retList.append(key) 從尾插入
            retList.append(key)
        supportData[key] = support
    return retList, supportData , ssCnt

In [122]:
retList, supportData , ssCnt = scanD(dataSet, c1, 0.5)

In [123]:
# item4低於0.5
retList


Out[123]:
[frozenset({5}), frozenset({2}), frozenset({3}), frozenset({1})]

In [124]:
supportData


Out[124]:
{frozenset({4}): 0.25,
 frozenset({5}): 0.75,
 frozenset({2}): 0.75,
 frozenset({3}): 0.75,
 frozenset({1}): 0.5}

In [127]:
# 從小集合聚合成大集合 兩個兩個聚合
def aprioriGen(Lk, k): #creates Ck
    retList = []
    lenLk = len(Lk)
    # 指針移動尋找集合  i,j各別移動
    for i in range(lenLk):
        for j in range(i+1, lenLk): 
            L1 = list(Lk[i])[:k-2]; L2 = list(Lk[j])[:k-2]
            # 先排序太精妙了 可以避免重覆的 屌啊~
            L1.sort(); L2.sort()
            if L1==L2: #if first k-2 elements are equal
                retList.append(Lk[i] | Lk[j]) #set union
    return retList

# 輸入dataset 從小集合聚合到大集合,輸出高於minSupport的item
def apriori(dataSet, minSupport = 0.5):
    C1 = createC1(dataSet)
    D = map(set, dataSet)
    L1, supportData ,ssCnt= scanD(D, C1, minSupport)
    L = [L1]
    k = 2
    while (len(L[k-2]) > 0):
        Ck = aprioriGen(L[k-2], k)
        Lk, supK ,ssCnt = scanD(D, Ck, minSupport)#scan DB to get Lk
        supportData.update(supK)
        L.append(Lk)
        k += 1
    return L, supportData

In [128]:
L,supportData = apriori(dataSet,0.5)

In [129]:
L


Out[129]:
[[frozenset({5}), frozenset({2}), frozenset({3}), frozenset({1})],
 [frozenset({3, 5}), frozenset({2, 3}), frozenset({2, 5}), frozenset({1, 3})],
 [frozenset({2, 3, 5})],
 []]

In [131]:
supportData


Out[131]:
{frozenset({5}): 0.75,
 frozenset({3}): 0.75,
 frozenset({2, 3, 5}): 0.5,
 frozenset({1, 2}): 0.25,
 frozenset({1, 5}): 0.25,
 frozenset({3, 5}): 0.5,
 frozenset({4}): 0.25,
 frozenset({2, 3}): 0.5,
 frozenset({2, 5}): 0.75,
 frozenset({1}): 0.5,
 frozenset({1, 3}): 0.5,
 frozenset({2}): 0.75}

In [133]:
# 算出Rule
def generateRules(L, supportData, minConf=0.7):  #supportData is a dict coming from scanD
    bigRuleList = []
    # L[0] 集合只有一個item frozenset([5]) L[1]集合有兩個item frozenset([2, 5])
    for i in range(1, len(L)):#only get the sets with two or more items
        for freqSet in L[i]:
            H1 = [frozenset([item]) for item in freqSet]
            if (i > 1):
                rulesFromConseq(freqSet, H1, supportData, bigRuleList, minConf)
            else:
                # 計算 confidence
                calcConf(freqSet, H1, supportData, bigRuleList, minConf)
    return bigRuleList         

def calcConf(freqSet, H, supportData, brl, minConf=0.7):
    prunedH = [] #create new list to return
    for conseq in H:
        # freqSet U conseq / freqSet - conseq     豆奶:freqSet - conseq 青菜:conseq 豆奶U青菜:freqSet
        # 買了豆奶和青菜 豆奶 -> 青菜 買了豆奶之後,再買青菜的機率
        conf = supportData[freqSet]/supportData[freqSet-conseq] #calc confidence
        if conf >= minConf: 
            print freqSet-conseq,'-->',conseq,'conf:',conf
            brl.append((freqSet-conseq, conseq, conf))
            prunedH.append(conseq)
    return prunedH

def rulesFromConseq(freqSet, H, supportData, brl, minConf=0.7):
    # 每一個集合中的物品數
    m = len(H[0])
    if (len(freqSet) > (m + 1)): #try further merging
        Hmp1 = aprioriGen(H, m+1)#create Hm+1 new candidates
        Hmp1 = calcConf(freqSet, Hmp1, supportData, brl, minConf)
        if (len(Hmp1) > 1):    #need at least two sets to merge
            rulesFromConseq(freqSet, Hmp1, supportData, brl, minConf)

In [134]:
rule = generateRules(L,supportData,0.7)


frozenset([5]) --> frozenset([2]) conf: 1.0
frozenset([2]) --> frozenset([5]) conf: 1.0
frozenset([1]) --> frozenset([3]) conf: 1.0

In [135]:
rule


Out[135]:
[(frozenset({5}), frozenset({2}), 1.0),
 (frozenset({2}), frozenset({5}), 1.0),
 (frozenset({1}), frozenset({3}), 1.0)]

In [136]:
# data source : retail.dat 4.2Mb

In [137]:
datapath = '/Users/wy/Desktop/retail.dat'
dataList=[]
with open (datapath,'r') as f:
    lines = f.readlines()
for index, line in enumerate(lines):
    line=line.strip()
    listLine = map(int,line.split(' '))
    dataList.append(listLine)

In [138]:
start = datetime.datetime.now()

L,supportData = apriori(dataList,0.1)
rule = generateRules(L,supportData,0.2)

end = datetime.datetime.now()
runtime = end - start


frozenset([41]) --> frozenset([48]) conf: 0.603412512546
frozenset([48]) --> frozenset([41]) conf: 0.214026343895
frozenset([39]) --> frozenset([38]) conf: 0.204144055254
frozenset([38]) --> frozenset([39]) conf: 0.663311105412
frozenset([39]) --> frozenset([48]) conf: 0.575076467686
frozenset([48]) --> frozenset([39]) conf: 0.691634033464
frozenset([39]) --> frozenset([41]) conf: 0.225239269857
frozenset([41]) --> frozenset([39]) conf: 0.763733690197

In [139]:
runtime


Out[139]:
datetime.timedelta(0, 321, 831933)

In [152]:
def store(data,path):
    with open(path,'w') as f:
        f.write(data)

In [153]:
freItem_list = []
for tmp in L:
    if len(tmp) !=0:
        for item in tmp:
            freItem_list.append(list(item))
freItem_json = json.dumps(freItem_list,indent=4)
store(freItem_json,'/Users/wy/Desktop/freItem.json')

In [158]:
freItem_list


Out[158]:
[[48], [38], [32], [39], [41], [48, 41], [38, 39], [48, 39], [41, 39]]

In [155]:
supportData_list = []
for key, value in supportData.iteritems():
    s = str(list(key))+':'+str(value)
    supportData_list.append(s)
supportData_list.sort()
supportData_json = json.dumps(supportData_list,indent=4)
store(supportData_json,'/Users/wy/Desktop/supportData.json')

In [160]:
# 前10筆
supportData_list[:10]


Out[160]:
['[0]:0.00200766770264',
 '[10000]:2.26855107643e-05',
 '[10001]:1.13427553821e-05',
 '[10002]:4.53710215285e-05',
 '[10003]:3.40282661464e-05',
 '[10004]:7.9399287675e-05',
 '[10005]:0.000113427553821',
 '[10006]:6.80565322928e-05',
 '[10007]:1.13427553821e-05',
 '[10008]:0.000238197863025']

In [156]:
strongRule = []
for tmp in rule:
    s = str(list(tmp[0])[0])+'-->'+str(list(tmp[1])[0])+'---'+str(tmp[2])
    strongRule.append(s)
strongRule_json = json.dumps(strongRule,indent=4)
store(strongRule_json,'/Users/wy/Desktop/strongRule.json')

In [161]:
strongRule


Out[161]:
['41-->48---0.603412512546',
 '48-->41---0.214026343895',
 '39-->38---0.204144055254',
 '38-->39---0.663311105412',
 '39-->48---0.575076467686',
 '48-->39---0.691634033464',
 '39-->41---0.225239269857',
 '41-->39---0.763733690197']

In [ ]:
# reference Machine Learning in action