notebook.community

Edit and run



In [115]:

    
import json
import datetime
import numpy as np



In [116]:

    
# 1,3,4
# 2,3,5
# 1,2,3,5
# 2,5
def loadDataSet():
    return [[1, 3, 4], [2, 3, 5], [1, 2, 3, 5], [2, 5]]
dataSet = loadDataSet()



In [117]:

    
dataSet









    Out[117]:





[[1, 3, 4], [2, 3, 5], [1, 2, 3, 5], [2, 5]]



In [118]:

    
# 找出所有dataset中的item集合
# frozenset 和 set 最大的差異在於前者不可變,後者可變,可當作dict的key
def createC1(dataSet):
    C1 = []
    for transaction in dataSet:
        for item in transaction:
            if not [item] in C1:
                C1.append([item])    
    C1.sort()
#     print(C1)
    return map(frozenset, C1)#use frozen set so we
                            #can use it as a key in a dict



In [119]:

    
c1 = createC1(dataSet)



In [120]:

    
c1









    Out[120]:





[frozenset({1}),
 frozenset({2}),
 frozenset({3}),
 frozenset({4}),
 frozenset({5})]



In [121]:

    
# scan dataset 輸出高於minSupport的item
# dict supportData 儲存 item support value
def scanD(D, Ck, minSupport):
    ssCnt = {}
    # 算出每一個item總出現次數
    for tid in D:
        for can in Ck:
            if can.issubset(tid):
                if not ssCnt.has_key(can): ssCnt[can]=1
                else: ssCnt[can] += 1
    numItems = float(len(D))
    retList = []
    supportData = {}
# retList 儲存 >輸出高於minSupport的items集合
    for key in ssCnt:
        support = ssCnt[key]/numItems
        if support >= minSupport:
            # retList.insert(0,key) 從首插入
            # retList.append(key) 從尾插入
            retList.append(key)
        supportData[key] = support
    return retList, supportData , ssCnt



In [122]:

    
retList, supportData , ssCnt = scanD(dataSet, c1, 0.5)



In [123]:

    
# item4低於0.5
retList









    Out[123]:





[frozenset({5}), frozenset({2}), frozenset({3}), frozenset({1})]



In [124]:

    
supportData









    Out[124]:





{frozenset({4}): 0.25,
 frozenset({5}): 0.75,
 frozenset({2}): 0.75,
 frozenset({3}): 0.75,
 frozenset({1}): 0.5}



In [127]:

    
# 從小集合聚合成大集合 兩個兩個聚合
def aprioriGen(Lk, k): #creates Ck
    retList = []
    lenLk = len(Lk)
    # 指針移動尋找集合  i,j各別移動
    for i in range(lenLk):
        for j in range(i+1, lenLk): 
            L1 = list(Lk[i])[:k-2]; L2 = list(Lk[j])[:k-2]
            # 先排序太精妙了 可以避免重覆的 屌啊～
            L1.sort(); L2.sort()
            if L1==L2: #if first k-2 elements are equal
                retList.append(Lk[i] | Lk[j]) #set union
    return retList

# 輸入dataset 從小集合聚合到大集合，輸出高於minSupport的item
def apriori(dataSet, minSupport = 0.5):
    C1 = createC1(dataSet)
    D = map(set, dataSet)
    L1, supportData ,ssCnt= scanD(D, C1, minSupport)
    L = [L1]
    k = 2
    while (len(L[k-2]) > 0):
        Ck = aprioriGen(L[k-2], k)
        Lk, supK ,ssCnt = scanD(D, Ck, minSupport)#scan DB to get Lk
        supportData.update(supK)
        L.append(Lk)
        k += 1
    return L, supportData



In [128]:

    
L,supportData = apriori(dataSet,0.5)



In [129]:

    
L









    Out[129]:





[[frozenset({5}), frozenset({2}), frozenset({3}), frozenset({1})],
 [frozenset({3, 5}), frozenset({2, 3}), frozenset({2, 5}), frozenset({1, 3})],
 [frozenset({2, 3, 5})],
 []]



In [131]:

    
supportData









    Out[131]:





{frozenset({5}): 0.75,
 frozenset({3}): 0.75,
 frozenset({2, 3, 5}): 0.5,
 frozenset({1, 2}): 0.25,
 frozenset({1, 5}): 0.25,
 frozenset({3, 5}): 0.5,
 frozenset({4}): 0.25,
 frozenset({2, 3}): 0.5,
 frozenset({2, 5}): 0.75,
 frozenset({1}): 0.5,
 frozenset({1, 3}): 0.5,
 frozenset({2}): 0.75}



In [133]:

    
# 算出Rule
def generateRules(L, supportData, minConf=0.7):  #supportData is a dict coming from scanD
    bigRuleList = []
    # L[0] 集合只有一個item frozenset([5]) L[1]集合有兩個item frozenset([2, 5])
    for i in range(1, len(L)):#only get the sets with two or more items
        for freqSet in L[i]:
            H1 = [frozenset([item]) for item in freqSet]
            if (i > 1):
                rulesFromConseq(freqSet, H1, supportData, bigRuleList, minConf)
            else:
                # 計算 confidence
                calcConf(freqSet, H1, supportData, bigRuleList, minConf)
    return bigRuleList         

def calcConf(freqSet, H, supportData, brl, minConf=0.7):
    prunedH = [] #create new list to return
    for conseq in H:
        # freqSet U conseq / freqSet - conseq     豆奶：freqSet - conseq 青菜：conseq 豆奶U青菜：freqSet
        # 買了豆奶和青菜 豆奶 -> 青菜 買了豆奶之後，再買青菜的機率
        conf = supportData[freqSet]/supportData[freqSet-conseq] #calc confidence
        if conf >= minConf: 
            print freqSet-conseq,'-->',conseq,'conf:',conf
            brl.append((freqSet-conseq, conseq, conf))
            prunedH.append(conseq)
    return prunedH

def rulesFromConseq(freqSet, H, supportData, brl, minConf=0.7):
    # 每一個集合中的物品數
    m = len(H[0])
    if (len(freqSet) > (m + 1)): #try further merging
        Hmp1 = aprioriGen(H, m+1)#create Hm+1 new candidates
        Hmp1 = calcConf(freqSet, Hmp1, supportData, brl, minConf)
        if (len(Hmp1) > 1):    #need at least two sets to merge
            rulesFromConseq(freqSet, Hmp1, supportData, brl, minConf)



In [134]:

    
rule = generateRules(L,supportData,0.7)









    



frozenset([5]) --> frozenset([2]) conf: 1.0
frozenset([2]) --> frozenset([5]) conf: 1.0
frozenset([1]) --> frozenset([3]) conf: 1.0



In [135]:

    
rule









    Out[135]:





[(frozenset({5}), frozenset({2}), 1.0),
 (frozenset({2}), frozenset({5}), 1.0),
 (frozenset({1}), frozenset({3}), 1.0)]



In [136]:

    
# data source : retail.dat 4.2Mb



In [137]:

    
datapath = '/Users/wy/Desktop/retail.dat'
dataList=[]
with open (datapath,'r') as f:
    lines = f.readlines()
for index, line in enumerate(lines):
    line=line.strip()
    listLine = map(int,line.split(' '))
    dataList.append(listLine)



In [138]:

    
start = datetime.datetime.now()

L,supportData = apriori(dataList,0.1)
rule = generateRules(L,supportData,0.2)

end = datetime.datetime.now()
runtime = end - start









    



frozenset([41]) --> frozenset([48]) conf: 0.603412512546
frozenset([48]) --> frozenset([41]) conf: 0.214026343895
frozenset([39]) --> frozenset([38]) conf: 0.204144055254
frozenset([38]) --> frozenset([39]) conf: 0.663311105412
frozenset([39]) --> frozenset([48]) conf: 0.575076467686
frozenset([48]) --> frozenset([39]) conf: 0.691634033464
frozenset([39]) --> frozenset([41]) conf: 0.225239269857
frozenset([41]) --> frozenset([39]) conf: 0.763733690197



In [139]:

    
runtime









    Out[139]:





datetime.timedelta(0, 321, 831933)



In [152]:

    
def store(data,path):
    with open(path,'w') as f:
        f.write(data)



In [153]:

    
freItem_list = []
for tmp in L:
    if len(tmp) !=0:
        for item in tmp:
            freItem_list.append(list(item))
freItem_json = json.dumps(freItem_list,indent=4)
store(freItem_json,'/Users/wy/Desktop/freItem.json')



In [158]:

    
freItem_list









    Out[158]:





[[48], [38], [32], [39], [41], [48, 41], [38, 39], [48, 39], [41, 39]]



In [155]:

    
supportData_list = []
for key, value in supportData.iteritems():
    s = str(list(key))+':'+str(value)
    supportData_list.append(s)
supportData_list.sort()
supportData_json = json.dumps(supportData_list,indent=4)
store(supportData_json,'/Users/wy/Desktop/supportData.json')



In [160]:

    
# 前10筆
supportData_list[:10]









    Out[160]:





['[0]:0.00200766770264',
 '[10000]:2.26855107643e-05',
 '[10001]:1.13427553821e-05',
 '[10002]:4.53710215285e-05',
 '[10003]:3.40282661464e-05',
 '[10004]:7.9399287675e-05',
 '[10005]:0.000113427553821',
 '[10006]:6.80565322928e-05',
 '[10007]:1.13427553821e-05',
 '[10008]:0.000238197863025']



In [156]:

    
strongRule = []
for tmp in rule:
    s = str(list(tmp[0])[0])+'-->'+str(list(tmp[1])[0])+'---'+str(tmp[2])
    strongRule.append(s)
strongRule_json = json.dumps(strongRule,indent=4)
store(strongRule_json,'/Users/wy/Desktop/strongRule.json')



In [161]:

    
strongRule









    Out[161]:





['41-->48---0.603412512546',
 '48-->41---0.214026343895',
 '39-->38---0.204144055254',
 '38-->39---0.663311105412',
 '39-->48---0.575076467686',
 '48-->39---0.691634033464',
 '39-->41---0.225239269857',
 '41-->39---0.763733690197']



In [ ]:

    
# reference Machine Learning in action