In [1]:
import numpy as np
# linalg:Linear algebra

In [2]:
# Collaborative filtering PPT 搭配服用
# http://www.slideshare.net/ssuserf88631/collaborative-filtering-45262678

In [3]:
#     日式  中式  美式  泰式  韓式
# ---------------------------
# sam  2    0    0    4    4
#john  5    5    5    3    3
# tim  2    4    2    1    2
#以下矩陣可看此 user 對 item 進行評分
#藉由協同過濾 猜出 sam對中式和美式的評價後,推薦sam吃什麼

In [4]:
def loadItemScore():
    tmp = [[4, 4, 0, 2, 2],
           [4, 0, 0, 3, 3],
           [4, 0, 0, 1, 1],
           [1, 1, 1, 2, 0],
           [2, 2, 2, 0, 0],
           [5, 5, 5, 0, 0],
           [1, 1, 1, 0, 0]]
    itemScore = np.array(tmp,dtype=np.float)
    return itemScore

In [5]:
def loadItemScore2():
    tmp = [[0, 0, 0, 0, 0, 4, 0, 0, 0, 0, 5],
           [0, 0, 0, 3, 0, 4, 0, 0, 0, 0, 3],
           [0, 0, 0, 0, 4, 0, 0, 1, 0, 4, 0],
           [3, 3, 4, 0, 0, 0, 0, 2, 2, 0, 0],
           [5, 4, 5, 0, 0, 0, 0, 5, 5, 0, 0],
           [0, 0, 0, 0, 5, 0, 1, 0, 0, 5, 0],
           [4, 3, 4, 0, 0, 0, 0, 5, 5, 0, 1],
           [0, 0, 0, 4, 0, 4, 0, 0, 0, 0, 4],
           [0, 0, 0, 2, 0, 2, 5, 0, 0, 1, 2],
           [0, 0, 0, 0, 5, 0, 0, 0, 0, 4, 0],
           [1, 0, 0, 0, 0, 0, 0, 1, 2, 0, 0]]
    itemScore = np.array(tmp,dtype=np.float)
    return itemScore

In [6]:
# SVD 矩陣奇異值分解
# 利用SVD奇異值分解可以大幅減低矩陣的儲存量
U,sigma,VT  = np.linalg.svd(loadItemScore2())
# 矩陣能量為 sum(sigma**2) 需保留90%為典型作法
sig2 = sigma**2
print("矩陣總能量為:")
print(sum(sig2))
print("90%矩陣能量:")
print(sum(sig2)*0.9)
print("轉換為二維矩陣能量:")
print(sum(sig2[:2]))
print("轉換三維矩陣能量:")
print(sum(sig2[:3]))
# 轉換為二維能量太低,需為三維 svdMatrix則為還原後的矩陣
Sig4 = np.mat(np.eye(3)*sigma[:3])
svdMatrix = U[:,:3] * Sig4 * VT[:3,:]
# 資料來源:http://en.wikipedia.org/wiki/Singular_value_decomposition


矩陣總能量為:
542.0
90%矩陣能量:
487.8
轉換為二維矩陣能量:
378.829559511
轉換三維矩陣能量:
500.500289128

In [7]:
# Euclid Similarity
def eulidSim(a,b):
    eulid = np.linalg.norm(a-b)
    norEulid = 1.0/(1.0 + eulid)
    return norEulid

In [8]:
# Adjusted Cosine Similarity 
# 不超過3 Similarity =1
def adjCosSim(a,b):
    if len(a)<3:
        return 1
    else:
        a = a-a.mean()
        b = b-b.mean()
        cos = float(np.inner(a,b))/(np.linalg.norm(a)*np.linalg.norm(b))
        norCos = 0.5+0.5*cos
        return norCos

In [9]:
# Cosine Similarity 
def cosSim(a,b):
    cos = float(np.inner(a,b))/(np.linalg.norm(a)*np.linalg.norm(b))
    norCos = 0.5+0.5*cos
    return norCos

In [10]:
# Pearson Correlation Coefficient Similarity
# 不超過3 Similarity =1
def pearsSim(a,b):
    if len(a)<3:
        return 1
    else:
        corcoef = np.corrcoef(a,b,rowvar=0)[0][1]
        norCorcoef = 0.5+0.5*corcoef
        return norCorcoef
# 資料來源:http://en.wikipedia.org/wiki/Pearson_product-moment_correlation_coefficient

In [11]:
# Person Correlation Coefficient Similarity 和 Adjusted Cosine Similarity 比較 
# Person Correlation Coefficient 是針對物品的平均 Adjusted Cosine Similarity是針對使用者評價的平均
# In pearson correlation 和 cosine 的比較 pearson correlation會減去平均項,降低bias
# In adjusted cosine correlation 不同使用者可能對物品會有較高或是較低的評分表現 因此減去使用者平均的評分 e.x.(4,5)(1,2)視為行為相似
# 資料來源:http://www.cs.carleton.edu/cs_comps/0607/recommend/recommender/itembased.html

In [12]:
# 概念藉由使用者都評分物品的分數做相似度運算,其結果*該未評分之物品 總和取平均 即為可能的分數
def itemSimRec(itemScore, user, simMethods, item):
    # user number : shape(itemScore)[0] item number : shape(itemScore)[1]
    n = np.shape(itemScore)[1]
    simTotal = 0.0; ratSimTotal = 0.0
    for i in range(n):
        userItemRating = itemScore[user,i]
        # pass norating item
        if userItemRating == 0 or i==item:
            pass
        else:
            # find same rating item
            sameRatingItem = np.nonzero(np.logical_and(itemScore[:,item] > 0,itemScore[:,i] > 0))[0]
            if len(sameRatingItem) == 0: 
                similarity = 0
            else: 
                similarity = simMethods(itemScore[sameRatingItem,item],itemScore[sameRatingItem,i])
#                 print(itemScore[sameRatingItem,item])
#                 print(itemScore[sameRatingItem,i])
#             print 'the %d and %d similarity is: %f' % (item, i, similarity)
            simTotal += similarity
            ratSimTotal += similarity * userItemRating
    if simTotal == 0: 
        return 0
    else: 
        return ratSimTotal/simTotal

In [13]:
def recommend(itemScore, user, N=3, simMethods=cosSim, estMethod=itemSimRec):
    #find unrated items 
    unratedItems = np.nonzero(itemScore[user,:] ==0)[0]
    if len(unratedItems) == 0: 
        return 'you rated everything'
    else:
        itemScores = []
        for item in unratedItems:
            estimatedScore = estMethod(itemScore, user, simMethods, item)
            itemScores.append((item, estimatedScore))
            recItem = sorted(itemScores, key=lambda x: x[1], reverse=True)[:N]
        return recItem

In [14]:
# SVD分解再還原 只有有多出SVD的部分,其餘和itemSimRec算法相同
def svdItemSimRec(itemScore, user, simMethods, item):
    n = np.shape(itemScore)[1]
    simTotal = 0.0; ratSimTotal = 0.0
    U,Sigma,VT = np.linalg.svd(itemScore)
    Sig4 = np.mat(np.eye(3)*Sigma[:3]) #arrange Sig4 into a diagonal matrix
    xformedItems = U[:,:3] * Sig4 * VT[:3,:]  #create transformed items    
    for i in range(n):
        userItemRating = itemScore[user,i]
        # pass norating item
        if userItemRating == 0 or i==item:
            pass
        else:
            # find same rating item
            sameRatingItem = np.nonzero(np.logical_and(itemScore[:,item] > 0,itemScore[:,i] > 0))[0]
            if len(sameRatingItem) == 0: 
                similarity = 0
            else: 
                similarity = simMethods(itemScore[sameRatingItem,item],itemScore[sameRatingItem,i])
#                 print(itemScore[sameRatingItem,item])
#                 print(itemScore[sameRatingItem,i])
#             print 'the %d and %d similarity is: %f' % (item, i, similarity)
            simTotal += similarity
            ratSimTotal += similarity * userItemRating
    if simTotal == 0: 
        return 0
    else: 
        return ratSimTotal/simTotal

In [15]:
# [(4, 5.0), (9, 5.0), (10, 4.7297297297297298)]
# 物品4 分數5 物品9 分數5 物品10 分數4.7 依序推薦
print("estMethod=itemSimRec")
print("--------------------")
print("simMethods=eulidSim")
print(recommend(loadItemScore2(),4,simMethods=eulidSim,estMethod=itemSimRec))
print("simMethods=cosSim")
print(recommend(loadItemScore2(),4,simMethods=cosSim,estMethod=itemSimRec))
print("simMethods=adjCosSim")
print(recommend(loadItemScore2(),4,simMethods=adjCosSim,estMethod=itemSimRec))
print("simMethods=pearsSim")
print(recommend(loadItemScore2(),4,simMethods=pearsSim,estMethod=itemSimRec))


estMethod=itemSimRec
--------------------
simMethods=eulidSim
[(4, 5.0), (9, 5.0), (10, 4.7297297297297298)]
simMethods=cosSim
[(4, 5.0), (9, 5.0), (10, 4.7999999999999998)]
simMethods=adjCosSim
[(4, 5.0), (9, 5.0), (10, 4.7999999999999998)]
simMethods=pearsSim
[(4, 5.0), (9, 5.0), (10, 4.7999999999999998)]

In [16]:
# 照理說經過SVD分解後,可大幅減少儲存量,但對於矩陣還原可能會失真,故會比itemSimRec中有更多誤差存在
print("estMethod=svdItemSimRec")
print("-----------------------")
print("simMethods=eulidSim")
print(recommend(loadItemScore2(),4,simMethods=eulidSim,estMethod=svdItemSimRec))
print("simMethods=cosSim")
print(recommend(loadItemScore2(),4,simMethods=cosSim,estMethod=svdItemSimRec))
print("simMethods=adjCosSim")
print(recommend(loadItemScore2(),4,simMethods=adjCosSim,estMethod=svdItemSimRec))
print("simMethods=pearsSim")
print(recommend(loadItemScore2(),4,simMethods=pearsSim,estMethod=svdItemSimRec))


estMethod=svdItemSimRec
-----------------------
simMethods=eulidSim
[(4, 5.0), (9, 5.0), (10, 4.7297297297297298)]
simMethods=cosSim
[(4, 5.0), (9, 5.0), (10, 4.7999999999999998)]
simMethods=adjCosSim
[(4, 5.0), (9, 5.0), (10, 4.7999999999999998)]
simMethods=pearsSim
[(4, 5.0), (9, 5.0), (10, 4.7999999999999998)]

In [17]:
# 參考來源 Machine Learning in Action