In [1]:
from scipy.sparse import csr_matrix
In [2]:
def get_top_k(sparse_row, k):
"""从每一行获得前k大的值的索引
Args:
sparse_row(csr_matrix) :- 行数为1的稀疏矩阵
k(int) :- 大于0
"""
ordered_indices = [x[1] for x in sorted(zip(sparse_row.data, sparse_row.indices), key=lambda x: x[0], reverse=True)]
num = len(ordered_indices)
if num == 0:
return []
else:
return ordered_indices[: min(num, k)]
In [3]:
def second_order_association_rule(rating_matrice, min_support, min_confidence, rule_alpha=0.05, k_1=10, k_2=20):
"""二阶关联规则
每个用户评过高分的物品作为交易
Args:
rating_matrice(csr_matrix) :- 评分矩阵,
of shape n * m, n为用户数量,m为物品数量
min_support(float) :- 最小支持度, 小于1
min_confidence(float) :- 最小置信度, 小于1
rule_alpha(float) :- 置信度的权重,介于0和1之间
k_1(int) :- 每个物品的关联物品的最大数量
k_2(int) :- 每个用户的推荐物品的最大数量
Returns:
rule_dict, dict, {漫画索引: [关联漫画的索引,按照置信度从高往低排]}
rec_dict, dict, {用户索引: [推荐漫画的索引,按照推荐度从高往低排]}
"""
try:
assert len(rating_matrice.shape) == 2
except AssertionError:
print("rating matrice must be 2-dimensional")
n, m = rating_matrice.shape
rating_binary = 1 * (rating_matrice > 0)
# freq_matrice为m * m维矩阵
# i, j 元素为同时拥有物品i和物品j的交易的数量
# 对角线元素i, i为物品i的频数
freq_matrice = rating_binary.transpose() * rating_binary
minimum = min_support * n
data = []
row_ind = []
col_ind = []
for row in range(m):
diagonal = freq_matrice[row, row]
if diagonal >= minimum:
data.append(1/diagonal)
row_ind.append(row)
col_ind.append(row)
else:
freq_matrice[row, :] = 0
freq_matrice[:, row] = 0
scaler_matrice = csr_matrix((data, (row_ind, col_ind)), shape=(m, m))
conf_matrice = rule_alpha * freq_matrice * scaler_matrice + (1 - rule_alpha) * scaler_matrice * freq_matrice
conf_matrice = conf_matrice.multiply(conf_matrice >= min_confidence)
rule_dict = dict()
rec_dict = dict()
for row in range(m):
sparse_row = conf_matrice.getrow(row)
top_k = get_top_k(sparse_row, k_1)
if top_k:
rule_dict[row] = top_k
rec_matrice = rating_matrice * conf_matrice
for row in range(n):
sparse_row = conf_matrice.getrow(row)
top_k = get_top_k(sparse_row, k_1)
if top_k:
rec_dict[row] = top_k
return rule_dict, rec_dict
In [ ]: