In [1]:
from scipy.sparse import csr_matrix

In [2]:
def get_top_k(sparse_row, k):
    """从每一行获得前k大的值的索引
    
    Args:
        sparse_row(csr_matrix) :- 行数为1的稀疏矩阵
        k(int) :- 大于0
    """
    ordered_indices = [x[1] for x in sorted(zip(sparse_row.data, sparse_row.indices), key=lambda x: x[0], reverse=True)]
    num = len(ordered_indices)
    if num == 0:
        return []
    else:
        return ordered_indices[: min(num, k)]

In [3]:
def second_order_association_rule(rating_matrice, min_support, min_confidence, rule_alpha=0.05, k_1=10, k_2=20):
    """二阶关联规则
    
    每个用户评过高分的物品作为交易
    
    Args:
        rating_matrice(csr_matrix) :- 评分矩阵, 
                                    of shape n * m, n为用户数量,m为物品数量
        min_support(float) :- 最小支持度, 小于1
        min_confidence(float) :- 最小置信度, 小于1
        rule_alpha(float) :- 置信度的权重,介于0和1之间
        k_1(int) :- 每个物品的关联物品的最大数量
        k_2(int) :- 每个用户的推荐物品的最大数量
    Returns:
        rule_dict, dict, {漫画索引: [关联漫画的索引,按照置信度从高往低排]}
        rec_dict, dict, {用户索引: [推荐漫画的索引,按照推荐度从高往低排]}
    """
    try:
        assert len(rating_matrice.shape) == 2
    except AssertionError:
        print("rating matrice must be 2-dimensional")
    n, m = rating_matrice.shape
    rating_binary = 1 * (rating_matrice > 0)
    
    # freq_matrice为m * m维矩阵
    # i, j 元素为同时拥有物品i和物品j的交易的数量
    # 对角线元素i, i为物品i的频数
    freq_matrice = rating_binary.transpose() * rating_binary
    
    minimum = min_support * n
    
    data = []
    row_ind = []
    col_ind = []
    for row in range(m):
        diagonal = freq_matrice[row, row]
        if diagonal >= minimum:
            data.append(1/diagonal)
            row_ind.append(row)
            col_ind.append(row)
        else:
            freq_matrice[row, :] = 0
            freq_matrice[:, row] = 0
    scaler_matrice = csr_matrix((data, (row_ind, col_ind)), shape=(m, m))
    
    conf_matrice = rule_alpha * freq_matrice * scaler_matrice + (1 - rule_alpha) * scaler_matrice * freq_matrice
    conf_matrice = conf_matrice.multiply(conf_matrice >= min_confidence)
    
    rule_dict = dict()
    rec_dict = dict()
    
    for row in range(m):
        sparse_row = conf_matrice.getrow(row)
        top_k = get_top_k(sparse_row, k_1)
        if top_k:
            rule_dict[row] = top_k
    
    
    rec_matrice = rating_matrice * conf_matrice
    for row in range(n):
        sparse_row = conf_matrice.getrow(row)
        top_k = get_top_k(sparse_row, k_1)
        if top_k:
            rec_dict[row] = top_k    
    return rule_dict, rec_dict

In [ ]: