In [1]:
import numpy as np 
import pandas as pd
import os

file_name = os.getcwd() +"/data/ml-100k/u.data"
all_ratings = pd.read_csv(file_name,delimiter="\t",header=None,names=["UserID", "MovieID", "Rating", "Datetime"],encoding="utf-8")
all_ratings['Datetime'] = pd.to_datetime(all_ratings['Datetime'],unit='s')
all_ratings.ix[0:4]


Out[1]:
UserID MovieID Rating Datetime
0 196 242 3 1997-12-04 15:55:49
1 186 302 3 1998-04-04 19:22:22
2 22 377 1 1997-11-07 07:18:36
3 244 51 2 1997-11-27 05:02:03
4 166 346 1 1998-02-02 05:33:16

In [2]:
#  Rating >3  is favorable
all_ratings['Favorable'] = all_ratings['Rating']>3
all_ratings.ix[:5]


Out[2]:
UserID MovieID Rating Datetime Favorable
0 196 242 3 1997-12-04 15:55:49 False
1 186 302 3 1998-04-04 19:22:22 False
2 22 377 1 1997-11-07 07:18:36 False
3 244 51 2 1997-11-27 05:02:03 False
4 166 346 1 1998-02-02 05:33:16 False
5 298 474 4 1998-01-07 14:20:06 True

In [3]:
#ratings = all_ratings.ix[:200]
ratings = all_ratings[all_ratings['UserID'].isin(range(200))]
favorable_ratings = ratings[ratings["Favorable"]]
favorable_reviews_by_users = dict((k, frozenset(v.values)) for k, v in favorable_ratings.groupby("UserID")["MovieID"])

num_favorable_by_movie = ratings[["MovieID","Favorable"]].groupby("MovieID").sum()
# last favorable movie
num_favorable_by_movie.sort_values(by=['Favorable'],ascending  = False ,axis=0)[:5]


Out[3]:
Favorable
MovieID
50 100.0
100 89.0
258 83.0
181 79.0
174 74.0

Apriori 算法

  • (1) 把各项目放到只包含自己的项集中,生成最初的频繁项集。只使用达到最小支持度的项目。
  • (2) 查找现有频繁项集的超集,发现新的频繁项集,并用其生成新的备选项集。
  • (3) 测试新生成的备选项集的频繁程度,如果不够频繁,则舍弃。如果没有新的频繁项集,就跳到最后一步。
  • (4) 存储新发现的频繁项集,跳到步骤(2)。
  • (5) 返回发现的所有频繁项集。

In [4]:
frequent_itemsets = {}
min_support = 50

1 我们把发现的频繁项集保存到以项集长度为键的字典中,便于根据长度查找,这样就可以找到最新发现的频繁项集。初始化一个字典。


In [5]:
frequent_itemsets[1] = dict((frozenset((movie_id,)),row['Favorable'])
                        for movie_id,row in num_favorable_by_movie.iterrows()
                        if row['Favorable'] > min_support)
frequent_itemsets[1]


Out[5]:
{frozenset({286}): 59.0,
 frozenset({7}): 67.0,
 frozenset({64}): 58.0,
 frozenset({79}): 58.0,
 frozenset({258}): 83.0,
 frozenset({50}): 100.0,
 frozenset({313}): 60.0,
 frozenset({174}): 74.0,
 frozenset({100}): 89.0,
 frozenset({181}): 79.0,
 frozenset({1}): 66.0,
 frozenset({127}): 70.0,
 frozenset({172}): 59.0,
 frozenset({98}): 70.0,
 frozenset({56}): 67.0,
 frozenset({9}): 53.0}

2 用一个函数来实现步骤(2)和(3),它接收新发现的频繁项集,创建超集,检测频繁程度


In [9]:
from collections import defaultdict
def find_frequent_itemsets(favorable_reviews_by_users,k_1_itemsets,min_support):
    counts = defaultdict(int)
    # 用户id,以及他点赞的电影集合
    for  user, reviews in favorable_reviews_by_users.items():
        # 遍历前面找出的项集,判断它们是否是当前评分项集的子集。如果是,表明用户已经为子集中的电影打过分
        for itemset in k_1_itemsets:
            if not itemset.issubset(reviews): 
                continue
            # 遍历用户打过分却没有出现在项集里的电影,用它生成超集,更新该项集的计数。
            for other_reviewed_movie in reviews - itemset:
                # 电影 | 用户打过分却没有出现在 【项集里】的电影集合
                # 喜欢一个也喜欢另外一个原则
                current_superset = itemset | frozenset((other_reviewed_movie,)) 
                counts[current_superset] += 1 # 每个用户同时喜欢这个这个项集 次数+1
    #函数最后检测达到支持度要求的项集,看它的频繁程度够不够,并返回其中的频繁项集。
    return dict([(itemset, frequency) for itemset, frequency in counts.items() if frequency >= min_support])

In [10]:
import sys
for k in range(2, 20):
    cur_frequent_itemsets = find_frequent_itemsets(favorable_reviews_by_users,frequent_itemsets[k-1],min_support)
    if len(cur_frequent_itemsets) == 0:
        print("Did not find any frequent itemsets of length {}".format(k))
        sys.stdout.flush()
        break
    else:
        print("I found {} frequent itemsets of length {}".format(len(cur_frequent_itemsets), k))
        sys.stdout.flush()
    frequent_itemsets[k] = cur_frequent_itemsets


I found 93 frequent itemsets of length 2
I found 295 frequent itemsets of length 3
I found 593 frequent itemsets of length 4
I found 785 frequent itemsets of length 5
I found 677 frequent itemsets of length 6
I found 373 frequent itemsets of length 7
I found 126 frequent itemsets of length 8
I found 24 frequent itemsets of length 9
I found 2 frequent itemsets of length 10
Did not find any frequent itemsets of length 11

In [19]:
#del frequent_itemsets[1]
frequent_itemsets.keys() 
frequent_itemsets[10]
#frequent_itemsets[8]


Out[19]:
{frozenset({1, 7, 50, 56, 64, 79, 98, 172, 174, 181}): 100,
 frozenset({7, 50, 56, 64, 79, 98, 100, 172, 174, 181}): 100}

抽取关联规则

  • 如果用户喜欢前提中的所有电影,那么他们也会喜欢结论中的电影。

In [20]:
candidate_rules = []
for itemsets_length, itemset_counts in frequent_itemsets.items():
    for itemset in itemset_counts.keys():
        for conclusion in itemset:
            premise = itemset - set((conclusion,))
            candidate_rules.append((premise, conclusion))
print(candidate_rules[:5])


[(frozenset({79}), 258), (frozenset({258}), 79), (frozenset({50}), 64), (frozenset({64}), 50), (frozenset({127}), 181)]

In [33]:
correct_counts = defaultdict(int)
incorrect_counts = defaultdict(int)
#遍历所有用户及其喜欢的电影数据,在这个过程中遍历每条关联规则
for user, reviews in favorable_reviews_by_users.items():
    for candidate_rule in candidate_rules:
        premise, conclusion = candidate_rule
        if not premise.issubset(reviews):  # 前置
            continue
        if conclusion in reviews:
            correct_counts[candidate_rule] += 1
        else:
            incorrect_counts[candidate_rule] += 1
rule_confidence = {candidate_rule: correct_counts[candidate_rule]/ float(correct_counts[candidate_rule] +incorrect_counts[candidate_rule]) for candidate_rule in candidate_rules}

In [55]:
# 对置信度字典进行排序后,输出置信度最高的前五条规则
from operator import itemgetter
sorted_confidence = sorted(rule_confidence.items(),key=itemgetter(1), reverse=True)
print(sorted_confidence[0])
for index in range(5):
    print(index)
    print("Rule #{0}".format(index + 1))
    (premise, conclusion) = sorted_confidence[index][0]
    print("Rule: If a person recommends {0} they will alsorecommend {1}".format(premise, conclusion))
    print(" - Confidence:{0:.3f}".format(rule_confidence[(premise, conclusion)]))
    print("")


((frozenset({98, 172, 127, 174, 7}), 64), 1.0)
0
Rule #1
Rule: If a person recommends frozenset({98, 172, 127, 174, 7}) they will alsorecommend 64
 - Confidence:1.000

1
Rule #2
Rule: If a person recommends frozenset({56, 1, 64, 127}) they will alsorecommend 98
 - Confidence:1.000

2
Rule #3
Rule: If a person recommends frozenset({64, 100, 181, 174, 79}) they will alsorecommend 56
 - Confidence:1.000

3
Rule #4
Rule: If a person recommends frozenset({56, 100, 181, 174, 127}) they will alsorecommend 50
 - Confidence:1.000

4
Rule #5
Rule: If a person recommends frozenset({98, 100, 172, 79, 50, 56}) they will alsorecommend 7
 - Confidence:1.000


In [89]:
#加载电影信息
movie_filename = os.getcwd()+"/data//ml-100k/u.item"
movie_data = pd.read_csv(movie_filename,delimiter="|",header=None,encoding="mac-roman")
movie_data.columns = ["MovieID", "Title", "Release Date",
"Video Release", "IMDB", "<UNK>", "Action", "Adventure",
"Animation", "Children's", "Comedy", "Crime", "Documentary",
"Drama", "Fantasy", "Film-Noir",
"Horror", "Musical", "Mystery", "Romance", "Sci-Fi", "Thriller",
"War", "Western"]
movie_data.ix[0:2]


Out[89]:
MovieID Title Release Date Video Release IMDB <UNK> Action Adventure Animation Children's ... Fantasy Film-Noir Horror Musical Mystery Romance Sci-Fi Thriller War Western
0 1 Toy Story (1995) 01-Jan-1995 NaN http://us.imdb.com/M/title-exact?Toy%20Story%2... 0 0 0 1 1 ... 0 0 0 0 0 0 0 0 0 0
1 2 GoldenEye (1995) 01-Jan-1995 NaN http://us.imdb.com/M/title-exact?GoldenEye%20(... 0 1 1 0 0 ... 0 0 0 0 0 0 0 1 0 0
2 3 Four Rooms (1995) 01-Jan-1995 NaN http://us.imdb.com/M/title-exact?Four%20Rooms%... 0 0 0 0 0 ... 0 0 0 0 0 0 0 1 0 0

3 rows × 24 columns


In [116]:
def get_movie_name(movie_id):
    title_obj = movie_data[movie_data['MovieID']==movie_id]['Title']
    return title_obj.values[0]


Silence of the Lambs, The (1991)

In [117]:
for index in range(5):
    print("Rule #{0}".format(index + 1))
    (premise, conclusion) = sorted_confidence[index][0]
    premise_name = ",".join(get_movie_name(idx) for idx in premise)
    conclusion_name = get_movie_name(conclusion)
    print("Rule: If a person recommends {0} they will alsorecommend {1}".format(premise_name, premise_name))
    print(" - Confidence:{0:.3f}".format(rule_confidence[(premise, conclusion)]))
    print("")


Rule #1
Rule: If a person recommends Silence of the Lambs, The (1991),Empire Strikes Back, The (1980),Godfather, The (1972),Raiders of the Lost Ark (1981),Twelve Monkeys (1995) they will alsorecommend Silence of the Lambs, The (1991),Empire Strikes Back, The (1980),Godfather, The (1972),Raiders of the Lost Ark (1981),Twelve Monkeys (1995)
 - Confidence:1.000

Rule #2
Rule: If a person recommends Pulp Fiction (1994),Toy Story (1995),Shawshank Redemption, The (1994),Godfather, The (1972) they will alsorecommend Pulp Fiction (1994),Toy Story (1995),Shawshank Redemption, The (1994),Godfather, The (1972)
 - Confidence:1.000

Rule #3
Rule: If a person recommends Shawshank Redemption, The (1994),Fargo (1996),Return of the Jedi (1983),Raiders of the Lost Ark (1981),Fugitive, The (1993) they will alsorecommend Shawshank Redemption, The (1994),Fargo (1996),Return of the Jedi (1983),Raiders of the Lost Ark (1981),Fugitive, The (1993)
 - Confidence:1.000

Rule #4
Rule: If a person recommends Pulp Fiction (1994),Fargo (1996),Return of the Jedi (1983),Raiders of the Lost Ark (1981),Godfather, The (1972) they will alsorecommend Pulp Fiction (1994),Fargo (1996),Return of the Jedi (1983),Raiders of the Lost Ark (1981),Godfather, The (1972)
 - Confidence:1.000

Rule #5
Rule: If a person recommends Silence of the Lambs, The (1991),Fargo (1996),Empire Strikes Back, The (1980),Fugitive, The (1993),Star Wars (1977),Pulp Fiction (1994) they will alsorecommend Silence of the Lambs, The (1991),Fargo (1996),Empire Strikes Back, The (1980),Fugitive, The (1993),Star Wars (1977),Pulp Fiction (1994)
 - Confidence:1.000


In [ ]: