In [1]:
import numpy as np
import pandas as pd
import os
file_name = os.getcwd() +"/data/ml-100k/u.data"
all_ratings = pd.read_csv(file_name,delimiter="\t",header=None,names=["UserID", "MovieID", "Rating", "Datetime"],encoding="utf-8")
all_ratings['Datetime'] = pd.to_datetime(all_ratings['Datetime'],unit='s')
all_ratings.ix[0:4]
Out[1]:
In [2]:
# Rating >3 is favorable
all_ratings['Favorable'] = all_ratings['Rating']>3
all_ratings.ix[:5]
Out[2]:
In [3]:
#ratings = all_ratings.ix[:200]
ratings = all_ratings[all_ratings['UserID'].isin(range(200))]
favorable_ratings = ratings[ratings["Favorable"]]
favorable_reviews_by_users = dict((k, frozenset(v.values)) for k, v in favorable_ratings.groupby("UserID")["MovieID"])
num_favorable_by_movie = ratings[["MovieID","Favorable"]].groupby("MovieID").sum()
# last favorable movie
num_favorable_by_movie.sort_values(by=['Favorable'],ascending = False ,axis=0)[:5]
Out[3]:
Apriori 算法
In [4]:
frequent_itemsets = {}
min_support = 50
In [5]:
frequent_itemsets[1] = dict((frozenset((movie_id,)),row['Favorable'])
for movie_id,row in num_favorable_by_movie.iterrows()
if row['Favorable'] > min_support)
frequent_itemsets[1]
Out[5]:
In [9]:
from collections import defaultdict
def find_frequent_itemsets(favorable_reviews_by_users,k_1_itemsets,min_support):
counts = defaultdict(int)
# 用户id,以及他点赞的电影集合
for user, reviews in favorable_reviews_by_users.items():
# 遍历前面找出的项集,判断它们是否是当前评分项集的子集。如果是,表明用户已经为子集中的电影打过分
for itemset in k_1_itemsets:
if not itemset.issubset(reviews):
continue
# 遍历用户打过分却没有出现在项集里的电影,用它生成超集,更新该项集的计数。
for other_reviewed_movie in reviews - itemset:
# 电影 | 用户打过分却没有出现在 【项集里】的电影集合
# 喜欢一个也喜欢另外一个原则
current_superset = itemset | frozenset((other_reviewed_movie,))
counts[current_superset] += 1 # 每个用户同时喜欢这个这个项集 次数+1
#函数最后检测达到支持度要求的项集,看它的频繁程度够不够,并返回其中的频繁项集。
return dict([(itemset, frequency) for itemset, frequency in counts.items() if frequency >= min_support])
In [10]:
import sys
for k in range(2, 20):
cur_frequent_itemsets = find_frequent_itemsets(favorable_reviews_by_users,frequent_itemsets[k-1],min_support)
if len(cur_frequent_itemsets) == 0:
print("Did not find any frequent itemsets of length {}".format(k))
sys.stdout.flush()
break
else:
print("I found {} frequent itemsets of length {}".format(len(cur_frequent_itemsets), k))
sys.stdout.flush()
frequent_itemsets[k] = cur_frequent_itemsets
In [19]:
#del frequent_itemsets[1]
frequent_itemsets.keys()
frequent_itemsets[10]
#frequent_itemsets[8]
Out[19]:
In [20]:
candidate_rules = []
for itemsets_length, itemset_counts in frequent_itemsets.items():
for itemset in itemset_counts.keys():
for conclusion in itemset:
premise = itemset - set((conclusion,))
candidate_rules.append((premise, conclusion))
print(candidate_rules[:5])
In [33]:
correct_counts = defaultdict(int)
incorrect_counts = defaultdict(int)
#遍历所有用户及其喜欢的电影数据,在这个过程中遍历每条关联规则
for user, reviews in favorable_reviews_by_users.items():
for candidate_rule in candidate_rules:
premise, conclusion = candidate_rule
if not premise.issubset(reviews): # 前置
continue
if conclusion in reviews:
correct_counts[candidate_rule] += 1
else:
incorrect_counts[candidate_rule] += 1
rule_confidence = {candidate_rule: correct_counts[candidate_rule]/ float(correct_counts[candidate_rule] +incorrect_counts[candidate_rule]) for candidate_rule in candidate_rules}
In [55]:
# 对置信度字典进行排序后,输出置信度最高的前五条规则
from operator import itemgetter
sorted_confidence = sorted(rule_confidence.items(),key=itemgetter(1), reverse=True)
print(sorted_confidence[0])
for index in range(5):
print(index)
print("Rule #{0}".format(index + 1))
(premise, conclusion) = sorted_confidence[index][0]
print("Rule: If a person recommends {0} they will alsorecommend {1}".format(premise, conclusion))
print(" - Confidence:{0:.3f}".format(rule_confidence[(premise, conclusion)]))
print("")
In [89]:
#加载电影信息
movie_filename = os.getcwd()+"/data//ml-100k/u.item"
movie_data = pd.read_csv(movie_filename,delimiter="|",header=None,encoding="mac-roman")
movie_data.columns = ["MovieID", "Title", "Release Date",
"Video Release", "IMDB", "<UNK>", "Action", "Adventure",
"Animation", "Children's", "Comedy", "Crime", "Documentary",
"Drama", "Fantasy", "Film-Noir",
"Horror", "Musical", "Mystery", "Romance", "Sci-Fi", "Thriller",
"War", "Western"]
movie_data.ix[0:2]
Out[89]:
In [116]:
def get_movie_name(movie_id):
title_obj = movie_data[movie_data['MovieID']==movie_id]['Title']
return title_obj.values[0]
In [117]:
for index in range(5):
print("Rule #{0}".format(index + 1))
(premise, conclusion) = sorted_confidence[index][0]
premise_name = ",".join(get_movie_name(idx) for idx in premise)
conclusion_name = get_movie_name(conclusion)
print("Rule: If a person recommends {0} they will alsorecommend {1}".format(premise_name, premise_name))
print(" - Confidence:{0:.3f}".format(rule_confidence[(premise, conclusion)]))
print("")
In [ ]: