In [2]:
import numpy as np
import pandas as pd
import os
# 使用pandas加载csv数据
movies = pd.read_csv(os.path.expanduser("~/ml-latest-small/movies.csv"))
ratings = pd.read_csv(os.path.expanduser("~/ml-latest-small/ratings.csv"))
# 去掉无用的维度
ratings.drop(['timestamp'],axis=1,inplace=True)
movies.head()
Out[2]:
In [3]:
ratings.head()
Out[3]:
In [4]:
# 将movieid替换为moviename
def replace_name(x):
return movies[movies["movieId"]==x].title.values[0]
ratings.movieId = ratings.movieId.map(replace_name)
In [5]:
ratings.head()
Out[5]:
In [6]:
# 建立一个透视表
M = ratings.pivot_table(index=['userId'],columns=['movieId'],values='rating')
In [7]:
# 当前维度
M.shape
Out[7]:
In [8]:
# M是一个非常稀疏的透视表
M
Out[8]:
In [14]:
# 算法实现
def pearson(s1, s2):
s1_c = s1 - s1.mean()
s2_c = s2 - s2.mean()
# print(f"s1_c={s1_c}")
# print(f"s2_c={s2_c}")
denominator = np.sqrt(np.sum(s1_c ** 2) * np.sum(s2_c ** 2))
if denominator == 0:
return 0
return np.sum(s1_c * s2_c) / denominator
In [15]:
# 永不妥协 碟中谍2
pearson(M['Erin Brockovich (2000)'],M['Mission: Impossible II (2000)'])
# 永不妥协 指环王
# pearson(M['Erin Brockovich (2000)'],M['Fingers (1978)'])
# 永不妥协 哈利波特与密室
# pearson(M['Erin Brockovich (2000)'],M['Harry Potter and the Chamber of Secrets (2002)'])
# 哈利波特与密室 哈利波特与阿兹卡班的囚徒
# pearson(M['Harry Potter and the Chamber of Secrets (2002)'],M['Harry Potter and the Prisoner of Azkaban (2004)'])
Out[15]:
In [16]:
def get_recs(movie_name, M, num):
reviews = []
for title in M.columns:
if title == movie_name:
continue
cor = pearson(M[movie_name], M[title])
if np.isnan(cor):
continue
else:
reviews.append((title, cor))
reviews.sort(key=lambda tup: tup[1], reverse=True)
return reviews[:num]
In [22]:
# %%time
recs = get_recs('Clerks (1994)', M, 10)
recs[:10]
Out[22]:
In [23]:
# %%time
anti_recs = get_recs('Clerks (1994)', M, 8551)
anti_recs[-10:]
Out[23]:
In [ ]: