In [4]:
import pandas as pd
path = 'E:/git/pydata-book/ch02/movielens/'
In [6]:
user_names = ['user_id', 'gender', 'age', 'occupation', 'zip']
users = pd.read_table(path + 'users.dat', sep='::', header=None, names=user_names)
In [7]:
rating_names = ['user_id', 'movie_id', 'rating', 'timestamp']
In [8]:
ratings = pd.read_table(path + 'ratings.dat', '::', header=None, names=rating_names)
In [9]:
movie_names = ['movie_id', 'title', 'genres']
In [10]:
movies = pd.read_table(path + 'movies.dat', '::', header=None, names=movie_names)
In [11]:
users[:5]
Out[11]:
In [13]:
ratings[:5]
Out[13]:
In [14]:
movies[:5]
Out[14]:
In [15]:
ratings
Out[15]:
In [16]:
data = pd.merge(pd.merge(ratings, users), movies)
In [17]:
data
Out[17]:
In [19]:
data.ix[0]
Out[19]:
In [25]:
#按性别计算每部电影的平均得分
mean_ratings = data.pivot_table('rating', rows='title', cols='gender', aggfunc='mean')
In [27]:
mean_ratings[:5]
Out[27]:
In [28]:
ratings_by_title = data.groupby('title').size()
In [29]:
ratings_by_title[:10]
Out[29]:
In [38]:
active_titles = ratings_by_title.index[ratings_by_title >=250]
In [43]:
active_titles
Out[43]:
In [44]:
mean_ratings = mean_ratings.ix[active_titles]
In [51]:
mean_ratings
Out[51]:
In [55]:
top_female_ratings = mean_ratings.sort_index(by='F', ascending=False)
In [56]:
top_female_ratings[:10]
Out[56]:
In [59]:
mean_ratings['diff'] = mean_ratings['M'] - mean_ratings['F']
In [61]:
sorted_by_diff = mean_ratings.sort_index(by='diff')
In [62]:
#分歧最大且女性观众更喜欢的电影
sorted_by_diff[:15]
Out[62]:
In [63]:
#分歧最大且男性观众更喜欢的电影
sorted_by_diff[::-1][:15]
Out[63]:
In [65]:
#根据电影名称分组的得分数据的标准差
rating_std_by_title = data.groupby('title')['rating'].std()
In [66]:
#根据active_titles进行过滤
rating_std_by_title = rating_std_by_title.ix[active_titles]
In [67]:
#根据值对Series进行降序排序
#找出分歧最大的电影(不考虑性别因素)
rating_std_by_title.order(ascending=False)[:10]
Out[67]: