In [1]:
%matplotlib inline
from pandas import Series, DataFrame
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from __future__ import division
In [2]:
# Read in the movielens data
dataDir = 'ml-1m/'
fnames = ['users','movies','ratings']
colNames = {'users': ['user_id', 'gender', 'age', 'occupation', 'zip'],
'movies': ['movie_id', 'title', 'genres'],
'ratings': ['user_id', 'movie_id', 'rating', 'timestamp']}
all_data = {}
for fname in fnames:
fpath = dataDir + fname + '.dat'
all_data[fname] = pd.read_table(fpath, delimiter='::', header=None, names=colNames[fname], engine='python')
In [3]:
all_data['users'].tail()
Out[3]:
In [4]:
all_data['movies'].irow(range(5))
Out[4]:
In [5]:
genr_iter = (set(x.split('|')) for x in all_data['movies']['genres'])
genres = sorted(set.union(*genr_iter))
In [6]:
# Create a mapping from genre to a power of 2 code so we can use bit masking to test for genre membership
genreCode = 2**np.arange(len(genres))
genreMap = dict( zip(genres, genreCode))
In [7]:
help(reduce)
In [8]:
reduce(lambda acc, s: acc + genreMap[s], 'Comedy|Romance'.split('|'), 0)
Out[8]:
In [9]:
all_data['movies']['genre code'] = all_data['movies']['genres'].map(
lambda s: reduce(lambda acc, s: acc + genreMap[s], s.split('|'), 0))
In [10]:
all_data['movies'].head()
Out[10]:
In [11]:
%timeit all_data['movies']['genre code'] & genreMap['Animation']
In [12]:
%timeit all_data['movies']['genres'].map(lambda s: 'Animation' in s)
In [13]:
frame = pd.merge( pd.merge( all_data['users'], all_data['ratings']), all_data['movies'])
In [14]:
len(frame['title'].unique())
Out[14]:
In [15]:
result = frame.groupby(['gender', frame['genre code'] & genreMap['Action']])['rating']
result.count()
Out[15]:
In [16]:
result.mean()
Out[16]:
In [17]:
# Consider all the people who watch films of a particular genre - is there a difference in gender ratings?
def gender_diff(genre, male_index, genre_index):
female = frame['rating'][(~male_index) & genre_index]
male = frame['rating'][male_index & genre_index]
return male.mean() - female.mean(), np.sqrt(male.var()/male.count() + female.var()/female.count()),
In [29]:
gender_data = {}
male_index = frame['gender'] == 'M'
nMale = male_index.sum()
nFemale = (~male_index).sum()
def genre_index(genre):
return frame['genre code'] & genreMap[genre]
for genre in genres:
gind = genre_index(genre)
mu, st = gender_diff(genre, male_index, gind)
male_prop = (male_index & gind).sum()/nMale
female_prop = ((~male_index) & gind).sum()/nFemale
gender_data[genre] = {'diff': mu, 'stderr':st, 'm': male_prop, 'f': female_prop, 'diff pop': male_prop - female_prop}
In [30]:
gender_frame = DataFrame(gender_data).T
In [31]:
gender_frame['is sig'] = np.abs(gender_frame['diff']) >= 1.96*gender_frame['stderr']
In [32]:
gender_frame[gender_frame['is sig']].sort('diff', ascending=False)
Out[32]:
In [22]:
gender_frame[-gender_frame['is sig']]
Out[22]:
In [23]:
male_index.sum()
Out[23]:
In [24]:
nFemale
Out[24]:
In [25]:
nMale
Out[25]:
In [48]:
ct = pd.crosstab(frame.rating, [frame.gender, genre_index("Children's") ], margins=True)
In [60]:
ct.columns
Out[60]:
In [42]:
grouped = frame.rating.groupby([frame['gender'],genre_index('Horror')]).mean()
In [43]:
grouped
Out[43]:
In [44]:
for name, group in frame['rating'].groupby([frame['gender'],genre_index('Horror')]):
print(name)
print(group)
In [ ]: