In [1]:
%matplotlib inline
from pandas import Series, DataFrame
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from __future__ import division

In [2]:
# Read in the movielens data
dataDir = 'ml-1m/'
fnames = ['users','movies','ratings']
colNames = {'users': ['user_id', 'gender', 'age', 'occupation', 'zip'],
           'movies': ['movie_id', 'title', 'genres'], 
           'ratings': ['user_id', 'movie_id', 'rating', 'timestamp']}
all_data = {}
for fname in fnames:
    fpath = dataDir + fname + '.dat'
    all_data[fname] = pd.read_table(fpath, delimiter='::', header=None, names=colNames[fname], engine='python')

In [3]:
all_data['users'].tail()


Out[3]:
user_id gender age occupation zip
6035 6036 F 25 15 32603
6036 6037 F 45 1 76006
6037 6038 F 56 1 14706
6038 6039 F 45 0 01060
6039 6040 M 25 6 11106

In [4]:
all_data['movies'].irow(range(5))


Out[4]:
movie_id title genres
0 1 Toy Story (1995) Animation|Children's|Comedy
1 2 Jumanji (1995) Adventure|Children's|Fantasy
2 3 Grumpier Old Men (1995) Comedy|Romance
3 4 Waiting to Exhale (1995) Comedy|Drama
4 5 Father of the Bride Part II (1995) Comedy

In [5]:
genr_iter = (set(x.split('|')) for x in all_data['movies']['genres'])
genres = sorted(set.union(*genr_iter))

In [6]:
# Create a mapping from genre to a power of 2 code so we can use bit masking to test for genre membership
genreCode = 2**np.arange(len(genres))
genreMap = dict( zip(genres, genreCode))

In [7]:
help(reduce)


Help on built-in function reduce in module __builtin__:

reduce(...)
    reduce(function, sequence[, initial]) -> value
    
    Apply a function of two arguments cumulatively to the items of a sequence,
    from left to right, so as to reduce the sequence to a single value.
    For example, reduce(lambda x, y: x+y, [1, 2, 3, 4, 5]) calculates
    ((((1+2)+3)+4)+5).  If initial is present, it is placed before the items
    of the sequence in the calculation, and serves as a default when the
    sequence is empty.


In [8]:
reduce(lambda acc, s: acc + genreMap[s], 'Comedy|Romance'.split('|'), 0)


Out[8]:
8208

In [9]:
all_data['movies']['genre code'] = all_data['movies']['genres'].map(
    lambda s: reduce(lambda acc, s: acc + genreMap[s], s.split('|'), 0))

In [10]:
all_data['movies'].head()


Out[10]:
movie_id title genres genre code
0 1 Toy Story (1995) Animation|Children's|Comedy 28
1 2 Jumanji (1995) Adventure|Children's|Fantasy 266
2 3 Grumpier Old Men (1995) Comedy|Romance 8208
3 4 Waiting to Exhale (1995) Comedy|Drama 144
4 5 Father of the Bride Part II (1995) Comedy 16

In [11]:
%timeit all_data['movies']['genre code'] & genreMap['Animation']


The slowest run took 5.19 times longer than the fastest. This could mean that an intermediate result is being cached 
1000 loops, best of 3: 238 µs per loop

In [12]:
%timeit all_data['movies']['genres'].map(lambda s: 'Animation' in s)


1000 loops, best of 3: 1.04 ms per loop

In [13]:
frame = pd.merge( pd.merge( all_data['users'], all_data['ratings']), all_data['movies'])

In [14]:
len(frame['title'].unique())


Out[14]:
3706

In [15]:
result = frame.groupby(['gender', frame['genre code'] & genreMap['Action']])['rating']
result.count()


Out[15]:
gender  genre code
F       False         200790
        True           45650
M       False         541962
        True          211807
Name: rating, dtype: int64

In [16]:
result.mean()


Out[16]:
gender  genre code
F       False         3.649948
        True          3.490252
M       False         3.599164
        True          3.491386
Name: rating, dtype: float64

In [17]:
# Consider all the people who watch films of a particular genre - is there a difference in gender ratings?
def gender_diff(genre, male_index, genre_index):
    female = frame['rating'][(~male_index) & genre_index]
    male = frame['rating'][male_index & genre_index]
    return male.mean() - female.mean(), np.sqrt(male.var()/male.count() + female.var()/female.count()),

In [29]:
gender_data = {}
male_index = frame['gender'] == 'M'
nMale = male_index.sum()
nFemale = (~male_index).sum()
def genre_index(genre):
    return frame['genre code'] & genreMap[genre]

for genre in genres:
    gind = genre_index(genre)
    mu, st = gender_diff(genre, male_index, gind)
    male_prop = (male_index & gind).sum()/nMale
    female_prop = ((~male_index) & gind).sum()/nFemale
    gender_data[genre] = {'diff': mu, 'stderr':st, 'm': male_prop, 'f': female_prop, 'diff pop': male_prop - female_prop}

In [30]:
gender_frame = DataFrame(gender_data).T

In [31]:
gender_frame['is sig'] = np.abs(gender_frame['diff']) >= 1.96*gender_frame['stderr']

In [32]:
gender_frame[gender_frame['is sig']].sort('diff', ascending=False)


Out[32]:
diff diff pop f m stderr is sig
Western 0.103208 0.008718 0.014109 0.022827 0.020662 True
Film-Noir 0.074167 0.001601 0.017051 0.018652 0.016861 True
Crime 0.024388 0.016993 0.066718 0.083711 0.009546 True
Sci-Fi 0.019697 0.061143 0.111183 0.172326 0.007803 True
Adventure -0.044753 0.030543 0.110907 0.141450 0.007652 True
Comedy -0.068271 -0.045304 0.390647 0.345343 0.004210 True
Animation -0.083367 -0.008368 0.049590 0.041222 0.011477 True
Fantasy -0.086473 0.001218 0.035376 0.036593 0.013875 True
Romance -0.100317 -0.075108 0.204094 0.128986 0.005835 True
Musical -0.212775 -0.017617 0.054800 0.037184 0.011246 True
Children's -0.213586 -0.019014 0.086500 0.067486 0.009273 True

In [22]:
gender_frame[-gender_frame['is sig']]


Out[22]:
diff diff pop f m stderr is sig
Action 0.001134 0.095759 0.185238 0.280997 0.005879 False
Documentary -0.017581 0.000048 0.007872 0.007920 0.026968 False
Drama 0.000927 -0.058158 0.398284 0.340125 0.003925 False
Horror 0.015021 0.022537 0.059386 0.081923 0.011466 False
Mystery -0.024539 -0.000412 0.040480 0.040068 0.012599 False
Thriller -0.003675 0.034606 0.163561 0.198167 0.006319 False
War 0.000237 0.015029 0.057186 0.072216 0.010074 False

In [23]:
male_index.sum()


Out[23]:
753769

In [24]:
nFemale


Out[24]:
246440

In [25]:
nMale


Out[25]:
753769

In [48]:
ct = pd.crosstab(frame.rating, [frame.gender, genre_index("Children's") ], margins=True)

In [60]:
ct.columns


Out[60]:
MultiIndex(levels=[[u'All', u'F', u'M'], [False, True, u'']],
           labels=[[1, 1, 2, 2, 0], [0, 1, 0, 1, 2]],
           names=[u'gender', u'genre code'])

In [42]:
grouped = frame.rating.groupby([frame['gender'],genre_index('Horror')]).mean()

In [43]:
grouped


Out[43]:
gender  genre code
F       False         3.646725
        True          3.202870
M       False         3.600198
        True          3.217891
Name: rating, dtype: float64

In [44]:
for name, group in frame['rating'].groupby([frame['gender'],genre_index('Horror')]):
    print(name)
    print(group)


('F', False)
0     5
5     4
7     5
8     3
19    4
20    4
22    5
23    5
24    5
26    3
27    2
36    4
37    4
39    4
43    5
...
1000176    5
1000177    4
1000182    4
1000184    4
1000186    1
1000188    4
1000192    4
1000196    4
1000197    3
1000198    1
1000199    5
1000200    3
1000201    3
1000202    4
1000207    5
Name: rating, Length: 231805, dtype: int64
('F', True)
104706    4
104708    1
104716    3
104728    1
104732    3
104737    1
104756    2
104758    1
104759    2
104767    4
104774    2
104778    4
104782    3
104783    4
104784    3
...
998891     1
998898     3
998905     1
999276     3
999282     2
999283     2
999550     4
999697     2
999702     1
999841     4
999870     1
999936     4
999942     3
999943     2
1000003    3
Name: rating, Length: 14635, dtype: int64
('M', False)
1     5
2     4
3     4
4     5
6     5
9     5
10    5
11    3
12    4
13    4
14    4
15    4
16    5
17    5
18    5
...
1000181    2
1000183    4
1000185    3
1000187    3
1000189    5
1000190    3
1000191    3
1000193    3
1000194    1
1000195    5
1000203    3
1000204    5
1000205    3
1000206    1
1000208    4
Name: rating, Length: 692018, dtype: int64
('M', True)
104705    3
104707    3
104709    5
104710    3
104711    4
104712    4
104713    3
104714    3
104715    3
104717    3
104718    4
104719    1
104720    4
104721    4
104722    4
...
999937     3
999938     2
999939     2
999940     4
999941     3
1000001    2
1000002    1
1000004    1
1000013    1
1000014    2
1000091    1
1000092    1
1000093    2
1000104    3
1000113    3
Name: rating, Length: 61751, dtype: int64

In [ ]: