notebook.community

Edit and run



In [1]:

    
%matplotlib inline
from pandas import Series, DataFrame
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from __future__ import division



In [2]:

    
# Read in the movielens data
dataDir = 'ml-1m/'
fnames = ['users','movies','ratings']
colNames = {'users': ['user_id', 'gender', 'age', 'occupation', 'zip'],
           'movies': ['movie_id', 'title', 'genres'], 
           'ratings': ['user_id', 'movie_id', 'rating', 'timestamp']}
all_data = {}
for fname in fnames:
    fpath = dataDir + fname + '.dat'
    all_data[fname] = pd.read_table(fpath, delimiter='::', header=None, names=colNames[fname], engine='python')



In [3]:

    
all_data['users'].tail()



In [4]:

    
all_data['movies'].irow(range(5))









    Out[4]:






  
    
      
      movie_id
      title
      genres
    
  
  
    
      0
       1
                         Toy Story (1995)
        Animation|Children's|Comedy
    
    
      1
       2
                           Jumanji (1995)
       Adventure|Children's|Fantasy
    
    
      2
       3
                  Grumpier Old Men (1995)
                     Comedy|Romance
    
    
      3
       4
                 Waiting to Exhale (1995)
                       Comedy|Drama
    
    
      4
       5
       Father of the Bride Part II (1995)
                             Comedy



In [5]:

    
genr_iter = (set(x.split('|')) for x in all_data['movies']['genres'])
genres = sorted(set.union(*genr_iter))



In [6]:

    
# Create a mapping from genre to a power of 2 code so we can use bit masking to test for genre membership
genreCode = 2**np.arange(len(genres))
genreMap = dict( zip(genres, genreCode))



In [7]:

    
help(reduce)









    



Help on built-in function reduce in module __builtin__:

reduce(...)
    reduce(function, sequence[, initial]) -> value
    
    Apply a function of two arguments cumulatively to the items of a sequence,
    from left to right, so as to reduce the sequence to a single value.
    For example, reduce(lambda x, y: x+y, [1, 2, 3, 4, 5]) calculates
    ((((1+2)+3)+4)+5).  If initial is present, it is placed before the items
    of the sequence in the calculation, and serves as a default when the
    sequence is empty.



In [8]:

    
reduce(lambda acc, s: acc + genreMap[s], 'Comedy|Romance'.split('|'), 0)









    Out[8]:





8208



In [9]:

    
all_data['movies']['genre code'] = all_data['movies']['genres'].map(
    lambda s: reduce(lambda acc, s: acc + genreMap[s], s.split('|'), 0))



In [10]:

    
all_data['movies'].head()









    Out[10]:






  
    
      
      movie_id
      title
      genres
      genre code
    
  
  
    
      0
       1
                         Toy Story (1995)
        Animation|Children's|Comedy
         28
    
    
      1
       2
                           Jumanji (1995)
       Adventure|Children's|Fantasy
        266
    
    
      2
       3
                  Grumpier Old Men (1995)
                     Comedy|Romance
       8208
    
    
      3
       4
                 Waiting to Exhale (1995)
                       Comedy|Drama
        144
    
    
      4
       5
       Father of the Bride Part II (1995)
                             Comedy
         16



In [11]:

    
%timeit all_data['movies']['genre code'] & genreMap['Animation']









    



The slowest run took 5.19 times longer than the fastest. This could mean that an intermediate result is being cached 
1000 loops, best of 3: 238 µs per loop



In [12]:

    
%timeit all_data['movies']['genres'].map(lambda s: 'Animation' in s)









    



1000 loops, best of 3: 1.04 ms per loop



In [13]:

    
frame = pd.merge( pd.merge( all_data['users'], all_data['ratings']), all_data['movies'])



In [14]:

    
len(frame['title'].unique())









    Out[14]:





3706



In [15]:

    
result = frame.groupby(['gender', frame['genre code'] & genreMap['Action']])['rating']
result.count()









    Out[15]:





gender  genre code
F       False         200790
        True           45650
M       False         541962
        True          211807
Name: rating, dtype: int64



In [16]:

    
result.mean()









    Out[16]:





gender  genre code
F       False         3.649948
        True          3.490252
M       False         3.599164
        True          3.491386
Name: rating, dtype: float64



In [17]:

    
# Consider all the people who watch films of a particular genre - is there a difference in gender ratings?
def gender_diff(genre, male_index, genre_index):
    female = frame['rating'][(~male_index) & genre_index]
    male = frame['rating'][male_index & genre_index]
    return male.mean() - female.mean(), np.sqrt(male.var()/male.count() + female.var()/female.count()),



In [29]:

    
gender_data = {}
male_index = frame['gender'] == 'M'
nMale = male_index.sum()
nFemale = (~male_index).sum()
def genre_index(genre):
    return frame['genre code'] & genreMap[genre]

for genre in genres:
    gind = genre_index(genre)
    mu, st = gender_diff(genre, male_index, gind)
    male_prop = (male_index & gind).sum()/nMale
    female_prop = ((~male_index) & gind).sum()/nFemale
    gender_data[genre] = {'diff': mu, 'stderr':st, 'm': male_prop, 'f': female_prop, 'diff pop': male_prop - female_prop}



In [30]:

    
gender_frame = DataFrame(gender_data).T



In [31]:

    
gender_frame['is sig'] = np.abs(gender_frame['diff']) >= 1.96*gender_frame['stderr']



In [32]:

    
gender_frame[gender_frame['is sig']].sort('diff', ascending=False)



In [22]:

    
gender_frame[-gender_frame['is sig']]









    Out[22]:






  
    
      
      diff
      diff pop
      f
      m
      stderr
      is sig
    
  
  
    
      Action
       0.001134
       0.095759
       0.185238
       0.280997
       0.005879
       False
    
    
      Documentary
      -0.017581
       0.000048
       0.007872
       0.007920
       0.026968
       False
    
    
      Drama
       0.000927
      -0.058158
       0.398284
       0.340125
       0.003925
       False
    
    
      Horror
       0.015021
       0.022537
       0.059386
       0.081923
       0.011466
       False
    
    
      Mystery
      -0.024539
      -0.000412
       0.040480
       0.040068
       0.012599
       False
    
    
      Thriller
      -0.003675
       0.034606
       0.163561
       0.198167
       0.006319
       False
    
    
      War
       0.000237
       0.015029
       0.057186
       0.072216
       0.010074
       False



In [23]:

    
male_index.sum()









    Out[23]:





753769



In [24]:

    
nFemale









    Out[24]:





246440



In [25]:

    
nMale









    Out[25]:





753769



In [48]:

    
ct = pd.crosstab(frame.rating, [frame.gender, genre_index("Children's") ], margins=True)



In [60]:

    
ct.columns









    Out[60]:





MultiIndex(levels=[[u'All', u'F', u'M'], [False, True, u'']],
           labels=[[1, 1, 2, 2, 0], [0, 1, 0, 1, 2]],
           names=[u'gender', u'genre code'])



In [42]:

    
grouped = frame.rating.groupby([frame['gender'],genre_index('Horror')]).mean()



In [43]:

    
grouped









    Out[43]:





gender  genre code
F       False         3.646725
        True          3.202870
M       False         3.600198
        True          3.217891
Name: rating, dtype: float64



In [44]:

    
for name, group in frame['rating'].groupby([frame['gender'],genre_index('Horror')]):
    print(name)
    print(group)









    



('F', False)
0     5
5     4
7     5
8     3
19    4
20    4
22    5
23    5
24    5
26    3
27    2
36    4
37    4
39    4
43    5
...
1000176    5
1000177    4
1000182    4
1000184    4
1000186    1
1000188    4
1000192    4
1000196    4
1000197    3
1000198    1
1000199    5
1000200    3
1000201    3
1000202    4
1000207    5
Name: rating, Length: 231805, dtype: int64
('F', True)
104706    4
104708    1
104716    3
104728    1
104732    3
104737    1
104756    2
104758    1
104759    2
104767    4
104774    2
104778    4
104782    3
104783    4
104784    3
...
998891     1
998898     3
998905     1
999276     3
999282     2
999283     2
999550     4
999697     2
999702     1
999841     4
999870     1
999936     4
999942     3
999943     2
1000003    3
Name: rating, Length: 14635, dtype: int64
('M', False)
1     5
2     4
3     4
4     5
6     5
9     5
10    5
11    3
12    4
13    4
14    4
15    4
16    5
17    5
18    5
...
1000181    2
1000183    4
1000185    3
1000187    3
1000189    5
1000190    3
1000191    3
1000193    3
1000194    1
1000195    5
1000203    3
1000204    5
1000205    3
1000206    1
1000208    4
Name: rating, Length: 692018, dtype: int64
('M', True)
104705    3
104707    3
104709    5
104710    3
104711    4
104712    4
104713    3
104714    3
104715    3
104717    3
104718    4
104719    1
104720    4
104721    4
104722    4
...
999937     3
999938     2
999939     2
999940     4
999941     3
1000001    2
1000002    1
1000004    1
1000013    1
1000014    2
1000091    1
1000092    1
1000093    2
1000104    3
1000113    3
Name: rating, Length: 61751, dtype: int64



In [ ]:

	user_id	gender	age	occupation	zip
6035	6036	F	25	15	32603
6036	6037	F	45	1	76006
6037	6038	F	56	1	14706
6038	6039	F	45	0	01060
6039	6040	M	25	6	11106

	diff	diff pop	f	m	stderr	is sig
Western	0.103208	0.008718	0.014109	0.022827	0.020662	True
Film-Noir	0.074167	0.001601	0.017051	0.018652	0.016861	True
Crime	0.024388	0.016993	0.066718	0.083711	0.009546	True
Sci-Fi	0.019697	0.061143	0.111183	0.172326	0.007803	True
Adventure	-0.044753	0.030543	0.110907	0.141450	0.007652	True
Comedy	-0.068271	-0.045304	0.390647	0.345343	0.004210	True
Animation	-0.083367	-0.008368	0.049590	0.041222	0.011477	True
Fantasy	-0.086473	0.001218	0.035376	0.036593	0.013875	True
Romance	-0.100317	-0.075108	0.204094	0.128986	0.005835	True
Musical	-0.212775	-0.017617	0.054800	0.037184	0.011246	True
Children's	-0.213586	-0.019014	0.086500	0.067486	0.009273	True

	movie_id	title	genres
0	1	Toy Story (1995)	Animation\|Children's\|Comedy
1	2	Jumanji (1995)	Adventure\|Children's\|Fantasy
2	3	Grumpier Old Men (1995)	Comedy\|Romance
3	4	Waiting to Exhale (1995)	Comedy\|Drama
4	5	Father of the Bride Part II (1995)	Comedy

	diff	diff pop	f	m	stderr	is sig
Action	0.001134	0.095759	0.185238	0.280997	0.005879	False
Documentary	-0.017581	0.000048	0.007872	0.007920	0.026968	False
Drama	0.000927	-0.058158	0.398284	0.340125	0.003925	False
Horror	0.015021	0.022537	0.059386	0.081923	0.011466	False
Mystery	-0.024539	-0.000412	0.040480	0.040068	0.012599	False
Thriller	-0.003675	0.034606	0.163561	0.198167	0.006319	False
War	0.000237	0.015029	0.057186	0.072216	0.010074	False