Movie Lens Dataset


In [1]:
import pandas as pd

In [2]:
# User data
unames = ['user_id', 'gender', 'age', 'occupation', 'zip']
# Movie data 
mnames = ['movie_id', 'title', 'genre']
# Rating data
rnames = ['user_id', 'movie_id', 'rating', 'timestamp']

In [5]:
# Wrapper function to read files
def read_table(fname, colnames):
    # Set engine to python to prevent warning about using python regex rather than c
    return pd.read_table('ml-1m/' + fname, sep='::', header=None, names=colnames, engine='python')

In [7]:
# Read the data tables
users = read_table('users.dat', unames)
movies = read_table('movies.dat', mnames)
ratings = read_table('ratings.dat', rnames)

In [8]:
users[:5]


Out[8]:
user_id gender age occupation zip
0 1 F 1 10 48067
1 2 M 56 16 70072
2 3 M 25 15 55117
3 4 M 45 7 02460
4 5 M 25 20 55455

In [9]:
ratings[:5]


Out[9]:
user_id movie_id rating timestamp
0 1 1193 5 978300760
1 1 661 3 978302109
2 1 914 3 978301968
3 1 3408 4 978300275
4 1 2355 5 978824291

In [10]:
movies[:5]


Out[10]:
movie_id title genre
0 1 Toy Story (1995) Animation|Children's|Comedy
1 2 Jumanji (1995) Adventure|Children's|Fantasy
2 3 Grumpier Old Men (1995) Comedy|Romance
3 4 Waiting to Exhale (1995) Comedy|Drama
4 5 Father of the Bride Part II (1995) Comedy

In [11]:
# Create merged dataframe with all tables together
data = pd.merge(pd.merge(users,ratings),movies)

In [13]:
data[:5]


Out[13]:
user_id gender age occupation zip movie_id rating timestamp title genre
0 1 F 1 10 48067 1193 5 978300760 One Flew Over the Cuckoo's Nest (1975) Drama
1 2 M 56 16 70072 1193 5 978298413 One Flew Over the Cuckoo's Nest (1975) Drama
2 12 M 25 12 32793 1193 4 978220179 One Flew Over the Cuckoo's Nest (1975) Drama
3 15 M 25 7 22903 1193 4 978199279 One Flew Over the Cuckoo's Nest (1975) Drama
4 17 M 50 1 95350 1193 5 978158471 One Flew Over the Cuckoo's Nest (1975) Drama

In [20]:
# Aggregate ratings by one or more users
# Mean rating for each film grouped by gender - use pivot table
mean_ratings = data.pivot_table('rating', index='title', columns='gender', aggfunc='mean')

In [21]:
mean_ratings[:5]


Out[21]:
gender F M
title
$1,000,000 Duck (1971) 3.375000 2.761905
'Night Mother (1986) 3.388889 3.352941
'Til There Was You (1997) 2.675676 2.733333
'burbs, The (1989) 2.793478 2.962085
...And Justice for All (1979) 3.828571 3.689024

In [22]:
# Look at movies with more than 250 ratings
ratings_by_title = data.groupby('title').size()
ratings_by_title[:10]


Out[22]:
title
$1,000,000 Duck (1971)                37
'Night Mother (1986)                  70
'Til There Was You (1997)             52
'burbs, The (1989)                   303
...And Justice for All (1979)        199
1-900 (1994)                           2
10 Things I Hate About You (1999)    700
101 Dalmatians (1961)                565
101 Dalmatians (1996)                364
12 Angry Men (1957)                  616
dtype: int64

In [23]:
active_titles = ratings_by_title.index[ratings_by_title >= 250]

In [24]:
active_titles


Out[24]:
Index([u''burbs, The (1989)', u'10 Things I Hate About You (1999)', u'101 Dalmatians (1961)', u'101 Dalmatians (1996)', u'12 Angry Men (1957)', u'13th Warrior, The (1999)', u'2 Days in the Valley (1996)', u'20,000 Leagues Under the Sea (1954)', u'2001: A Space Odyssey (1968)', u'2010 (1984)', u'28 Days (2000)', u'39 Steps, The (1935)', u'54 (1998)', u'7th Voyage of Sinbad, The (1958)', u'8MM (1999)', u'About Last Night... (1986)', u'Absent Minded Professor, The (1961)', u'Absolute Power (1997)', u'Abyss, The (1989)', u'Ace Ventura: Pet Detective (1994)', u'Ace Ventura: When Nature Calls (1995)', u'Addams Family Values (1993)', u'Addams Family, The (1991)', u'Adventures in Babysitting (1987)', u'Adventures of Buckaroo Bonzai Across the 8th Dimension, The (1984)', u'Adventures of Priscilla, Queen of the Desert, The (1994)', u'Adventures of Robin Hood, The (1938)', u'African Queen, The (1951)', u'Age of Innocence, The (1993)', u'Agnes of God (1985)', u'Air America (1990)', u'Air Force One (1997)', u'Airplane II: The Sequel (1982)', u'Airplane! (1980)', u'Akira (1988)', u'Aladdin (1992)', u'Alice in Wonderland (1951)', u'Alien (1979)', u'Alien Nation (1988)', u'Alien: Resurrection (1997)', u'Aliens (1986)', u'Alien� (1992)', u'Alive (1993)', u'All About Eve (1950)', u'All About My Mother (Todo Sobre Mi Madre) (1999)', u'All Quiet on the Western Front (1930)', u'All That Jazz (1979)', u'Almost Famous (2000)', u'Amadeus (1984)', u'American Beauty (1999)', u'American Gigolo (1980)', u'American Graffiti (1973)', u'American History X (1998)', u'American Movie (1999)', u'American Pie (1999)', u'American President, The (1995)', u'American Psycho (2000)', u'American Tail, An (1986)', u'American Werewolf in London, An (1981)', u'American Werewolf in Paris, An (1997)', u'American in Paris, An (1951)', u'Amistad (1997)', u'Amityville Horror, The (1979)', u'Anaconda (1997)', u'Analyze This (1999)', u'Anastasia (1997)', u'And Now for Something Completely Different (1971)', u'Angel Heart (1987)', u'Animal House (1978)', u'Anna and the King (1999)', u'Annie Hall (1977)', u'Antz (1998)', u'Any Given Sunday (1999)', u'Apartment, The (1960)', u'Apocalypse Now (1979)', u'Apollo 13 (1995)', u'Apostle, The (1997)', u'Arachnophobia (1990)', u'Aristocats, The (1970)', u'Arlington Road (1999)', u'Armageddon (1998)', u'Army of Darkness (1993)', u'Around the World in 80 Days (1956)', u'Arrival, The (1996)', u'Arsenic and Old Lace (1944)', u'Arthur (1981)', u'As Good As It Gets (1997)', u'Astronaut's Wife, The (1999)', u'Atlantic City (1980)', u'Auntie Mame (1958)', u'Austin Powers: International Man of Mystery (1997)', u'Austin Powers: The Spy Who Shagged Me (1999)', u'Avengers, The (1998)', u'Awakenings (1990)', u'Babe (1995)', u'Babe: Pig in the City (1998)', u'Bachelor Party (1984)', u'Bachelor, The (1999)', u'Back to School (1986)', u'Back to the Future (1985)', ...], dtype='object')

In [25]:
mean_ratings = mean_ratings.ix[active_titles]

In [26]:
mean_ratings


Out[26]:
gender F M
title
'burbs, The (1989) 2.793478 2.962085
10 Things I Hate About You (1999) 3.646552 3.311966
101 Dalmatians (1961) 3.791444 3.500000
101 Dalmatians (1996) 3.240000 2.911215
12 Angry Men (1957) 4.184397 4.328421
13th Warrior, The (1999) 3.112000 3.168000
2 Days in the Valley (1996) 3.488889 3.244813
20,000 Leagues Under the Sea (1954) 3.670103 3.709205
2001: A Space Odyssey (1968) 3.825581 4.129738
2010 (1984) 3.446809 3.413712
28 Days (2000) 3.209424 2.977707
39 Steps, The (1935) 3.965517 4.107692
54 (1998) 2.701754 2.782178
7th Voyage of Sinbad, The (1958) 3.409091 3.658879
8MM (1999) 2.906250 2.850962
About Last Night... (1986) 3.188679 3.140909
Absent Minded Professor, The (1961) 3.469388 3.446809
Absolute Power (1997) 3.469136 3.327759
Abyss, The (1989) 3.659236 3.689507
Ace Ventura: Pet Detective (1994) 3.000000 3.197917
Ace Ventura: When Nature Calls (1995) 2.269663 2.543333
Addams Family Values (1993) 3.000000 2.878531
Addams Family, The (1991) 3.186170 3.163498
Adventures in Babysitting (1987) 3.455782 3.208122
Adventures of Buckaroo Bonzai Across the 8th Dimension, The (1984) 3.308511 3.402321
Adventures of Priscilla, Queen of the Desert, The (1994) 3.989071 3.688811
Adventures of Robin Hood, The (1938) 4.166667 3.918367
African Queen, The (1951) 4.324232 4.223822
Age of Innocence, The (1993) 3.827068 3.339506
Agnes of God (1985) 3.534884 3.244898
... ... ...
White Men Can't Jump (1992) 3.028777 3.231061
Who Framed Roger Rabbit? (1988) 3.569378 3.713251
Who's Afraid of Virginia Woolf? (1966) 4.029703 4.096939
Whole Nine Yards, The (2000) 3.296552 3.404814
Wild Bunch, The (1969) 3.636364 4.128099
Wild Things (1998) 3.392000 3.459082
Wild Wild West (1999) 2.275449 2.131973
William Shakespeare's Romeo and Juliet (1996) 3.532609 3.318644
Willow (1988) 3.658683 3.453543
Willy Wonka and the Chocolate Factory (1971) 4.063953 3.789474
Witness (1985) 4.115854 3.941504
Wizard of Oz, The (1939) 4.355030 4.203138
Wolf (1994) 3.074074 2.899083
Women on the Verge of a Nervous Breakdown (1988) 3.934307 3.865741
Wonder Boys (2000) 4.043796 3.913649
Working Girl (1988) 3.606742 3.312500
World Is Not Enough, The (1999) 3.337500 3.388889
Wrong Trousers, The (1993) 4.588235 4.478261
Wyatt Earp (1994) 3.147059 3.283898
X-Files: Fight the Future, The (1998) 3.489474 3.493797
X-Men (2000) 3.682310 3.851702
Year of Living Dangerously (1982) 3.951220 3.869403
Yellow Submarine (1968) 3.714286 3.689286
You've Got Mail (1998) 3.542424 3.275591
Young Frankenstein (1974) 4.289963 4.239177
Young Guns (1988) 3.371795 3.425620
Young Guns II (1990) 2.934783 2.904025
Young Sherlock Holmes (1985) 3.514706 3.363344
Zero Effect (1998) 3.864407 3.723140
eXistenZ (1999) 3.098592 3.289086

1216 rows × 2 columns


In [28]:
top_female_ratings = mean_ratings.sort_index(by='F', ascending=False)
top_female_ratings[:10]


Out[28]:
gender F M
title
Close Shave, A (1995) 4.644444 4.473795
Wrong Trousers, The (1993) 4.588235 4.478261
Sunset Blvd. (a.k.a. Sunset Boulevard) (1950) 4.572650 4.464589
Wallace & Gromit: The Best of Aardman Animation (1996) 4.563107 4.385075
Schindler's List (1993) 4.562602 4.491415
Shawshank Redemption, The (1994) 4.539075 4.560625
Grand Day Out, A (1992) 4.537879 4.293255
To Kill a Mockingbird (1962) 4.536667 4.372611
Creature Comforts (1990) 4.513889 4.272277
Usual Suspects, The (1995) 4.513317 4.518248

In [31]:
# Measuring rating disagreement
mean_ratings['diff'] = abs(mean_ratings['M'] - mean_ratings['F'])

In [32]:
sorted_by_diff = mean_ratings.sort_index(by='diff', ascending=False)
sorted_by_diff[:10]


Out[32]:
gender F M diff
title
Dirty Dancing (1987) 3.790378 2.959596 0.830782
Good, The Bad and The Ugly, The (1966) 3.494949 4.221300 0.726351
Kentucky Fried Movie, The (1977) 2.878788 3.555147 0.676359
Jumpin' Jack Flash (1986) 3.254717 2.578358 0.676359
Dumb & Dumber (1994) 2.697987 3.336595 0.638608
Longest Day, The (1962) 3.411765 4.031447 0.619682
Cable Guy, The (1996) 2.250000 2.863787 0.613787
Evil Dead II (Dead By Dawn) (1987) 3.297297 3.909283 0.611985
Grease (1978) 3.975265 3.367041 0.608224
Hidden, The (1987) 3.137931 3.745098 0.607167

In [33]:
# Alternatively, measure disagreement by dispersion of rankings
ratings_std_by_title = data.groupby('title')['rating'].std()

In [35]:
ratings_std_by_title = ratings_std_by_title.ix[active_titles]

In [36]:
ratings_std_by_title.order(ascending=False)[:10]


Out[36]:
title
Dumb & Dumber (1994)                     1.321333
Blair Witch Project, The (1999)          1.316368
Natural Born Killers (1994)              1.307198
Tank Girl (1995)                         1.277695
Rocky Horror Picture Show, The (1975)    1.260177
Eyes Wide Shut (1999)                    1.259624
Evita (1996)                             1.253631
Billy Madison (1995)                     1.249970
Fear and Loathing in Las Vegas (1998)    1.246408
Bicentennial Man (1999)                  1.245533
Name: rating, dtype: float64

In [ ]: