In [1]:
import pandas as pd
In [14]:
unames = ['user_id', 'gender', 'age', 'occupation', 'zip']
users = pd.read_table('movie/users.dat', sep='::', header=None, names=unames, engine='python')
rnames = ['user_id', 'movie_id', 'rating', 'timestamp']
ratings = pd.read_table('movie/ratings.dat', sep='::', header=None, names=rnames, engine='python')
mnames = ['movie_id', 'title', 'genres']
movies = pd.read_table('movie/movies.dat', sep='::', header=None, names=mnames, engine='python')
In [11]:
users[:5]
Out[11]:
user_id
gender
age
occupation
zip
0
1
F
1
10
48067
1
2
M
56
16
70072
2
3
M
25
15
55117
3
4
M
45
7
02460
4
5
M
25
20
55455
In [12]:
ratings[:5]
Out[12]:
user_id
movie_id
rating
timestamp
0
1
1193
5
978300760
1
1
661
3
978302109
2
1
914
3
978301968
3
1
3408
4
978300275
4
1
2355
5
978824291
In [15]:
movies[:5]
Out[15]:
movie_id
title
genres
0
1
Toy Story (1995)
Animation|Children's|Comedy
1
2
Jumanji (1995)
Adventure|Children's|Fantasy
2
3
Grumpier Old Men (1995)
Comedy|Romance
3
4
Waiting to Exhale (1995)
Comedy|Drama
4
5
Father of the Bride Part II (1995)
Comedy
In [16]:
data = pd.merge(pd.merge(ratings, users), movies)
In [17]:
data
Out[17]:
user_id
movie_id
rating
timestamp
gender
age
occupation
zip
title
genres
0
1
1193
5
978300760
F
1
10
48067
One Flew Over the Cuckoo's Nest (1975)
Drama
1
2
1193
5
978298413
M
56
16
70072
One Flew Over the Cuckoo's Nest (1975)
Drama
2
12
1193
4
978220179
M
25
12
32793
One Flew Over the Cuckoo's Nest (1975)
Drama
3
15
1193
4
978199279
M
25
7
22903
One Flew Over the Cuckoo's Nest (1975)
Drama
4
17
1193
5
978158471
M
50
1
95350
One Flew Over the Cuckoo's Nest (1975)
Drama
5
18
1193
4
978156168
F
18
3
95825
One Flew Over the Cuckoo's Nest (1975)
Drama
6
19
1193
5
982730936
M
1
10
48073
One Flew Over the Cuckoo's Nest (1975)
Drama
7
24
1193
5
978136709
F
25
7
10023
One Flew Over the Cuckoo's Nest (1975)
Drama
8
28
1193
3
978125194
F
25
1
14607
One Flew Over the Cuckoo's Nest (1975)
Drama
9
33
1193
5
978557765
M
45
3
55421
One Flew Over the Cuckoo's Nest (1975)
Drama
10
39
1193
5
978043535
M
18
4
61820
One Flew Over the Cuckoo's Nest (1975)
Drama
11
42
1193
3
978038981
M
25
8
24502
One Flew Over the Cuckoo's Nest (1975)
Drama
12
44
1193
4
978018995
M
45
17
98052
One Flew Over the Cuckoo's Nest (1975)
Drama
13
47
1193
4
977978345
M
18
4
94305
One Flew Over the Cuckoo's Nest (1975)
Drama
14
48
1193
4
977975061
M
25
4
92107
One Flew Over the Cuckoo's Nest (1975)
Drama
15
49
1193
4
978813972
M
18
12
77084
One Flew Over the Cuckoo's Nest (1975)
Drama
16
53
1193
5
977946400
M
25
0
96931
One Flew Over the Cuckoo's Nest (1975)
Drama
17
54
1193
5
977944039
M
50
1
56723
One Flew Over the Cuckoo's Nest (1975)
Drama
18
58
1193
5
977933866
M
25
2
30303
One Flew Over the Cuckoo's Nest (1975)
Drama
19
59
1193
4
977934292
F
50
1
55413
One Flew Over the Cuckoo's Nest (1975)
Drama
20
62
1193
4
977968584
F
35
3
98105
One Flew Over the Cuckoo's Nest (1975)
Drama
21
80
1193
4
977786172
M
56
1
49327
One Flew Over the Cuckoo's Nest (1975)
Drama
22
81
1193
5
977785864
F
25
0
60640
One Flew Over the Cuckoo's Nest (1975)
Drama
23
88
1193
5
977694161
F
45
1
02476
One Flew Over the Cuckoo's Nest (1975)
Drama
24
89
1193
5
977683596
F
56
9
85749
One Flew Over the Cuckoo's Nest (1975)
Drama
25
95
1193
5
977626632
M
45
0
98201
One Flew Over the Cuckoo's Nest (1975)
Drama
26
96
1193
3
977621789
F
25
16
78028
One Flew Over the Cuckoo's Nest (1975)
Drama
27
99
1193
2
982791053
F
1
10
19390
One Flew Over the Cuckoo's Nest (1975)
Drama
28
102
1193
5
1040737607
M
35
19
20871
One Flew Over the Cuckoo's Nest (1975)
Drama
29
104
1193
2
977546620
M
25
12
00926
One Flew Over the Cuckoo's Nest (1975)
Drama
...
...
...
...
...
...
...
...
...
...
...
1000179
4933
3084
3
962757020
M
25
15
94040
Home Page (1999)
Documentary
1000180
4802
2218
2
1014866656
M
56
1
40601
Juno and Paycock (1930)
Drama
1000181
4812
2308
2
962932391
M
18
14
25301
Detroit 9000 (1973)
Action|Crime
1000182
4874
624
4
962781918
F
25
4
70808
Condition Red (1995)
Action|Drama|Thriller
1000183
5059
1434
4
962484364
M
45
16
22652
Stranger, The (1994)
Action
1000184
5947
1434
4
957190428
F
45
16
97215
Stranger, The (1994)
Action
1000185
5077
1868
3
962417299
M
25
2
20037
Truce, The (1996)
Drama|War
1000186
5944
1868
1
957197520
F
18
10
27606
Truce, The (1996)
Drama|War
1000187
5105
404
3
962337582
M
50
7
18977
Brother Minister: The Assassination of Malcolm...
Documentary
1000188
5185
404
4
963402617
F
35
4
44485
Brother Minister: The Assassination of Malcolm...
Documentary
1000189
5532
404
5
959619841
M
25
17
27408
Brother Minister: The Assassination of Malcolm...
Documentary
1000190
5543
404
3
960127592
M
25
17
97401
Brother Minister: The Assassination of Malcolm...
Documentary
1000191
5220
2543
3
961546137
M
25
7
91436
Six Ways to Sunday (1997)
Comedy
1000192
5754
2543
4
958272316
F
18
1
60640
Six Ways to Sunday (1997)
Comedy
1000193
5227
591
3
961475931
M
18
10
64050
Tough and Deadly (1995)
Action|Drama|Thriller
1000194
5795
591
1
958145253
M
25
1
92688
Tough and Deadly (1995)
Action|Drama|Thriller
1000195
5313
3656
5
960920392
M
56
0
55406
Lured (1947)
Crime
1000196
5328
2438
4
960838075
F
25
4
91740
Outside Ozona (1998)
Drama|Thriller
1000197
5334
3323
3
960796159
F
56
13
46140
Chain of Fools (2000)
Comedy|Crime
1000198
5334
127
1
960795494
F
56
13
46140
Silence of the Palace, The (Saimt el Qusur) (1...
Drama
1000199
5334
3382
5
960796159
F
56
13
46140
Song of Freedom (1936)
Drama
1000200
5420
1843
3
960156505
F
1
19
14850
Slappy and the Stinkers (1998)
Children's|Comedy
1000201
5433
286
3
960240881
F
35
17
45014
Nemesis 2: Nebula (1995)
Action|Sci-Fi|Thriller
1000202
5494
3530
4
959816296
F
35
17
94306
Smoking/No Smoking (1993)
Comedy
1000203
5556
2198
3
959445515
M
45
6
92103
Modulations (1998)
Documentary
1000204
5949
2198
5
958846401
M
18
17
47901
Modulations (1998)
Documentary
1000205
5675
2703
3
976029116
M
35
14
30030
Broken Vessels (1998)
Drama
1000206
5780
2845
1
958153068
M
18
17
92886
White Boys (1999)
Drama
1000207
5851
3607
5
957756608
F
18
20
55410
One Little Indian (1973)
Comedy|Drama|Western
1000208
5938
2909
4
957273353
M
25
1
35401
Five Wives, Three Secretaries and Me (1998)
Documentary
1000209 rows × 10 columns
In [19]:
data.ix[0]
Out[19]:
user_id 1
movie_id 1193
rating 5
timestamp 978300760
gender F
age 1
occupation 10
zip 48067
title One Flew Over the Cuckoo's Nest (1975)
genres Drama
Name: 0, dtype: object
In [22]:
mean_ratings = data.pivot_table('rating', index='title', columns='gender', aggfunc='mean')
In [23]:
mean_ratings[:5]
Out[23]:
gender
F
M
title
$1,000,000 Duck (1971)
3.375000
2.761905
'Night Mother (1986)
3.388889
3.352941
'Til There Was You (1997)
2.675676
2.733333
'burbs, The (1989)
2.793478
2.962085
...And Justice for All (1979)
3.828571
3.689024
In [27]:
ratings_by_title = data.groupby('title').size()
In [29]:
ratings_by_title[:10]
Out[29]:
title
$1,000,000 Duck (1971) 37
'Night Mother (1986) 70
'Til There Was You (1997) 52
'burbs, The (1989) 303
...And Justice for All (1979) 199
1-900 (1994) 2
10 Things I Hate About You (1999) 700
101 Dalmatians (1961) 565
101 Dalmatians (1996) 364
12 Angry Men (1957) 616
dtype: int64
In [30]:
active_titles = ratings_by_title.index[ratings_by_title >= 250]
In [31]:
active_titles
Out[31]:
Index([u''burbs, The (1989)', u'10 Things I Hate About You (1999)',
u'101 Dalmatians (1961)', u'101 Dalmatians (1996)',
u'12 Angry Men (1957)', u'13th Warrior, The (1999)',
u'2 Days in the Valley (1996)', u'20,000 Leagues Under the Sea (1954)',
u'2001: A Space Odyssey (1968)', u'2010 (1984)',
...
u'X-Men (2000)', u'Year of Living Dangerously (1982)',
u'Yellow Submarine (1968)', u'You've Got Mail (1998)',
u'Young Frankenstein (1974)', u'Young Guns (1988)',
u'Young Guns II (1990)', u'Young Sherlock Holmes (1985)',
u'Zero Effect (1998)', u'eXistenZ (1999)'],
dtype='object', name=u'title', length=1216)
In [32]:
mean_ratings = mean_ratings.ix[active_titles]
In [33]:
mean_ratings
Out[33]:
gender
F
M
title
'burbs, The (1989)
2.793478
2.962085
10 Things I Hate About You (1999)
3.646552
3.311966
101 Dalmatians (1961)
3.791444
3.500000
101 Dalmatians (1996)
3.240000
2.911215
12 Angry Men (1957)
4.184397
4.328421
13th Warrior, The (1999)
3.112000
3.168000
2 Days in the Valley (1996)
3.488889
3.244813
20,000 Leagues Under the Sea (1954)
3.670103
3.709205
2001: A Space Odyssey (1968)
3.825581
4.129738
2010 (1984)
3.446809
3.413712
28 Days (2000)
3.209424
2.977707
39 Steps, The (1935)
3.965517
4.107692
54 (1998)
2.701754
2.782178
7th Voyage of Sinbad, The (1958)
3.409091
3.658879
8MM (1999)
2.906250
2.850962
About Last Night... (1986)
3.188679
3.140909
Absent Minded Professor, The (1961)
3.469388
3.446809
Absolute Power (1997)
3.469136
3.327759
Abyss, The (1989)
3.659236
3.689507
Ace Ventura: Pet Detective (1994)
3.000000
3.197917
Ace Ventura: When Nature Calls (1995)
2.269663
2.543333
Addams Family Values (1993)
3.000000
2.878531
Addams Family, The (1991)
3.186170
3.163498
Adventures in Babysitting (1987)
3.455782
3.208122
Adventures of Buckaroo Bonzai Across the 8th Dimension, The (1984)
3.308511
3.402321
Adventures of Priscilla, Queen of the Desert, The (1994)
3.989071
3.688811
Adventures of Robin Hood, The (1938)
4.166667
3.918367
African Queen, The (1951)
4.324232
4.223822
Age of Innocence, The (1993)
3.827068
3.339506
Agnes of God (1985)
3.534884
3.244898
...
...
...
White Men Can't Jump (1992)
3.028777
3.231061
Who Framed Roger Rabbit? (1988)
3.569378
3.713251
Who's Afraid of Virginia Woolf? (1966)
4.029703
4.096939
Whole Nine Yards, The (2000)
3.296552
3.404814
Wild Bunch, The (1969)
3.636364
4.128099
Wild Things (1998)
3.392000
3.459082
Wild Wild West (1999)
2.275449
2.131973
William Shakespeare's Romeo and Juliet (1996)
3.532609
3.318644
Willow (1988)
3.658683
3.453543
Willy Wonka and the Chocolate Factory (1971)
4.063953
3.789474
Witness (1985)
4.115854
3.941504
Wizard of Oz, The (1939)
4.355030
4.203138
Wolf (1994)
3.074074
2.899083
Women on the Verge of a Nervous Breakdown (1988)
3.934307
3.865741
Wonder Boys (2000)
4.043796
3.913649
Working Girl (1988)
3.606742
3.312500
World Is Not Enough, The (1999)
3.337500
3.388889
Wrong Trousers, The (1993)
4.588235
4.478261
Wyatt Earp (1994)
3.147059
3.283898
X-Files: Fight the Future, The (1998)
3.489474
3.493797
X-Men (2000)
3.682310
3.851702
Year of Living Dangerously (1982)
3.951220
3.869403
Yellow Submarine (1968)
3.714286
3.689286
You've Got Mail (1998)
3.542424
3.275591
Young Frankenstein (1974)
4.289963
4.239177
Young Guns (1988)
3.371795
3.425620
Young Guns II (1990)
2.934783
2.904025
Young Sherlock Holmes (1985)
3.514706
3.363344
Zero Effect (1998)
3.864407
3.723140
eXistenZ (1999)
3.098592
3.289086
1216 rows × 2 columns
In [35]:
top_female_ratings = mean_ratings.sort_values(by='F', ascending=False)
In [36]:
top_female_ratings[:10]
Out[36]:
gender
F
M
title
Close Shave, A (1995)
4.644444
4.473795
Wrong Trousers, The (1993)
4.588235
4.478261
Sunset Blvd. (a.k.a. Sunset Boulevard) (1950)
4.572650
4.464589
Wallace & Gromit: The Best of Aardman Animation (1996)
4.563107
4.385075
Schindler's List (1993)
4.562602
4.491415
Shawshank Redemption, The (1994)
4.539075
4.560625
Grand Day Out, A (1992)
4.537879
4.293255
To Kill a Mockingbird (1962)
4.536667
4.372611
Creature Comforts (1990)
4.513889
4.272277
Usual Suspects, The (1995)
4.513317
4.518248
In [37]:
mean_ratings['diff'] = mean_ratings['M'] - mean_ratings['F']
In [38]:
sorted_by_diff = mean_ratings.sort_values(by='diff')
In [39]:
sorted_by_diff[:15]
Out[39]:
gender
F
M
diff
title
Dirty Dancing (1987)
3.790378
2.959596
-0.830782
Jumpin' Jack Flash (1986)
3.254717
2.578358
-0.676359
Grease (1978)
3.975265
3.367041
-0.608224
Little Women (1994)
3.870588
3.321739
-0.548849
Steel Magnolias (1989)
3.901734
3.365957
-0.535777
Anastasia (1997)
3.800000
3.281609
-0.518391
Rocky Horror Picture Show, The (1975)
3.673016
3.160131
-0.512885
Color Purple, The (1985)
4.158192
3.659341
-0.498851
Age of Innocence, The (1993)
3.827068
3.339506
-0.487561
Free Willy (1993)
2.921348
2.438776
-0.482573
French Kiss (1995)
3.535714
3.056962
-0.478752
Little Shop of Horrors, The (1960)
3.650000
3.179688
-0.470312
Guys and Dolls (1955)
4.051724
3.583333
-0.468391
Mary Poppins (1964)
4.197740
3.730594
-0.467147
Patch Adams (1998)
3.473282
3.008746
-0.464536
In [40]:
sorted_by_diff[::-1][:15]
Out[40]:
gender
F
M
diff
title
Good, The Bad and The Ugly, The (1966)
3.494949
4.221300
0.726351
Kentucky Fried Movie, The (1977)
2.878788
3.555147
0.676359
Dumb & Dumber (1994)
2.697987
3.336595
0.638608
Longest Day, The (1962)
3.411765
4.031447
0.619682
Cable Guy, The (1996)
2.250000
2.863787
0.613787
Evil Dead II (Dead By Dawn) (1987)
3.297297
3.909283
0.611985
Hidden, The (1987)
3.137931
3.745098
0.607167
Rocky III (1982)
2.361702
2.943503
0.581801
Caddyshack (1980)
3.396135
3.969737
0.573602
For a Few Dollars More (1965)
3.409091
3.953795
0.544704
Porky's (1981)
2.296875
2.836364
0.539489
Animal House (1978)
3.628906
4.167192
0.538286
Exorcist, The (1973)
3.537634
4.067239
0.529605
Fright Night (1985)
2.973684
3.500000
0.526316
Barb Wire (1996)
1.585366
2.100386
0.515020
In [43]:
# Standart deviation ratings, groups by name
rating_std_by_title = data.groupby('title')['rating'].std()
In [45]:
# Leave the only active_titles
rating_std_by_title = rating_std_by_title.ix[active_titles]
In [49]:
# Order Series by value in descending order
rating_std_by_title = rating_std_by_title.sort_values(ascending=False)[:10]
In [50]:
rating_std_by_title
Out[50]:
title
Dumb & Dumber (1994) 1.321333
Blair Witch Project, The (1999) 1.316368
Natural Born Killers (1994) 1.307198
Tank Girl (1995) 1.277695
Rocky Horror Picture Show, The (1975) 1.260177
Eyes Wide Shut (1999) 1.259624
Evita (1996) 1.253631
Billy Madison (1995) 1.249970
Fear and Loathing in Las Vegas (1998) 1.246408
Bicentennial Man (1999) 1.245533
Name: rating, dtype: float64
In [ ]:
Content source: batazor/MyExampleAndExperiments
Similar notebooks: