In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

#column headers for the dataset
data_cols = ['user id','movie id','rating','timestamp']
item_cols = ['movie id','movie title','release date','video release date','IMDb URL','unknown','Action',
'Adventure','Animation','Childrens','Comedy','Crime','Documentary','Drama','Fantasy','Film-Noir','Horror',
'Musical','Mystery','Romance ','Sci-Fi','Thriller','War' ,'Western']
user_cols = ['user id','age','gender','occupation','zip code']

#importing the data files onto dataframes
users = pd.read_csv('ml-100k/u.user', sep='|', names=user_cols, encoding='latin-1')
item = pd.read_csv('ml-100k/u.item', sep='|', names=item_cols, encoding='latin-1')
data = pd.read_csv('ml-100k/u.data', sep='\t', names=data_cols, encoding='latin-1')

In [12]:
#printing the head of these dataframes
print(users.head())
print(item.head())
print(data.head())


   user id  age gender  occupation zip code
0        1   24      M  technician    85711
1        2   53      F       other    94043
2        3   23      M      writer    32067
3        4   24      M  technician    43537
4        5   33      F       other    15213
   movie id        movie title release date  video release date  \
0         1   Toy Story (1995)  01-Jan-1995                 NaN   
1         2   GoldenEye (1995)  01-Jan-1995                 NaN   
2         3  Four Rooms (1995)  01-Jan-1995                 NaN   
3         4  Get Shorty (1995)  01-Jan-1995                 NaN   
4         5     Copycat (1995)  01-Jan-1995                 NaN   

                                            IMDb URL  unknown  Action  \
0  http://us.imdb.com/M/title-exact?Toy%20Story%2...        0       0   
1  http://us.imdb.com/M/title-exact?GoldenEye%20(...        0       1   
2  http://us.imdb.com/M/title-exact?Four%20Rooms%...        0       0   
3  http://us.imdb.com/M/title-exact?Get%20Shorty%...        0       1   
4  http://us.imdb.com/M/title-exact?Copycat%20(1995)        0       0   

   Adventure  Animation  Childrens   ...     Fantasy  Film-Noir  Horror  \
0          0          1          1   ...           0          0       0   
1          1          0          0   ...           0          0       0   
2          0          0          0   ...           0          0       0   
3          0          0          0   ...           0          0       0   
4          0          0          0   ...           0          0       0   

   Musical  Mystery  Romance   Sci-Fi  Thriller  War  Western  
0        0        0         0       0         0    0        0  
1        0        0         0       0         1    0        0  
2        0        0         0       0         1    0        0  
3        0        0         0       0         0    0        0  
4        0        0         0       0         1    0        0  

[5 rows x 24 columns]
   user id  movie id  rating  timestamp
0      196       242       3  881250949
1      186       302       3  891717742
2       22       377       1  878887116
3      244        51       2  880606923
4      166       346       1  886397596

In [13]:
#Create one data frame from the three
dataset = pd.merge(pd.merge(item, data),users)
print(dataset.head())


   movie id            movie title release date  video release date  \
0         1       Toy Story (1995)  01-Jan-1995                 NaN   
1         4      Get Shorty (1995)  01-Jan-1995                 NaN   
2         5         Copycat (1995)  01-Jan-1995                 NaN   
3         7  Twelve Monkeys (1995)  01-Jan-1995                 NaN   
4         8            Babe (1995)  01-Jan-1995                 NaN   

                                            IMDb URL  unknown  Action  \
0  http://us.imdb.com/M/title-exact?Toy%20Story%2...        0       0   
1  http://us.imdb.com/M/title-exact?Get%20Shorty%...        0       1   
2  http://us.imdb.com/M/title-exact?Copycat%20(1995)        0       0   
3  http://us.imdb.com/M/title-exact?Twelve%20Monk...        0       0   
4     http://us.imdb.com/M/title-exact?Babe%20(1995)        0       0   

   Adventure  Animation  Childrens    ...     Thriller  War  Western  user id  \
0          0          1          1    ...            0    0        0      308   
1          0          0          0    ...            0    0        0      308   
2          0          0          0    ...            1    0        0      308   
3          0          0          0    ...            0    0        0      308   
4          0          0          1    ...            0    0        0      308   

   rating  timestamp  age  gender  occupation  zip code  
0       4  887736532   60       M     retired     95076  
1       5  887737890   60       M     retired     95076  
2       4  887739608   60       M     retired     95076  
3       4  887738847   60       M     retired     95076  
4       5  887736696   60       M     retired     95076  

[5 rows x 31 columns]

In [14]:
ratings_total = dataset.groupby('movie title').size()
print(ratings_total.head())


movie title
'Til There Was You (1997)      9
1-900 (1994)                   5
101 Dalmatians (1996)        109
12 Angry Men (1957)          125
187 (1997)                    41
dtype: int64

In [15]:
ratings_mean = (dataset.groupby('movie title'))['movie title','rating'].mean()
print(ratings_mean.head())


                             rating
movie title                        
'Til There Was You (1997)  2.333333
1-900 (1994)               2.600000
101 Dalmatians (1996)      2.908257
12 Angry Men (1957)        4.344000
187 (1997)                 3.024390

In [16]:
#modify the dataframes so that we can merge the two
ratings_total = pd.DataFrame({'movie title':ratings_total.index,
'total ratings': ratings_total.values})
ratings_mean['movie title'] = ratings_mean.index

In [17]:
final = pd.merge(ratings_mean, ratings_total).sort_values(by = 'total ratings',
ascending= False)
print(final.head())


        rating                movie title  total ratings
1398  4.358491           Star Wars (1977)            583
333   3.803536             Contact (1997)            509
498   4.155512               Fargo (1996)            508
1234  4.007890  Return of the Jedi (1983)            507
860   3.156701           Liar Liar (1997)            485

In [18]:
print(final.describe())


            rating  total ratings
count  1664.000000    1664.000000
mean      3.077018      60.096154
std       0.780418      80.956484
min       1.000000       1.000000
25%       2.665094       7.000000
50%       3.162132      27.000000
75%       3.651808      80.250000
max       5.000000     583.000000

In [19]:
final = final[:300].sort_values(by = 'rating',
ascending = False)
print(final.head())


        rating                       movie title  total ratings
1281  4.466443           Schindler's List (1993)            298
1652  4.466102        Wrong Trousers, The (1993)            118
273   4.456790                 Casablanca (1942)            243
1317  4.445230  Shawshank Redemption, The (1994)            283
1215  4.387560                Rear Window (1954)            209