In [1]:
import os
os.chdir('..')

In [2]:
import pandas as pd
import os 
import src.utils as utils

# Enable logging on Jupyter notebook
import logging
logger = logging.getLogger()
logger.setLevel(logging.DEBUG)

 Download and load the data from MovieLens


In [3]:
# downloads and unzips dataset from MovieLens 
#'http://files.grouplens.org/datasets/movielens/ml-latest-small.zip'
dataset_folder = os.path.join(os.getcwd(), 'data')
dataset_folder_ready = utils.load_dataset(dataset_folder)


INFO:root:downloading dataset http://files.grouplens.org/datasets/movielens/ml-latest-small.zip
INFO:root:dataset stored in: /Users/hcorona/github/recsys-101-workshop/data/ml-latest-small

How to export your IMDb ratings

  1. Go to IMDb and make an account if you don't have one
  2. If you have no ratings in your account, rate at least 10 movies you liked and 10 movies you didn't like.
  3. Go to your account (top right) and click on "your Ratings"
  4. Go to the bottom of the page, next to the "next" button, you will find an "Export this list" button. It will export your ratings to your Downloads folder.
  5. Move the file inside your /data folder in this repo and name it ratings-imdb.csv

In [4]:
# Export IMDB ratings to the right format 
imdb_ratings = os.path.join(dataset_folder,'ratings-imdb.csv')
links_file = os.path.join(dataset_folder, 'ml-latest-small', 'links.csv')
ratings_file = os.path.join(dataset_folder, 'ml-latest-small','ratings-merged.csv')
utils.import_imdb_ratings(imdb_ratings, links_file, ratings_file)

# adds personal ratings to original dataset ratings file.
[ratings, my_customer_number] = utils.merge_datasets(dataset_folder_ready, ratings_file)


INFO:root:wrote IMDB ratings into the dataset format to /Users/hcorona/github/recsys-101-workshop/data/ml-latest-small/ratings-merged.csv
INFO:root:loaded 44 personal ratings
INFO:root:loaded 9125 movies
INFO:root:loaded 100048 ratings in total

Understand the data


In [5]:
# the data is stored in a long pandas dataframe
# we need to pivot the data to create a [user x movie] matrix
ratings_matrix = ratings.pivot_table(index='customer', columns='movie', values='rating', fill_value=0)
ratings_matrix = ratings_matrix.transpose()

In [6]:
# the personal ratings are now stored together with the rest of the ratings
ratings.loc[ratings.customer == my_customer_number]


Out[6]:
customer movie rating
39381 672 Exit Through the Gift Shop (2010) 4.0
39493 672 Inception (2010) 4.5
39547 672 127 Hours (2010) 4.5
41243 672 Gran Torino (2008) 4.5
41398 672 Harry Potter and the Deathly Hallows: Part 2 (... 5.0
65134 672 Slumdog Millionaire (2008) 4.0
65204 672 Wrestler, The (2008) 4.5
65771 672 Serious Man, A (2009) 4.0
65841 672 Up in the Air (2009) 3.5
66046 672 Shutter Island (2010) 4.5
66318 672 Social Network, The (2010) 3.5
66423 672 King's Speech, The (2010) 5.0
66683 672 Beginners (2010) 4.5
66895 672 Shame (2011) 3.5
67132 672 Moonrise Kingdom (2012) 5.0
67243 672 Argo (2012) 4.5
67277 672 Flight (2012) 3.5
67323 672 Zero Dark Thirty (2012) 4.5
67383 672 Django Unchained (2012) 4.5
76289 672 Intouchables (2011) 4.5
81558 672 Blue Jasmine (2013) 4.0
83096 672 Education, An (2009) 4.0
83313 672 Perks of Being a Wallflower, The (2012) 5.0
83362 672 Amour (2012) 5.0
84307 672 Single Man, A (2009) 5.0
84321 672 Fish Tank (2009) 3.5
84379 672 Help, The (2011) 4.5
84396 672 We Need to Talk About Kevin (2011) 4.0
84472 672 Way, Way Back, The (2013) 4.5
87092 672 Milk (2008) 4.5
87207 672 Invictus (2009) 3.5
87301 672 Win Win (2011) 4.5
87312 672 Submarine (2010) 4.5
87345 672 Carnage (2011) 4.5
91368 672 The Artist (2011) 4.0
91827 672 Descendants, The (2011) 4.5
94868 672 Skin I Live In, The (La piel que habito) (2011) 4.5
97567 672 Bling Ring, The (2013) 4.0
98257 672 Buried (2010) 4.5
98286 672 Wave, The (Welle, Die) (2008) 4.0
99087 672 Iron Lady, The (2011) 5.0
99696 672 Wall Street: Money Never Sleeps (2010) 3.0
99747 672 Sessions, The (Surrogate, The) (2012) 4.5
99976 672 Patrik Age 1.5 (Patrik 1,5) (2008) 3.5

In [7]:
# A list with some of the movies in the dataset
movie_list = pd.DataFrame(ratings_matrix.index)
movie_list.head(20)


Out[7]:
movie
0 "Great Performances" Cats (1998)
1 $9.99 (2008)
2 'Hellboy': The Seeds of Creation (2004)
3 'Neath the Arizona Skies (1934)
4 'Round Midnight (1986)
5 'Salem's Lot (2004)
6 'Til There Was You (1997)
7 'burbs, The (1989)
8 'night Mother (1986)
9 (500) Days of Summer (2009)
10 *batteries not included (1987)
11 ...And God Spoke (1993)
12 ...And Justice for All (1979)
13 1-900 (06) (1994)
14 10 (1979)
15 10 Attitudes (2001)
16 10 Cloverfield Lane (2016)
17 10 Items or Less (2006)
18 10 Things I Hate About You (1999)
19 10 Years (2011)