In [1]:

    
import os
os.chdir('..')



In [2]:

    
import pandas as pd
import os 
import src.utils as utils

# Enable logging on Jupyter notebook
import logging
logger = logging.getLogger()
logger.setLevel(logging.DEBUG)

Download and load the data from MovieLens



In [3]:

    
# downloads and unzips dataset from MovieLens 
#'http://files.grouplens.org/datasets/movielens/ml-latest-small.zip'
dataset_folder = os.path.join(os.getcwd(), 'data')
dataset_folder_ready = utils.load_dataset(dataset_folder)









    



INFO:root:downloading dataset http://files.grouplens.org/datasets/movielens/ml-latest-small.zip
INFO:root:dataset stored in: /Users/hcorona/github/recsys-101-workshop/data/ml-latest-small

How to export your IMDb ratings

Go to IMDb and make an account if you don't have one
If you have no ratings in your account, rate at least 10 movies you liked and 10 movies you didn't like.
Go to your account (top right) and click on "your Ratings"
Go to the bottom of the page, next to the "next" button, you will find an "Export this list" button. It will export your ratings to your Downloads folder.
Move the file inside your /data folder in this repo and name it ratings-imdb.csv



In [4]:

    
# Export IMDB ratings to the right format 
imdb_ratings = os.path.join(dataset_folder,'ratings-imdb.csv')
links_file = os.path.join(dataset_folder, 'ml-latest-small', 'links.csv')
ratings_file = os.path.join(dataset_folder, 'ml-latest-small','ratings-merged.csv')
utils.import_imdb_ratings(imdb_ratings, links_file, ratings_file)

# adds personal ratings to original dataset ratings file.
[ratings, my_customer_number] = utils.merge_datasets(dataset_folder_ready, ratings_file)









    



INFO:root:wrote IMDB ratings into the dataset format to /Users/hcorona/github/recsys-101-workshop/data/ml-latest-small/ratings-merged.csv
INFO:root:loaded 44 personal ratings
INFO:root:loaded 9125 movies
INFO:root:loaded 100048 ratings in total

Understand the data



In [5]:

    
# the data is stored in a long pandas dataframe
# we need to pivot the data to create a [user x movie] matrix
ratings_matrix = ratings.pivot_table(index='customer', columns='movie', values='rating', fill_value=0)
ratings_matrix = ratings_matrix.transpose()



In [6]:

    
# the personal ratings are now stored together with the rest of the ratings
ratings.loc[ratings.customer == my_customer_number]









    Out[6]:






  
    
      
      customer
      movie
      rating
    
  
  
    
      39381
      672
      Exit Through the Gift Shop (2010)
      4.0
    
    
      39493
      672
      Inception (2010)
      4.5
    
    
      39547
      672
      127 Hours (2010)
      4.5
    
    
      41243
      672
      Gran Torino (2008)
      4.5
    
    
      41398
      672
      Harry Potter and the Deathly Hallows: Part 2 (...
      5.0
    
    
      65134
      672
      Slumdog Millionaire (2008)
      4.0
    
    
      65204
      672
      Wrestler, The (2008)
      4.5
    
    
      65771
      672
      Serious Man, A (2009)
      4.0
    
    
      65841
      672
      Up in the Air (2009)
      3.5
    
    
      66046
      672
      Shutter Island (2010)
      4.5
    
    
      66318
      672
      Social Network, The (2010)
      3.5
    
    
      66423
      672
      King's Speech, The (2010)
      5.0
    
    
      66683
      672
      Beginners (2010)
      4.5
    
    
      66895
      672
      Shame (2011)
      3.5
    
    
      67132
      672
      Moonrise Kingdom (2012)
      5.0
    
    
      67243
      672
      Argo (2012)
      4.5
    
    
      67277
      672
      Flight (2012)
      3.5
    
    
      67323
      672
      Zero Dark Thirty (2012)
      4.5
    
    
      67383
      672
      Django Unchained (2012)
      4.5
    
    
      76289
      672
      Intouchables (2011)
      4.5
    
    
      81558
      672
      Blue Jasmine (2013)
      4.0
    
    
      83096
      672
      Education, An (2009)
      4.0
    
    
      83313
      672
      Perks of Being a Wallflower, The (2012)
      5.0
    
    
      83362
      672
      Amour (2012)
      5.0
    
    
      84307
      672
      Single Man, A (2009)
      5.0
    
    
      84321
      672
      Fish Tank (2009)
      3.5
    
    
      84379
      672
      Help, The (2011)
      4.5
    
    
      84396
      672
      We Need to Talk About Kevin (2011)
      4.0
    
    
      84472
      672
      Way, Way Back, The (2013)
      4.5
    
    
      87092
      672
      Milk (2008)
      4.5
    
    
      87207
      672
      Invictus (2009)
      3.5
    
    
      87301
      672
      Win Win (2011)
      4.5
    
    
      87312
      672
      Submarine (2010)
      4.5
    
    
      87345
      672
      Carnage (2011)
      4.5
    
    
      91368
      672
      The Artist (2011)
      4.0
    
    
      91827
      672
      Descendants, The (2011)
      4.5
    
    
      94868
      672
      Skin I Live In, The (La piel que habito) (2011)
      4.5
    
    
      97567
      672
      Bling Ring, The (2013)
      4.0
    
    
      98257
      672
      Buried (2010)
      4.5
    
    
      98286
      672
      Wave, The (Welle, Die) (2008)
      4.0
    
    
      99087
      672
      Iron Lady, The (2011)
      5.0
    
    
      99696
      672
      Wall Street: Money Never Sleeps (2010)
      3.0
    
    
      99747
      672
      Sessions, The (Surrogate, The) (2012)
      4.5
    
    
      99976
      672
      Patrik Age 1.5 (Patrik 1,5) (2008)
      3.5



In [7]:

    
# A list with some of the movies in the dataset
movie_list = pd.DataFrame(ratings_matrix.index)
movie_list.head(20)









    Out[7]:






  
    
      
      movie
    
  
  
    
      0
      "Great Performances" Cats (1998)
    
    
      1
      $9.99 (2008)
    
    
      2
      'Hellboy': The Seeds of Creation (2004)
    
    
      3
      'Neath the Arizona Skies (1934)
    
    
      4
      'Round Midnight (1986)
    
    
      5
      'Salem's Lot (2004)
    
    
      6
      'Til There Was You (1997)
    
    
      7
      'burbs, The (1989)
    
    
      8
      'night Mother (1986)
    
    
      9
      (500) Days of Summer (2009)
    
    
      10
      *batteries not included (1987)
    
    
      11
      ...And God Spoke (1993)
    
    
      12
      ...And Justice for All (1979)
    
    
      13
      1-900 (06) (1994)
    
    
      14
      10 (1979)
    
    
      15
      10 Attitudes (2001)
    
    
      16
      10 Cloverfield Lane (2016)
    
    
      17
      10 Items or Less (2006)
    
    
      18
      10 Things I Hate About You (1999)
    
    
      19
      10 Years (2011)

	customer	movie	rating
39381	672	Exit Through the Gift Shop (2010)	4.0
39493	672	Inception (2010)	4.5
39547	672	127 Hours (2010)	4.5
41243	672	Gran Torino (2008)	4.5
41398	672	Harry Potter and the Deathly Hallows: Part 2 (...	5.0
65134	672	Slumdog Millionaire (2008)	4.0
65204	672	Wrestler, The (2008)	4.5
65771	672	Serious Man, A (2009)	4.0
65841	672	Up in the Air (2009)	3.5
66046	672	Shutter Island (2010)	4.5
66318	672	Social Network, The (2010)	3.5
66423	672	King's Speech, The (2010)	5.0
66683	672	Beginners (2010)	4.5
66895	672	Shame (2011)	3.5
67132	672	Moonrise Kingdom (2012)	5.0
67243	672	Argo (2012)	4.5
67277	672	Flight (2012)	3.5
67323	672	Zero Dark Thirty (2012)	4.5
67383	672	Django Unchained (2012)	4.5
76289	672	Intouchables (2011)	4.5
81558	672	Blue Jasmine (2013)	4.0
83096	672	Education, An (2009)	4.0
83313	672	Perks of Being a Wallflower, The (2012)	5.0
83362	672	Amour (2012)	5.0
84307	672	Single Man, A (2009)	5.0
84321	672	Fish Tank (2009)	3.5
84379	672	Help, The (2011)	4.5
84396	672	We Need to Talk About Kevin (2011)	4.0
84472	672	Way, Way Back, The (2013)	4.5
87092	672	Milk (2008)	4.5
87207	672	Invictus (2009)	3.5
87301	672	Win Win (2011)	4.5
87312	672	Submarine (2010)	4.5
87345	672	Carnage (2011)	4.5
91368	672	The Artist (2011)	4.0
91827	672	Descendants, The (2011)	4.5
94868	672	Skin I Live In, The (La piel que habito) (2011)	4.5
97567	672	Bling Ring, The (2013)	4.0
98257	672	Buried (2010)	4.5
98286	672	Wave, The (Welle, Die) (2008)	4.0
99087	672	Iron Lady, The (2011)	5.0
99696	672	Wall Street: Money Never Sleeps (2010)	3.0
99747	672	Sessions, The (Surrogate, The) (2012)	4.5
99976	672	Patrik Age 1.5 (Patrik 1,5) (2008)	3.5

	movie
0	"Great Performances" Cats (1998)
1	$9.99 (2008)
2	'Hellboy': The Seeds of Creation (2004)
3	'Neath the Arizona Skies (1934)
4	'Round Midnight (1986)
5	'Salem's Lot (2004)
6	'Til There Was You (1997)
7	'burbs, The (1989)
8	'night Mother (1986)
9	(500) Days of Summer (2009)
10	*batteries not included (1987)
11	...And God Spoke (1993)
12	...And Justice for All (1979)
13	1-900 (06) (1994)
14	10 (1979)
15	10 Attitudes (2001)
16	10 Cloverfield Lane (2016)
17	10 Items or Less (2006)
18	10 Things I Hate About You (1999)
19	10 Years (2011)