In [1]:
import os
os.chdir('..')
In [2]:
import pandas as pd
import os
import src.utils as utils
# Enable logging on Jupyter notebook
import logging
logger = logging.getLogger()
logger.setLevel(logging.DEBUG)
In [3]:
# downloads and unzips dataset from MovieLens
#'http://files.grouplens.org/datasets/movielens/ml-latest-small.zip'
dataset_folder = os.path.join(os.getcwd(), 'data')
dataset_folder_ready = utils.load_dataset(dataset_folder)
In [4]:
# Export IMDB ratings to the right format
imdb_ratings = os.path.join(dataset_folder,'ratings-imdb.csv')
links_file = os.path.join(dataset_folder, 'ml-latest-small', 'links.csv')
ratings_file = os.path.join(dataset_folder, 'ml-latest-small','ratings-merged.csv')
utils.import_imdb_ratings(imdb_ratings, links_file, ratings_file)
# adds personal ratings to original dataset ratings file.
[ratings, my_customer_number] = utils.merge_datasets(dataset_folder_ready, ratings_file)
In [5]:
# the data is stored in a long pandas dataframe
# we need to pivot the data to create a [user x movie] matrix
ratings_matrix = ratings.pivot_table(index='customer', columns='movie', values='rating', fill_value=0)
ratings_matrix = ratings_matrix.transpose()
In [6]:
# the personal ratings are now stored together with the rest of the ratings
ratings.loc[ratings.customer == my_customer_number]
Out[6]:
In [7]:
# A list with some of the movies in the dataset
movie_list = pd.DataFrame(ratings_matrix.index)
movie_list.head(20)
Out[7]: