In [1]:
import os
os.chdir('..')
In [2]:
# Import all the packages we need to generate recommendations
import numpy as np
import pandas as pd
import src.utils as utils
import src.recommenders as recommenders
import src.similarity as similarity
# imports necesary for plotting
import matplotlib
import matplotlib.pyplot as plt
%matplotlib inline
# Enable logging on Jupyter notebook
import logging
logger = logging.getLogger()
logger.setLevel(logging.DEBUG)
In [3]:
# loads dataset
dataset_folder = os.path.join(os.getcwd(), 'data')
dataset_folder_ready = utils.load_dataset(dataset_folder)
# adds personal ratings to original dataset ratings file.
ratings_file = os.path.join(dataset_folder, 'ml-latest-small','ratings-merged.csv')
[ratings, my_customer_number] = utils.merge_datasets(dataset_folder_ready, ratings_file)
In [4]:
# the data is stored in a long pandas dataframe
# we need to pivot the data to create a [user x movie] matrix
ratings_matrix = ratings.pivot_table(index='customer', columns='movie', values='rating', fill_value=0)
ratings_matrix = ratings_matrix.transpose()
In [5]:
# number of users, items and ratings
n_users= len(ratings['customer'].unique())
n_items= len(ratings['movie'].unique())
n_ratings = ratings.shape[0]
print('number of users',n_users)
print('number of items', n_items)
print('number of ratings', n_ratings)
In [6]:
# Plot distribution of ratings using the pandas plot functionality
histogram = ratings['rating'].hist()
histogram.set_xlabel('number of ratings')
histogram.set_ylabel('rating value')
Out[6]:
In [7]:
# movie popularity distribution
popularity = pd.DataFrame(ratings['movie'].value_counts())
popularity.sort_values(by='movie', ascending=False)
popularity.plot(figsize=(15, 5), title="Movie Popularity").grid(True)
In [8]:
# movie popularity distribution: zoom into the top-100 movies
popularity[0:100].plot(figsize=(15, 5), title="Movie Popularity").grid(True)