Data Exploration and other examples



In [1]:

    
import os
os.chdir('..')



In [2]:

    
# Import all the packages we need to generate recommendations
import numpy as np
import pandas as pd
import src.utils as utils
import src.recommenders as recommenders
import src.similarity as similarity

# imports necesary for plotting
import matplotlib
import matplotlib.pyplot as plt
%matplotlib inline  

# Enable logging on Jupyter notebook
import logging
logger = logging.getLogger()
logger.setLevel(logging.DEBUG)



In [3]:

    
# loads dataset 
dataset_folder = os.path.join(os.getcwd(), 'data')
dataset_folder_ready = utils.load_dataset(dataset_folder)

# adds personal ratings to original dataset ratings file.
ratings_file = os.path.join(dataset_folder, 'ml-latest-small','ratings-merged.csv')
[ratings, my_customer_number] = utils.merge_datasets(dataset_folder_ready, ratings_file)









    



INFO:root:dataset was already downloaded
INFO:root:dataset stored in: /Users/hcorona/github/recsys-101-workshop/data/ml-latest-small
INFO:root:loaded 44 personal ratings
INFO:root:loaded 9125 movies
INFO:root:loaded 100048 ratings in total



In [4]:

    
# the data is stored in a long pandas dataframe
# we need to pivot the data to create a [user x movie] matrix
ratings_matrix = ratings.pivot_table(index='customer', columns='movie', values='rating', fill_value=0)
ratings_matrix = ratings_matrix.transpose()

Basic data exploration



In [5]:

    
# number of users, items and ratings
n_users= len(ratings['customer'].unique())
n_items= len(ratings['movie'].unique())
n_ratings = ratings.shape[0]

print('number of users',n_users)
print('number of items', n_items)
print('number of ratings', n_ratings)









    



number of users 672
number of items 9064
number of ratings 100048



In [6]:

    
# Plot distribution of ratings using the pandas plot functionality
histogram = ratings['rating'].hist()
histogram.set_xlabel('number of ratings')
histogram.set_ylabel('rating value')









    Out[6]:





<matplotlib.text.Text at 0x1099cc400>



In [7]:

    
# movie popularity distribution
popularity = pd.DataFrame(ratings['movie'].value_counts())
popularity.sort_values(by='movie', ascending=False)
popularity.plot(figsize=(15, 5), title="Movie Popularity").grid(True)



In [8]:

    
# movie popularity distribution: zoom into the top-100 movies
popularity[0:100].plot(figsize=(15, 5), title="Movie Popularity").grid(True)