In [1]:
import math

import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import missingno
import seaborn as sns

%matplotlib inline

In [4]:
header = ['user_id', 'item_id', 'rating', 'timestamp']
data = pd.read_csv('dataset/u.data', sep='\t', names=header)

In [5]:
data.describe()


Out[5]:
user_id item_id rating timestamp
count 100000.00000 100000.000000 100000.000000 1.000000e+05
mean 462.48475 425.530130 3.529860 8.835289e+08
std 266.61442 330.798356 1.125674 5.343856e+06
min 1.00000 1.000000 1.000000 8.747247e+08
25% 254.00000 175.000000 3.000000 8.794487e+08
50% 447.00000 322.000000 4.000000 8.828269e+08
75% 682.00000 631.000000 4.000000 8.882600e+08
max 943.00000 1682.000000 5.000000 8.932866e+08

In [6]:
n_users = data.user_id.unique().shape[0]
n_items = data.item_id.unique().shape[0]
print('Number of users : {}\tNumber of movies : {}'.format(n_users, n_items))


Number of users : 943	Number of movies : 1682

In [8]:
from sklearn.model_selection import train_test_split

train_data, test_data = train_test_split(data, test_size=0.25)

Memory-Based Collaborative Filtering

Memory-Based Collaborative Filtering approaches can be divided into two main sections: user-item filtering and item-item filtering. A user-item filtering takes a particular user, find users that are similar to that user based on similarity of ratings, and recommend items that those similar users liked. In contrast, item-item filtering will take an item, find users who liked that item, and find other items that those users or similar users also liked. It takes items and outputs other items as recommendations.

  • Item-Item Collaborative Filtering: “Users who liked this item also liked …”
  • User-Item Collaborative Filtering: “Users who are similar to you also liked …”

In [9]:
# Create two user-item matrices, one for training and another for testing
train_data_matrix = np.zeros((n_users, n_items))
for line in train_data.itertuples():
    train_data_matrix[line[1]-1, line[2]-1] = line[3]

test_data_matrix = np.zeros((n_users, n_items))
for line in test_data.itertuples():
    test_data_matrix[line[1]-1, line[2]-1] = line[3]

In [10]:
# Using cosine similarity
from sklearn.metrics.pairwise import pairwise_distances

user_similarity = pairwise_distances(train_data_matrix, metric='cosine')
item_similarity = pairwise_distances(train_data_matrix.T, metric='cosine')

In [15]:
def predict(ratings, similarity, type='user'):
    if type == 'user':
        mean_user_rating = ratings.mean(axis=1)
        #You use np.newaxis so that mean_user_rating has same format as ratings
        ratings_diff = (ratings - mean_user_rating[:, np.newaxis])
        pred = mean_user_rating[:, np.newaxis] + similarity.dot(ratings_diff) / np.array([np.abs(similarity).sum(axis=1)]).T
    elif type == 'item':
        pred = ratings.dot(similarity) / np.array([np.abs(similarity).sum(axis=1)])
    return pred

item_prediction = predict(train_data_matrix, item_similarity, type='item')
user_prediction = predict(train_data_matrix, user_similarity, type='user')

In [16]:
from sklearn.metrics import mean_squared_error

def rmse(prediction, ground_truth):
    prediction = prediction[ground_truth.nonzero()].flatten()
    ground_truth = ground_truth[ground_truth.nonzero()].flatten()
    return math.sqrt(mean_squared_error(prediction, ground_truth))

print('User-based CF RMSE: {}'.format(rmse(user_prediction, test_data_matrix)))
print('Item-based CF RMSE: {}'.format(rmse(item_prediction, test_data_matrix)))


User-based CF RMSE: 3.138932867315602
Item-based CF RMSE: 3.4651598697913806

Model-based Collaborative Filtering

Model-based Collaborative Filtering is based on matrix factorization (MF) which has received greater exposure, mainly as an unsupervised learning method for latent variable decomposition and dimensionality reduction. Matrix factorization is widely used for recommender systems where it can deal better with scalability and sparsity than Memory-based CF. The goal of MF is to learn the latent preferences of users and the latent attributes of items from known ratings (learn features that describe the characteristics of ratings) to then predict the unknown ratings through the dot product of the latent features of users and items. When you have a very sparse matrix, with a lot of dimensions, by doing matrix factorization you can restructure the user-item matrix into low-rank structure, and you can represent the matrix by the multiplication of two low-rank matrices, where the rows contain the latent vector. You fit this matrix to approximate your original matrix, as closely as possible, by multiplying the low-rank matrices together, which fills in the entries missing in the original matrix.


In [20]:
import scipy.sparse as sp
from scipy.sparse.linalg import svds

#get SVD components from train matrix.
u, s, vt = svds(train_data_matrix, k = 4)
s_diag_matrix=np.diag(s)
X_pred = np.dot(np.dot(u, s_diag_matrix), vt)
print('User-based CF MSE: {}'.format(rmse(X_pred, test_data_matrix)))


User-based CF MSE: 2.7901741266119973

In [ ]: