In [1]:

    
import math

import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import missingno
import seaborn as sns

%matplotlib inline



In [4]:

    
header = ['user_id', 'item_id', 'rating', 'timestamp']
data = pd.read_csv('dataset/u.data', sep='\t', names=header)



In [5]:

    
data.describe()









    Out[5]:







  
    
      
      user_id
      item_id
      rating
      timestamp
    
  
  
    
      count
      100000.00000
      100000.000000
      100000.000000
      1.000000e+05
    
    
      mean
      462.48475
      425.530130
      3.529860
      8.835289e+08
    
    
      std
      266.61442
      330.798356
      1.125674
      5.343856e+06
    
    
      min
      1.00000
      1.000000
      1.000000
      8.747247e+08
    
    
      25%
      254.00000
      175.000000
      3.000000
      8.794487e+08
    
    
      50%
      447.00000
      322.000000
      4.000000
      8.828269e+08
    
    
      75%
      682.00000
      631.000000
      4.000000
      8.882600e+08
    
    
      max
      943.00000
      1682.000000
      5.000000
      8.932866e+08



In [6]:

    
n_users = data.user_id.unique().shape[0]
n_items = data.item_id.unique().shape[0]
print('Number of users : {}\tNumber of movies : {}'.format(n_users, n_items))









    



Number of users : 943	Number of movies : 1682



In [8]:

    
from sklearn.model_selection import train_test_split

train_data, test_data = train_test_split(data, test_size=0.25)

Memory-Based Collaborative Filtering

Memory-Based Collaborative Filtering approaches can be divided into two main sections: user-item filtering and item-item filtering. A user-item filtering takes a particular user, find users that are similar to that user based on similarity of ratings, and recommend items that those similar users liked. In contrast, item-item filtering will take an item, find users who liked that item, and find other items that those users or similar users also liked. It takes items and outputs other items as recommendations.

Item-Item Collaborative Filtering: “Users who liked this item also liked …”
User-Item Collaborative Filtering: “Users who are similar to you also liked …”



In [9]:

    
# Create two user-item matrices, one for training and another for testing
train_data_matrix = np.zeros((n_users, n_items))
for line in train_data.itertuples():
    train_data_matrix[line[1]-1, line[2]-1] = line[3]

test_data_matrix = np.zeros((n_users, n_items))
for line in test_data.itertuples():
    test_data_matrix[line[1]-1, line[2]-1] = line[3]



In [10]:

    
# Using cosine similarity
from sklearn.metrics.pairwise import pairwise_distances

user_similarity = pairwise_distances(train_data_matrix, metric='cosine')
item_similarity = pairwise_distances(train_data_matrix.T, metric='cosine')



In [15]:

    
def predict(ratings, similarity, type='user'):
    if type == 'user':
        mean_user_rating = ratings.mean(axis=1)
        #You use np.newaxis so that mean_user_rating has same format as ratings
        ratings_diff = (ratings - mean_user_rating[:, np.newaxis])
        pred = mean_user_rating[:, np.newaxis] + similarity.dot(ratings_diff) / np.array([np.abs(similarity).sum(axis=1)]).T
    elif type == 'item':
        pred = ratings.dot(similarity) / np.array([np.abs(similarity).sum(axis=1)])
    return pred

item_prediction = predict(train_data_matrix, item_similarity, type='item')
user_prediction = predict(train_data_matrix, user_similarity, type='user')



In [16]:

    
from sklearn.metrics import mean_squared_error

def rmse(prediction, ground_truth):
    prediction = prediction[ground_truth.nonzero()].flatten()
    ground_truth = ground_truth[ground_truth.nonzero()].flatten()
    return math.sqrt(mean_squared_error(prediction, ground_truth))

print('User-based CF RMSE: {}'.format(rmse(user_prediction, test_data_matrix)))
print('Item-based CF RMSE: {}'.format(rmse(item_prediction, test_data_matrix)))









    



User-based CF RMSE: 3.138932867315602
Item-based CF RMSE: 3.4651598697913806

Model-based Collaborative Filtering

Model-based Collaborative Filtering is based on matrix factorization (MF) which has received greater exposure, mainly as an unsupervised learning method for latent variable decomposition and dimensionality reduction. Matrix factorization is widely used for recommender systems where it can deal better with scalability and sparsity than Memory-based CF. The goal of MF is to learn the latent preferences of users and the latent attributes of items from known ratings (learn features that describe the characteristics of ratings) to then predict the unknown ratings through the dot product of the latent features of users and items. When you have a very sparse matrix, with a lot of dimensions, by doing matrix factorization you can restructure the user-item matrix into low-rank structure, and you can represent the matrix by the multiplication of two low-rank matrices, where the rows contain the latent vector. You fit this matrix to approximate your original matrix, as closely as possible, by multiplying the low-rank matrices together, which fills in the entries missing in the original matrix.



In [20]:

    
import scipy.sparse as sp
from scipy.sparse.linalg import svds

#get SVD components from train matrix.
u, s, vt = svds(train_data_matrix, k = 4)
s_diag_matrix=np.diag(s)
X_pred = np.dot(np.dot(u, s_diag_matrix), vt)
print('User-based CF MSE: {}'.format(rmse(X_pred, test_data_matrix)))









    



User-based CF MSE: 2.7901741266119973



In [ ]:

	user_id	item_id	rating	timestamp
count	100000.00000	100000.000000	100000.000000	1.000000e+05
mean	462.48475	425.530130	3.529860	8.835289e+08
std	266.61442	330.798356	1.125674	5.343856e+06
min	1.00000	1.000000	1.000000	8.747247e+08
25%	254.00000	175.000000	3.000000	8.794487e+08
50%	447.00000	322.000000	4.000000	8.828269e+08
75%	682.00000	631.000000	4.000000	8.882600e+08
max	943.00000	1682.000000	5.000000	8.932866e+08