In [8]:
import numpy as np
import pandas as pd

In [9]:
header = ['user_id', 'item_id', 'rating', 'timestamp']
df = pd.read_csv('ml-100k/u.data', sep='\t', names=header)

In [10]:
n_users = df.user_id.unique().shape[0]
n_items = df.item_id.unique().shape[0]
print 'Number of users = ' + str(n_users) + ' | Number of movies = ' + str(n_items)


Number of users = 943 | Number of movies = 1682

In [12]:
from sklearn import model_selection as cv
train_data, test_data = cv.train_test_split(df, test_size=0.25)

In [13]:
#Create two user-item matrices, one for training and another for testing
train_data_matrix = np.zeros((n_users, n_items))
for line in train_data.itertuples():
    train_data_matrix[line[1]-1, line[2]-1] = line[3]

test_data_matrix = np.zeros((n_users, n_items))
for line in test_data.itertuples():
    test_data_matrix[line[1]-1, line[2]-1] = line[3]

In [14]:
from sklearn.metrics.pairwise import pairwise_distances
user_similarity = pairwise_distances(train_data_matrix, metric='cosine')
item_similarity = pairwise_distances(train_data_matrix.T, metric='cosine')

In [15]:
def predict(ratings, similarity, type='user'):
    if type == 'user':
        mean_user_rating = ratings.mean(axis=1)
        #You use np.newaxis so that mean_user_rating has same format as ratings
        ratings_diff = (ratings - mean_user_rating[:, np.newaxis])
        pred = mean_user_rating[:, np.newaxis] + similarity.dot(ratings_diff) / np.array([np.abs(similarity).sum(axis=1)]).T
    elif type == 'item':
        pred = ratings.dot(similarity) / np.array([np.abs(similarity).sum(axis=1)])
    return pred

In [16]:
item_prediction = predict(train_data_matrix, item_similarity, type='item')
user_prediction = predict(train_data_matrix, user_similarity, type='user')

In [17]:
from sklearn.metrics import mean_squared_error
from math import sqrt
def rmse(prediction, ground_truth):
    prediction = prediction[ground_truth.nonzero()].flatten()
    ground_truth = ground_truth[ground_truth.nonzero()].flatten()
    return sqrt(mean_squared_error(prediction, ground_truth))

In [18]:
print 'User-based CF RMSE: ' + str(rmse(user_prediction, test_data_matrix))
print 'Item-based CF RMSE: ' + str(rmse(item_prediction, test_data_matrix))


User-based CF RMSE: 3.11993772752
Item-based CF RMSE: 3.44734950369

In [19]:
sparsity=round(1.0-len(df)/float(n_users*n_items),3)
print 'The sparsity level of MovieLens100K is ' +  str(sparsity*100) + '%'


The sparsity level of MovieLens100K is 93.7%

In [20]:
import scipy.sparse as sp
from scipy.sparse.linalg import svds

#get SVD components from train matrix. Choose k.
u, s, vt = svds(train_data_matrix, k = 20)
s_diag_matrix=np.diag(s)
X_pred = np.dot(np.dot(u, s_diag_matrix), vt)
print 'User-based CF MSE: ' + str(rmse(X_pred, test_data_matrix))


User-based CF MSE: 2.70864990911

In [21]:
print(item_prediction)


[[ 0.36039398  0.38148084  0.3932973  ...,  0.44092359  0.43175966
   0.42579034]
 [ 0.0970445   0.11197577  0.10748179 ...,  0.11253196  0.11390592
   0.11494079]
 [ 0.06553295  0.06811509  0.06730834 ...,  0.06226559  0.06694988
   0.06735553]
 ..., 
 [ 0.02788395  0.03560212  0.03421997 ...,  0.0397138   0.03924781
   0.03931998]
 [ 0.11144224  0.12107344  0.12713479 ...,  0.13434465  0.13307707
   0.13383019]
 [ 0.2106959   0.21101617  0.23363854 ...,  0.26696695  0.25851901
   0.25914029]]