Matrix Factorisation



In [ ]:

    
import os, sys, gzip
from surprise import Dataset
from surprise import Reader
from surprise import SVD
#from surprise import SVDpp
#from surprise import NMF
import pickle as pkl
import numpy as np
from tqdm import tqdm



In [ ]:

    
from tools import calc_RPrecision_HitRate



In [ ]:

    
TOPs = [5, 10, 20, 30, 50, 100, 200, 300, 500, 1000]
datasets = ['aotm2011', '30music']
data_dir = 'data'
SEED = 13579



In [ ]:

    
dix = 1
dataset_name = datasets[dix]
dataset_name



In [ ]:

    
fname = os.path.join(data_dir, '%s/setting2/mftrain_%s.3.csv' % (dataset_name, dataset_name))
reader = Reader(line_format='user item rating', sep=',')
data_train = Dataset.load_from_file(fname, reader=reader)



In [ ]:

    
# http://surprise.readthedocs.io/en/stable/matrix_factorization.html
#algo = NMF(n_factors=300, n_epochs=100, verbose=True)
algo = SVD(n_factors=500, random_state=SEED, reg_all=0.01, verbose=True)
trainset = data_train.build_full_trainset()
algo.fit(trainset)



In [ ]:

    
base_dir = 'data/%s/setting2' % dataset_name
Y = pkl.load(gzip.open(os.path.join(base_dir, 'Y.pkl.gz'), 'rb'))
PU_test = pkl.load(gzip.open(os.path.join(base_dir, 'PU_test.pkl.gz'), 'rb'))
Y_test = Y[:, -PU_test.shape[1]:]
print(Y_test.shape)
#Y_test.sum(axis=0)



In [ ]:

    
N, K = Y.shape
ustrs = ['U%d' % i for i in range(N)]
istrs = ['P%d' % j for j in range(K)]



In [ ]:

    
rps_mf = []
hitrates_mf = {top: [] for top in TOPs}

assert Y_test.shape == PU_test.shape
offset = Y.shape[1] - PU_test.shape[1]
for j in tqdm(range(Y_test.shape[1])):
    y1 = Y_test[:, j].toarray().reshape(-1)
    y2 = PU_test[:, j].toarray().reshape(-1)
    indices = np.where(0 == y2)[0]
    y_true = y1[indices]
    y_pred = np.asarray([algo.predict(ustrs[i], istrs[j + offset]).est for i in indices]).reshape(-1)
    
    rp, hr_dict = calc_RPrecision_HitRate(y_true, y_pred, tops=TOPs)
    rps_mf.append(rp)
    for top in TOPs:
        hitrates_mf[top].append(hr_dict[top])



In [ ]:

    
mf_perf = {dataset_name: {'Test': {'R-Precision': np.mean(rps_mf), 
                                   'Hit-Rate': {top: np.mean(hitrates_mf[top]) for top in hitrates_mf}}}}
mf_perf



In [ ]:

    
fperf_mf = os.path.join(base_dir, 'perf-mf.pkl')
print(fperf_mf)
#pkl.dump(mf_perf, open(fperf_mf, 'wb'))
pkl.load(open(fperf_mf, 'rb'))

Example



In [ ]:

    
# pip install scikit-surprise
from surprise import SVD
from surprise import Dataset
from surprise.model_selection import cross_validate

from surprise import SVD
from surprise import Dataset
from surprise import accuracy
from surprise.model_selection import train_test_split



In [ ]:

    
# Load the movielens-100k dataset (download it if needed),
data = Dataset.load_builtin('ml-100k')



In [ ]:

    
# sample random trainset and testset
# test set is made of 25% of the ratings.
trainset, testset = train_test_split(data, test_size=.25)



In [ ]:

    
# We'll use the famous SVD algorithm.
algo = SVD()



In [ ]:

    
# Train the algorithm on the trainset
algo.fit(trainset)



In [ ]:

    
#algo.test?



In [ ]:

    
# Predict ratings for the testset
predictions = algo.test(testset)



In [ ]:

    
type(predictions)



In [ ]:

    
predictions[2]



In [ ]:

    
predictions[2].est



In [ ]:

    
# Then compute RMSE
accuracy.rmse(predictions)