Matrix Factorisation


In [ ]:
import os, sys, gzip
from surprise import Dataset
from surprise import Reader
from surprise import SVD
#from surprise import SVDpp
#from surprise import NMF
import pickle as pkl
import numpy as np
from tqdm import tqdm

In [ ]:
from tools import calc_RPrecision_HitRate

In [ ]:
TOPs = [5, 10, 20, 30, 50, 100, 200, 300, 500, 1000]
datasets = ['aotm2011', '30music']
data_dir = 'data'
SEED = 13579

In [ ]:
dix = 1
dataset_name = datasets[dix]
dataset_name

In [ ]:
fname = os.path.join(data_dir, '%s/setting2/mftrain_%s.3.csv' % (dataset_name, dataset_name))
reader = Reader(line_format='user item rating', sep=',')
data_train = Dataset.load_from_file(fname, reader=reader)

In [ ]:
# http://surprise.readthedocs.io/en/stable/matrix_factorization.html
#algo = NMF(n_factors=300, n_epochs=100, verbose=True)
algo = SVD(n_factors=500, random_state=SEED, reg_all=0.01, verbose=True)
trainset = data_train.build_full_trainset()
algo.fit(trainset)

In [ ]:
base_dir = 'data/%s/setting2' % dataset_name
Y = pkl.load(gzip.open(os.path.join(base_dir, 'Y.pkl.gz'), 'rb'))
PU_test = pkl.load(gzip.open(os.path.join(base_dir, 'PU_test.pkl.gz'), 'rb'))
Y_test = Y[:, -PU_test.shape[1]:]
print(Y_test.shape)
#Y_test.sum(axis=0)

In [ ]:
N, K = Y.shape
ustrs = ['U%d' % i for i in range(N)]
istrs = ['P%d' % j for j in range(K)]

In [ ]:
rps_mf = []
hitrates_mf = {top: [] for top in TOPs}

assert Y_test.shape == PU_test.shape
offset = Y.shape[1] - PU_test.shape[1]
for j in tqdm(range(Y_test.shape[1])):
    y1 = Y_test[:, j].toarray().reshape(-1)
    y2 = PU_test[:, j].toarray().reshape(-1)
    indices = np.where(0 == y2)[0]
    y_true = y1[indices]
    y_pred = np.asarray([algo.predict(ustrs[i], istrs[j + offset]).est for i in indices]).reshape(-1)
    
    rp, hr_dict = calc_RPrecision_HitRate(y_true, y_pred, tops=TOPs)
    rps_mf.append(rp)
    for top in TOPs:
        hitrates_mf[top].append(hr_dict[top])

In [ ]:
mf_perf = {dataset_name: {'Test': {'R-Precision': np.mean(rps_mf), 
                                   'Hit-Rate': {top: np.mean(hitrates_mf[top]) for top in hitrates_mf}}}}
mf_perf

In [ ]:
fperf_mf = os.path.join(base_dir, 'perf-mf.pkl')
print(fperf_mf)
#pkl.dump(mf_perf, open(fperf_mf, 'wb'))
pkl.load(open(fperf_mf, 'rb'))

Example


In [ ]:
# pip install scikit-surprise
from surprise import SVD
from surprise import Dataset
from surprise.model_selection import cross_validate

from surprise import SVD
from surprise import Dataset
from surprise import accuracy
from surprise.model_selection import train_test_split

In [ ]:
# Load the movielens-100k dataset (download it if needed),
data = Dataset.load_builtin('ml-100k')

In [ ]:
# sample random trainset and testset
# test set is made of 25% of the ratings.
trainset, testset = train_test_split(data, test_size=.25)

In [ ]:
# We'll use the famous SVD algorithm.
algo = SVD()

In [ ]:
# Train the algorithm on the trainset
algo.fit(trainset)

In [ ]:
#algo.test?

In [ ]:
# Predict ratings for the testset
predictions = algo.test(testset)

In [ ]:
type(predictions)

In [ ]:
predictions[2]

In [ ]:
predictions[2].est

In [ ]:
# Then compute RMSE
accuracy.rmse(predictions)