In [1]:
from __future__ import print_function
from collections import namedtuple
import pandas as pd
import numpy as np
from scipy.sparse import csr_matrix
In [2]:
from polara.recommender.evaluation import assemble_scoring_matrices, build_rank_matrix, matrix_from_observations, split_positive, generate_hits_data
from polara.recommender.evaluation import get_mrr_score, get_ndcr_discounts, get_ndcg_score, get_ndcl_score
from polara.recommender.evaluation import get_hits, get_relevance_scores, get_ranking_scores
from polara.datasets.movielens import get_movielens_data
from polara.recommender.data import RecommenderData
from polara.recommender.models import SVDModel
based on https://en.wikipedia.org/wiki/Discounted_cumulative_gain
In [3]:
swp = None
data = pd.DataFrame({'userid': [0,0,0,0,0,0,0,0],
'movieid': [0,1,2,3,4,5,6,7],
'rating':[3, 2, 3, 0, 1, 2, 3, 2]})
recs = np.array([[0,1,2,3,4,5]])
hsz = data.shape[0]
In [4]:
topk = recs.shape[1]
shp = (recs.shape[0], max(recs.max(), data['movieid'].max())+1)
In [5]:
rankm, hrank, mrank, evalm, ehits, emiss = assemble_scoring_matrices(recs, data, 'userid', 'movieid', None, 'rating')
In [6]:
discm, idisc = get_ndcr_discounts(rankm, evalm, topk)
In [7]:
get_ndcg_score(ehits, discm, idisc, alternative=False)
Out[7]:
the result is slightly worse (expected value is 0.785), as normalization is based on the full holdout, not just topk elements
this is an intentional behavior in order to support NDCL score calculation when switch_positive is set
In [8]:
swp = 3
data = pd.DataFrame({'userid': [0,0, 1,1, 2,2],
'movieid': [0,1, 2,3, 4,5],
'rating':[2,3, 1,3, 5,4]})
recs = np.array([[1,0], [2,3], [5,4]])
hsz = 2
In [9]:
topk = recs.shape[1]
shp = (recs.shape[0], max(recs.max(), data['movieid'].max())+1)
In [10]:
data.set_index(['userid', 'movieid']).sort_index()
Out[10]:
In [11]:
if swp is None:
is_positive = None
else:
is_positive = data.rating>=swp
In [12]:
rankm, hrank, mrank, evalm, ehits, emiss = assemble_scoring_matrices(recs, data, 'userid', 'movieid', is_positive, 'rating')
In [13]:
discm, idisc = get_ndcr_discounts(rankm, evalm, topk)
In [14]:
get_ndcg_score(ehits, discm, idisc, alternative=False)
Out[14]:
In [15]:
get_ndcl_score(emiss, discm, idisc, swp, alternative=False)
Out[15]:
In [16]:
ml_data = get_movielens_data()
In [17]:
ml_data.head()
Out[17]:
In [18]:
dm = RecommenderData(ml_data, 'userid', 'movieid', 'rating', seed=0)
In [19]:
dm.get_configuration()
Out[19]:
In [20]:
dm.random_holdout = True
dm.prepare()
In [21]:
svd = SVDModel(dm)
svd.rank = 50
In [22]:
svd.build()
In [23]:
swp = 4
svd.switch_positive = swp
data = dm.test.holdout
recs = svd.recommendations
hsz = dm.holdout_size
In [24]:
topk = recs.shape[1]
shp = (recs.shape[0], max(recs.max(), data['movieid'].max())+1)
In [25]:
if swp is None:
is_positive = None
else:
is_positive = (data.rating>=swp).values
In [26]:
rankm, hrank, mrank, evalm, ehits, emiss = assemble_scoring_matrices(recs, data, 'userid', 'movieid', is_positive, 'rating')
In [27]:
evalm
Out[27]:
In [28]:
ehits
Out[28]:
In [29]:
emiss
Out[29]:
In [30]:
discm, idisc = get_ndcr_discounts(rankm, evalm, topk)
In [31]:
discm
Out[31]:
In [32]:
idisc
Out[32]:
In [33]:
get_ndcg_score(ehits, discm, idisc, alternative=False)
Out[33]:
In [34]:
get_ndcl_score(emiss, discm, idisc, swp, alternative=False)
Out[34]:
In [35]:
get_mrr_score(hrank)
Out[35]:
compare with previous implementation
In [36]:
def get_matched_predictions(eval_data, holdout_size, recs):
userid, itemid = 'userid', 'movieid'
holdout_data = eval_data[itemid]
holdout_matrix = holdout_data.values.reshape(-1, holdout_size).astype(np.int64)
matched_predictions = (recs[:, :, None] == holdout_matrix[:, None, :])
return matched_predictions
def get_feedback_data(eval_data, holdout_size):
feedback = 'rating'
eval_data = eval_data[feedback].values
feedback_data = eval_data.reshape(-1, holdout_size)
return feedback_data
def get_rnkng_scores(eval_data, holdout_size, recs, switch_positive=None, alternative=False):
matched_predictions = get_matched_predictions(eval_data, holdout_size, recs)
feedback_data = get_feedback_data(eval_data, holdout_size)
users_num, topk, holdout = matched_predictions.shape
ideal_scores_idx = np.argsort(feedback_data, axis=1)[:, ::-1] #returns column index only
ideal_scores_idx = np.ravel_multi_index((np.arange(feedback_data.shape[0])[:, None],
ideal_scores_idx), dims=feedback_data.shape)
where = np.where
is_positive = feedback_data >= switch_positive
positive_feedback = where(is_positive, feedback_data, 0)
negative_feedback = where(~is_positive, feedback_data-switch_positive, 0)
relevance_scores_pos = (matched_predictions * positive_feedback[:, None, :]).sum(axis=2)
relevance_scores_neg = (matched_predictions * negative_feedback[:, None, :]).sum(axis=2)
ideal_scores_pos = positive_feedback.ravel()[ideal_scores_idx]
ideal_scores_neg = negative_feedback.ravel()[ideal_scores_idx]
if alternative:
relevance_scores_pos = 2**relevance_scores_pos - 1
relevance_scores_neg = 2.0**relevance_scores_neg - 1
ideal_scores_pos = 2**ideal_scores_pos - 1
ideal_scores_neg = 2.0**ideal_scores_neg - 1
disc_num = max(topk, holdout)
discount = np.log2(np.arange(2, disc_num+2))
dcg = (relevance_scores_pos / discount[:topk]).sum(axis=1)
dcl = (relevance_scores_neg / -discount[:topk]).sum(axis=1)
idcg = (ideal_scores_pos / discount[:holdout]).sum(axis=1)
idcl = (ideal_scores_neg / -discount[:holdout]).sum(axis=1)
with np.errstate(invalid='ignore'):
ndcg = np.nansum(dcg / idcg) / users_num
ndcl = np.nansum(dcl / idcl) / users_num
ranking_score = namedtuple('Ranking', ['nDCG', 'nDCL'])._make([ndcg, ndcl])
return ranking_score
In [37]:
get_rnkng_scores(data, hsz, recs, switch_positive=swp, alternative=False)
Out[37]:
In [38]:
get_ranking_scores(rankm, hrank, mrank, evalm, ehits, emiss, switch_positive=swp, topk=topk, alternative=False)
Out[38]:
In [ ]:
In [39]:
svd.evaluate('hits', not_rated_penalty=None)
Out[39]:
In [40]:
svd.evaluate('relevance')
Out[40]:
In [ ]:
In [41]:
from polara.recommender import defaults
In [42]:
defaults.ndcg_alternative = False
In [43]:
svd.evaluate('ranking')
Out[43]:
In [ ]:
In [44]:
svd.evaluate('ranking', topk=1)
Out[44]:
In [45]:
test_user = 98
test_data = svd.data.test.holdout.query('userid=={}'.format(test_user))
test_recs = svd.recommendations[test_user, :]
In [46]:
topk = len(test_recs)
In [47]:
print(test_recs)
test_data
Out[47]:
In [48]:
test_data.loc[:, 'movieid'].isin(test_recs)
Out[48]:
In [49]:
(rankm, hrank, mrank,
evalm, ehits, emiss) = assemble_scoring_matrices(test_recs, test_data,
svd._key, svd._target,
(test_data.rating>=swp).values, feedback='rating')
In [50]:
hrank.data
Out[50]:
In [51]:
hrank.indices
Out[51]:
In [52]:
ehits.data
Out[52]:
In [53]:
ehits.indices
Out[53]:
In [54]:
discm, idisc = get_ndcr_discounts(rankm, evalm, topn=2)
In [55]:
discm.data
Out[55]:
In [56]:
discm.indices
Out[56]:
In [57]:
idisc.data
Out[57]:
In [58]:
idisc.indices
Out[58]:
NDCG
In [59]:
get_ndcg_score(ehits, discm, idisc, alternative=False)
Out[59]:
In [60]:
print('rec rank', np.where(np.isin(test_recs, test_data.movieid))[0] + 1)
print('rec item', test_recs[np.isin(test_recs, test_data.movieid)])
NDCL
In [61]:
emiss.data
Out[61]:
In [62]:
emiss.indices
Out[62]:
In [63]:
emiss.multiply(discm).data
Out[63]:
In [64]:
emiss.multiply(idisc)
Out[64]:
In [65]:
get_ndcl_score(emiss, discm, idisc, swp, alternative=False)
Out[65]:
In [ ]:
basically due to NDCL metric, which is "the lower the better"
this means that ideal score is 0
regular case
In [66]:
cg = lambda rel, pos: rel / np.log2(1+pos)
print('dcg ', cg(5, 9))
print('idcg', cg(5, 1) + cg(5, 2))
print('ndcg', cg(5, 9) / (cg(5, 1) + cg(5, 2)))
singular, but still ok
In [67]:
cl = lambda rel, pos: (np.exp(rel-4)-1) / (-np.log2(1+pos))
print('dcl ', 0)
print('idcl', 0)
with np.errstate(invalid='ignore'):
print('ndcl', np.array([0.]) / np.array([0.]))
broken case
when dcl is above zere and idcl is exactly 0 (due to only topk selected result, where negatove examples are not included at all)
In [68]:
cl = lambda rel, pos: (np.exp(rel-4)-1) / (-np.log2(1+pos))
print('dcl ', cl(3, 3))
print('idcl', 0)
with np.errstate(invalid='ignore'): # will not catch an error
print('ndcl', cl(3, 3) / np.array([0.]))
therefore with standard normalization NDCL may generate inf doesn't make a lot of sense, especially when trying to average across many users