In [1]:

    
from __future__ import print_function
from collections import namedtuple

import pandas as pd
import numpy as np
from scipy.sparse import csr_matrix



In [2]:

    
from polara.recommender.evaluation import assemble_scoring_matrices, build_rank_matrix, matrix_from_observations, split_positive, generate_hits_data
from polara.recommender.evaluation import get_mrr_score, get_ndcr_discounts, get_ndcg_score, get_ndcl_score
from polara.recommender.evaluation import get_hits, get_relevance_scores, get_ranking_scores
from polara.datasets.movielens import get_movielens_data
from polara.recommender.data import RecommenderData
from polara.recommender.models import SVDModel

Simple examples

from wiki

based on https://en.wikipedia.org/wiki/Discounted_cumulative_gain



In [3]:

    
swp = None

data = pd.DataFrame({'userid': [0,0,0,0,0,0,0,0],
                     'movieid': [0,1,2,3,4,5,6,7],
                    'rating':[3, 2, 3, 0, 1, 2, 3, 2]})
recs = np.array([[0,1,2,3,4,5]])
hsz = data.shape[0]



In [4]:

    
topk = recs.shape[1]
shp = (recs.shape[0], max(recs.max(), data['movieid'].max())+1)



In [5]:

    
rankm, hrank, mrank, evalm, ehits, emiss = assemble_scoring_matrices(recs, data, 'userid', 'movieid', None, 'rating')



In [6]:

    
discm, idisc = get_ndcr_discounts(rankm, evalm, topk)



In [7]:

    
get_ndcg_score(ehits, discm, idisc, alternative=False)









    Out[7]:





0.7561640298168335

the result is slightly worse (expected value is 0.785), as normalization is based on the full holdout, not just topk elements
this is an intentional behavior in order to support NDCL score calculation when switch_positive is set

hand-crafted example



In [8]:

    
swp = 3

data = pd.DataFrame({'userid': [0,0, 1,1, 2,2],
                     'movieid': [0,1, 2,3, 4,5],
                    'rating':[2,3, 1,3, 5,4]})
recs = np.array([[1,0], [2,3], [5,4]])
hsz = 2



In [9]:

    
topk = recs.shape[1]
shp = (recs.shape[0], max(recs.max(), data['movieid'].max())+1)



In [10]:

    
data.set_index(['userid', 'movieid']).sort_index()



In [11]:

    
if swp is None:
    is_positive = None
else:
    is_positive = data.rating>=swp



In [12]:

    
rankm, hrank, mrank, evalm, ehits, emiss = assemble_scoring_matrices(recs, data, 'userid', 'movieid', is_positive, 'rating')



In [13]:

    
discm, idisc = get_ndcr_discounts(rankm, evalm, topk)



In [14]:

    
get_ndcg_score(ehits, discm, idisc, alternative=False)









    Out[14]:





0.8606251743711292



In [15]:

    
get_ndcl_score(emiss, discm, idisc, swp, alternative=False)









    Out[15]:





0.861654166907052

Movielens



In [16]:

    
ml_data = get_movielens_data()



In [17]:

    
ml_data.head()



In [18]:

    
dm = RecommenderData(ml_data, 'userid', 'movieid', 'rating', seed=0)



In [19]:

    
dm.get_configuration()









    Out[19]:





{'holdout_size': 3,
 'negative_prediction': False,
 'permute_tops': False,
 'random_holdout': False,
 'shuffle_data': False,
 'test_fold': 5,
 'test_ratio': 0.2,
 'test_sample': None,
 'warm_start': True}



In [20]:

    
dm.random_holdout = True
dm.prepare()









    



Preparing data...
19 unique movieid's within 26 testset interactions were filtered. Reason: not in the training data.
1 unique movieid's within 1 holdout interactions were filtered. Reason: not in the training data.
1 of 1208 userid's were filtered out from holdout. Reason: not enough items.
1 userid's were filtered out from testset. Reason: inconsistent with holdout.
Done.



In [21]:

    
svd = SVDModel(dm)
svd.rank = 50



In [22]:

    
svd.build()









    



PureSVD training time: 0.4176868981995412s



In [23]:

    
swp = 4

svd.switch_positive = swp
data = dm.test.holdout
recs = svd.recommendations
hsz = dm.holdout_size



In [24]:

    
topk = recs.shape[1]
shp = (recs.shape[0], max(recs.max(), data['movieid'].max())+1)



In [25]:

    
if swp is None:
    is_positive = None
else:
    is_positive = (data.rating>=swp).values



In [26]:

    
rankm, hrank, mrank, evalm, ehits, emiss = assemble_scoring_matrices(recs, data, 'userid', 'movieid', is_positive, 'rating')



In [27]:

    
evalm









    Out[27]:





<1207x3687 sparse matrix of type '<class 'numpy.int64'>'
	with 3621 stored elements in Compressed Sparse Row format>



In [28]:

    
ehits









    Out[28]:





<1207x3687 sparse matrix of type '<class 'numpy.int64'>'
	with 2346 stored elements in Compressed Sparse Row format>



In [29]:

    
emiss









    Out[29]:





<1207x3687 sparse matrix of type '<class 'numpy.int64'>'
	with 1275 stored elements in Compressed Sparse Row format>



In [30]:

    
discm, idisc = get_ndcr_discounts(rankm, evalm, topk)



In [31]:

    
discm









    Out[31]:





<1207x3687 sparse matrix of type '<class 'numpy.float64'>'
	with 12070 stored elements in Compressed Sparse Row format>



In [32]:

    
idisc









    Out[32]:





<1207x3687 sparse matrix of type '<class 'numpy.float64'>'
	with 3621 stored elements in Compressed Sparse Row format>



In [33]:

    
get_ndcg_score(ehits, discm, idisc, alternative=False)









    Out[33]:





0.1699440242225603



In [34]:

    
get_ndcl_score(emiss, discm, idisc, swp, alternative=False)









    Out[34]:





0.06406889699069644



In [35]:

    
get_mrr_score(hrank)









    Out[35]:





Ranking(mrr=0.20079365079365077)

compare with previous implementation



In [36]:

    
def get_matched_predictions(eval_data, holdout_size, recs):
    userid, itemid = 'userid', 'movieid'
    holdout_data = eval_data[itemid]
    holdout_matrix = holdout_data.values.reshape(-1, holdout_size).astype(np.int64)

    matched_predictions = (recs[:, :, None] == holdout_matrix[:, None, :])
    return matched_predictions

def get_feedback_data(eval_data, holdout_size):
    feedback = 'rating'
    eval_data = eval_data[feedback].values
    feedback_data = eval_data.reshape(-1, holdout_size)
    return feedback_data

def get_rnkng_scores(eval_data, holdout_size, recs, switch_positive=None, alternative=False):
    matched_predictions = get_matched_predictions(eval_data, holdout_size, recs)
    feedback_data = get_feedback_data(eval_data, holdout_size)
    
    users_num, topk, holdout = matched_predictions.shape
    ideal_scores_idx = np.argsort(feedback_data, axis=1)[:, ::-1] #returns column index only
    ideal_scores_idx = np.ravel_multi_index((np.arange(feedback_data.shape[0])[:, None],
                                             ideal_scores_idx), dims=feedback_data.shape)
        
    where = np.where
    is_positive = feedback_data >= switch_positive
    positive_feedback = where(is_positive, feedback_data, 0)
    negative_feedback = where(~is_positive, feedback_data-switch_positive, 0)
    
    relevance_scores_pos = (matched_predictions * positive_feedback[:, None, :]).sum(axis=2)
    relevance_scores_neg = (matched_predictions * negative_feedback[:, None, :]).sum(axis=2)
    ideal_scores_pos = positive_feedback.ravel()[ideal_scores_idx]
    ideal_scores_neg = negative_feedback.ravel()[ideal_scores_idx]
    
    if alternative:
        relevance_scores_pos = 2**relevance_scores_pos - 1
        relevance_scores_neg = 2.0**relevance_scores_neg - 1
        ideal_scores_pos = 2**ideal_scores_pos - 1
        ideal_scores_neg = 2.0**ideal_scores_neg - 1

    disc_num = max(topk, holdout)
    discount = np.log2(np.arange(2, disc_num+2))            
    dcg = (relevance_scores_pos / discount[:topk]).sum(axis=1)
    dcl = (relevance_scores_neg / -discount[:topk]).sum(axis=1)
    idcg = (ideal_scores_pos / discount[:holdout]).sum(axis=1)
    idcl = (ideal_scores_neg / -discount[:holdout]).sum(axis=1)
    
    with np.errstate(invalid='ignore'):
        ndcg = np.nansum(dcg / idcg) / users_num
        ndcl = np.nansum(dcl / idcl) / users_num

    ranking_score = namedtuple('Ranking', ['nDCG', 'nDCL'])._make([ndcg, ndcl])
    return ranking_score



In [37]:

    
get_rnkng_scores(data, hsz, recs, switch_positive=swp, alternative=False)









    Out[37]:





Ranking(nDCG=0.1699440242225603, nDCL=0.06406889699069644)



In [38]:

    
get_ranking_scores(rankm, hrank, mrank, evalm, ehits, emiss, switch_positive=swp, topk=topk, alternative=False)









    Out[38]:





Ranking(nDCG=0.1699440242225603, nDCL=0.06406889699069644)



In [ ]:



In [39]:

    
svd.evaluate('hits', not_rated_penalty=None)









    Out[39]:





Hits(true_positive=602, false_positive=132, true_negative=1143, false_negative=1744)



In [40]:

    
svd.evaluate('relevance')









    Out[40]:





Relevance(precision=0.39215686274509803, recall=0.24247445457056063, fallout=0.06890361778514222, specifity=0.6096382214857774, miss_rate=0.6871030102181718)



In [ ]:



In [41]:

    
from polara.recommender import defaults



In [42]:

    
defaults.ndcg_alternative = False



In [43]:

    
svd.evaluate('ranking')









    Out[43]:





Ranking(nDCG=0.1699440242225603, nDCL=0.06406889699069644)



In [ ]:



In [44]:

    
svd.evaluate('ranking', topk=1)









    Out[44]:





Ranking(nDCG=0.07359347041824198, nDCL=0.022039537078199615)

Hand-picked test



In [45]:

    
test_user = 98
test_data = svd.data.test.holdout.query('userid=={}'.format(test_user))
test_recs = svd.recommendations[test_user, :]



In [46]:

    
topk = len(test_recs)



In [47]:

    
print(test_recs)
test_data









    



[1045 2469 1126 1173 2489  846 2638  524 1130 2553]






    Out[47]:







  
    
      
      userid
      movieid
      rating
    
  
  
    
      820166
      98
      1130
      5
    
    
      820164
      98
      1108
      5
    
    
      820140
      98
      1045
      3



In [48]:

    
test_data.loc[:, 'movieid'].isin(test_recs)









    Out[48]:





820166     True
820164    False
820140     True
Name: movieid, dtype: bool



In [49]:

    
(rankm, hrank, mrank,
 evalm, ehits, emiss) = assemble_scoring_matrices(test_recs, test_data,
                                                  svd._key, svd._target,
                                                  (test_data.rating>=swp).values, feedback='rating')



In [50]:

    
hrank.data









    Out[50]:





array([9], dtype=uint8)



In [51]:

    
hrank.indices









    Out[51]:





array([1130], dtype=int64)



In [52]:

    
ehits.data









    Out[52]:





array([5, 5], dtype=int64)



In [53]:

    
ehits.indices









    Out[53]:





array([1130, 1108])



In [54]:

    
discm, idisc = get_ndcr_discounts(rankm, evalm, topn=2)



In [55]:

    
discm.data









    Out[55]:





array([1.        , 0.63092975, 0.5       , 0.43067656, 0.38685281,
       0.35620719, 0.33333333, 0.31546488, 0.30103   , 0.28906483])



In [56]:

    
discm.indices









    Out[56]:





array([1045, 2469, 1126, 1173, 2489,  846, 2638,  524, 1130, 2553],
      dtype=int32)



In [57]:

    
idisc.data









    Out[57]:





array([1.        , 0.63092975, 0.5       ])



In [58]:

    
idisc.indices









    Out[58]:





array([1108, 1130, 1045])

NDCG



In [59]:

    
get_ndcg_score(ehits, discm, idisc, alternative=False)









    Out[59]:





0.18457569677956817



In [60]:

    
print('rec rank', np.where(np.isin(test_recs, test_data.movieid))[0] + 1)
print('rec item', test_recs[np.isin(test_recs, test_data.movieid)])









    



rec rank [1 9]
rec item [1045 1130]

NDCL



In [61]:

    
emiss.data









    Out[61]:





array([3], dtype=int64)



In [62]:

    
emiss.indices









    Out[62]:





array([1045])



In [63]:

    
emiss.multiply(discm).data









    Out[63]:





array([3.])



In [64]:

    
emiss.multiply(idisc)









    Out[64]:





<1x2639 sparse matrix of type '<class 'numpy.float64'>'
	with 1 stored elements in Compressed Sparse Row format>



In [65]:

    
get_ndcl_score(emiss, discm, idisc, swp, alternative=False)









    Out[65]:





2.0



In [ ]:

Why normalization in NDCG is changed

basically due to NDCL metric, which is "the lower the better"
this means that ideal score is 0

regular case



In [66]:

    
cg = lambda rel, pos: rel / np.log2(1+pos)

print('dcg ', cg(5, 9))
print('idcg', cg(5, 1) + cg(5, 2))
print('ndcg', cg(5, 9) / (cg(5, 1) + cg(5, 2)))









    



dcg  1.505149978319906
idcg 8.154648767857287
ndcg 0.1845756967795682

singular, but still ok



In [67]:

    
cl = lambda rel, pos: (np.exp(rel-4)-1) / (-np.log2(1+pos))

print('dcl ', 0)
print('idcl', 0)
with np.errstate(invalid='ignore'):
    print('ndcl', np.array([0.]) / np.array([0.]))









    



dcl  0
idcl 0
ndcl [nan]

broken case
when dcl is above zere and idcl is exactly 0 (due to only topk selected result, where negatove examples are not included at all)



In [68]:

    
cl = lambda rel, pos: (np.exp(rel-4)-1) / (-np.log2(1+pos))

print('dcl ', cl(3, 3))
print('idcl', 0)
with np.errstate(invalid='ignore'): # will not catch an error
    print('ndcl', cl(3, 3) / np.array([0.]))









    



dcl  0.31606027941427883
idcl 0
ndcl [inf]






    



C:\Users\evfro\Anaconda3\envs\py3_polara\lib\site-packages\ipykernel_launcher.py:6: RuntimeWarning: divide by zero encountered in true_divide

therefore with standard normalization NDCL may generate inf doesn't make a lot of sense, especially when trying to average across many users

	userid	movieid	rating
0	1	1193	5
1	1	661	3
2	1	914	3
3	1	3408	4
4	1	2355	5