In [1]:
from __future__ import print_function
from collections import namedtuple

import pandas as pd
import numpy as np
from scipy.sparse import csr_matrix

In [2]:
from polara.recommender.evaluation import assemble_scoring_matrices, build_rank_matrix, matrix_from_observations, split_positive, generate_hits_data
from polara.recommender.evaluation import get_mrr_score, get_ndcr_discounts, get_ndcg_score, get_ndcl_score
from polara.recommender.evaluation import get_hits, get_relevance_scores, get_ranking_scores
from polara.datasets.movielens import get_movielens_data
from polara.recommender.data import RecommenderData
from polara.recommender.models import SVDModel

Simple examples


In [3]:
swp = None

data = pd.DataFrame({'userid': [0,0,0,0,0,0,0,0],
                     'movieid': [0,1,2,3,4,5,6,7],
                    'rating':[3, 2, 3, 0, 1, 2, 3, 2]})
recs = np.array([[0,1,2,3,4,5]])
hsz = data.shape[0]

In [4]:
topk = recs.shape[1]
shp = (recs.shape[0], max(recs.max(), data['movieid'].max())+1)

In [5]:
rankm, hrank, mrank, evalm, ehits, emiss = assemble_scoring_matrices(recs, data, 'userid', 'movieid', None, 'rating')

In [6]:
discm, idisc = get_ndcr_discounts(rankm, evalm, topk)

In [7]:
get_ndcg_score(ehits, discm, idisc, alternative=False)


Out[7]:
0.7561640298168335

the result is slightly worse (expected value is 0.785), as normalization is based on the full holdout, not just topk elements
this is an intentional behavior in order to support NDCL score calculation when switch_positive is set

hand-crafted example


In [8]:
swp = 3

data = pd.DataFrame({'userid': [0,0, 1,1, 2,2],
                     'movieid': [0,1, 2,3, 4,5],
                    'rating':[2,3, 1,3, 5,4]})
recs = np.array([[1,0], [2,3], [5,4]])
hsz = 2

In [9]:
topk = recs.shape[1]
shp = (recs.shape[0], max(recs.max(), data['movieid'].max())+1)

In [10]:
data.set_index(['userid', 'movieid']).sort_index()


Out[10]:
rating
userid movieid
0 0 2
1 3
1 2 1
3 3
2 4 5
5 4

In [11]:
if swp is None:
    is_positive = None
else:
    is_positive = data.rating>=swp

In [12]:
rankm, hrank, mrank, evalm, ehits, emiss = assemble_scoring_matrices(recs, data, 'userid', 'movieid', is_positive, 'rating')

In [13]:
discm, idisc = get_ndcr_discounts(rankm, evalm, topk)

In [14]:
get_ndcg_score(ehits, discm, idisc, alternative=False)


Out[14]:
0.8606251743711292

In [15]:
get_ndcl_score(emiss, discm, idisc, swp, alternative=False)


Out[15]:
0.861654166907052

Movielens


In [16]:
ml_data = get_movielens_data()

In [17]:
ml_data.head()


Out[17]:
userid movieid rating
0 1 1193 5
1 1 661 3
2 1 914 3
3 1 3408 4
4 1 2355 5

In [18]:
dm = RecommenderData(ml_data, 'userid', 'movieid', 'rating', seed=0)

In [19]:
dm.get_configuration()


Out[19]:
{'holdout_size': 3,
 'negative_prediction': False,
 'permute_tops': False,
 'random_holdout': False,
 'shuffle_data': False,
 'test_fold': 5,
 'test_ratio': 0.2,
 'test_sample': None,
 'warm_start': True}

In [20]:
dm.random_holdout = True
dm.prepare()


Preparing data...
19 unique movieid's within 26 testset interactions were filtered. Reason: not in the training data.
1 unique movieid's within 1 holdout interactions were filtered. Reason: not in the training data.
1 of 1208 userid's were filtered out from holdout. Reason: not enough items.
1 userid's were filtered out from testset. Reason: inconsistent with holdout.
Done.

In [21]:
svd = SVDModel(dm)
svd.rank = 50

In [22]:
svd.build()


PureSVD training time: 0.4176868981995412s

In [23]:
swp = 4

svd.switch_positive = swp
data = dm.test.holdout
recs = svd.recommendations
hsz = dm.holdout_size

In [24]:
topk = recs.shape[1]
shp = (recs.shape[0], max(recs.max(), data['movieid'].max())+1)

In [25]:
if swp is None:
    is_positive = None
else:
    is_positive = (data.rating>=swp).values

In [26]:
rankm, hrank, mrank, evalm, ehits, emiss = assemble_scoring_matrices(recs, data, 'userid', 'movieid', is_positive, 'rating')

In [27]:
evalm


Out[27]:
<1207x3687 sparse matrix of type '<class 'numpy.int64'>'
	with 3621 stored elements in Compressed Sparse Row format>

In [28]:
ehits


Out[28]:
<1207x3687 sparse matrix of type '<class 'numpy.int64'>'
	with 2346 stored elements in Compressed Sparse Row format>

In [29]:
emiss


Out[29]:
<1207x3687 sparse matrix of type '<class 'numpy.int64'>'
	with 1275 stored elements in Compressed Sparse Row format>

In [30]:
discm, idisc = get_ndcr_discounts(rankm, evalm, topk)

In [31]:
discm


Out[31]:
<1207x3687 sparse matrix of type '<class 'numpy.float64'>'
	with 12070 stored elements in Compressed Sparse Row format>

In [32]:
idisc


Out[32]:
<1207x3687 sparse matrix of type '<class 'numpy.float64'>'
	with 3621 stored elements in Compressed Sparse Row format>

In [33]:
get_ndcg_score(ehits, discm, idisc, alternative=False)


Out[33]:
0.1699440242225603

In [34]:
get_ndcl_score(emiss, discm, idisc, swp, alternative=False)


Out[34]:
0.06406889699069644

In [35]:
get_mrr_score(hrank)


Out[35]:
Ranking(mrr=0.20079365079365077)

compare with previous implementation


In [36]:
def get_matched_predictions(eval_data, holdout_size, recs):
    userid, itemid = 'userid', 'movieid'
    holdout_data = eval_data[itemid]
    holdout_matrix = holdout_data.values.reshape(-1, holdout_size).astype(np.int64)

    matched_predictions = (recs[:, :, None] == holdout_matrix[:, None, :])
    return matched_predictions

def get_feedback_data(eval_data, holdout_size):
    feedback = 'rating'
    eval_data = eval_data[feedback].values
    feedback_data = eval_data.reshape(-1, holdout_size)
    return feedback_data

def get_rnkng_scores(eval_data, holdout_size, recs, switch_positive=None, alternative=False):
    matched_predictions = get_matched_predictions(eval_data, holdout_size, recs)
    feedback_data = get_feedback_data(eval_data, holdout_size)
    
    users_num, topk, holdout = matched_predictions.shape
    ideal_scores_idx = np.argsort(feedback_data, axis=1)[:, ::-1] #returns column index only
    ideal_scores_idx = np.ravel_multi_index((np.arange(feedback_data.shape[0])[:, None],
                                             ideal_scores_idx), dims=feedback_data.shape)
        
    where = np.where
    is_positive = feedback_data >= switch_positive
    positive_feedback = where(is_positive, feedback_data, 0)
    negative_feedback = where(~is_positive, feedback_data-switch_positive, 0)
    
    relevance_scores_pos = (matched_predictions * positive_feedback[:, None, :]).sum(axis=2)
    relevance_scores_neg = (matched_predictions * negative_feedback[:, None, :]).sum(axis=2)
    ideal_scores_pos = positive_feedback.ravel()[ideal_scores_idx]
    ideal_scores_neg = negative_feedback.ravel()[ideal_scores_idx]
    
    if alternative:
        relevance_scores_pos = 2**relevance_scores_pos - 1
        relevance_scores_neg = 2.0**relevance_scores_neg - 1
        ideal_scores_pos = 2**ideal_scores_pos - 1
        ideal_scores_neg = 2.0**ideal_scores_neg - 1

    disc_num = max(topk, holdout)
    discount = np.log2(np.arange(2, disc_num+2))            
    dcg = (relevance_scores_pos / discount[:topk]).sum(axis=1)
    dcl = (relevance_scores_neg / -discount[:topk]).sum(axis=1)
    idcg = (ideal_scores_pos / discount[:holdout]).sum(axis=1)
    idcl = (ideal_scores_neg / -discount[:holdout]).sum(axis=1)
    
    with np.errstate(invalid='ignore'):
        ndcg = np.nansum(dcg / idcg) / users_num
        ndcl = np.nansum(dcl / idcl) / users_num

    ranking_score = namedtuple('Ranking', ['nDCG', 'nDCL'])._make([ndcg, ndcl])
    return ranking_score

In [37]:
get_rnkng_scores(data, hsz, recs, switch_positive=swp, alternative=False)


Out[37]:
Ranking(nDCG=0.1699440242225603, nDCL=0.06406889699069644)

In [38]:
get_ranking_scores(rankm, hrank, mrank, evalm, ehits, emiss, switch_positive=swp, topk=topk, alternative=False)


Out[38]:
Ranking(nDCG=0.1699440242225603, nDCL=0.06406889699069644)

In [ ]:


In [39]:
svd.evaluate('hits', not_rated_penalty=None)


Out[39]:
Hits(true_positive=602, false_positive=132, true_negative=1143, false_negative=1744)

In [40]:
svd.evaluate('relevance')


Out[40]:
Relevance(precision=0.39215686274509803, recall=0.24247445457056063, fallout=0.06890361778514222, specifity=0.6096382214857774, miss_rate=0.6871030102181718)

In [ ]:


In [41]:
from polara.recommender import defaults

In [42]:
defaults.ndcg_alternative = False

In [43]:
svd.evaluate('ranking')


Out[43]:
Ranking(nDCG=0.1699440242225603, nDCL=0.06406889699069644)

In [ ]:


In [44]:
svd.evaluate('ranking', topk=1)


Out[44]:
Ranking(nDCG=0.07359347041824198, nDCL=0.022039537078199615)

Hand-picked test


In [45]:
test_user = 98
test_data = svd.data.test.holdout.query('userid=={}'.format(test_user))
test_recs = svd.recommendations[test_user, :]

In [46]:
topk = len(test_recs)

In [47]:
print(test_recs)
test_data


[1045 2469 1126 1173 2489  846 2638  524 1130 2553]
Out[47]:
userid movieid rating
820166 98 1130 5
820164 98 1108 5
820140 98 1045 3

In [48]:
test_data.loc[:, 'movieid'].isin(test_recs)


Out[48]:
820166     True
820164    False
820140     True
Name: movieid, dtype: bool

In [49]:
(rankm, hrank, mrank,
 evalm, ehits, emiss) = assemble_scoring_matrices(test_recs, test_data,
                                                  svd._key, svd._target,
                                                  (test_data.rating>=swp).values, feedback='rating')

In [50]:
hrank.data


Out[50]:
array([9], dtype=uint8)

In [51]:
hrank.indices


Out[51]:
array([1130], dtype=int64)

In [52]:
ehits.data


Out[52]:
array([5, 5], dtype=int64)

In [53]:
ehits.indices


Out[53]:
array([1130, 1108])

In [54]:
discm, idisc = get_ndcr_discounts(rankm, evalm, topn=2)

In [55]:
discm.data


Out[55]:
array([1.        , 0.63092975, 0.5       , 0.43067656, 0.38685281,
       0.35620719, 0.33333333, 0.31546488, 0.30103   , 0.28906483])

In [56]:
discm.indices


Out[56]:
array([1045, 2469, 1126, 1173, 2489,  846, 2638,  524, 1130, 2553],
      dtype=int32)

In [57]:
idisc.data


Out[57]:
array([1.        , 0.63092975, 0.5       ])

In [58]:
idisc.indices


Out[58]:
array([1108, 1130, 1045])

NDCG


In [59]:
get_ndcg_score(ehits, discm, idisc, alternative=False)


Out[59]:
0.18457569677956817

In [60]:
print('rec rank', np.where(np.isin(test_recs, test_data.movieid))[0] + 1)
print('rec item', test_recs[np.isin(test_recs, test_data.movieid)])


rec rank [1 9]
rec item [1045 1130]

NDCL


In [61]:
emiss.data


Out[61]:
array([3], dtype=int64)

In [62]:
emiss.indices


Out[62]:
array([1045])

In [63]:
emiss.multiply(discm).data


Out[63]:
array([3.])

In [64]:
emiss.multiply(idisc)


Out[64]:
<1x2639 sparse matrix of type '<class 'numpy.float64'>'
	with 1 stored elements in Compressed Sparse Row format>

In [65]:
get_ndcl_score(emiss, discm, idisc, swp, alternative=False)


Out[65]:
2.0

In [ ]:

Why normalization in NDCG is changed

basically due to NDCL metric, which is "the lower the better"
this means that ideal score is 0

regular case


In [66]:
cg = lambda rel, pos: rel / np.log2(1+pos)

print('dcg ', cg(5, 9))
print('idcg', cg(5, 1) + cg(5, 2))
print('ndcg', cg(5, 9) / (cg(5, 1) + cg(5, 2)))


dcg  1.505149978319906
idcg 8.154648767857287
ndcg 0.1845756967795682

singular, but still ok


In [67]:
cl = lambda rel, pos: (np.exp(rel-4)-1) / (-np.log2(1+pos))

print('dcl ', 0)
print('idcl', 0)
with np.errstate(invalid='ignore'):
    print('ndcl', np.array([0.]) / np.array([0.]))


dcl  0
idcl 0
ndcl [nan]

broken case
when dcl is above zere and idcl is exactly 0 (due to only topk selected result, where negatove examples are not included at all)


In [68]:
cl = lambda rel, pos: (np.exp(rel-4)-1) / (-np.log2(1+pos))

print('dcl ', cl(3, 3))
print('idcl', 0)
with np.errstate(invalid='ignore'): # will not catch an error
    print('ndcl', cl(3, 3) / np.array([0.]))


dcl  0.31606027941427883
idcl 0
ndcl [inf]
C:\Users\evfro\Anaconda3\envs\py3_polara\lib\site-packages\ipykernel_launcher.py:6: RuntimeWarning: divide by zero encountered in true_divide
  

therefore with standard normalization NDCL may generate inf doesn't make a lot of sense, especially when trying to average across many users