In [1]:
from __future__ import division
import sys
import numpy as np
import scipy as sp
import pandas as pd
from timeit import default_timer as timer
import matplotlib.pyplot as plt
%matplotlib inline
from os.path import basename

import seaborn as sns
sns.set_style('white')

from polara.recommender.data import RecommenderData, RecommenderDataPositive
from polara.recommender.models import SVDModel, CoffeeModel, NonPersonalized
from polara.evaluation import evaluation_engine as ee
from polara.evaluation.plotting import show_hits, show_hit_rates, show_precision_recall, show_ranking, show_relevance, show_ranking_positivity
from polara.tools.mymedialite.mmlwrapper import MyMediaLiteWrapper
from polara.tools.movielens import get_movielens_data, filter_short_head
from polara.tools.printing import print_frames

In [2]:
ml_file = "ml-1m.zip"
if sys.platform == 'win32':
    lib_path = 'MyMediaLite-3.11/lib/mymedialite' 
else:
    lib_path = 'MyMediaLite-3.11/bin'
data_folder = 'MyMediaLiteData'

def get_file_name(filepath):
    return ''.join(basename(filepath).split('.')[:-1])

In [3]:
ml_data = get_movielens_data(local_file=ml_file)
movielens = RecommenderData(ml_data, 'userid', 'movieid', 'rating')
movielens.name = get_file_name(ml_file)

In [4]:
movielens.holdout_size = 1
movielens.shuffle_data = True
movielens.test_sample = None
movielens.random_holdout = False
movielens.permute_tops = True

In [5]:
coffee = CoffeeModel(movielens)
coffee.mlrank


Out[5]:
(13, 10, 2)

In [6]:
coffee.build()


Preparing data
CoFFee model training time: 2.18131835875s

In [7]:
v, w = coffee._items_factors, coffee._feedback_factors

In [8]:
rating_model = movielens
rating_model.index.feedback.T


Out[8]:
0 1 2 3 4
new 0 1 2 3 4
old 1 2 3 4 5

In [9]:
pos_rating_idx = rating_model.index.feedback.set_index('old').loc[coffee.switch_positive, 'new']
pos_rating_idx


Out[9]:
3

In [10]:
hit_score = {}
almost_score = {}
fail_score = {}
hidden_pos = {}
rating_diff = {}

num_users = rating_model.test.testset.userid.nunique()

for user_id in rating_model.test.testset.userid.unique():
    user_data = rating_model.test.testset.query('userid==@user_id')
    rating_data = rating_model.index.feedback.set_index('old').loc[user_data.rating.values, 'new'].values
    movies_data = user_data.movieid.values

    user_pref = sp.sparse.coo_matrix((np.ones_like(movies_data), (movies_data, rating_data)), shape = (v.shape[0], w.shape[0]))
    recs = v.dot((v.T.dot(user_pref.A).dot(w)).dot(w.T))
        
    hidden_movie = rating_model.test.evalset.query('userid==@user_id').movieid.iloc[0]
    hidden_rating = rating_model.test.evalset.query('userid==@user_id').rating.iloc[0]
    hidden_rating_idx = rating_model.index.feedback.query('old == @hidden_rating').new.iloc[0]
    
    predicted_rating_idx = recs[hidden_movie, :].argmax()

    if predicted_rating_idx == hidden_rating_idx:
        hit_score[user_id] = 1
    else:
        rating_diff[user_id] = hidden_rating_idx - predicted_rating_idx
        
        is_almost_top = ((predicted_rating_idx >= pos_rating_idx) and (hidden_rating_idx >= pos_rating_idx) or
                         (predicted_rating_idx <  pos_rating_idx) and (hidden_rating_idx <  pos_rating_idx))
        if is_almost_top:
            almost_score[user_id] = 1
        else:
            fail_score[user_id] = 1

RMSE result


In [11]:
RMSE = np.sqrt(np.sum(rating_diff.values())/num_users)
RMSE


Out[11]:
0.76718717334194098

In [12]:
bingo = sum(hit_score.values())
bingo


Out[12]:
592

In [13]:
positive = sum(almost_score.values())
positive


Out[13]:
538

In [14]:
fail = sum(fail_score.values())
fail


Out[14]:
78

In [15]:
num_users


Out[15]:
1208

In [16]:
bingo / num_users


Out[16]:
0.4900662251655629

In [17]:
positive / num_users


Out[17]:
0.445364238410596

In [18]:
fail / num_users


Out[18]:
0.06456953642384106