In [1]:
import sys
import numpy as np
import pandas as pd
import matplotlib as mpl
import matplotlib.pyplot as plt
%matplotlib inline
from polara import (SVDModel, PopularityModel, RandomModel,
RecommenderData, get_movielens_data)
from polara.recommender.models import CoffeeModel
from polara.evaluation import evaluation_engine as ee
from polara.tools.preprocessing import filter_sessions_by_length
from polara.evaluation.plotting import show_hit_rates, show_precision_recall, show_ranking
In [2]:
DATA_NAME = 'ml-1m'
DATA_FILE = 'D:/datasets/recsys/movielens/{}.zip'.format(DATA_NAME)#path to Movielens-1M zip-file
#set it to None to automatically download data from Grouplens
SESS_SIZE = 20
In [3]:
ml_data = get_movielens_data(local_file=DATA_FILE, get_genres=False)
In [4]:
ml_data = filter_sessions_by_length(ml_data, min_session_length=SESS_SIZE)
data_model = RecommenderData(ml_data, 'userid', 'movieid', 'rating', seed = 0)
data_model.name = DATA_NAME
In [5]:
data_model.get_configuration()
Out[5]:
In [6]:
svd = SVDModel(data_model)
popular = PopularityModel(data_model)
random = RandomModel(data_model, seed=0)
coffee = CoffeeModel(data_model)
In [7]:
coffee.flattener = [3, 4] # ratings 4 and 5
In [8]:
models = [svd, coffee, popular, random]
model_names = [model.method for model in models]
model_names
Out[8]:
In [9]:
for model in models:
model.switch_positive = 4
In [10]:
for model in models:
try:
rank = model.rank
except AttributeError:
try:
rank = model.mlrank
except AttributeError:
continue
print('{} rank: {}'.format(model.method, rank))
In [11]:
metrics = ['ranking', 'relevance', 'experience']
topk_list = [1, 2, 3, 5, 10, 15, 20, 30, 50, 70, 100]
test_samples = [0, -1]
folds = [1, 2, 3, 4, 5]
In [12]:
data_model.holdout_size = 10
data_model.random_holdout = True
In [13]:
result = {}
for test_sample in test_samples:
data_model.test_sample = test_sample
result[test_sample] = ee.run_cv_experiment(models,
folds,
metrics,
fold_experiment=ee.topk_test,
topk_list=topk_list)
In [14]:
test_sample = 0
In [15]:
scores = result[test_sample].mean(axis=0, level=['top-n', 'model'])
deviation = result[test_sample].std(axis=0, level=['top-n', 'model'])
In [16]:
scores['ranking'].unstack('model')
Out[16]:
In [17]:
ERR_ALPHA = 0.1
In [18]:
show_hit_rates(scores, errors=deviation, err_alpha=ERR_ALPHA)
In [19]:
show_precision_recall(scores, errors=deviation, err_alpha=ERR_ALPHA)
In [20]:
show_ranking(scores, errors=deviation, err_alpha=ERR_ALPHA)
In [21]:
test_sample = -1
In [22]:
scores = result[test_sample].mean(axis=0, level=['top-n', 'model'])
deviation = result[test_sample].std(axis=0, level=['top-n', 'model'])
In [23]:
show_hit_rates(scores, errors=deviation, err_alpha=ERR_ALPHA)
In [24]:
show_precision_recall(scores, errors=deviation, err_alpha=ERR_ALPHA)
In [25]:
show_ranking(scores, errors=deviation, err_alpha=ERR_ALPHA)