In [1]:
import sys
import numpy as np
import pandas as pd
import matplotlib as mpl
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
sns.set(font_scale=1.3)
sns.set_style('white')
mpl.rcParams['text.usetex'] = True #Type1 Fonts
from polara.recommender.data import RecommenderData, RecommenderDataPositive
from polara.recommender.models import SVDModel, CoffeeModel, NonPersonalized
from polara.evaluation import evaluation_engine as ee
from polara.tools.mymedialite.mmlwrapper import MyMediaLiteWrapper
from polara.tools.movielens import get_movielens_data
from polara.evaluation.plotting import show_hit_rates, show_precision_recall, show_ranking
In [2]:
DATA_NAME = 'ml-1m'
DATA_FILE = '{}.zip'.format(DATA_NAME)#path to Movielens-1M zip-file
#set it to None to automatically download data from Grouplens
SESS_SIZE = 20
In [3]:
#set path to MyMediaLite binaries
if sys.platform == 'win32':
LIB_PATH = 'MyMediaLite-3.11/lib/mymedialite'
else:
LIB_PATH = 'MyMediaLite-3.11/bin'
MML_DATA = 'MyMediaLiteData' #folder to store MyMediLite data (models, data mappings, etc.)
# the folder must exist!
In [4]:
RESULTS_DIR = 'results'
EXPERIMENT_NAME = 'ACM'
In [5]:
ERR_ALPHA = 0.05
In [6]:
ml_data = get_movielens_data(local_file=DATA_FILE, get_genres=False)
In [7]:
def filter_by_length(data, user_id='userid', session_length=SESS_SIZE):
"""Filters users with insufficient number of items"""
sz = data.groupby(user_id, sort=False).size()
short_sessions = sz < session_length
if (short_sessions < session_length).any():
valid_users = sz.index[(sz > session_length)]
new_data = data[data.userid.isin(valid_users)]
print 'Sessions are filtered by length'
else:
new_data = data
return new_data
In [8]:
ml_data = filter_by_length(ml_data)
data_model = RecommenderData(ml_data, 'userid', 'movieid', 'rating')
data_model.name = DATA_NAME
In [9]:
bpr = MyMediaLiteWrapper(LIB_PATH, MML_DATA, 'BPRMF', data_model)
wrmf = MyMediaLiteWrapper(LIB_PATH, MML_DATA, 'WRMF', data_model)
svd = SVDModel(data_model)
popular = NonPersonalized('mostpopular', data_model)
random = NonPersonalized('random', data_model)
coffee = CoffeeModel(data_model)
In [10]:
models = [bpr, wrmf, svd, coffee, popular, random]
model_names = [model.method for model in models]
metrics = ['ranking', 'relevance']
model_names
Out[10]:
In [11]:
for model in models[:4]:
try:
rank = model.rank
except AttributeError:
rank = model.mlrank
print '{} rank: {}'.format(model.method, rank)
In [12]:
topk_list = [1, 2, 3, 5, 10, 15, 20, 30, 50, 70, 100]
test_samples = [-1, 0]
folds = [1, 2, 3, 4, 5]
In [13]:
data_model.holdout_size = 10
data_model.random_holdout = True
In [14]:
result = {}
topk_result = {}
for test_sample in test_samples:
data_model.test_sample = test_sample
print '\n\n========= Test sample: {} =========\n'.format(test_sample)
for fold in folds:
print '\n============ Fold: {} ============='.format(fold)
data_model.test_fold = fold
topk_result[fold] = ee.topk_test(models, topk_list=topk_list, metrics=metrics)
result[test_sample] = ee.consolidate_folds(topk_result, folds, metrics)
ee.save_scores(result, DATA_NAME, EXPERIMENT_NAME, save_folder=RESULTS_DIR)
In [15]:
file_path_tmpl = "{}/{}_{}_({{}})_{{}}.csv".format(RESULTS_DIR, DATA_NAME, EXPERIMENT_NAME)
In [16]:
def get_results(metric, test_sample):
filepath = file_path_tmpl.format(test_sample, metric)
data = pd.read_csv(filepath, header=[0, 1], index_col=[0, 1])
res, err = ee.average_results({metric:data})
return res, err
In [17]:
def format_plots(ax):
handles = {}
labels = {}
colors = {}
for i, axi in enumerate(ax):
axi.set_ylim(ymin=0)
handles[i], labels[i] = axi.get_legend_handles_labels()
handle = handles[i]
if i == 0:
for j in range(len(handle)):
try: #workaround for matplotlib < 1.5
colors[j] = handle[j].get_color()
except AttributeError:
colors[j] = handle[j].get_facecolor()
else:
for j in range(len(handle)):
try:
handle[j].set_color(colors[j])
except AttributeError:
handle[j].set_facecolor(colors[j])
labels[i], handles[i] = zip(*sorted(zip(labels[i], handles[i]), key=lambda t: t[0]))
for i, lable in enumerate(labels):
labels[i] = [name if name!='mostpopular' else 'most popular' for name in labels[i]]
def format_labels(ax):
ax[2].set_xlabel('top-$n$')
ax[3].set_xlabel('top-$n$')
ax[2].set_ylabel('nDCG@$n$')
ax[3].set_ylabel('nDCL@$n$')
ax[1].set_xlabel('Recall@$n$')
ax[1].set_ylabel('Precision@$n$')
In [18]:
test_sample = -1
In [19]:
res, err = get_results('relevance', test_sample)
resr, errr = get_results('ranking', test_sample)
In [20]:
fig, ax = plt.subplots(1, 4, figsize=(16, 3))
show_hit_rates(res, errors=err, err_alpha=ERR_ALPHA, ax=ax[0], ROC_middle=True);
show_precision_recall(res, errors=err, err_alpha=ERR_ALPHA, ax=ax[1])
show_ranking(resr, errors=errr, err_alpha=ERR_ALPHA, ax=(ax[2], ax[3]))
format_plots(ax)
format_labels(ax)
handles, labels = ax[3].get_legend_handles_labels()
#don't put errors shaded area into legend
handles = [hdl for hdl in handles if not isinstance(hdl, mpl.collections.PolyCollection)]
labels = [lbl for lbl in labels if not isinstance(lbl, mpl.collections.PolyCollection)]
lgnd = fig.legend(handles, labels, loc='upper left', bbox_to_anchor=(1,0.8))
txt = ax[0].text(-0.07, 0.087, 'A', fontsize=20)
plt.tight_layout()
# plt.savefig("ml1m.pdf", bbox_extra_artists=(lgnd, txt), bbox_inches='tight')
In [21]:
test_sample = 0
In [22]:
res, err = get_results('relevance', test_sample)
resr, errr = get_results('ranking', test_sample)
In [23]:
fig, ax = plt.subplots(1, 4, figsize=(16, 3))
show_hit_rates(res, errors=err, err_alpha=ERR_ALPHA, ax=ax[0], ROC_middle=True);
show_precision_recall(res, errors=err, err_alpha=ERR_ALPHA, ax=ax[1])
show_ranking(resr, errors=errr, err_alpha=ERR_ALPHA, ax=(ax[2], ax[3]))
format_plots(ax)
format_labels(ax)
handles, labels = ax[3].get_legend_handles_labels()
#don't put errors shaded area into legend
handles = [hdl for hdl in handles if not isinstance(hdl, mpl.collections.PolyCollection)]
labels = [lbl for lbl in labels if not isinstance(lbl, mpl.collections.PolyCollection)]
lgnd = fig.legend(handles, labels, loc='upper left', bbox_to_anchor=(1,0.8))
txt = ax[0].text(-0.17, 0.27, 'C', fontsize=20)
plt.tight_layout()
# plt.savefig("ml1m_all.pdf", bbox_extra_artists=(lgnd, txt), bbox_inches='tight')
In [ ]: