In [1]:
import itertools
import os
import sys
os.environ['OPENBLAS_NUM_THREADS'] = '1'
import numpy as np
import pandas as pd
from scipy import sparse
In [2]:
import content_wmf
import batched_inv_joblib
import rec_eval
Change this to wherever you saved the pre-processed data following this notebook.
In [3]:
DATA_DIR = '/hdd2/dawen/data/ml-20m/pro/'
In [4]:
unique_uid = list()
with open(os.path.join(DATA_DIR, 'unique_uid.txt'), 'r') as f:
for line in f:
unique_uid.append(line.strip())
unique_sid = list()
with open(os.path.join(DATA_DIR, 'unique_sid.txt'), 'r') as f:
for line in f:
unique_sid.append(line.strip())
In [5]:
n_items = len(unique_sid)
n_users = len(unique_uid)
print n_users, n_items
In [6]:
def load_data(csv_file, shape=(n_users, n_items)):
tp = pd.read_csv(csv_file)
timestamps, rows, cols = np.array(tp['timestamp']), np.array(tp['uid']), np.array(tp['sid'])
seq = np.concatenate((rows[:, None], cols[:, None], np.ones((rows.size, 1), dtype='int'), timestamps[:, None]), axis=1)
data = sparse.csr_matrix((np.ones_like(rows), (rows, cols)), dtype=np.int16, shape=shape)
return data, seq
In [7]:
train_data, train_raw = load_data(os.path.join(DATA_DIR, 'train.csv'))
In [8]:
vad_data, vad_raw = load_data(os.path.join(DATA_DIR, 'validation.csv'))
In [9]:
num_factors = 100
num_iters = 50
batch_size = 1000
n_jobs = 4
lam_theta = lam_beta = 1e-5
In [ ]:
best_ndcg = -np.inf
U_best = None
V_best = None
best_alpha = 0
for alpha in [2, 5, 10, 30, 50]:
S = content_wmf.linear_surplus_confidence_matrix(train_data, alpha=alpha)
U, V, vad_ndcg = content_wmf.factorize(S, num_factors, vad_data=vad_data, num_iters=num_iters,
init_std=0.01, lambda_U_reg=lam_theta, lambda_V_reg=lam_beta,
dtype='float32', random_state=98765, verbose=False,
recompute_factors=batched_inv_joblib.recompute_factors_batched,
batch_size=batch_size, n_jobs=n_jobs)
if vad_ndcg > best_ndcg:
best_ndcg = vad_ndcg
U_best = U.copy()
V_best = V.copy()
best_alpha = alpha
In [11]:
print best_alpha, best_ndcg
In [12]:
test_data, _ = load_data(os.path.join(DATA_DIR, 'test.csv'))
test_data.data = np.ones_like(test_data.data)
In [13]:
# alpha = 10 gives the best validation performance
print 'Test Recall@20: %.4f' % rec_eval.recall_at_k(train_data, test_data, U_best, V_best, k=20, vad_data=vad_data)
print 'Test Recall@50: %.4f' % rec_eval.recall_at_k(train_data, test_data, U_best, V_best, k=50, vad_data=vad_data)
print 'Test NDCG@100: %.4f' % rec_eval.normalized_dcg_at_k(train_data, test_data, U_best, V_best, k=100, vad_data=vad_data)
print 'Test MAP@100: %.4f' % rec_eval.map_at_k(train_data, test_data, U_best, V_best, k=100, vad_data=vad_data)
In [14]:
np.savez('WMF_K100_ML20M.npz', U=U_best, V=V_best)
In [ ]: