Fit WMF (weighted matrix factorization) to the binarized ML20M


In [1]:
import itertools
import os
import sys
os.environ['OPENBLAS_NUM_THREADS'] = '1'

import numpy as np
import pandas as pd
from scipy import sparse

In [2]:
import content_wmf
import batched_inv_joblib
import rec_eval

Load pre-processed data

Change this to wherever you saved the pre-processed data following this notebook.


In [3]:
DATA_DIR = '/hdd2/dawen/data/ml-20m/pro/'

In [4]:
unique_uid = list()
with open(os.path.join(DATA_DIR, 'unique_uid.txt'), 'r') as f:
    for line in f:
        unique_uid.append(line.strip())
    
unique_sid = list()
with open(os.path.join(DATA_DIR, 'unique_sid.txt'), 'r') as f:
    for line in f:
        unique_sid.append(line.strip())

In [5]:
n_items = len(unique_sid)
n_users = len(unique_uid)

print n_users, n_items


111148 11711

In [6]:
def load_data(csv_file, shape=(n_users, n_items)):
    tp = pd.read_csv(csv_file)
    timestamps, rows, cols = np.array(tp['timestamp']), np.array(tp['uid']), np.array(tp['sid'])
    seq = np.concatenate((rows[:, None], cols[:, None], np.ones((rows.size, 1), dtype='int'), timestamps[:, None]), axis=1)
    data = sparse.csr_matrix((np.ones_like(rows), (rows, cols)), dtype=np.int16, shape=shape)
    return data, seq

In [7]:
train_data, train_raw = load_data(os.path.join(DATA_DIR, 'train.csv'))

In [8]:
vad_data, vad_raw = load_data(os.path.join(DATA_DIR, 'validation.csv'))

Train the model


In [9]:
num_factors = 100
num_iters = 50
batch_size = 1000

n_jobs = 4
lam_theta = lam_beta = 1e-5

In [ ]:
best_ndcg = -np.inf
U_best = None
V_best = None
best_alpha = 0

for alpha in [2, 5, 10, 30, 50]: 
    S = content_wmf.linear_surplus_confidence_matrix(train_data, alpha=alpha)

    U, V, vad_ndcg = content_wmf.factorize(S, num_factors, vad_data=vad_data, num_iters=num_iters, 
                                           init_std=0.01, lambda_U_reg=lam_theta, lambda_V_reg=lam_beta, 
                                           dtype='float32', random_state=98765, verbose=False, 
                                           recompute_factors=batched_inv_joblib.recompute_factors_batched, 
                                           batch_size=batch_size, n_jobs=n_jobs)
    if vad_ndcg > best_ndcg:
        best_ndcg = vad_ndcg
        U_best = U.copy()
        V_best = V.copy()
        best_alpha = alpha

In [11]:
print best_alpha, best_ndcg


10 0.35510611042

In [12]:
test_data, _ = load_data(os.path.join(DATA_DIR, 'test.csv'))
test_data.data = np.ones_like(test_data.data)

In [13]:
# alpha = 10 gives the best validation performance
print 'Test Recall@20: %.4f' % rec_eval.recall_at_k(train_data, test_data, U_best, V_best, k=20, vad_data=vad_data)
print 'Test Recall@50: %.4f' % rec_eval.recall_at_k(train_data, test_data, U_best, V_best, k=50, vad_data=vad_data)
print 'Test NDCG@100: %.4f' % rec_eval.normalized_dcg_at_k(train_data, test_data, U_best, V_best, k=100, vad_data=vad_data)
print 'Test MAP@100: %.4f' % rec_eval.map_at_k(train_data, test_data, U_best, V_best, k=100, vad_data=vad_data)


Test Recall@20: 0.1333
Test Recall@50: 0.1647
Test NDCG@100: 0.1602
Test MAP@100: 0.0473

In [14]:
np.savez('WMF_K100_ML20M.npz', U=U_best, V=V_best)

In [ ]: