Fit Hu et al. matrix factorization to the binarized taste profile dataset


In [1]:
import os
os.environ['OMP_NUM_THREADS'] = '1'   # to not conflict with joblib

import numpy as np
import scipy.sparse
import pandas as pd

In [2]:
import content_wmf
import batched_inv_joblib

In [3]:
unique_uid = list()
with open('unique_uid.txt', 'r') as f:
    for line in f:
        unique_uid.append(line.strip())
    
unique_sid = list()
with open('unique_sid.txt', 'r') as f:
    for line in f:
        unique_sid.append(line.strip())

In [4]:
n_songs = len(unique_sid)
n_users = len(unique_uid)

print n_users, n_songs


613682 97414

In [5]:
# the last 5% of the songs are in the out-matrix set
n_songs = int(0.95 * n_songs)
print n_songs


92543

Load the data and train the model


In [6]:
def load_data(csv_file, shape=(n_users, n_songs)):
    tp = pd.read_csv(csv_file)
    rows, cols = np.array(tp['uid'], dtype=np.int32), np.array(tp['sid'], dtype=np.int32)
    count = tp['count']
    return scipy.sparse.csr_matrix((count,(rows, cols)), dtype=np.int16, shape=shape), rows, cols

In [7]:
train_data, rows, cols = load_data('in.train.num.csv')
# binarize the data
train_data.data = np.ones_like(train_data.data)

In [8]:
print train_data.shape
print train_data.data.shape


(613682, 92543)
(26139017,)

In [9]:
vad_data, rows_vad, cols_vad = load_data('in.vad.num.csv')
# binarize the data
vad_data.data = np.ones_like(vad_data.data)
print vad_data.shape
print vad_data.data.shape


(613682, 92543)
(2904335,)

In [10]:
vad = dict(X_new=vad_data.data,
           rows_new=rows_vad,
           cols_new=cols_vad)

In [11]:
test_data, rows_test, cols_test = load_data('in.test.num.csv')
# binarize the data
test_data.data = np.ones_like(test_data.data)
print test_data.shape
print test_data.data.shape


(613682, 92543)
(7260837,)

In [12]:
num_factors = 100
num_iters = 10
batch_size = 10000

In [13]:
S = content_wmf.log_surplus_confidence_matrix(train_data, alpha=2.0, epsilon=1e-6)

In [25]:
lambda_U_reg = 1e-1
lambda_V_reg = 1e-1

In [ ]:
U, V, _ = content_wmf.factorize(S, num_factors, vad=vad, num_iters=num_iters, 
                                init_std=0.01, lambda_U_reg=lambda_U_reg, lambda_V_reg=lambda_V_reg, 
                                dtype='float32', random_state=98765, verbose=True, 
                                recompute_factors=batched_inv_joblib.recompute_factors_batched, 
                                batch_size=batch_size, n_jobs=10)


precompute S^T and X^TX (if necessary)
  took 2.182 seconds
run ALS algorithm
Iteration 0:
	Updating item factors: time=18.61
	Updating user factors: time=47.06
	Pred likeli: 0.82856
Iteration 1:
	Updating item factors: time=23.62
	Updating user factors: time=58.10
	Pred likeli: 0.59027
Iteration 2:
	Updating item factors: time=24.31
	Updating user factors: time=60.97
	Pred likeli: 0.54315
Iteration 3:
	Updating item factors: time=24.77
	Updating user factors: time=61.77
	Pred likeli: 0.52661
Iteration 4:
	Updating item factors: time=22.60
	Updating user factors: time=61.54
	Pred likeli: 0.51891
Iteration 5:
	Updating item factors: time=20.99
	Updating user factors: time=62.41
	Pred likeli: 0.51467
Iteration 6:
	Updating item factors: time=21.17
	Updating user factors: time=59.48
	Pred likeli: 0.51208
Iteration 7:
	Updating item factors: time=22.45
	Updating user factors: time=59.69
	Pred likeli: 0.51037
Iteration 8:
	Updating item factors: time=24.11
	Updating user factors...

In [ ]:
np.savez('params_wmf_K%d_U%1.E_V%1.E.unpop.npz' % (num_factors, lambda_U_reg, lambda_V_reg), U=U, V=V)