Fit content-aware matrix factorization to the binarized taste profile dataset


In [1]:
import os
os.environ['OMP_NUM_THREADS'] = '1'

import numpy as np
import scipy.sparse
import pandas as pd

In [2]:
import content_wmf
import batched_inv_joblib

In [3]:
unique_uid = list()
with open('unique_uid.txt', 'r') as f:
    for line in f:
        unique_uid.append(line.strip())
    
unique_sid = list()
with open('unique_sid.txt', 'r') as f:
    for line in f:
        unique_sid.append(line.strip())

In [4]:
n_songs = len(unique_sid)
n_users = len(unique_uid)

print n_users, n_songs


613682 97414

In [5]:
# the last 5% of the songs are in the out-matrix set
n_songs = int(0.95 * n_songs)
print n_songs


92543

Load the data and train the model


In [6]:
def load_data(csv_file, shape=(n_users, n_songs)):
    tp = pd.read_csv(csv_file)
    rows, cols = np.array(tp['uid'], dtype=np.int32), np.array(tp['sid'], dtype=np.int32)
    count = tp['count']
    return scipy.sparse.csr_matrix((count,(rows, cols)), dtype=np.int16, shape=shape), rows, cols

In [7]:
train_data, rows, cols = load_data('in.train.num.csv')
# binarize the data
train_data.data = np.ones_like(train_data.data)

In [8]:
print train_data.shape
print train_data.data.shape


(613682, 92543)
(26139017,)

In [9]:
vad_data, rows_vad, cols_vad = load_data('in.vad.num.csv')
# binarize the data
vad_data.data = np.ones_like(vad_data.data)
print vad_data.shape
print vad_data.data.shape


(613682, 92543)
(2904335,)

In [10]:
vad = dict(X_new=vad_data.data,
           rows_new=rows_vad,
           cols_new=cols_vad)

In [11]:
test_data, rows_test, cols_test = load_data('in.test.num.csv')
# binarize the data
test_data.data = np.ones_like(test_data.data)
print test_data.shape
print test_data.data.shape


(613682, 92543)
(7260837,)

In [12]:
# the output of the neural network for both in and out-of-matrix songs
H_in_out = np.load('H_in_out.npz')

In [13]:
H_in = H_in_out['H_in']
H_out = H_in_out['H_out']

In [14]:
S = content_wmf.log_surplus_confidence_matrix(train_data, alpha=2.0, epsilon=1e-6)

In [15]:
num_factors = 100
num_iters = 10
batch_size = 10000

In [16]:
lambda_U_reg = 1e-4 
lambda_V_reg = 1e-4
lambda_W_reg = 1e-4

In [17]:
print lambda_U_reg, lambda_V_reg


0.0001 0.0001

In [18]:
U, V, W = content_wmf.factorize(S, num_factors, X=H_in, vad=vad, num_iters=num_iters, init_std=0.01, 
                                lambda_U_reg=lambda_U_reg, lambda_V_reg=lambda_V_reg, lambda_W_reg=lambda_W_reg, 
                                dtype='float32', random_state=98765, verbose=True, 
                                recompute_factors=batched_inv_joblib.recompute_factors_batched, 
                                batch_size=batch_size, n_jobs=10)


precompute S^T and X^TX (if necessary)
  took 11.837 seconds
run ALS algorithm
Iteration 0:
	Updating item factors: time=20.47
	Updating user factors: time=40.14
	Pred likeli: 0.82855
	Updating projection matrix: time=1.11
Iteration 1:
	Updating item factors: time=22.05
	Updating user factors: time=44.46
	Pred likeli: 0.59025
	Updating projection matrix: time=1.10
Iteration 2:
	Updating item factors: time=21.81
	Updating user factors: time=43.88
	Pred likeli: 0.54313
	Updating projection matrix: time=1.14
Iteration 3:
	Updating item factors: time=22.18
	Updating user factors: time=41.79
	Pred likeli: 0.52659
	Updating projection matrix: time=1.19
Iteration 4:
	Updating item factors: time=22.24
	Updating user factors: time=44.61
	Pred likeli: 0.51889
	Updating projection matrix: time=1.15
Iteration 5:
	Updating item factors: time=21.17
	Updating user factors: time=45.89
	Pred likeli: 0.51465
	Updating projection matrix: time=1.19
Iteration 6:
	Updating item factors: time=21.84
	Updating user factors: time=44.20
	Pred likeli: 0.51206
	Updating projection matrix: time=1.15
Iteration 7:
	Updating item factors: time=21.74
	Updating user factors: time=44.33
	Pred likeli: 0.51035
	Updating projection matrix: time=1.19
Iteration 8:
	Updating item factors: time=21.27
	Updating user factors: time=43.69
	Pred likeli: 0.50916
	Updating projection matrix: time=1.20
Iteration 9:
	Updating item factors: time=21.79
	Updating user factors: time=44.81
	Pred likeli: 0.50828
	Updating projection matrix: time=1.14

In [19]:
print U.shape, V.shape, W.shape


(613682, 100) (92543, 100) (1201, 100)

In [ ]:
np.savez('params_deep_wmf_K%d_U%1.E_V%1.E_W%1.E.unpop.npz' % 
         (num_factors, lambda_U_reg, lambda_V_reg, lambda_W_reg), U=U, V=V, W=W)