Fit content-aware matrix factorization to the binarized taste profile dataset



In [1]:

    
import os
os.environ['OMP_NUM_THREADS'] = '1'

import numpy as np
import scipy.sparse
import pandas as pd



In [2]:

    
import content_wmf
import batched_inv_joblib



In [3]:

    
unique_uid = list()
with open('unique_uid.txt', 'r') as f:
    for line in f:
        unique_uid.append(line.strip())
    
unique_sid = list()
with open('unique_sid.txt', 'r') as f:
    for line in f:
        unique_sid.append(line.strip())



In [4]:

    
n_songs = len(unique_sid)
n_users = len(unique_uid)

print n_users, n_songs









    



613682 97414



In [5]:

    
# the last 5% of the songs are in the out-matrix set
n_songs = int(0.95 * n_songs)
print n_songs

Load the data and train the model



In [6]:

    
def load_data(csv_file, shape=(n_users, n_songs)):
    tp = pd.read_csv(csv_file)
    rows, cols = np.array(tp['uid'], dtype=np.int32), np.array(tp['sid'], dtype=np.int32)
    count = tp['count']
    return scipy.sparse.csr_matrix((count,(rows, cols)), dtype=np.int16, shape=shape), rows, cols



In [7]:

    
train_data, rows, cols = load_data('in.train.num.csv')
# binarize the data
train_data.data = np.ones_like(train_data.data)



In [8]:

    
print train_data.shape
print train_data.data.shape









    



(613682, 92543)
(26139017,)



In [9]:

    
vad_data, rows_vad, cols_vad = load_data('in.vad.num.csv')
# binarize the data
vad_data.data = np.ones_like(vad_data.data)
print vad_data.shape
print vad_data.data.shape









    



(613682, 92543)
(2904335,)



In [10]:

    
vad = dict(X_new=vad_data.data,
           rows_new=rows_vad,
           cols_new=cols_vad)



In [11]:

    
test_data, rows_test, cols_test = load_data('in.test.num.csv')
# binarize the data
test_data.data = np.ones_like(test_data.data)
print test_data.shape
print test_data.data.shape









    



(613682, 92543)
(7260837,)



In [12]:

    
# the output of the neural network for both in and out-of-matrix songs
H_in_out = np.load('H_in_out.npz')



In [13]:

    
H_in = H_in_out['H_in']
H_out = H_in_out['H_out']



In [14]:

    
S = content_wmf.log_surplus_confidence_matrix(train_data, alpha=2.0, epsilon=1e-6)



In [15]:

    
num_factors = 100
num_iters = 10
batch_size = 10000



In [16]:

    
lambda_U_reg = 1e-4 
lambda_V_reg = 1e-4
lambda_W_reg = 1e-4



In [17]:

    
print lambda_U_reg, lambda_V_reg









    



0.0001 0.0001



In [18]:

    
U, V, W = content_wmf.factorize(S, num_factors, X=H_in, vad=vad, num_iters=num_iters, init_std=0.01, 
                                lambda_U_reg=lambda_U_reg, lambda_V_reg=lambda_V_reg, lambda_W_reg=lambda_W_reg, 
                                dtype='float32', random_state=98765, verbose=True, 
                                recompute_factors=batched_inv_joblib.recompute_factors_batched, 
                                batch_size=batch_size, n_jobs=10)









    



precompute S^T and X^TX (if necessary)
  took 11.837 seconds
run ALS algorithm
Iteration 0:
	Updating item factors: time=20.47
	Updating user factors: time=40.14
	Pred likeli: 0.82855
	Updating projection matrix: time=1.11
Iteration 1:
	Updating item factors: time=22.05
	Updating user factors: time=44.46
	Pred likeli: 0.59025
	Updating projection matrix: time=1.10
Iteration 2:
	Updating item factors: time=21.81
	Updating user factors: time=43.88
	Pred likeli: 0.54313
	Updating projection matrix: time=1.14
Iteration 3:
	Updating item factors: time=22.18
	Updating user factors: time=41.79
	Pred likeli: 0.52659
	Updating projection matrix: time=1.19
Iteration 4:
	Updating item factors: time=22.24
	Updating user factors: time=44.61
	Pred likeli: 0.51889
	Updating projection matrix: time=1.15
Iteration 5:
	Updating item factors: time=21.17
	Updating user factors: time=45.89
	Pred likeli: 0.51465
	Updating projection matrix: time=1.19
Iteration 6:
	Updating item factors: time=21.84
	Updating user factors: time=44.20
	Pred likeli: 0.51206
	Updating projection matrix: time=1.15
Iteration 7:
	Updating item factors: time=21.74
	Updating user factors: time=44.33
	Pred likeli: 0.51035
	Updating projection matrix: time=1.19
Iteration 8:
	Updating item factors: time=21.27
	Updating user factors: time=43.69
	Pred likeli: 0.50916
	Updating projection matrix: time=1.20
Iteration 9:
	Updating item factors: time=21.79
	Updating user factors: time=44.81
	Pred likeli: 0.50828
	Updating projection matrix: time=1.14



In [19]:

    
print U.shape, V.shape, W.shape









    



(613682, 100) (92543, 100) (1201, 100)



In [ ]:

    
np.savez('params_deep_wmf_K%d_U%1.E_V%1.E_W%1.E.unpop.npz' % 
         (num_factors, lambda_U_reg, lambda_V_reg, lambda_W_reg), U=U, V=V, W=W)