In [1]:
import os
os.environ['OMP_NUM_THREADS'] = '1'
import numpy as np
import scipy.sparse
import pandas as pd
In [2]:
import content_wmf
import batched_inv_joblib
In [3]:
unique_uid = list()
with open('unique_uid.txt', 'r') as f:
for line in f:
unique_uid.append(line.strip())
unique_sid = list()
with open('unique_sid.txt', 'r') as f:
for line in f:
unique_sid.append(line.strip())
In [4]:
n_songs = len(unique_sid)
n_users = len(unique_uid)
print n_users, n_songs
In [5]:
# the last 5% of the songs are in the out-matrix set
n_songs = int(0.95 * n_songs)
print n_songs
In [6]:
def load_data(csv_file, shape=(n_users, n_songs)):
tp = pd.read_csv(csv_file)
rows, cols = np.array(tp['uid'], dtype=np.int32), np.array(tp['sid'], dtype=np.int32)
count = tp['count']
return scipy.sparse.csr_matrix((count,(rows, cols)), dtype=np.int16, shape=shape), rows, cols
In [7]:
train_data, rows, cols = load_data('in.train.num.csv')
# binarize the data
train_data.data = np.ones_like(train_data.data)
In [8]:
print train_data.shape
print train_data.data.shape
In [9]:
vad_data, rows_vad, cols_vad = load_data('in.vad.num.csv')
# binarize the data
vad_data.data = np.ones_like(vad_data.data)
print vad_data.shape
print vad_data.data.shape
In [10]:
vad = dict(X_new=vad_data.data,
rows_new=rows_vad,
cols_new=cols_vad)
In [11]:
test_data, rows_test, cols_test = load_data('in.test.num.csv')
# binarize the data
test_data.data = np.ones_like(test_data.data)
print test_data.shape
print test_data.data.shape
In [12]:
# the output of the neural network for both in and out-of-matrix songs
H_in_out = np.load('H_in_out.npz')
In [13]:
H_in = H_in_out['H_in']
H_out = H_in_out['H_out']
In [14]:
S = content_wmf.log_surplus_confidence_matrix(train_data, alpha=2.0, epsilon=1e-6)
In [15]:
num_factors = 100
num_iters = 10
batch_size = 10000
In [16]:
lambda_U_reg = 1e-4
lambda_V_reg = 1e-4
lambda_W_reg = 1e-4
In [17]:
print lambda_U_reg, lambda_V_reg
In [18]:
U, V, W = content_wmf.factorize(S, num_factors, X=H_in, vad=vad, num_iters=num_iters, init_std=0.01,
lambda_U_reg=lambda_U_reg, lambda_V_reg=lambda_V_reg, lambda_W_reg=lambda_W_reg,
dtype='float32', random_state=98765, verbose=True,
recompute_factors=batched_inv_joblib.recompute_factors_batched,
batch_size=batch_size, n_jobs=10)
In [19]:
print U.shape, V.shape, W.shape
In [ ]:
np.savez('params_deep_wmf_K%d_U%1.E_V%1.E_W%1.E.unpop.npz' %
(num_factors, lambda_U_reg, lambda_V_reg, lambda_W_reg), U=U, V=V, W=W)