In [1]:
import os
os.environ['OMP_NUM_THREADS'] = '1' # to not conflict with joblib
import numpy as np
import scipy.sparse
import pandas as pd
In [2]:
import content_wmf
import batched_inv_joblib
In [3]:
unique_uid = list()
with open('unique_uid.txt', 'r') as f:
for line in f:
unique_uid.append(line.strip())
unique_sid = list()
with open('unique_sid.txt', 'r') as f:
for line in f:
unique_sid.append(line.strip())
In [4]:
n_songs = len(unique_sid)
n_users = len(unique_uid)
print n_users, n_songs
In [5]:
# the last 5% of the songs are in the out-matrix set
n_songs = int(0.95 * n_songs)
print n_songs
In [6]:
def load_data(csv_file, shape=(n_users, n_songs)):
tp = pd.read_csv(csv_file)
rows, cols = np.array(tp['uid'], dtype=np.int32), np.array(tp['sid'], dtype=np.int32)
count = tp['count']
return scipy.sparse.csr_matrix((count,(rows, cols)), dtype=np.int16, shape=shape), rows, cols
In [7]:
train_data, rows, cols = load_data('in.train.num.csv')
# binarize the data
train_data.data = np.ones_like(train_data.data)
In [8]:
print train_data.shape
print train_data.data.shape
In [9]:
vad_data, rows_vad, cols_vad = load_data('in.vad.num.csv')
# binarize the data
vad_data.data = np.ones_like(vad_data.data)
print vad_data.shape
print vad_data.data.shape
In [10]:
vad = dict(X_new=vad_data.data,
rows_new=rows_vad,
cols_new=cols_vad)
In [11]:
test_data, rows_test, cols_test = load_data('in.test.num.csv')
# binarize the data
test_data.data = np.ones_like(test_data.data)
print test_data.shape
print test_data.data.shape
In [12]:
num_factors = 100
num_iters = 10
batch_size = 10000
In [13]:
S = content_wmf.log_surplus_confidence_matrix(train_data, alpha=2.0, epsilon=1e-6)
In [25]:
lambda_U_reg = 1e-1
lambda_V_reg = 1e-1
In [ ]:
U, V, _ = content_wmf.factorize(S, num_factors, vad=vad, num_iters=num_iters,
init_std=0.01, lambda_U_reg=lambda_U_reg, lambda_V_reg=lambda_V_reg,
dtype='float32', random_state=98765, verbose=True,
recompute_factors=batched_inv_joblib.recompute_factors_batched,
batch_size=batch_size, n_jobs=10)
In [ ]:
np.savez('params_wmf_K%d_U%1.E_V%1.E.unpop.npz' % (num_factors, lambda_U_reg, lambda_V_reg), U=U, V=V)