In [1]:
from itertools import count
from scipy import sparse
from scipy.sparse.linalg import svds
import numpy as np
import timeit
from sklearn.linear_model import Ridge
In [2]:
n = 300000
In [3]:
raw = np.genfromtxt('/media/plopd/home/src/mmds/02/train_triplets.txt', max_rows=n, dtype="S40,S18,i8")
In [4]:
'''
Construct COOordinate matrix from raw
'''
row = []
col = []
data = []
user_to_row = {}
song_to_col = {}
user_count = count()
song_count = count()
for i, (uid, sid, play_count) in enumerate(raw):
if not uid in user_to_row:
user_to_row[uid] = next(user_count)
row.append(user_to_row[uid])
if not sid in song_to_col:
song_to_col[sid] = next(song_count)
col.append(song_to_col[sid])
data.append(float(play_count))
arr = np.array(data) # transform list to ndarray for conveninence
In [5]:
'''
Preprocess the data by binning the play counts into b bins
and store data in sparse coo matrix
'''
b = 10
for i in range(b):
arr[((arr >= ((1 << i))) & (arr <= ((1 << (i+1)) - 1)))] = (i+1)
arr[arr >= (1 << b)] = b
M = sparse.coo_matrix((arr, (row, col)))
In [6]:
'''
Helper methods for removing users/songs
'''
def removeCol(M):
M = M.tocsc() #transform to CSC for fast column slicing
cols = []
for i in range(M.shape[1]):
nnz = (M.getcol(i)).count_nonzero()
if nnz > 5:
cols.append(i)
M = M[:, cols]
return M
def removeRow(M):
M = M.tocsr() #transform to CSR for fast row slicing
rows = []
for i in range(M.shape[0]):
nnz = (M.getrow(i)).count_nonzero()
if nnz > 5:
rows.append(i)
M = M[rows, :]
return M
In [7]:
start_time = timeit.default_timer()
'''
Preprocess the data to avoid the cold start issue.
That is, recursively remove all users/songs
that have less or equal than 5 occurrences in M,
until M no longer changes.
'''
prevShape = M.shape
M = removeCol(M)
currShape = M.shape
while prevShape != currShape:
prevShape = currShape
M = removeCol(M)
M = removeRow(M)
currShape = M.shape
elapsed = timeit.default_timer() - start_time
In [8]:
elapsed
Out[8]:
In [9]:
M.shape # After cold-start preprocess we expect (5693, 10966)
Out[9]:
In [10]:
'''
To properly evaluate our approach set-aside 200 randomly
selected non-zero datapoints from the matrix as a test set.
That is, store their values separately and set them to 0 in M.
'''
M = M.tocsr()
nnz_row_idx, nnz_col_idx = M.nonzero()
nnz_indices = np.vstack((nnz_row_idx, nnz_col_idx))
np.random.shuffle(nnz_indices.T)
test_set_idx = nnz_indices[:,:200]
M_test = M[test_set_idx[0,:],test_set_idx[1,:]]
M[test_set_idx[0,:],test_set_idx[1,:]] = 0 # set test instances to zero in M
In [11]:
'''
Finding latent factors via alternating optimization
1. Initialization of P and Q with SVD, where missing entries are set to zero.
'''
M = M.T # transpose M to have users to columns and songs to rows
U, S, Vt = svds(M,k=30)
Q = U*S
P = Vt
In [22]:
def argmin_P(P,Q):
reg = Ridge(fit_intercept=False)
for user_idx in range(M.shape[1]): # for each user
nnz_row_idx, nnz_col_idx = M[:,user_idx].nonzero()
y = M[nnz_row_idx, user_idx].todense() # response variable
X = Q[nnz_row_idx,:] # explanatory variables
reg.fit(X,y)
P[:,user_idx] = reg.coef_ # get coefficients and update column i of P
return P
In [23]:
def argmin_Q(P,Q):
reg = Ridge(fit_intercept=False)
for item_idx in range(M.shape[0]): # for each item
nnz_row_idx, nnz_col_idx = M[item_idx,:].nonzero()
y = M[item_idx,nnz_col_idx].todense().T # response variable
X = P[:,nnz_col_idx].T # explanatory variables
reg.fit(X,y)
Q[item_idx,:] = reg.coef_ # get coefficients and update column i of P
return Q
In [24]:
'''
2. Performing alternating optimization in fixed iterations steps
'''
start_time = timeit.default_timer()
for iter in range(10):
P=argmin_P(P,Q)
Q=argmin_Q(P,Q)
elapsed = timeit.default_timer() - start_time
In [25]:
elapsed
Out[25]:
In [32]:
from sklearn.metrics import mean_squared_error
from math import sqrt
rows = test_set_idx[0,:]
cols = test_set_idx[1,:]
test_y = []
zipped = zip(rows, cols)
for i in zipped:
test_y.append(np.dot(Q[i[1],:], P[:,i[0]]))
In [33]:
test_y = np.array(test_y)
In [35]:
test_y = np.expand_dims(test_y, axis=1)
In [36]:
err = sqrt(mean_squared_error(M_test.T, test_y))
In [37]:
err