In [1]:
from itertools import count
from scipy import sparse
from scipy.sparse.linalg import svds
import numpy as np
import timeit
from sklearn.linear_model import Ridge

In [2]:
n = 300000

In [3]:
raw = np.genfromtxt('/media/plopd/home/src/mmds/02/train_triplets.txt', max_rows=n, dtype="S40,S18,i8")

In [4]:
'''
Construct COOordinate matrix from raw
'''
row = []
col = []
data = []

user_to_row = {}
song_to_col = {}
user_count = count()
song_count = count()

for i, (uid, sid, play_count) in enumerate(raw):
    if not uid in user_to_row:
        user_to_row[uid] = next(user_count)
    row.append(user_to_row[uid])
    
    if not sid in song_to_col:
        song_to_col[sid] = next(song_count)
    col.append(song_to_col[sid])        
    
    data.append(float(play_count))

arr = np.array(data) # transform list to ndarray for conveninence

In [5]:
'''
Preprocess the data by binning the play counts into b bins 
and store data in sparse coo matrix
'''
b = 10
for i in range(b):
    arr[((arr >= ((1 << i))) & (arr <= ((1 << (i+1)) - 1)))] = (i+1)
arr[arr >= (1 << b)] = b

M = sparse.coo_matrix((arr, (row, col)))

In [6]:
'''
Helper methods for removing users/songs
'''
def removeCol(M):
    M = M.tocsc() #transform to CSC for fast column slicing
    cols = []
    for i in range(M.shape[1]):
        nnz = (M.getcol(i)).count_nonzero()
        if nnz > 5:
            cols.append(i)
    M = M[:, cols]
    return M

def removeRow(M):
    M = M.tocsr() #transform to CSR for fast row slicing
    rows = []
    for i in range(M.shape[0]):
        nnz = (M.getrow(i)).count_nonzero()
        if nnz > 5:
            rows.append(i)
    M = M[rows, :]
    return M

In [7]:
start_time = timeit.default_timer()
'''
Preprocess the data to avoid the cold start issue. 
That is, recursively remove all users/songs
that have less or equal than 5 occurrences in M, 
until M no longer changes.
'''
prevShape = M.shape
M = removeCol(M)
currShape = M.shape
while prevShape != currShape:
    prevShape = currShape
    M = removeCol(M)
    M = removeRow(M)
    currShape = M.shape
elapsed = timeit.default_timer() - start_time

In [8]:
elapsed


Out[8]:
7.958170175552368

In [9]:
M.shape # After cold-start preprocess we expect (5693, 10966)


Out[9]:
(5693, 10966)

In [10]:
'''
To properly evaluate our approach set-aside 200 randomly 
selected non-zero datapoints from the matrix as a test set.
That is, store their values separately and set them to 0 in M.
'''
M = M.tocsr()
nnz_row_idx, nnz_col_idx = M.nonzero()
nnz_indices = np.vstack((nnz_row_idx, nnz_col_idx))
np.random.shuffle(nnz_indices.T)
test_set_idx = nnz_indices[:,:200]
M_test = M[test_set_idx[0,:],test_set_idx[1,:]]
M[test_set_idx[0,:],test_set_idx[1,:]] = 0 # set test instances to zero in M

In [11]:
'''
Finding latent factors via alternating optimization
1. Initialization of P and Q with SVD, where missing entries are set to zero.
'''
M = M.T # transpose M to have users to columns and songs to rows
U, S, Vt = svds(M,k=30)
Q = U*S
P = Vt

In [22]:
def argmin_P(P,Q):
    reg = Ridge(fit_intercept=False)
    for user_idx in range(M.shape[1]): # for each user
        nnz_row_idx, nnz_col_idx = M[:,user_idx].nonzero()
        y = M[nnz_row_idx, user_idx].todense() # response variable
        X = Q[nnz_row_idx,:] # explanatory variables
        reg.fit(X,y)
        P[:,user_idx] = reg.coef_ # get coefficients and update column i of P
    return P

In [23]:
def argmin_Q(P,Q):
    reg = Ridge(fit_intercept=False)
    for item_idx in range(M.shape[0]): # for each item
        nnz_row_idx, nnz_col_idx = M[item_idx,:].nonzero()
        y = M[item_idx,nnz_col_idx].todense().T # response variable
        X = P[:,nnz_col_idx].T # explanatory variables
        reg.fit(X,y)
        Q[item_idx,:] = reg.coef_ # get coefficients and update column i of P
    return Q

In [24]:
'''
2. Performing alternating optimization in fixed iterations steps
'''
start_time = timeit.default_timer()
for iter in range(10):
    P=argmin_P(P,Q)
    Q=argmin_Q(P,Q)
elapsed = timeit.default_timer() - start_time

In [25]:
elapsed


Out[25]:
291.4343271255493

In [32]:
from sklearn.metrics import mean_squared_error
from math import sqrt

rows = test_set_idx[0,:]
cols = test_set_idx[1,:]
test_y = []
zipped = zip(rows, cols)
for i in zipped:
    test_y.append(np.dot(Q[i[1],:], P[:,i[0]]))

In [33]:
test_y = np.array(test_y)

In [35]:
test_y = np.expand_dims(test_y, axis=1)

In [36]:
err = sqrt(mean_squared_error(M_test.T, test_y))

In [37]:
err


1.06898675043