In [3]:
import numpy as np
m = 6040 # users
n = 3952 # movies
Rnan = np.full((m, n), np.nan) # Ratings matrix with nans instead of 0s
We read the data from the ratings file.
In [4]:
import io
# Read the data into the rating matrix
with open('ml-1m/ratings.dat', 'r') as fp:
for line in iter(fp.readline, ''):
l = line.split('::')
Rnan[int(l[0])-1,int(l[1])-1] = int(l[2])
We continue defining functions as per the paper. $O_u$ is the item (movie) set of the user $u_i$ and $U_o$ is the user set of the item (movie) $o_j$.
In [15]:
def O_u(u_i, R = Rnan): # item set of user u_i
return np.nonzero(1-np.isnan(R[u_i,:]))
def U_o(o_j, R = Rnan): # user set of item o_j
return np.nonzero(1-np.isnan(R[:,o_j]))
In [5]:
print(O_u(0))
print(U_o(0))
In [4]:
def calc_r_bar_v(R = Rnan):
return np.nanmean(R, axis = 1) # mean ratings of user u_i
def calc_r_bar(R = Rnan): # global mean rating
return np.nanmean(R)
In [6]:
r_bar_v = calc_r_bar_v()
r_bar = calc_r_bar()
print('Average rating:',r_bar)
print('Average rating for user 0:',r_bar_v[0])
We define the prediction function.
In [7]:
def calc_r_hat(u_t, o_j, c_t, R = Rnan, r_bar_v = r_bar_v, r_bar = r_bar):
"""
u_t -> target user
o_j -> target movie
c_t -> similarity vector of t to all users
"""
U_oj = U_o(o_j, R = R)
return r_bar_v[u_t] - r_bar + (np.nansum(c_t[U_oj]*R[U_oj, o_j]) / np.nansum(c_t[U_oj]))
In [6]:
print('Rating of user 0 on movie 0:', Rnan[0,0])
print('Estimated rating (using uniform user similarity):', calc_r_hat(0,0, np.ones(m)))
In [8]:
def calc_P_uo(R = Rnan):
return (1 - np.isnan(R)) / np.sum(1 - np.isnan(R), axis = 1).reshape(R.shape[0],1) # Type 1 walk, user to movie
P_uo = calc_P_uo()
print(P_uo)
We also define the rating similarity matrix for the user $u_i$. The computed ratings are only numbers if $u_i$ and $u_k$ both have a rating for that item, where $k\in [0..m]$.
In [9]:
MAXSCORE = 5
def sim(u_i, R = Rnan): # similarity matrix from u_k to o_j, given u_i
return MAXSCORE - np.absolute(R[u_i,:] - R)
sim(0)
Out[9]:
Using the rating similarity matrix, we can quickly compute the total similarity score for each item $o_j$ over all users, given $u_i$. By not including NaNs, we are calculating the denominators of the Type 2 probabilities.
In [10]:
def sum_sim(u_i, R = Rnan): # Sum of similarities for any o_j, give u_i
return np.nansum(sim(u_i, R = R), axis = 0)
sum_sim(0)
Out[10]:
We can now define the probability function from item (movie) $o_j$ to user $u_k$, given the previous transition was from $u_i$ to $o_j$. Again, we calculate a transition probability matrix to lessen the number of computations. Note that we actually return the transpose of the transition probability matrix, to ease further calculations.
In [11]:
def P_ou(u_i, R = Rnan):
"""
Transition probability matrix from movie to user, given
a base user u_i. Note that axis 0 is still the user and
the axis 1 is the movie.
"""
with np.errstate(divide='ignore', invalid='ignore'):
s = sim(u_i, R = R)
return s / np.nansum(s, axis = 0)
P_ou(0)
Out[11]:
We then calculate the transition probability from a user $u_i$ to any other user $u_k$.
In [12]:
def p(u_i, R = Rnan, P_uo = P_uo):
""" Transition probability from user u_i to each user. """
return np.nansum(P_uo[u_i] * P_ou(u_i, R = R), axis = 1)
p(0)
Out[12]:
The calculated transition probabilities are stacked on top of each other to build the transition probability matrix $P$.
In [13]:
def construct_P(R = Rnan):
P_uo = calc_P_uo(R = R)
l = [p(u_i, R = Rnan, P_uo = P_uo) for u_i in range(R.shape[0])]
return np.vstack(l)
In [77]:
P = construct_P()
print(P)
Since constructing this matrix takes time, we save it on the local machine, so that we don't have to calculate it again.
In [78]:
np.save('P.npy', P)
In [18]:
P = np.load('P.npy')
print(P)
In [14]:
def get_P(R = Rnan, ind = None):
P = None
file_name = 'P.npy' if ind is None else 'P'+str(ind)+'.npy'
try:
P = np.load(file_name)
except FileNotFoundError:
P = construct_P(R = R)
np.save(file_name, P)
return P
We create a random test set of 5 numbers for our initial walk.
In [15]:
size_ts = 5
ts = np.random.randint(m, size=(size_ts,))
ts
Out[15]:
We swap the test data to the beginning of the transition probability matrix.
In [15]:
def front_swap(M, inds, dim = 1):
M_new = np.copy(M)
swap_inds = [np.arange(inds.shape[0]) for i in range(dim)]
orig_inds = [inds for i in range(dim)]
temp = M_new[swap_inds]
M_new[swap_inds] = M_new[orig_inds]
M_new[orig_inds] = temp
return M_new
In [19]:
P_new = front_swap(P, ts, dim = 2)
P_new.shape
Out[19]:
We can now calculate $P^*$ and $\pi_{\tau.}$.
In [27]:
P_star = P_new[size_ts:, size_ts:]
P_star.shape
Out[27]:
In [24]:
pi = P_new[:size_ts, size_ts:]
pi.shape
Out[24]:
In [16]:
def walk(N, n_users, P_star, alpha = 0.9):
W = np.zeros((n_users,n_users), dtype = np.float) # The weight matrix for training set
norm_P_star = P_star / (np.sum(P_star, axis= 1).reshape((n_users),1)) # Normalize the probabilities
for r in range(N): # Do N runs for each training user
users = np.arange(n_users) # Create the currently running users
cur_users = np.copy(users) # The current user after starting from the running user itself.
while users.shape[0] > 0: # While there are currently running users
for u in users: # Walk for each user
u_new = np.random.choice(n_users, 1, p = norm_P_star[cur_users[u], :])[0] # Jump to a new user
cur_users[u] = u_new
W[u, u_new] += 1 # Increment the total number of visits to u_new starting from u
cont = np.random.rand(users.shape[0]) > alpha # Finish runs with alpha probability
users = users[cont]
return W / N # Calculate the average # of visits
In [13]:
W = walk(m, m - size_ts, P_star)
In [14]:
np.save('W.npy',W)
In [17]:
def get_W(N, n_users, P_star, alpha = 0.9, ind = None):
W = None
file_name = 'W.npy' if ind is None else 'W'+str(ind)+'.npy'
try:
W = np.load(file_name)
except FileNotFoundError:
W = walk(N, n_users, P_star, alpha = alpha)
np.save(file_name, W)
return W
In [36]:
ts[0]
Out[36]:
In [40]:
c = 0.9*np.sum(pi[0] * W.T, axis = 1)
print(c)
print(c.shape)
c = np.hstack((np.zeros(size_ts), c))
print(c)
print(c.shape)
In [41]:
c_new = front_swap(c, ts, dim = 1)
print(c_new)
print(c_new.shape)
In [19]:
def get_C(W, pi, test_set, alpha, ind = None):
C = None
file_name = 'C.npy' if ind is None else 'C'+str(ind)+'.npy'
try:
C = np.load(file_name)
except FileNotFoundError:
size_ts = test_set.shape[0]
C = np.vstack([front_swap(np.hstack((np.zeros(size_ts), alpha * np.sum(pi[k] * W.T, axis = 1))), \
test_set, dim = 1) for k in range(size_ts)])
np.save(file_name, C)
return C
In [1]:
C = get_C(W, pi, ts, 0.9)
C
In [20]:
def mean_absolute_error(C, test_set, held_out, R = Rnan):
maes = np.zeros(test_set.shape[0])
r_bar_v = calc_r_bar_v(R = R)
r_bar = calc_r_bar(R = R)
for c_ind, u_i in enumerate(test_set):
r_act = R[u_i, held_out[u_i]]
ojs = np.arange(held_out.shape[0])[held_out[u_i]]
r_hat = np.array([calc_r_hat(u_i, o_j, C[c_ind], r_bar_v = r_bar_v, r_bar = r_bar) for o_j in ojs])
maes[c_ind] = np.nanmean(np.absolute(r_act - r_hat))
return maes
In [57]:
mean_absolute_error(C, ts, np.ones((m,n), dtype = np.bool_))
Out[57]:
In [21]:
def get_MAE(C, test_set, held_out, R = Rnan, ind = None):
MAE = None
file_name = 'MAE.npy' if ind is None else 'MAE'+str(ind)+'.npy'
try:
MAE = np.load(file_name)
except FileNotFoundError:
MAE = mean_absolute_error(C, test_set, held_out, R = R)
np.save(file_name, MAE)
return MAE
In [8]:
from sklearn.cross_validation import KFold
def generate_test_sets(n_folds = 4):
kfold = KFold(m, n_folds = 4, shuffle = True)
i = 0
l = []
for train_set, test_set in kfold:
np.save('TS'+str(i)+'.npy', test_set)
l.append(test_set)
i += 1
return l
In [9]:
generate_test_sets(4)
Out[9]:
In [10]:
def load_test_sets(n_folds = 4):
return [np.load('TS'+str(i)+'.npy') for i in range(n_folds)]
In [11]:
test_sets = load_test_sets(4)
test_sets
Out[11]:
In [12]:
def get_test_sets(n_folds = 4):
test_sets = None
try:
test_sets = load_test_sets(n_folds)
except FileNotFoundError:
test_sets = generate_test_sets(n_folds)
return test_sets
For each test user, we hold out 90% of their rated items.
In [19]:
def generate_held_outs(test_sets, R = Rnan, p = 0.9):
l = []
for i, test_set in enumerate(test_sets):
ho = np.zeros((m,n), dtype = np.bool_)
for test_user in test_set:
items = O_u(test_user, R = R)[0]
np.random.shuffle(items)
items = items[:int(p*items.shape[0])]
ho[test_user,items] = 1
l.append(ho)
np.save('HO'+str(i)+'.npy', ho)
return l
In [23]:
np.sum(generate_held_outs(test_sets)[0])
Out[23]:
In [26]:
def load_held_outs(n_folds = 4):
return [np.load('HO'+str(i)+'.npy') for i in range(n_folds)]
In [69]:
held_outs = load_held_outs(4)
held_outs
Out[69]:
In [5]:
def get_held_outs(test_sets, R = Rnan, p = 0.9):
held_outs = None
try:
held_outs = load_held_outs(len(test_sets))
except FileNotFoundError:
held_outs = generate_held_outs(test_sets, R = R, p = p)
return held_outs
In [39]:
def generate_maes(n_folds = 4, R = Rnan, held_ratio = 0.9, alpha = 0.9):
test_sets = get_test_sets(n_folds)
held_outs = get_held_outs(test_sets, R = R, p = held_ratio)
maes = []
for i, (test_set, held_out) in enumerate(zip(test_sets, held_outs)):
R_test = np.copy(R)
R_test[held_out] = np.nan
P = get_P(R = R_test, ind = i)
size_ts = test_set.shape[0]
P_new = front_swap(P, test_set, dim = 2)
P_star = P_new[size_ts:, size_ts:]
pi = P_new[:size_ts, size_ts:]
W = get_W(m, m - size_ts, P_star, alpha = alpha, ind = i)
C = get_C(W, pi, test_set, alpha, ind = i)
MAE = get_MAE(C, test_set, held_out, R = R, ind = i)
maes.append(MAE)
return maes
with np.errstate(divide='raise', invalid='raise'):
maes = generate_maes()
In [34]:
C0 = np.load('C0.npy')
print(C0)