In [8]:
import pandas as pd
import matplotlib.pyplot as plt
import re
from scipy.sparse import csr_matrix, coo_matrix
import numpy as np
%matplotlib inline
In [24]:
ratings = pd.read_csv('BX-Book-Ratings.csv', encoding='iso-8859-1', sep = ';')
ratings.columns = ['user_id', 'isbn', 'book_rating']
books = pd.read_csv('BX-Books.csv', sep=';', encoding = 'iso-8859-1', dtype =str)
books["Book-Title"].nunique() == books["ISBN"].nunique()
book_dict = books[["Book-Title","ISBN"]].set_index("Book-Title").to_dict()["ISBN"]
books['new_isbn'] = books["Book-Title"].apply(lambda x: book_dict[x])
books["Book-Title"].nunique() == books["new_isbn"].nunique()
books['isbn'] = books['new_isbn']
del books['Image-URL-L']
del books['Image-URL-M']
del books['Image-URL-S']
del books['Book-Author']
del books['Publisher']
del books['ISBN']
del books['new_isbn']
newdf = ratings[ratings.book_rating>0]
joined = books.merge(newdf, on ='isbn')
print(newdf.shape)
In [72]:
datasets = []
for j in [100, 150, 200, 300, 500]:
df = joined.groupby('isbn').count().sort_values('user_id', ascending =False)[0:j].index.values
test = joined.groupby('user_id').count().sort_values('isbn', ascending = False)[:20000].index.values
newdf = joined[joined.user_id.isin(test) & joined.isbn.isin(df)]
data = newdf[newdf['user_id'].isin(newdf['user_id'].value_counts()[newdf['user_id'].value_counts()>1].index)]
data = data[['user_id', 'Book-Title', 'book_rating', 'isbn']].drop_duplicates()
print(data.user_id.nunique(), data.isbn.nunique())
print(data.groupby('user_id').count().sort_values('isbn', ascending = False).mean())
datasets.append(data)
#data.to_csv('data' + str(j) + '.csv')
We implemented NMF from scratch using this paper: http://www.siam.org/meetings/sdm06/proceedings/059zhangs2.pdf. This implementation has regularisation parameters as well.
There are two functions which can used to train NMF. train_nmf() can be used for training when you have a train/test split. CV_nmf() can be used for cross validation
In [9]:
def train_nmf(X, Wts, Wts_train, Wts_test, factors=30, iterations=10, lambda1=0, lambda2=0):
""" Train NMF on train data and get results on test data
Args:
X: Matrix with rows for users and columns for books
Wts: Matrix of shape of X. Entry should be 1 if rating is present, 0 otherwise
Wts_train: Matrix of shape of X. Entry should be 1 if known rating should be used for training, 0 otherwise
Wts_test: Matrix of shape of X. Entry should be 1 if known rating should be used for testing, 0 otherwise
factors: Number of latent factors to train
iterations: Number of times latent factors are updated
lambda1: Regularization parameters for W. Predicted ratings = W*H
lambda2: Regularization parameters for H. Predicted ratings = W*H
Returns:
W,H: Latent factors of desired size
test_mae, test_rmse, train_mae, train_rmse: MAE & RMSE metrics on train/test
"""
W=np.random.uniform(low=0, high=0.5, size=(X.shape[0], factors))
H=np.random.uniform(low=0, high=0.5, size=(factors, X.shape[1]))
# lambda1=0 #.01
# lambda2=0 #.01
for i in range(iterations):
num_w = np.dot(np.nan_to_num(np.multiply(Wts_train, X)),H.T)
den_w = np.dot(Wts_train * np.dot(W,H), H.T) + lambda1*(np.asarray([Wts_train.sum(axis=1)]).T)*W
den_w[den_w==0] = 1e-16
W = W*num_w/den_w
num_h = np.dot(W.T, np.nan_to_num(Wts_train*X))
den_h = np.dot(W.T, Wts_train*np.dot(W,H)) + lambda2*(np.asarray([Wts_train.sum(axis=0)]))*H
den_h[den_h==0] = 1e-16
H = H*num_h/den_h
test_mae = ((np.abs(np.nan_to_num(X - np.dot(W,H)))*Wts_test).sum())/Wts_test.sum()
test_rmse = np.sqrt(((np.nan_to_num(X - np.dot(W,H))**2)*Wts_test).sum()/Wts_test.sum())
train_mae = ((np.abs(np.nan_to_num(X - np.dot(W,H)))*Wts_train).sum())/Wts_train.sum()
train_rmse = np.sqrt(((np.nan_to_num(X - np.dot(W,H))**2)*Wts_train).sum()/Wts_train.sum())
print("Test MAE", test_mae)
print("Test RMSE", test_rmse)
print("Train MAE", train_mae)
print("Train RMSE", train_rmse)
return([W, H, test_mae, test_rmse, train_mae, train_rmse])
In [51]:
def CV_nmf(data, k_cv=5, factors=30, iterations=10, lambda1=0, lambda2=0):
""" Cross Validation for NMF implementation
Args:
data: DataFrame in long form. User-book pair and corresponding rating
k_cv: Number of folds
factors: Number of latent factors to train
iterations: Number of times latent factors are updated
lambda1: Regularization parameters for W. Predicted ratings = W*H
lambda2: Regularization parameters for H. Predicted ratings = W*H
Returns:
test_mae, test_rmse, train_mae, train_rmse: Mean MAE & RMSE metrics on train/test
"""
test_mae_mean=0
test_rmse_mean=0
train_mae_mean=0
train_rmse_mean=0
sample_threshold = 1 - 1/k_cv
for i in range(k_cv):
data['rn'] = np.random.uniform(low=0, high=1, size=(len(data),))
cuts = list(np.arange(0,1,1/10))
cuts.append(1)
data['fold'] = pd.cut(data.rn, cuts, labels=False)
data['test'] = (data['fold']==i).astype(int)
X_matrix = data.pivot(index='user_id', columns='Book-Title', values='book_rating')
X = np.asarray(X_matrix)
Wts = pd.isnull(X)
Wts = (~np.asarray(Wts)).astype(int)
Wts_test = np.nan_to_num(np.asarray(data.pivot(index='user_id', columns='Book-Title', values='test')))
Wts_train = Wts * (~Wts_test.astype(bool))
print('Results of fold', i)
W, H, test_mae, test_rmse, train_mae, train_rmse = train_nmf(X, Wts, Wts_train, Wts_test, factors, iterations, lambda1, lambda2)
test_mae_mean = test_mae_mean + test_mae
test_rmse_mean = test_rmse_mean + test_rmse
train_mae_mean = train_mae_mean + train_mae
train_rmse_mean = train_rmse_mean + train_rmse
print("----------Average of metrics------------")
test_mae_mean=test_mae_mean/k_cv
test_rmse_mean=test_rmse_mean/k_cv
train_mae_mean=train_mae_mean/k_cv
train_rmse_mean=train_rmse_mean/k_cv
print("Average Test MAE", test_mae_mean)
print("Average Test RMSE", test_rmse_mean)
print("Average Train MAE", train_mae_mean)
print("Average Train RMSE", train_rmse_mean)
return([test_mae_mean, test_rmse_mean, train_mae_mean, train_rmse_mean])
In [49]:
data = datasets[4]
rows = data.user_id.unique()
cols = data['Book-Title'].unique()
data = data[['user_id', 'Book-Title', 'book_rating']]
idict = dict(zip(cols, range(len(cols))))
udict = dict(zip(rows, range(len(rows))))
data.user_id = [
udict[i] for i in data.user_id
]
data['Book-Title'] = [
idict[i] for i in data['Book-Title']
]
data = data[['user_id','Book-Title','book_rating']].drop_duplicates()
nmat = data.as_matrix()
In [53]:
data['test'] = (np.random.uniform(low=0, high=1, size=(len(data),))>0.9).astype(int)
X_matrix = data.pivot(index='user_id', columns='Book-Title', values='book_rating')
X = np.asarray(X_matrix)
Wts = pd.isnull(X)
Wts = (~np.asarray(Wts)).astype(int)
Wts_test = np.nan_to_num(np.asarray(data.pivot(index='user_id', columns='Book-Title', values='test')))
Wts_train = Wts * (~Wts_test.astype(bool))
In [54]:
W, H, test_mae, test_rmse, train_mae, train_rmse = train_nmf(X, Wts, Wts_train, Wts_test, factors=50, iterations=5, lambda1=0, lambda2=0)
In [52]:
CV_nmf(data, factors=20, iterations=5) #2.047 on 300, 1.9 on 500, 2.1 on 200
Out[52]:
In [16]:
# Grid searching for best factors: number of factors, lambda1, lambda2
factors = np.arange(1,100,20)
test_maes = []
train_maes = []
for factor in factors:
print("Results for factor ",factor)
test_mae_mean, test_rmse_mean, train_mae_mean, train_rmse_mean = CV_nmf(data, factors=factor, iterations=10)
test_maes.append(test_mae_mean)
train_maes.append(train_mae_mean)
In [19]:
results_df = pd.DataFrame({'Factors': factors, 'test_mae': test_maes, 'train_mae': train_maes})
results_df.plot(x='Factors', y=['test_mae', 'train_mae'], ylim=(0,4))
Out[19]:
In [21]:
# Grid searching for best iterations/epochs
iterations = np.arange(1,50,10)
test_maes = []
train_maes = []
for iteration in iterations:
test_mae_mean, test_rmse_mean, train_mae_mean, train_rmse_mean = CV_nmf(data, factors=10, iterations=iteration)
test_maes.append(test_mae_mean)
train_maes.append(train_mae_mean)
In [22]:
results_df = pd.DataFrame({'Iterations': iterations, 'test_mae': test_maes, 'train_mae': train_maes})
results_df.plot(x='Iterations', y=['test_mae', 'train_mae'], ylim=(0,3))
Out[22]:
In [ ]:
test_maes = []
train_maes = []
for iteration in iterations:
test_mae_mean, test_rmse_mean, train_mae_mean, train_rmse_mean = CV_nmf(data, factors=10, iterations=iteration)
test_maes.append(test_mae_mean)
train_maes.append(train_mae_mean)
In [67]:
# Time taken for different data sizes
import time
Num_users = [100, 150, 200, 300, 500]
times = []
test_maes = []
train_maes = []
for datas in datasets:
time_start = time.time()
test_mae_mean, test_rmse_mean, train_mae_mean, train_rmse_mean = CV_nmf(datas, factors=20, iterations=10)
time_taken = time.time()-time_start
times.append(time_taken)
test_maes.append(test_mae_mean)
train_maes.append(train_mae_mean)
In [71]:
results_df = pd.DataFrame({'Num_users': Num_users, 'test_mae': test_maes, 'train_mae': train_maes, 'time': times})
results_df.plot(x='Num_users', y=['test_mae', 'train_mae'], ylim=(0,3))
results_df.plot(x='Num_users', y=['time'], ylim=(0,30))
Out[71]: