In [218]:
import time
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from scipy.spatial.distance import cosine
from sklearn.metrics.pairwise import cosine_similarity, pairwise_distances
from sklearn.model_selection import KFold
from sklearn.metrics import mean_squared_error, mean_absolute_error
%matplotlib inline
In [219]:
ratings = pd.read_csv('../raw-data/BX-Book-Ratings.csv', encoding='iso-8859-1', sep = ';')
ratings.columns = ['user_id', 'isbn', 'book_rating']
In [220]:
print(ratings.dtypes)
print()
print(ratings.head())
print()
print("Data Points :", ratings.shape[0])
In [221]:
books = pd.read_csv('../raw-data/BX-Books.csv', sep=';', encoding = 'iso-8859-1', dtype =str)
del books['Image-URL-L']
del books['Image-URL-M']
del books['Image-URL-S']
del books['Book-Author']
del books['Publisher']
Some Books don't have unique ISBN, creating a 1:1 maping between books-title and ISBN
In [222]:
print('Number of Books == Number of ISBN ? ', books["Book-Title"].nunique() == books["ISBN"].nunique())
book_dict = books[["Book-Title","ISBN"]].set_index("Book-Title").to_dict()["ISBN"]
books['new_isbn'] = books["Book-Title"].apply(lambda x: book_dict[x])
print('Number of Books == Number of ISBN ? ', books["Book-Title"].nunique() == books["new_isbn"].nunique())
In [223]:
books['isbn'] = books['new_isbn']
del books['ISBN']
del books['new_isbn']
In [224]:
books.shape
Out[224]:
Removing ratings equal to zero, since Book Crossing Dataset has rating scale from 1-10. Taking Inner Join with books dataframe to maintain books whose details exist.
In [225]:
newdf = ratings[ratings.book_rating>0]
joined = books.merge(newdf, on ='isbn')
print(joined.shape)
In [226]:
rows = joined.user_id.unique()
cols = joined['Book-Title'].unique()
print(joined.user_id.nunique(), joined.isbn.nunique())
In [227]:
print("Sparsity :", 100 - (joined.shape[0]/(joined.user_id.nunique()* joined.isbn.nunique())))
In [228]:
data1 = pd.read_csv('../clean-data/ratings_Books.csv', )
In [229]:
data1.columns = ['user_id', 'isbn', 'book_rating', 'timestamp']
In [230]:
rows = data1.user_id.unique()
cols = data1.isbn.unique()
print(data1.user_id.nunique(), data1.isbn.nunique())
print("Sparsity :", 100 - (data1.shape[0]/(data1.user_id.nunique()* data1.isbn.nunique())))
In [232]:
data1 = data1[['user_id', 'isbn', 'book_rating']]
In [233]:
data1.shape
Out[233]:
In [234]:
data2 = joined[['user_id', 'isbn', 'book_rating']]
In [235]:
data2.book_rating = data2.book_rating / 2.0
In [236]:
data2.shape
Out[236]:
In [237]:
data2 = data2.drop_duplicates()
In [238]:
data2.shape
Out[238]:
In [239]:
data3 = pd.concat((data1, data2))
In [240]:
data3.shape
Out[240]:
In [335]:
temp = data3[data3['isbn'].isin(data3['isbn'].value_counts()[data3['isbn'].value_counts()>50].index)]
# print(len(temp.user_id.unique()))
# print(len(temp.isbn.unique()))
temp1 = temp[temp['user_id'].isin(temp['user_id'].value_counts()[temp['user_id'].value_counts()>49].index)]
# print(len(temp1.user_id.unique()))
# print(len(temp1.isbn.unique()))
temp2 = temp1[temp1['isbn'].isin(temp1['isbn'].value_counts()[temp1['isbn'].value_counts()>53].index)]
print(len(temp2.user_id.unique()))
print(len(temp2.isbn.unique()))
In [242]:
print(temp2.groupby(['user_id']).count()['book_rating'].mean())
print(temp2.groupby(['isbn']).count()['book_rating'].mean())
In [243]:
data = temp2
rows = data.user_id.unique()
cols = data.isbn.unique()
print(data.user_id.nunique(), data.isbn.nunique())
data = data[['user_id', 'isbn', 'book_rating']]
data.to_csv('Combine.csv')
In [244]:
print("Sparsity :", 100 - (data.shape[0]/(len(cols)*len(rows)) * 100))
In [245]:
idict = dict(zip(cols, range(len(cols))))
udict = dict(zip(rows, range(len(rows))))
data.user_id = [
udict[i] for i in data.user_id
]
data['isbn'] = [
idict[i] for i in data['isbn']
]
nmat = data.as_matrix()
In [246]:
nmat = nmat.astype(int)
nmat.shape
Out[246]:
In [247]:
def rmse(ypred, ytrue):
ypred = ypred[ytrue.nonzero()].flatten()
ytrue = ytrue[ytrue.nonzero()].flatten()
return np.sqrt(mean_squared_error(ypred, ytrue))
def mae(ypred, ytrue):
ypred = ypred[ytrue.nonzero()].flatten()
ytrue = ytrue[ytrue.nonzero()].flatten()
return mean_absolute_error(ypred, ytrue)
Our Naive Baseline for any user i, item j prediction is to assign it with average rating over entire dataset. (amean))
In [317]:
def predict_naive(user, item):
return amean1
In [318]:
x1, x2 = train_test_split(nmat, test_size = 0.2, random_state =42)
naive = np.zeros((len(rows),len(cols)))
for row in x1:
naive[row[0], row[1]] = row[2]
predictions = []
targets = []
amean1 = np.mean(naive[naive!=0])
umean1 = sum(naive.T) / sum((naive!=0).T)
imean1 = sum(naive) / sum((naive!=0))
umean1 = np.where(np.isnan(umean1), amean1, umean1)
imean1 = np.where(np.isnan(imean1), amean1, imean1)
print('Naive---')
for row in x2:
user, item, actual = row[0], row[1], row[2]
predictions.append(predict_naive(user, item))
targets.append(actual)
print('rmse %.4f' % rmse(np.array(predictions), np.array(targets)))
print('mae %.4f' % mae(np.array(predictions), np.array(targets)))
print()
Following are the functions to calculate pairwise similarity between two items : Cosine, Adjusted Cosine, Euclidean, Pearson Corelation.
In [250]:
def cos(mat, a, b):
if a == b:
return 1
aval = mat.T[a].nonzero()
bval = mat.T[b].nonzero()
corated = np.intersect1d(aval, bval)
if len(corated) == 0:
return 0
avec = np.take(mat.T[a], corated)
bvec = np.take(mat.T[b], corated)
val = 1 - cosine(avec, bvec)
if np.isnan(val):
return 0
return val
In [251]:
def adjcos(mat, a, b, umean):
if a == b:
return 1
aval = mat.T[a].nonzero()
bval = mat.T[b].nonzero()
corated = np.intersect1d(aval, bval)
if len(corated) == 0:
return 0
avec = np.take(mat.T[a], corated)
bvec = np.take(mat.T[b], corated)
avec1 = avec - umean[corated]
bvec1 = bvec - umean[corated]
val = 1 - cosine(avec1, bvec1)
if np.isnan(val):
return 0
return val
In [252]:
def pr(mat, a, b, imean):
if a == b:
return 1
aval = mat.T[a].nonzero()
bval = mat.T[b].nonzero()
corated = np.intersect1d(aval, bval)
if len(corated) < 2:
return 0
avec = np.take(mat.T[a], corated)
bvec = np.take(mat.T[b], corated)
avec1 = avec - imean[a]
bvec1 = bvec - imean[b]
val = 1 - cosine(avec1, bvec1)
if np.isnan(val):
return 0
return val
In [253]:
def euc(mat, a, b):
if a == b:
return 1
aval = mat.T[a].nonzero()
bval = mat.T[b].nonzero()
corated = np.intersect1d(aval, bval)
if len(corated) == 0:
return 0
avec = np.take(mat.T[a], corated)
bvec = np.take(mat.T[b], corated)
dist = np.sqrt(np.sum(a-b)**2)
val = 1/(1+dist)
if np.isnan(val):
return 0
return val
Function item similar returns matrix of pairwise similarity between all items based on the option provided. Also return amean (global mean rating), umean (average rating of each user), imean (Average rating of each item)
In [254]:
def itemsimilar(mat, option):
amean = np.mean(mat[mat!=0])
umean = sum(mat.T) / sum((mat!=0).T)
imean = sum(mat) / sum((mat!=0))
umean = np.where(np.isnan(umean), amean, umean)
imean = np.where(np.isnan(imean), amean, imean)
n = mat.shape[1]
sim_mat = np.zeros((n, n))
if option == 'pr':
#print("PR")
for i in range(n):
for j in range(n):
sim_mat[i][j] = pr(mat, i, j, imean)
sim_mat = (sim_mat + 1)/2
elif option == 'cos':
#print("COS")
print(n)
for i in range(n):
if(i%100 == 0):
print(i)
for j in range(n):
sim_mat[i][j] = cos(mat, i, j)
elif option == 'adjcos':
#print("ADJCOS")
for i in range(n):
for j in range(n):
sim_mat[i][j] = adjcos(mat, i, j, umean)
sim_mat = (sim_mat + 1)/2
elif option == 'euc':
#print("EUCLIDEAN")
for i in range(n):
for j in range(n):
sim_mat[i][j] = euc(mat, i, j)
else:
#print("Hello")
sim_mat = cosine_similarity(mat.T)
return sim_mat, amean, umean, imean
In [37]:
import time
start = time.time()
naive = np.zeros((len(rows),len(cols)))
for row in x1:
naive[row[0], row[1]] = row[2]
items, amean, umean, imean = itemsimilar(naive,'cos')
end = time.time()
print(end-start)
In [255]:
print(end - start)
In [256]:
items.shape
Out[256]:
Predict function is used to get recommended rating by user i for item j.
In [327]:
def predict(user, item, mat, item_similarity, amean, umean, imean, k=20):
nzero = mat[user].nonzero()[0]
if len(nzero) == 0:
return amean
baseline = imean + umean[user] - amean
choice = nzero[item_similarity[item, nzero].argsort()[::-1][1:k+1]]
prediction = ((mat[user, choice] - baseline[choice]).dot(item_similarity[item, choice])/ sum(item_similarity[item, choice])) + baseline[item]
if np.isnan(prediction):
prediction = imean[item] + umean[user] - amean
if prediction > 5:
prediction = 5
if prediction < 1:
prediction = 1
return prediction
In [328]:
predict(0,1, naive, items, amean, umean, imean,5)
Out[328]:
In [329]:
def get_results1(X, rows, cols, folds, k, item_similarity, amean, umean, imean):
kf = KFold(n_splits=folds, shuffle = True, random_state=95)
count = 1
rmse_list = []
mae_list = []
trmse_list = []
tmae_list = []
for train_index, test_index in kf.split(X):
print("---------- Fold ", count, "---------------")
train_data, test_data = X[train_index], X[test_index]
full_mat = np.zeros((rows, cols))
for row in train_data:
full_mat[row[0], row[1]] = row[2]
preds = []
real = []
for row in train_data:
user_id, isbn, rating = row[0], row[1], row[2]
preds.append(predict(user_id, isbn, full_mat, item_similarity, amean, umean, imean, k))
real.append(rating)
err1 = rmse(np.array(preds), np.array(real))
err2 = mae(np.array(preds), np.array(real))
trmse_list.append(err1)
tmae_list.append(err2)
print('Train Errors')
print('RMSE : %.4f' % err1)
print('MAE : %.4f' % err2)
preds = []
real = []
for row in test_data:
user_id, isbn, rating = row[0], row[1], row[2]
preds.append(predict(user_id, isbn, full_mat, item_similarity, amean, umean, imean, k))
real.append(rating)
err1 = rmse(np.array(preds), np.array(real))
err2 = mae(np.array(preds), np.array(real))
rmse_list.append(err1)
mae_list.append(err2)
print('Test Errors')
print('RMSE : %.4f' % err1)
print('MAE : %.4f' % err2)
count+=1
print("-------------------------------------")
print("Training Avg Error:")
print("AVG RMSE :", str(np.mean(trmse_list)))
print("AVG MAE :", str(np.mean(tmae_list)))
print()
print("Testing Avg Error:")
print("AVG RMSE :", str(np.mean(rmse_list)))
print("AVG MAE :", str(np.mean(mae_list)))
print(" ")
return np.mean(mae_list), np.mean(rmse_list)
In [344]:
s = time.time()
get_results1(nmat, len(rows), len(cols), 5 ,20,items, amean,umean, imean)
e=time.time()
In [352]:
print("Time to test the recommendation over 5 fold cross validation of the data", (e-s)/5, "seconds")
get_results function is our function to cross_val setup and changing the parameter of this function will help to tune hyperparameter k (nearest neighbours)
In [331]:
each_sims = []
each_sims_rmse = []
for k in [5, 10, 15, 20, 25]:
print("Nearest Neighbors: ",k)
ans1, ans2 = get_results1(nmat, len(rows), len(cols), 5 ,k,items, amean,umean, imean)
each_sims.append(ans1)
each_sims_rmse.append(ans2)
print()
print("Best K Value for")
print()
print("Min MAE")
print(np.min(each_sims), np.argmin(each_sims))
print("Min RMSE")
print(np.min(each_sims_rmse), np.argmin(each_sims_rmse))
print()
In [ ]:
In [332]:
print(each_sims[2], each_sims_rmse[2])
In [333]:
results_df1 = pd.DataFrame({'Nearest Neighbors': [5, 10, 15, 20, 25], 'MAE': each_sims, 'RMSE': each_sims_rmse })
plot1 = results_df1.plot(x='Nearest Neighbors', y=['MAE', 'RMSE'], ylim=(0.5,0.85), title = 'Item-Item CF: Metrics over different K')
fig = plot1.get_figure()
fig.savefig('MetricsCFK.png')
getmrec function is used to get top m recommendation for a user_id based on the similarity matrix (option), k neighbours.
In [264]:
full_mat = np.zeros((len(rows),len(cols)))
for row in nmat:
full_mat[row[0], row[1]] = row[2]
#item_similarity, amean, umean, imean = itemsimilar(full_mat, 'euc')
In [265]:
def getmrec(full_mat, user_id, item_similarity, k, m, idict, cov = False):
n = item_similarity.shape[0]
nzero = full_mat[user_id].nonzero()[0]
preds = {}
for row in range(n):
preds[row] = predict(user_id, row, full_mat, item_similarity, amean, umean, imean, k)
flipped_dict = dict(zip(idict.values(), idict.keys()))
if not cov:
print("Books Read -----")
for i in nzero:
print(flipped_dict[i])
del preds[i]
res = sorted(preds.items(), key=lambda x: x[1], reverse = True)
ans = [flipped_dict[i[0]] for i in res[:m]]
return ans
In [266]:
flipped_dict = dict(zip(idict.values(), idict.keys()))
In [267]:
def coverage(full_mat, user_id, item_similarity, k, mlist, flipped_dict, cov = False):
n = item_similarity.shape[0]
nzero = full_mat[user_id].nonzero()[0]
preds = {}
for row in range(n):
preds[row] = predict(user_id, row, full_mat, item_similarity, amean, umean, imean, k)
if not cov:
print("Books Read -----")
for i in nzero:
print(flipped_dict[i])
del preds[i]
res = sorted(preds.items(), key=lambda x: x[1], reverse = True)
ret_tup = []
ans = [flipped_dict[i[0]] for i in res[:mlist[-1]]]
for i in mlist:
ret_tup.append(ans[:i])
return ret_tup
In [268]:
cov1 = []
cov2 = []
cov3 = []
cov4 = []
cov5 = []
mlist = [5,10,15,20,25]
for i in range(len(rows)):
if(i%100 == 0):
print(i)
ans = coverage(full_mat, i, items, 10, mlist, flipped_dict, True)
cov1.extend(ans[0])
cov2.extend(ans[1])
cov3.extend(ans[2])
cov4.extend(ans[3])
cov5.extend(ans[4])
In [269]:
print("Coverage with recommending 5 books", len(set(cov1))/4959 *100 ,"%")
In [270]:
print("Coverage with recommending 10 books", len(set(cov2))/4959 *100 ,"%")
In [271]:
print("Coverage with recommending 15 books", len(set(cov3))/4959 *100 ,"%")
In [272]:
print("Coverage with recommending 20 books", len(set(cov4))/4959 *100 ,"%")
In [273]:
print("Coverage with recommending 25 books", len(set(cov5))/4959 *100 ,"%")
In [ ]:
In [ ]:
In [ ]:
In [274]:
feats = pd.read_csv('../book_features.csv')
In [275]:
feats.shape
Out[275]:
In [276]:
feats.head()
Out[276]:
In [277]:
scores = feats.iloc[:,1:15]
In [278]:
scores1 = scores.as_matrix()
In [279]:
scores1.shape
Out[279]:
In [280]:
inputscores = scores1.T
In [284]:
naive = np.zeros((len(rows),len(cols)))
for row in x1:
naive[row[0], row[1]] = row[2]
items_features, temple1, temple2, temple3 = itemsimilar(inputscores,'')
In [349]:
s1 = time.time()
get_results1(nmat, len(rows), len(cols), 5 ,20,items_features, amean,umean, imean)
e1 = time.time()
In [351]:
print("Time to test the recommendation over 5 folds cross validation of the data", (e1-s1)/5, "seconds")
In [287]:
each_sims_con = []
each_sims_rmse_con = []
for k in [5, 10, 15, 20, 25]:
print("Nearest Neighbors: ",k)
ans1, ans2 = get_results1(nmat, len(rows), len(cols), 5 ,k,items_features, amean,umean, imean)
each_sims_con.append(ans1)
each_sims_rmse_con.append(ans2)
print()
print("Best K Value for")
print()
print("Min MAE")
print(np.min(each_sims_con), np.argmin(each_sims_con))
print("Min RMSE")
print(np.min(each_sims_rmse_con), np.argmin(each_sims_rmse_con))
print()
In [308]:
results_df2 = pd.DataFrame({'Nearest Neighbors': [5, 10, 15, 20, 25], 'MAE': each_sims_con, 'RMSE': each_sims_rmse_con })
plot2 = results_df2.plot(x='Nearest Neighbors', y=['MAE', 'RMSE'], ylim=(0.5,0.9), title = 'Content Based Item-Item CF: Metrics over different K')
fig = plot2.get_figure()
fig.savefig('MetricsContentCFK.png')
In [316]:
covcon1 = []
covcon2 = []
covcon3 = []
covcon4 = []
covcon5 = []
mlist = [5,10,15,20,25]
for i in range(len(rows)):
if(i%100 == 0):
print(i)
ans = coverage(full_mat, i, items_features, 10, mlist, flipped_dict, True)
covcon1.extend(ans[0])
covcon2.extend(ans[1])
covcon3.extend(ans[2])
covcon4.extend(ans[3])
covcon5.extend(ans[4])
In [338]:
print("Coverage with recommending 5 books", len(set(covcon1))/4959 *100 ,"%")
In [339]:
print("Coverage with recommending 10 books", len(set(covcon2))/4959 *100 ,"%")
In [340]:
print("Coverage with recommending 15 books", len(set(covcon3))/4959 *100 ,"%")
In [341]:
print("Coverage with recommending 20 books", len(set(covcon4))/4959 *100 ,"%")
In [342]:
print("Coverage with recommending 25 books", len(set(covcon5))/4959 *100 ,"%")
In [ ]: