In [14]:
import pandas as pd
import numpy as np
import pickle
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.model_selection import KFold
from sklearn.metrics import mean_squared_error, mean_absolute_error
In [33]:
f = open('../models/item_model', 'rb')
item_model = pickle.load(f)
f.close()
In [61]:
f = open('../models/lshbestmodel', 'rb')
lsh = pickle.load(f)
f.close()
In [7]:
data = pd.read_csv('../clean-data/Combine.csv')
rows = data.user_id.unique()
cols = data['isbn'].unique()
print(data.user_id.nunique(), data.isbn.nunique())
data = data[['user_id', 'isbn', 'book_rating']]
idict = dict(zip(cols, range(len(cols))))
udict = dict(zip(rows, range(len(rows))))
data.user_id = [
udict[i] for i in data.user_id
]
data['isbn'] = [
idict[i] for i in data['isbn']
]
nmat = data.as_matrix()
nmat = nmat.astype(int)
print(nmat.shape)
naive = np.zeros((len(rows),len(cols)))
for row in nmat:
naive[row[0], row[1]] = row[2]
print(naive.T.shape)
In [12]:
amean = np.mean(naive[naive!=0])
umean = sum(naive.T) / sum((naive!=0).T)
imean = sum(naive) / sum((naive!=0))
umean = np.where(np.isnan(umean), amean, umean)
imean = np.where(np.isnan(imean), amean, imean)
In [17]:
def rmse(ypred, ytrue):
ypred = ypred[ytrue.nonzero()].flatten()
ytrue = ytrue[ytrue.nonzero()].flatten()
return np.sqrt(mean_squared_error(ypred, ytrue))
def mae(ypred, ytrue):
ypred = ypred[ytrue.nonzero()].flatten()
ytrue = ytrue[ytrue.nonzero()].flatten()
return mean_absolute_error(ypred, ytrue)
In [18]:
def predict(user, item, mat, model, amean, umean, imean, k=20):
nzero = naive[user].nonzero()[0]
if len(nzero) == 0:
return amean
diction = dict(model[item])
elems = set(nzero).intersection(diction.keys())
baseline = imean + umean[user] - amean
new_dict = sorted({k: diction[k] for k in elems if k in diction}.items(), key=lambda x: x[1], reverse = True)[:k]
if(not len(new_dict)):
return amean
num=0
denum =0
for i in new_dict:
num+= (naive[user,i[0]]-baseline[i[0]])*i[1]
denum+=i[1]
prediction = (num/denum + baseline[item])
if np.isnan(prediction):
prediction = imean[item] + umean[user] - amean
if prediction > 5:
prediction = 5
if prediction < 1:
prediction = 1
return prediction
In [34]:
def predict2(user, item, mat, item_similarity, amean, umean, imean, k=20):
nzero = mat[user].nonzero()[0]
if len(nzero) == 0:
return amean
baseline = imean + umean[user] - amean
choice = nzero[item_similarity[item, nzero].argsort()[::-1][1:k+1]]
prediction = ((mat[user, choice] - baseline[choice]).dot(item_similarity[item, choice])/ sum(item_similarity[item, choice])) + baseline[item]
if np.isnan(prediction):
prediction = imean[item] + umean[user] - amean
if prediction > 5:
prediction = 5
if prediction < 1:
prediction = 1
return prediction
In [38]:
def get_results1(X, rows, cols, folds, k, item_sim1, item_sim2, weights, amean, umean, imean):
kf = KFold(n_splits=folds, shuffle = True, random_state=42)
count = 1
rmse_list = []
mae_list = []
trmse_list = []
tmae_list = []
for train_index, test_index in kf.split(X):
print("---------- Fold ", count, "---------------")
train_data, test_data = X[train_index], X[test_index]
full_mat = np.zeros((rows, cols))
for row in train_data:
full_mat[row[0], row[1]] = row[2]
preds = []
real = []
for row in test_data:
user_id, isbn, rating = row[0], row[1], row[2]
pred1 = predict(user_id, isbn, full_mat, item_sim1, amean, umean, imean, k)
pred2 = predict2(user_id, isbn, full_mat, item_sim2, amean, umean, imean, k)
preds.append(weights[0] * pred1 + weights[1] * pred2)
real.append(rating)
err1 = rmse(np.array(preds), np.array(real))
err2 = mae(np.array(preds), np.array(real))
rmse_list.append(err1)
mae_list.append(err2)
print('Test Errors')
print('RMSE : %.4f' % err1)
print('MAE : %.4f' % err2)
count+=1
print("-------------------------------------")
# print("Training Avg Error:")
# print("AVG RMSE :", str(np.mean(trmse_list)))
# print("AVG MAE :", str(np.mean(tmae_list)))
print()
print("Testing Avg Error:")
print("AVG RMSE :", str(np.mean(rmse_list)))
print("AVG MAE :", str(np.mean(mae_list)))
print(" ")
return np.mean(mae_list), np.mean(rmse_list)
In [70]:
get_results1(nmat, len(rows), len(cols), 5, 15, lsh, item_model, [0.7, 0.3], amean, umean, imean)
Out[70]:
In [63]:
get_results1(nmat, len(rows), len(cols), 5, 15, lsh, item_model, [0.8, 0.2], amean, umean, imean)
Out[63]:
In [68]:
get_results1(nmat, len(rows), len(cols), 5, 15, lsh, item_model, [0.9, 0.1], amean, umean, imean)
Out[68]:
In [69]:
get_results1(nmat, len(rows), len(cols), 5, 15, lsh, item_model, [0.6, 0.4], amean, umean, imean)
Out[69]:
In [66]:
get_results1(nmat, len(rows), len(cols), 5, 15, lsh, item_model, [0.5, 0.5], amean, umean, imean)
Out[66]:
In [72]:
each_sims = []
each_sims_rmse = []
for k in [5, 10, 15, 20, 25]:
print("Nearest Neighbors: ",k)
ans1, ans2 = get_results1(nmat, len(rows), len(cols), 5, k, lsh, item_model, [0.7, 0.3], amean, umean, imean)
each_sims.append(ans1)
each_sims_rmse.append(ans2)
print()
print("Best K Value for")
print()
print("Min MAE")
print(np.min(each_sims), np.argmin(each_sims))
print("Min RMSE")
print(np.min(each_sims_rmse), np.argmin(each_sims_rmse))
print()
In [337]:
%matplotlib inline
In [348]:
results_df1 = pd.DataFrame({'Nearest Neighbors': [5, 10, 15, 20, 25], 'MAE': each_sims, 'RMSE': each_sims_rmse })
plot1 = results_df1.plot(x='Nearest Neighbors', y=['MAE', 'RMSE'], ylim=(0.5,0.85), title = 'Hybrid Model: Metrics over different K')
fig = plot1.get_figure()
fig.savefig('MetricsHybrid.png')
In [46]:
flipped_dict = dict(zip(idict.values(), idict.keys()))
In [74]:
full_mat = np.zeros((len(rows),len(cols)))
for row in nmat:
full_mat[row[0], row[1]] = row[2]
In [47]:
def coverage(full_mat, user_id, item_sim1, item_sim2, weights, k, mlist, flipped_dict, cov = False):
n = full_mat.shape[1]
nzero = full_mat[user_id].nonzero()[0]
preds = {}
for row in range(n):
pred1 = predict(user_id, row, full_mat, item_sim1, amean, umean, imean, k)
pred2 = predict2(user_id, row, full_mat, item_sim2, amean, umean, imean, k)
preds[row] = weights[0] * pred1 + weights[1] * pred2
if not cov:
print("Books Read -----")
for i in nzero:
print(flipped_dict[i])
del preds[i]
res = sorted(preds.items(), key=lambda x: x[1], reverse = True)
ret_tup = []
ans = [flipped_dict[i[0]] for i in res[:mlist[-1]]]
for i in mlist:
ret_tup.append(ans[:i])
return ret_tup
In [75]:
cov1 = []
cov2 = []
cov3 = []
cov4 = []
cov5 = []
mlist = [5,10,15,20,25]
for i in range(len(rows)):
if(i%100 == 0):
print(i)
ans = coverage(full_mat, i, lsh, item_model, [0.7, 0.3], 15, mlist, flipped_dict, True)
cov1.extend(ans[0])
cov2.extend(ans[1])
cov3.extend(ans[2])
cov4.extend(ans[3])
cov5.extend(ans[4])
In [143]:
print("Coverage with recommending 5 books", len(set(cov1))/4959 *100 ,"%")
In [144]:
print("Coverage with recommending 10 books", len(set(cov2))/4959 *100 ,"%")
In [145]:
print("Coverage with recommending 15 books", len(set(cov3))/4959 *100 ,"%")
In [146]:
print("Coverage with recommending 20 books", len(set(cov4))/4959 *100 ,"%")
In [147]:
print("Coverage with recommending 25 books", len(set(cov5))/4959 *100 ,"%")
In [370]:
def getmrec(full_mat, user_id, item_similarity, k, m, flipped_dict, cov = False):
n = full_mat.shape[1]
nzero = full_mat[user_id].nonzero()[0]
preds = {}
for row in range(n):
preds[row] = predict(user_id, row, full_mat, item_similarity, amean, umean, imean, k)
genre = []
if not cov:
# print("Books Read -----")
for i in nzero:
# print(flipped_dict[i])
genre.extend(ansdict[i])
del preds[i]
newA = dict(sorted(preds.items(),key=operator.itemgetter(1), reverse = True)[:m])
#res = sorted(preds.items(), key=lambda x: x[1], reverse = True)
retgen = []
for j in newA.keys():
retgen.extend(ansdict[j])
ans = [flipped_dict[i] for i in newA.keys()]
# print("Books Read")
df1 = pd.DataFrame.from_dict(dict(Counter(genre)),orient='index').sort_values(by=0,ascending = False)
# print(df1)
# print()
# print("Book Recommended")
df2 = pd.DataFrame.from_dict(dict(Counter(retgen)),orient='index').sort_values(by=0,ascending = False)
# print(df2)
df = pd.merge(left=df1, right=df2, left_index=True, right_index=True, how = 'outer').fillna(0)
df.columns = ['Books Read', 'Books Recommended']
df = df.sort_values(['Books Read','Books Recommended'], ascending = False)
# print(df)
# print(df1)
return df , ans
In [371]:
df, ans = getmrec(full_mat, 130, lsh, 15, 10, flipped_dict, cov = False)
In [375]:
df
Out[375]:
In [387]:
# print("\n\n==========[ RECOMMENDED BOOKS]==========\n")
# for book in ans:
# print(" " + book)
# print("\n\n")
df2 = pd.DataFrame(ans,columns=['Recommended Books'])
df2
Out[387]:
In [148]:
data = pd.read_pickle('../created_datasets/ibsn_features_new_batch.pickle')
In [ ]:
In [149]:
name = data.title
In [150]:
name = list(name)
In [151]:
for i in range(len(flipped_dict)):
flipped_dict[i] = name[i]
In [158]:
feats.head()
Out[158]:
In [333]:
def get_top_two_indices(x):
# l = list(x.values)
# return l.index(max(l))
return pd.Series(x.sort_values(ascending = False).head(2).index.values)
In [334]:
ansdict = feats.loc[:,'Science_Score':'Fantasy_Score'].apply(get_top_two_indices,axis=1).T.to_dict(orient='list')