In [14]:
import pandas as pd
import numpy as np
import pickle
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.model_selection import KFold
from sklearn.metrics import mean_squared_error, mean_absolute_error

Load the precomputed Model for Hybrid Approach


In [33]:
f = open('../models/item_model', 'rb')
item_model = pickle.load(f)
f.close()

In [61]:
f = open('../models/lshbestmodel', 'rb')
lsh = pickle.load(f)
f.close()

In [7]:
data = pd.read_csv('../clean-data/Combine.csv')

rows = data.user_id.unique()
cols = data['isbn'].unique()
print(data.user_id.nunique(), data.isbn.nunique())
data = data[['user_id', 'isbn', 'book_rating']]

idict  = dict(zip(cols, range(len(cols))))
udict = dict(zip(rows, range(len(rows))))

data.user_id = [
    udict[i] for i in data.user_id
]
data['isbn'] = [
    idict[i] for i in data['isbn']
]

nmat = data.as_matrix()
nmat = nmat.astype(int)
print(nmat.shape)

naive = np.zeros((len(rows),len(cols)))
for row in nmat:
    naive[row[0], row[1]] = row[2]

print(naive.T.shape)


12053 4959
(470749, 3)
(4959, 12053)

In [12]:
amean = np.mean(naive[naive!=0])
umean = sum(naive.T) / sum((naive!=0).T)
imean = sum(naive) / sum((naive!=0))

umean = np.where(np.isnan(umean), amean, umean)
imean = np.where(np.isnan(imean), amean, imean)

In [17]:
def rmse(ypred, ytrue):
    ypred = ypred[ytrue.nonzero()].flatten()
    ytrue = ytrue[ytrue.nonzero()].flatten()
    return np.sqrt(mean_squared_error(ypred, ytrue))

def mae(ypred, ytrue):
    ypred = ypred[ytrue.nonzero()].flatten()
    ytrue = ytrue[ytrue.nonzero()].flatten()
    return mean_absolute_error(ypred, ytrue)

In [18]:
def predict(user, item, mat, model, amean, umean, imean,  k=20):
    nzero = naive[user].nonzero()[0]
    if len(nzero) == 0:
        return amean
    diction = dict(model[item])
    elems = set(nzero).intersection(diction.keys())
    baseline = imean + umean[user] - amean

    new_dict = sorted({k: diction[k] for k in elems if k in diction}.items(),  key=lambda x: x[1], reverse = True)[:k]
    if(not len(new_dict)):
        return amean
    num=0
    denum =0 
    for i in new_dict:
        num+= (naive[user,i[0]]-baseline[i[0]])*i[1]
        denum+=i[1]
    prediction = (num/denum + baseline[item])
    
    if np.isnan(prediction):
        prediction = imean[item] + umean[user] - amean
    if prediction > 5:
        prediction = 5
    if prediction < 1:
        prediction = 1
    return prediction

In [34]:
def predict2(user, item, mat, item_similarity, amean, umean, imean,  k=20):
    nzero = mat[user].nonzero()[0]
    if len(nzero) == 0:
        return amean
    baseline = imean + umean[user] - amean
    choice = nzero[item_similarity[item, nzero].argsort()[::-1][1:k+1]]

    prediction = ((mat[user, choice] - baseline[choice]).dot(item_similarity[item, choice])/ sum(item_similarity[item, choice])) + baseline[item]
        
    if np.isnan(prediction):
        prediction = imean[item] + umean[user] - amean
    if prediction > 5:
        prediction = 5
    if prediction < 1:
        prediction = 1
    return prediction

Weighted Average Hybrid Model


In [38]:
def get_results1(X, rows, cols, folds, k, item_sim1, item_sim2, weights, amean, umean, imean):
    kf =  KFold(n_splits=folds, shuffle = True, random_state=42)
    count = 1
    rmse_list = []
    mae_list = []
    trmse_list = []
    tmae_list = []
    for train_index, test_index in kf.split(X):
        print("----------   Fold ", count, "---------------")
        train_data, test_data = X[train_index], X[test_index]
        
        full_mat = np.zeros((rows, cols))
        
        for row in train_data:
            full_mat[row[0], row[1]] = row[2]
        
        
        preds = []
        real = []
        
        for row in test_data:
            user_id, isbn, rating = row[0], row[1], row[2]
            pred1 = predict(user_id, isbn, full_mat, item_sim1, amean, umean, imean, k)
            pred2 = predict2(user_id, isbn, full_mat, item_sim2, amean, umean, imean,  k)
            preds.append(weights[0] * pred1 + weights[1] * pred2)
            real.append(rating)
            
        err1 = rmse(np.array(preds), np.array(real))
        err2 = mae(np.array(preds), np.array(real))
        rmse_list.append(err1)
        mae_list.append(err2)
        
       
        
        print('Test Errors')
        print('RMSE : %.4f' % err1)
        print('MAE : %.4f' % err2)
        count+=1
        
    
    print("-------------------------------------")
#     print("Training Avg Error:")
#     print("AVG RMSE :", str(np.mean(trmse_list)))
#     print("AVG MAE :", str(np.mean(tmae_list)))
    print()
    print("Testing Avg Error:")
    print("AVG RMSE :", str(np.mean(rmse_list)))
    print("AVG MAE :", str(np.mean(mae_list)))
    print(" ")
        
    return np.mean(mae_list), np.mean(rmse_list)

Tuning the model weights for lower errors


In [70]:
get_results1(nmat, len(rows), len(cols), 5, 15, lsh, item_model, [0.7, 0.3], amean, umean, imean)


----------   Fold  1 ---------------
/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/ipykernel/__main__.py:8: RuntimeWarning: invalid value encountered in double_scalars
Test Errors
RMSE : 0.8035
MAE : 0.5857
----------   Fold  2 ---------------
Test Errors
RMSE : 0.8032
MAE : 0.5839
----------   Fold  3 ---------------
Test Errors
RMSE : 0.7975
MAE : 0.5822
----------   Fold  4 ---------------
Test Errors
RMSE : 0.7963
MAE : 0.5817
----------   Fold  5 ---------------
Test Errors
RMSE : 0.7992
MAE : 0.5839
-------------------------------------

Testing Avg Error:
AVG RMSE : 0.799918947078
AVG MAE : 0.583491996023
 
Out[70]:
(0.58349199602324886, 0.79991894707783184)

In [63]:
get_results1(nmat, len(rows), len(cols), 5, 15, lsh, item_model, [0.8, 0.2], amean, umean, imean)


----------   Fold  1 ---------------
/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/ipykernel/__main__.py:8: RuntimeWarning: invalid value encountered in double_scalars
Test Errors
RMSE : 0.8098
MAE : 0.5874
----------   Fold  2 ---------------
Test Errors
RMSE : 0.8096
MAE : 0.5856
----------   Fold  3 ---------------
Test Errors
RMSE : 0.8036
MAE : 0.5836
----------   Fold  4 ---------------
Test Errors
RMSE : 0.8021
MAE : 0.5829
----------   Fold  5 ---------------
Test Errors
RMSE : 0.8053
MAE : 0.5856
-------------------------------------

Testing Avg Error:
AVG RMSE : 0.806073740938
AVG MAE : 0.585022709427
 
Out[63]:
(0.58502270942728418, 0.8060737409378268)

In [68]:
get_results1(nmat, len(rows), len(cols), 5, 15, lsh, item_model, [0.9, 0.1], amean, umean, imean)


----------   Fold  1 ---------------
/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/ipykernel/__main__.py:8: RuntimeWarning: invalid value encountered in double_scalars
Test Errors
RMSE : 0.8178
MAE : 0.5898
----------   Fold  2 ---------------
Test Errors
RMSE : 0.8178
MAE : 0.5880
----------   Fold  3 ---------------
Test Errors
RMSE : 0.8114
MAE : 0.5858
----------   Fold  4 ---------------
Test Errors
RMSE : 0.8096
MAE : 0.5848
----------   Fold  5 ---------------
Test Errors
RMSE : 0.8131
MAE : 0.5878
-------------------------------------

Testing Avg Error:
AVG RMSE : 0.81394307864
AVG MAE : 0.587237204506
 
Out[68]:
(0.58723720450551642, 0.81394307864027571)

In [69]:
get_results1(nmat, len(rows), len(cols), 5, 15, lsh, item_model, [0.6, 0.4], amean, umean, imean)


----------   Fold  1 ---------------
/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/ipykernel/__main__.py:8: RuntimeWarning: invalid value encountered in double_scalars
Test Errors
RMSE : 0.7990
MAE : 0.5848
----------   Fold  2 ---------------
Test Errors
RMSE : 0.7985
MAE : 0.5830
----------   Fold  3 ---------------
Test Errors
RMSE : 0.7931
MAE : 0.5814
----------   Fold  4 ---------------
Test Errors
RMSE : 0.7922
MAE : 0.5813
----------   Fold  5 ---------------
Test Errors
RMSE : 0.7948
MAE : 0.5830
-------------------------------------

Testing Avg Error:
AVG RMSE : 0.795518491884
AVG MAE : 0.582695248907
 
Out[69]:
(0.58269524890672009, 0.79551849188400536)

In [66]:
get_results1(nmat, len(rows), len(cols), 5, 15, lsh, item_model, [0.5, 0.5], amean, umean, imean)


----------   Fold  1 ---------------
/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/ipykernel/__main__.py:8: RuntimeWarning: invalid value encountered in double_scalars
Test Errors
RMSE : 0.7962
MAE : 0.5846
----------   Fold  2 ---------------
Test Errors
RMSE : 0.7956
MAE : 0.5828
----------   Fold  3 ---------------
Test Errors
RMSE : 0.7905
MAE : 0.5815
----------   Fold  4 ---------------
Test Errors
RMSE : 0.7899
MAE : 0.5816
----------   Fold  5 ---------------
Test Errors
RMSE : 0.7923
MAE : 0.5829
-------------------------------------

Testing Avg Error:
AVG RMSE : 0.792901584014
AVG MAE : 0.58266482577
 
Out[66]:
(0.58266482577013645, 0.79290158401378918)

Best K Selection for Hybrid Model with LSH weight 0.7 and Content Based Weight 0.3


In [72]:
each_sims = []
each_sims_rmse = []
for k in [5, 10, 15, 20, 25]:
    print("Nearest Neighbors: ",k)
    ans1, ans2  = get_results1(nmat, len(rows), len(cols), 5, k, lsh, item_model, [0.7, 0.3], amean, umean, imean)
    each_sims.append(ans1)
    each_sims_rmse.append(ans2)

print()
print("Best K Value for")
print()
print("Min MAE")
print(np.min(each_sims), np.argmin(each_sims))
print("Min RMSE")
print(np.min(each_sims_rmse), np.argmin(each_sims_rmse))
print()


Nearest Neighbors:  5
----------   Fold  1 ---------------
/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/ipykernel/__main__.py:8: RuntimeWarning: invalid value encountered in double_scalars
Test Errors
RMSE : 0.8125
MAE : 0.5897
----------   Fold  2 ---------------
Test Errors
RMSE : 0.8128
MAE : 0.5879
----------   Fold  3 ---------------
Test Errors
RMSE : 0.8062
MAE : 0.5857
----------   Fold  4 ---------------
Test Errors
RMSE : 0.8057
MAE : 0.5855
----------   Fold  5 ---------------
Test Errors
RMSE : 0.8085
MAE : 0.5878
-------------------------------------

Testing Avg Error:
AVG RMSE : 0.809146188876
AVG MAE : 0.587324777311
 
Nearest Neighbors:  10
----------   Fold  1 ---------------
Test Errors
RMSE : 0.8048
MAE : 0.5858
----------   Fold  2 ---------------
Test Errors
RMSE : 0.8045
MAE : 0.5840
----------   Fold  3 ---------------
Test Errors
RMSE : 0.7986
MAE : 0.5821
----------   Fold  4 ---------------
Test Errors
RMSE : 0.7973
MAE : 0.5817
----------   Fold  5 ---------------
Test Errors
RMSE : 0.8003
MAE : 0.5840
-------------------------------------

Testing Avg Error:
AVG RMSE : 0.801096639609
AVG MAE : 0.583508451591
 
Nearest Neighbors:  15
----------   Fold  1 ---------------
Test Errors
RMSE : 0.8035
MAE : 0.5857
----------   Fold  2 ---------------
Test Errors
RMSE : 0.8032
MAE : 0.5839
----------   Fold  3 ---------------
Test Errors
RMSE : 0.7975
MAE : 0.5822
----------   Fold  4 ---------------
Test Errors
RMSE : 0.7963
MAE : 0.5817
----------   Fold  5 ---------------
Test Errors
RMSE : 0.7992
MAE : 0.5839
-------------------------------------

Testing Avg Error:
AVG RMSE : 0.799918947078
AVG MAE : 0.583491996023
 
Nearest Neighbors:  20
----------   Fold  1 ---------------
Test Errors
RMSE : 0.8032
MAE : 0.5860
----------   Fold  2 ---------------
Test Errors
RMSE : 0.8030
MAE : 0.5843
----------   Fold  3 ---------------
Test Errors
RMSE : 0.7971
MAE : 0.5823
----------   Fold  4 ---------------
Test Errors
RMSE : 0.7963
MAE : 0.5823
----------   Fold  5 ---------------
Test Errors
RMSE : 0.7987
MAE : 0.5839
-------------------------------------

Testing Avg Error:
AVG RMSE : 0.799635852661
AVG MAE : 0.583750301908
 
Nearest Neighbors:  25
----------   Fold  1 ---------------
Test Errors
RMSE : 0.8032
MAE : 0.5863
----------   Fold  2 ---------------
Test Errors
RMSE : 0.8030
MAE : 0.5844
----------   Fold  3 ---------------
Test Errors
RMSE : 0.7971
MAE : 0.5825
----------   Fold  4 ---------------
Test Errors
RMSE : 0.7961
MAE : 0.5825
----------   Fold  5 ---------------
Test Errors
RMSE : 0.7986
MAE : 0.5841
-------------------------------------

Testing Avg Error:
AVG RMSE : 0.799602667732
AVG MAE : 0.583960033462
 

Best K Value for

Min MAE
0.583491996023 2
Min RMSE
0.799602667732 4


In [337]:
%matplotlib inline

In [348]:
results_df1 = pd.DataFrame({'Nearest Neighbors': [5, 10, 15, 20, 25], 'MAE': each_sims, 'RMSE': each_sims_rmse })
plot1 = results_df1.plot(x='Nearest Neighbors', y=['MAE', 'RMSE'], ylim=(0.5,0.85), title = 'Hybrid Model: Metrics over different K')
fig = plot1.get_figure()
fig.savefig('MetricsHybrid.png')


/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/pandas/plotting/_core.py:1714: UserWarning: Pandas doesn't allow columns to be created via a new attribute name - see https://pandas.pydata.org/pandas-docs/stable/indexing.html#attribute-access
  series.name = label

In [46]:
flipped_dict = dict(zip(idict.values(), idict.keys()))

In [74]:
full_mat = np.zeros((len(rows),len(cols)))
for row in nmat:
    full_mat[row[0], row[1]] = row[2]

Coverage for Hybrid Model


In [47]:
def coverage(full_mat, user_id, item_sim1, item_sim2, weights, k, mlist, flipped_dict,  cov = False):
    
    n = full_mat.shape[1]
    nzero = full_mat[user_id].nonzero()[0]
    
    preds = {}
    for row in range(n):
        pred1 = predict(user_id, row, full_mat, item_sim1, amean, umean, imean, k)
        pred2 = predict2(user_id, row, full_mat, item_sim2, amean, umean, imean,  k)

        preds[row] = weights[0] * pred1 + weights[1] * pred2
        
    if not cov:
        print("Books Read -----")
        for i in nzero:
            print(flipped_dict[i])
            del preds[i]
    
    
    res = sorted(preds.items(), key=lambda x: x[1], reverse = True)
    
    ret_tup = []
    ans = [flipped_dict[i[0]] for i in res[:mlist[-1]]]
    for i in mlist:
        ret_tup.append(ans[:i])
    
    return ret_tup

In [75]:
cov1 = []
cov2 = []
cov3 = []
cov4 = []
cov5 = []
mlist = [5,10,15,20,25]
for i in range(len(rows)):
    if(i%100 == 0):
        print(i)
    ans = coverage(full_mat, i, lsh, item_model, [0.7, 0.3], 15, mlist, flipped_dict, True)
    cov1.extend(ans[0])
    cov2.extend(ans[1])
    cov3.extend(ans[2])
    cov4.extend(ans[3])
    cov5.extend(ans[4])


0
/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/ipykernel/__main__.py:8: RuntimeWarning: invalid value encountered in double_scalars
/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/ipykernel/__main__.py:17: RuntimeWarning: invalid value encountered in double_scalars
100
200
300
400
500
600
700
800
900
1000
1100
1200
1300
1400
1500
1600
1700
1800
1900
2000
2100
2200
2300
2400
2500
2600
2700
2800
2900
3000
3100
3200
3300
3400
3500
3600
3700
3800
3900
4000
4100
4200
4300
4400
4500
4600
4700
4800
4900
5000
5100
5200
5300
5400
5500
5600
5700
5800
5900
6000
6100
6200
6300
6400
6500
6600
6700
6800
6900
7000
7100
7200
7300
7400
7500
7600
7700
7800
7900
8000
8100
8200
8300
8400
8500
8600
8700
8800
8900
9000
9100
9200
9300
9400
9500
9600
9700
9800
9900
10000
10100
10200
10300
10400
10500
10600
10700
10800
10900
11000
11100
11200
11300
11400
11500
11600
11700
11800
11900
12000

In [143]:
print("Coverage with recommending 5 books", len(set(cov1))/4959 *100 ,"%")


Coverage with recommending 5 books 27.808025811655572 %

In [144]:
print("Coverage with recommending 10 books", len(set(cov2))/4959 *100 ,"%")


Coverage with recommending 10 books 38.93930227868522 %

In [145]:
print("Coverage with recommending 15 books", len(set(cov3))/4959 *100 ,"%")


Coverage with recommending 15 books 46.541641459971764 %

In [146]:
print("Coverage with recommending 20 books", len(set(cov4))/4959 *100 ,"%")


Coverage with recommending 20 books 52.63157894736842 %

In [147]:
print("Coverage with recommending 25 books", len(set(cov5))/4959 *100 ,"%")


Coverage with recommending 25 books 57.8544061302682 %

Get M Recommendation for a user using LSH Model


In [370]:
def getmrec(full_mat, user_id, item_similarity, k, m, flipped_dict,  cov = False):
    
    n = full_mat.shape[1]
    nzero = full_mat[user_id].nonzero()[0]
    
    preds = {}
    for row in range(n):
        preds[row] = predict(user_id, row, full_mat, item_similarity, amean, umean, imean, k)
    
    genre = []
    if not cov:
#         print("Books Read -----")
        for i in nzero:
#             print(flipped_dict[i])
            genre.extend(ansdict[i])
            del preds[i]
    
    newA = dict(sorted(preds.items(),key=operator.itemgetter(1), reverse = True)[:m])
    #res = sorted(preds.items(), key=lambda x: x[1], reverse = True)
    retgen = []
    for j in newA.keys():
        retgen.extend(ansdict[j])
        
    ans = [flipped_dict[i] for i in newA.keys()]
#     print("Books Read")
    df1 = pd.DataFrame.from_dict(dict(Counter(genre)),orient='index').sort_values(by=0,ascending = False)
#     print(df1)
#     print()
#     print("Book Recommended")
    df2 = pd.DataFrame.from_dict(dict(Counter(retgen)),orient='index').sort_values(by=0,ascending = False)
#     print(df2)
    
    df = pd.merge(left=df1, right=df2, left_index=True, right_index=True, how = 'outer').fillna(0)
    df.columns = ['Books Read', 'Books Recommended']
    df = df.sort_values(['Books Read','Books Recommended'], ascending = False)
    
#     print(df)
    
#     print(df1)

    return df , ans

In [371]:
df, ans = getmrec(full_mat, 130, lsh, 15, 10, flipped_dict,  cov = False)


/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/ipykernel/__main__.py:17: RuntimeWarning: invalid value encountered in double_scalars

In [375]:
df


Out[375]:
Books Read Books Recommended
Horror_Score 19.0 5.0
Biography_Score 13.0 2.0
Mystery_Score 6.0 7.0
Romance_Score 4.0 1.0
Fantasy_Score 2.0 1.0
Children_Score 2.0 0.0
History_Score 1.0 3.0
Autobiography_Score 1.0 0.0
Drama_Score 0.0 1.0

In [387]:
# print("\n\n==========[ RECOMMENDED BOOKS]==========\n")
# for book in ans:
#     print("  " + book)
# print("\n\n")

df2 = pd.DataFrame(ans,columns=['Recommended Books'])

df2


Out[387]:
Recommended Books
0 For Whom the Bell Tolls
1 Open: An Autobiography
2 Night
3 Deeply Odd (Odd Thomas, #6)
4 Unbroken
5 Eye of the Needle
6 Don Quixote
7 The Graveyard Book
8 To Kill a Mockingbird
9 The Lion, the Witch and the Wardrobe (Chronicl...

Getting the right data structure; Not Important


In [148]:
data = pd.read_pickle('../created_datasets/ibsn_features_new_batch.pickle')

In [ ]:


In [149]:
name = data.title

In [150]:
name = list(name)

In [151]:
for i in range(len(flipped_dict)):
    flipped_dict[i] = name[i]

In [158]:
feats.head()


Out[158]:
Unnamed: 0 Science_Score Satire_Score Drama_Score Action_Score Romance_Score Mystery_Score Horror_Score Travel_Score Children_Score Religion_Score History_Score Biography_Score Autobiography_Score Fantasy_Score isbn title
0 0 0.038207 0.068504 0.069262 0.034280 0.067933 0.080699 0.093292 0.048437 0.067634 0.058378 0.065526 0.045191 0.044895 0.071272 0002007770 Water for Elephants
1 1 0.039468 0.075064 0.061319 0.055943 0.092886 0.093401 0.090650 0.063545 0.055523 0.064056 0.094257 0.073566 0.068922 0.071120 0002051850 For Whom the Bell Tolls
2 2 0.037345 0.061224 0.052370 0.034974 0.064670 0.076909 0.076948 0.040161 0.049419 0.057505 0.076696 0.031504 0.030831 0.060471 0002247399 A Dance with Dragons (A Song of Ice and Fire, #5)
3 3 0.040277 0.059965 0.056401 0.040376 0.068644 0.090155 0.089819 0.046808 0.054902 0.057387 0.068051 0.053922 0.046720 0.067391 0006476155 Along Came a Spider (Alex Cross, #1)
4 4 0.024843 0.050273 0.067219 0.023418 0.088785 0.081145 0.076581 0.041120 0.087698 0.046154 0.067406 0.054567 0.061165 0.064440 0006514006 The Other Boleyn Girl

In [333]:
def get_top_two_indices(x):
    
#     l = list(x.values)
    
#     return l.index(max(l))

    return pd.Series(x.sort_values(ascending = False).head(2).index.values)

In [334]:
ansdict = feats.loc[:,'Science_Score':'Fantasy_Score'].apply(get_top_two_indices,axis=1).T.to_dict(orient='list')