In [218]:

    
import time
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from scipy.spatial.distance import cosine
from sklearn.metrics.pairwise import cosine_similarity, pairwise_distances
from sklearn.model_selection import KFold
from sklearn.metrics import mean_squared_error, mean_absolute_error

%matplotlib inline

Loading the Book Ratings Dataset



In [219]:

    
ratings = pd.read_csv('../raw-data/BX-Book-Ratings.csv', encoding='iso-8859-1', sep = ';')
ratings.columns = ['user_id', 'isbn', 'book_rating']



In [220]:

    
print(ratings.dtypes)
print()
print(ratings.head())
print()
print("Data Points :", ratings.shape[0])









    



user_id         int64
isbn           object
book_rating     int64
dtype: object

   user_id        isbn  book_rating
0   276725  034545104X            0
1   276726  0155061224            5
2   276727  0446520802            0
3   276729  052165615X            3
4   276729  0521795028            6

Data Points : 1149780

Loading the Books Dataset



In [221]:

    
books = pd.read_csv('../raw-data/BX-Books.csv', sep=';', encoding = 'iso-8859-1', dtype =str)
del books['Image-URL-L']
del books['Image-URL-M']
del books['Image-URL-S']
del books['Book-Author']
del books['Publisher']

Some Books don't have unique ISBN, creating a 1:1 maping between books-title and ISBN



In [222]:

    
print('Number of Books == Number of ISBN ? ', books["Book-Title"].nunique() == books["ISBN"].nunique())
book_dict = books[["Book-Title","ISBN"]].set_index("Book-Title").to_dict()["ISBN"]
books['new_isbn'] = books["Book-Title"].apply(lambda x: book_dict[x])
print('Number of Books == Number of ISBN ? ', books["Book-Title"].nunique() == books["new_isbn"].nunique())









    



Number of Books == Number of ISBN ?  False
Number of Books == Number of ISBN ?  True



In [223]:

    
books['isbn'] = books['new_isbn']

del books['ISBN']
del books['new_isbn']



In [224]:

    
books.shape









    Out[224]:





(271379, 3)

Data Preparation/ Cleaning

Removing ratings equal to zero, since Book Crossing Dataset has rating scale from 1-10. Taking Inner Join with books dataframe to maintain books whose details exist.



In [225]:

    
newdf = ratings[ratings.book_rating>0]
joined = books.merge(newdf, on ='isbn')
print(joined.shape)









    



(293111, 5)

Sparsity, Number of Users and Items in Book Crossing Dataset



In [226]:

    
rows = joined.user_id.unique()
cols = joined['Book-Title'].unique()
print(joined.user_id.nunique(), joined.isbn.nunique())









    



52097 129168



In [227]:

    
print("Sparsity :", 100 - (joined.shape[0]/(joined.user_id.nunique()* joined.isbn.nunique())))









    



Sparsity : 99.9999564423474

Loading the Amazon Data Set



In [228]:

    
data1 = pd.read_csv('../clean-data/ratings_Books.csv', )



In [229]:

    
data1.columns = ['user_id', 'isbn', 'book_rating', 'timestamp']

Sparsity, Number of Users and Items in Amazon Dataset



In [230]:

    
rows = data1.user_id.unique()
cols = data1.isbn.unique()
print(data1.user_id.nunique(), data1.isbn.nunique())
print("Sparsity :", 100 - (data1.shape[0]/(data1.user_id.nunique()* data1.isbn.nunique())))









    



8026324 2330066
Sparsity : 99.99999879652889



In [232]:

    
data1 = data1[['user_id', 'isbn', 'book_rating']]



In [233]:

    
data1.shape









    Out[233]:





(22507154, 3)



In [234]:

    
data2 = joined[['user_id', 'isbn', 'book_rating']]



In [235]:

    
data2.book_rating = data2.book_rating / 2.0









    



/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/pandas/core/generic.py:3643: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self[name] = value



In [236]:

    
data2.shape









    Out[236]:





(293111, 3)



In [237]:

    
data2 = data2.drop_duplicates()



In [238]:

    
data2.shape









    Out[238]:





(261676, 3)

Combining the Datasets



In [239]:

    
data3 = pd.concat((data1, data2))



In [240]:

    
data3.shape









    Out[240]:





(22768830, 3)

Sampling wiht each item being rated atleast 53 times



In [335]:

    
temp = data3[data3['isbn'].isin(data3['isbn'].value_counts()[data3['isbn'].value_counts()>50].index)]
# print(len(temp.user_id.unique()))
# print(len(temp.isbn.unique()))
temp1 = temp[temp['user_id'].isin(temp['user_id'].value_counts()[temp['user_id'].value_counts()>49].index)]
# print(len(temp1.user_id.unique()))
# print(len(temp1.isbn.unique()))
temp2 = temp1[temp1['isbn'].isin(temp1['isbn'].value_counts()[temp1['isbn'].value_counts()>53].index)]
print(len(temp2.user_id.unique()))
print(len(temp2.isbn.unique()))



In [242]:

    
print(temp2.groupby(['user_id']).count()['book_rating'].mean())
print(temp2.groupby(['isbn']).count()['book_rating'].mean())









    



39.0565834232
94.9282113329

Baseline Naive Based Algorithm and Benchmarking with Traditional Collaborative Filtering



In [243]:

    
data = temp2
rows = data.user_id.unique()
cols = data.isbn.unique()
print(data.user_id.nunique(), data.isbn.nunique())
data = data[['user_id', 'isbn', 'book_rating']]
data.to_csv('Combine.csv')



In [244]:

    
print("Sparsity :", 100 - (data.shape[0]/(len(cols)*len(rows)) * 100))









    



Sparsity : 99.21241009430905



In [245]:

    
idict  = dict(zip(cols, range(len(cols))))
udict = dict(zip(rows, range(len(rows))))

data.user_id = [
    udict[i] for i in data.user_id
]
data['isbn'] = [
    idict[i] for i in data['isbn']
]

nmat = data.as_matrix()



In [246]:

    
nmat = nmat.astype(int)
nmat.shape









    Out[246]:





(470749, 3)

Function for Evaluation Metrics: MAE and RMSE



In [247]:

    
def rmse(ypred, ytrue):
    ypred = ypred[ytrue.nonzero()].flatten()
    ytrue = ytrue[ytrue.nonzero()].flatten()
    return np.sqrt(mean_squared_error(ypred, ytrue))

def mae(ypred, ytrue):
    ypred = ypred[ytrue.nonzero()].flatten()
    ytrue = ytrue[ytrue.nonzero()].flatten()
    return mean_absolute_error(ypred, ytrue)

-------------- Naive Baseline ---------------

Our Naive Baseline for any user i, item j prediction is to assign it with average rating over entire dataset. (amean))



In [317]:

    
def predict_naive(user, item):
    return amean1



In [318]:

    
x1, x2 = train_test_split(nmat, test_size = 0.2, random_state =42)
naive = np.zeros((len(rows),len(cols)))
for row in x1:
    naive[row[0], row[1]] = row[2]

predictions = []
targets = []

amean1 = np.mean(naive[naive!=0])
umean1 = sum(naive.T) / sum((naive!=0).T)
imean1 = sum(naive) / sum((naive!=0))

umean1 = np.where(np.isnan(umean1), amean1, umean1)
imean1 = np.where(np.isnan(imean1), amean1, imean1)


print('Naive---')
for row in x2:
    user, item, actual = row[0], row[1], row[2]
    predictions.append(predict_naive(user, item))
    targets.append(actual)

print('rmse %.4f' % rmse(np.array(predictions), np.array(targets)))
print('mae %.4f' % mae(np.array(predictions), np.array(targets)))
print()









    



/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/ipykernel/__main__.py:10: RuntimeWarning: invalid value encountered in true_divide






    



Naive---
rmse 0.9444
mae 0.7630

Following are the functions to calculate pairwise similarity between two items : Cosine, Adjusted Cosine, Euclidean, Pearson Corelation.



In [250]:

    
def cos(mat, a, b):
    if a == b:
        return 1
    aval = mat.T[a].nonzero()
    bval = mat.T[b].nonzero()
    corated = np.intersect1d(aval, bval)
    if len(corated) == 0:
        return 0
    avec = np.take(mat.T[a], corated)
    bvec = np.take(mat.T[b], corated)
    val = 1 - cosine(avec, bvec)
    if np.isnan(val):
        return 0
    return val



In [251]:

    
def adjcos(mat, a, b, umean):
    if a == b:
        return 1
    aval = mat.T[a].nonzero()
    bval = mat.T[b].nonzero()
    corated = np.intersect1d(aval, bval)
    if len(corated) == 0:
        return 0
    avec = np.take(mat.T[a], corated)
    bvec = np.take(mat.T[b], corated)
    avec1 = avec - umean[corated]
    bvec1 = bvec - umean[corated]
    val = 1 - cosine(avec1, bvec1)
    if np.isnan(val):
        return 0
    return val



In [252]:

    
def pr(mat, a, b, imean):
    if a == b:
        return 1
    aval = mat.T[a].nonzero()
    bval = mat.T[b].nonzero()
    corated = np.intersect1d(aval, bval)
    if len(corated) < 2:
        return 0
    avec = np.take(mat.T[a], corated)
    bvec = np.take(mat.T[b], corated)
    avec1 = avec - imean[a]
    bvec1 = bvec - imean[b]
    val = 1 - cosine(avec1, bvec1)
    if np.isnan(val):
        return 0
    return val



In [253]:

    
def euc(mat, a, b):
    if a == b:
        return 1
    aval = mat.T[a].nonzero()
    bval = mat.T[b].nonzero()
    corated = np.intersect1d(aval, bval)
    if len(corated) == 0:
        return 0
    avec = np.take(mat.T[a], corated)
    bvec = np.take(mat.T[b], corated)
    dist = np.sqrt(np.sum(a-b)**2)
    val = 1/(1+dist)
    if np.isnan(val):
        return 0
    return val

Function item similar returns matrix of pairwise similarity between all items based on the option provided. Also return amean (global mean rating), umean (average rating of each user), imean (Average rating of each item)



In [254]:

    
def itemsimilar(mat, option):
    amean = np.mean(mat[mat!=0])
    umean = sum(mat.T) / sum((mat!=0).T)
    imean = sum(mat) / sum((mat!=0))
    
    umean = np.where(np.isnan(umean), amean, umean)
    imean = np.where(np.isnan(imean), amean, imean)
    
    n = mat.shape[1]
    sim_mat = np.zeros((n, n))
    
    if option == 'pr':
        #print("PR")
        for i in range(n):
            for j in range(n):
                sim_mat[i][j] = pr(mat, i, j, imean)
        sim_mat = (sim_mat + 1)/2
    elif option == 'cos':
        #print("COS")
        print(n)
        for i in range(n):
            if(i%100 == 0):
                print(i)
            for j in range(n):
                sim_mat[i][j] = cos(mat, i, j)
    elif option == 'adjcos':
        #print("ADJCOS")
        for i in range(n):
            for j in range(n):
                sim_mat[i][j] = adjcos(mat, i, j, umean)
        sim_mat = (sim_mat + 1)/2
    elif option == 'euc':
        #print("EUCLIDEAN")
        for i in range(n):
            for j in range(n):
                sim_mat[i][j] = euc(mat, i, j)
    else:
        #print("Hello")
        sim_mat = cosine_similarity(mat.T)
    
    return sim_mat, amean, umean, imean

Benchmark Traditional Item-Item Collaborative Filtering



In [37]:

    
import time 
start = time.time()
naive = np.zeros((len(rows),len(cols)))
for row in x1:
    naive[row[0], row[1]] = row[2]
items, amean, umean, imean = itemsimilar(naive,'cos')
end = time.time()
print(end-start)









    



/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/ipykernel/__main__.py:3: RuntimeWarning: invalid value encountered in true_divide
  app.launch_new_instance()






    



4959
0
100
200
300
400
500
600
700
800
900
1000
1100
1200
1300
1400
1500
1600
1700
1800
1900
2000
2100
2200
2300
2400
2500
2600
2700
2800
2900
3000
3100
3200
3300
3400
3500
3600
3700
3800
3900
4000
4100
4200
4300
4400
4500
4600
4700
4800
4900
31105.438440084457



In [255]:

    
print(end - start)









    



31105.438440084457



In [256]:

    
items.shape









    Out[256]:





(4959, 4959)

Predict function is used to get recommended rating by user i for item j.



In [327]:

    
def predict(user, item, mat, item_similarity, amean, umean, imean,  k=20):
    nzero = mat[user].nonzero()[0]
    if len(nzero) == 0:
        return amean
    baseline = imean + umean[user] - amean
    choice = nzero[item_similarity[item, nzero].argsort()[::-1][1:k+1]]

    prediction = ((mat[user, choice] - baseline[choice]).dot(item_similarity[item, choice])/ sum(item_similarity[item, choice])) + baseline[item]
        
    if np.isnan(prediction):
        prediction = imean[item] + umean[user] - amean
    if prediction > 5:
        prediction = 5
    if prediction < 1:
        prediction = 1
    return prediction



In [328]:

    
predict(0,1, naive, items, amean, umean, imean,5)









    Out[328]:





4.9711482118438033



In [329]:

    
def get_results1(X, rows, cols, folds, k, item_similarity, amean, umean, imean):
    kf =  KFold(n_splits=folds, shuffle = True, random_state=95)
    count = 1
    rmse_list = []
    mae_list = []
    trmse_list = []
    tmae_list = []
    for train_index, test_index in kf.split(X):
        print("----------   Fold ", count, "---------------")
        train_data, test_data = X[train_index], X[test_index]
        
        full_mat = np.zeros((rows, cols))
        
        for row in train_data:
            full_mat[row[0], row[1]] = row[2]
        
        
        preds = []
        real = []
        
       
        for row in train_data:
            user_id, isbn, rating = row[0], row[1], row[2]
            preds.append(predict(user_id, isbn, full_mat, item_similarity, amean, umean, imean, k))
            real.append(rating)
        
        
        err1 = rmse(np.array(preds), np.array(real))
        err2 = mae(np.array(preds), np.array(real))
        trmse_list.append(err1)
        tmae_list.append(err2)
        
        print('Train Errors')
        print('RMSE : %.4f' % err1)
        print('MAE : %.4f' % err2)
        
        preds = []
        real = []
        
        for row in test_data:
            user_id, isbn, rating = row[0], row[1], row[2]
            preds.append(predict(user_id, isbn, full_mat, item_similarity, amean, umean, imean, k))
            real.append(rating)
            
        err1 = rmse(np.array(preds), np.array(real))
        err2 = mae(np.array(preds), np.array(real))
        rmse_list.append(err1)
        mae_list.append(err2)
        
       
        
        print('Test Errors')
        print('RMSE : %.4f' % err1)
        print('MAE : %.4f' % err2)
        count+=1
    
    print("-------------------------------------")
    print("Training Avg Error:")
    print("AVG RMSE :", str(np.mean(trmse_list)))
    print("AVG MAE :", str(np.mean(tmae_list)))
    print()
    print("Testing Avg Error:")
    print("AVG RMSE :", str(np.mean(rmse_list)))
    print("AVG MAE :", str(np.mean(mae_list)))
    print(" ")
        
    return np.mean(mae_list), np.mean(rmse_list)

Time to Test the recommendations



In [344]:

    
s = time.time()
get_results1(nmat, len(rows), len(cols), 5 ,20,items, amean,umean, imean)
e=time.time()









    



----------   Fold  1 ---------------






    



/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/ipykernel/__main__.py:8: RuntimeWarning: invalid value encountered in double_scalars






    



Train Errors
RMSE : 0.7258
MAE : 0.5300
Test Errors
RMSE : 0.7604
MAE : 0.5548
----------   Fold  2 ---------------
Train Errors
RMSE : 0.7250
MAE : 0.5295
Test Errors
RMSE : 0.7633
MAE : 0.5572
----------   Fold  3 ---------------
Train Errors
RMSE : 0.7265
MAE : 0.5305
Test Errors
RMSE : 0.7574
MAE : 0.5531
----------   Fold  4 ---------------
Train Errors
RMSE : 0.7257
MAE : 0.5299
Test Errors
RMSE : 0.7600
MAE : 0.5550
----------   Fold  5 ---------------
Train Errors
RMSE : 0.7255
MAE : 0.5301
Test Errors
RMSE : 0.7600
MAE : 0.5544
-------------------------------------
Training Avg Error:
AVG RMSE : 0.725690168911
AVG MAE : 0.530009929035

Testing Avg Error:
AVG RMSE : 0.760232634108
AVG MAE : 0.554906333271



In [352]:

    
print("Time to test the recommendation over 5 fold cross validation of the data", (e-s)/5, "seconds")









    



Time to test the recommendation over 5 fold cross validation of the data 41.3511257648468 seconds

get_results function is our function to cross_val setup and changing the parameter of this function will help to tune hyperparameter k (nearest neighbours)

Grid Search for best K for item-item CF using all the similarity metric implemented.



In [331]:

    
each_sims = []
each_sims_rmse = []
for k in [5, 10, 15, 20, 25]:
    print("Nearest Neighbors: ",k)
    ans1, ans2  = get_results1(nmat, len(rows), len(cols), 5 ,k,items, amean,umean, imean)
    each_sims.append(ans1)
    each_sims_rmse.append(ans2)

print()
print("Best K Value for")
print()
print("Min MAE")
print(np.min(each_sims), np.argmin(each_sims))
print("Min RMSE")
print(np.min(each_sims_rmse), np.argmin(each_sims_rmse))
print()









    



Nearest Neighbors:  5
----------   Fold  1 ---------------






    



/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/ipykernel/__main__.py:8: RuntimeWarning: invalid value encountered in double_scalars






    



Train Errors
RMSE : 0.7393
MAE : 0.5276
Test Errors
RMSE : 0.8063
MAE : 0.5796
----------   Fold  2 ---------------
Train Errors
RMSE : 0.7383
MAE : 0.5271
Test Errors
RMSE : 0.8082
MAE : 0.5811
----------   Fold  3 ---------------
Train Errors
RMSE : 0.7405
MAE : 0.5284
Test Errors
RMSE : 0.8010
MAE : 0.5764
----------   Fold  4 ---------------
Train Errors
RMSE : 0.7396
MAE : 0.5281
Test Errors
RMSE : 0.8039
MAE : 0.5771
----------   Fold  5 ---------------
Train Errors
RMSE : 0.7384
MAE : 0.5277
Test Errors
RMSE : 0.8046
MAE : 0.5774
-------------------------------------
Training Avg Error:
AVG RMSE : 0.739217688741
AVG MAE : 0.527781424304

Testing Avg Error:
AVG RMSE : 0.804776845346
AVG MAE : 0.578315689842
 
Nearest Neighbors:  10
----------   Fold  1 ---------------
Train Errors
RMSE : 0.7194
MAE : 0.5209
Test Errors
RMSE : 0.7681
MAE : 0.5572
----------   Fold  2 ---------------
Train Errors
RMSE : 0.7185
MAE : 0.5202
Test Errors
RMSE : 0.7699
MAE : 0.5585
----------   Fold  3 ---------------
Train Errors
RMSE : 0.7212
MAE : 0.5221
Test Errors
RMSE : 0.7636
MAE : 0.5546
----------   Fold  4 ---------------
Train Errors
RMSE : 0.7198
MAE : 0.5214
Test Errors
RMSE : 0.7660
MAE : 0.5556
----------   Fold  5 ---------------
Train Errors
RMSE : 0.7187
MAE : 0.5209
Test Errors
RMSE : 0.7678
MAE : 0.5563
-------------------------------------
Training Avg Error:
AVG RMSE : 0.719514366051
AVG MAE : 0.521103404645

Testing Avg Error:
AVG RMSE : 0.767078566955
AVG MAE : 0.556444604709
 
Nearest Neighbors:  15
----------   Fold  1 ---------------
Train Errors
RMSE : 0.7203
MAE : 0.5243
Test Errors
RMSE : 0.7599
MAE : 0.5532
----------   Fold  2 ---------------
Train Errors
RMSE : 0.7193
MAE : 0.5237
Test Errors
RMSE : 0.7623
MAE : 0.5549
----------   Fold  3 ---------------
Train Errors
RMSE : 0.7215
MAE : 0.5251
Test Errors
RMSE : 0.7562
MAE : 0.5509
----------   Fold  4 ---------------
Train Errors
RMSE : 0.7202
MAE : 0.5242
Test Errors
RMSE : 0.7590
MAE : 0.5529
----------   Fold  5 ---------------
Train Errors
RMSE : 0.7199
MAE : 0.5245
Test Errors
RMSE : 0.7599
MAE : 0.5528
-------------------------------------
Training Avg Error:
AVG RMSE : 0.720240438059
AVG MAE : 0.524345970407

Testing Avg Error:
AVG RMSE : 0.759456803989
AVG MAE : 0.552945475038
 
Nearest Neighbors:  20
----------   Fold  1 ---------------
Train Errors
RMSE : 0.7258
MAE : 0.5300
Test Errors
RMSE : 0.7604
MAE : 0.5548
----------   Fold  2 ---------------
Train Errors
RMSE : 0.7250
MAE : 0.5295
Test Errors
RMSE : 0.7633
MAE : 0.5572
----------   Fold  3 ---------------
Train Errors
RMSE : 0.7265
MAE : 0.5305
Test Errors
RMSE : 0.7574
MAE : 0.5531
----------   Fold  4 ---------------
Train Errors
RMSE : 0.7257
MAE : 0.5299
Test Errors
RMSE : 0.7600
MAE : 0.5550
----------   Fold  5 ---------------
Train Errors
RMSE : 0.7255
MAE : 0.5301
Test Errors
RMSE : 0.7600
MAE : 0.5544
-------------------------------------
Training Avg Error:
AVG RMSE : 0.725690168911
AVG MAE : 0.530009929035

Testing Avg Error:
AVG RMSE : 0.760232634108
AVG MAE : 0.554906333271
 
Nearest Neighbors:  25
----------   Fold  1 ---------------
Train Errors
RMSE : 0.7323
MAE : 0.5359
Test Errors
RMSE : 0.7635
MAE : 0.5583
----------   Fold  2 ---------------
Train Errors
RMSE : 0.7312
MAE : 0.5352
Test Errors
RMSE : 0.7669
MAE : 0.5608
----------   Fold  3 ---------------
Train Errors
RMSE : 0.7329
MAE : 0.5364
Test Errors
RMSE : 0.7610
MAE : 0.5569
----------   Fold  4 ---------------
Train Errors
RMSE : 0.7322
MAE : 0.5359
Test Errors
RMSE : 0.7635
MAE : 0.5584
----------   Fold  5 ---------------
Train Errors
RMSE : 0.7317
MAE : 0.5358
Test Errors
RMSE : 0.7635
MAE : 0.5580
-------------------------------------
Training Avg Error:
AVG RMSE : 0.732057394852
AVG MAE : 0.535842702787

Testing Avg Error:
AVG RMSE : 0.76369366713
AVG MAE : 0.558468162082
 

Best K Value for

Min MAE
0.552945475038 2
Min RMSE
0.759456803989 2



In [ ]:



In [332]:

    
print(each_sims[2], each_sims_rmse[2])









    



0.552945475038 0.759456803989



In [333]:

    
results_df1 = pd.DataFrame({'Nearest Neighbors': [5, 10, 15, 20, 25], 'MAE': each_sims, 'RMSE': each_sims_rmse })
plot1 = results_df1.plot(x='Nearest Neighbors', y=['MAE', 'RMSE'], ylim=(0.5,0.85), title = 'Item-Item CF: Metrics over different K')
fig = plot1.get_figure()
fig.savefig('MetricsCFK.png')









    



/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/pandas/plotting/_core.py:1714: UserWarning: Pandas doesn't allow columns to be created via a new attribute name - see https://pandas.pydata.org/pandas-docs/stable/indexing.html#attribute-access
  series.name = label

getmrec function is used to get top m recommendation for a user_id based on the similarity matrix (option), k neighbours.



In [264]:

    
full_mat = np.zeros((len(rows),len(cols)))
for row in nmat:
    full_mat[row[0], row[1]] = row[2]
#item_similarity, amean, umean, imean = itemsimilar(full_mat, 'euc')



In [265]:

    
def getmrec(full_mat, user_id, item_similarity, k, m, idict,  cov = False):
    
    n = item_similarity.shape[0]
    nzero = full_mat[user_id].nonzero()[0]
    
    preds = {}
    for row in range(n):
        preds[row] = predict(user_id, row, full_mat, item_similarity, amean, umean, imean, k)
    
    flipped_dict = dict(zip(idict.values(), idict.keys()))
    
    if not cov:
        print("Books Read -----")
        for i in nzero:
            print(flipped_dict[i])
            del preds[i]
    
    
    res = sorted(preds.items(), key=lambda x: x[1], reverse = True)
    
    ans = [flipped_dict[i[0]] for i in res[:m]]
    return ans



In [266]:

    
flipped_dict = dict(zip(idict.values(), idict.keys()))

Coverage for Item - Item CF



In [267]:

    
def coverage(full_mat, user_id, item_similarity, k, mlist, flipped_dict,  cov = False):
    
    n = item_similarity.shape[0]
    nzero = full_mat[user_id].nonzero()[0]
    
    preds = {}
    for row in range(n):
        preds[row] = predict(user_id, row, full_mat, item_similarity, amean, umean, imean, k)
        
    if not cov:
        print("Books Read -----")
        for i in nzero:
            print(flipped_dict[i])
            del preds[i]
    
    
    res = sorted(preds.items(), key=lambda x: x[1], reverse = True)
    
    ret_tup = []
    ans = [flipped_dict[i[0]] for i in res[:mlist[-1]]]
    for i in mlist:
        ret_tup.append(ans[:i])
    
    return ret_tup



In [268]:

    
cov1 = []
cov2 = []
cov3 = []
cov4 = []
cov5 = []
mlist = [5,10,15,20,25]
for i in range(len(rows)):
    if(i%100 == 0):
        print(i)
    ans = coverage(full_mat, i, items, 10, mlist, flipped_dict, True)
    cov1.extend(ans[0])
    cov2.extend(ans[1])
    cov3.extend(ans[2])
    cov4.extend(ans[3])
    cov5.extend(ans[4])









    



0






    



/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/ipykernel/__main__.py:8: RuntimeWarning: invalid value encountered in double_scalars






    



100
200
300
400
500
600
700
800
900
1000
1100
1200
1300
1400
1500
1600
1700
1800
1900
2000
2100
2200
2300
2400
2500
2600
2700
2800
2900
3000
3100
3200
3300
3400
3500
3600
3700
3800
3900
4000
4100
4200
4300
4400
4500
4600
4700
4800
4900
5000
5100
5200
5300
5400
5500
5600
5700
5800
5900
6000
6100
6200
6300
6400
6500
6600
6700
6800
6900
7000
7100
7200
7300
7400
7500
7600
7700
7800
7900
8000
8100
8200
8300
8400
8500
8600
8700
8800
8900
9000
9100
9200
9300
9400
9500
9600
9700
9800
9900
10000
10100
10200
10300
10400
10500
10600
10700
10800
10900
11000
11100
11200
11300
11400
11500
11600
11700
11800
11900
12000

Coverage Results



In [269]:

    
print("Coverage with recommending 5 books", len(set(cov1))/4959 *100 ,"%")









    



Coverage with recommending 5 books 47.48941318814277 %



In [270]:

    
print("Coverage with recommending 10 books", len(set(cov2))/4959 *100 ,"%")









    



Coverage with recommending 10 books 66.76749344625932 %



In [271]:

    
print("Coverage with recommending 15 books", len(set(cov3))/4959 *100 ,"%")









    



Coverage with recommending 15 books 76.40653357531761 %



In [272]:

    
print("Coverage with recommending 20 books", len(set(cov4))/4959 *100 ,"%")









    



Coverage with recommending 20 books 82.03266787658802 %



In [273]:

    
print("Coverage with recommending 25 books", len(set(cov5))/4959 *100 ,"%")









    



Coverage with recommending 25 books 85.88425085702762 %



In [ ]:



In [ ]:



In [ ]:

Content-Based CF Using Book Features

Loading Book Features



In [274]:

    
feats = pd.read_csv('../book_features.csv')



In [275]:

    
feats.shape









    Out[275]:





(4959, 17)



In [276]:

    
feats.head()









    Out[276]:







  
    
      
      Unnamed: 0
      Science_Score
      Satire_Score
      Drama_Score
      Action_Score
      Romance_Score
      Mystery_Score
      Horror_Score
      Travel_Score
      Children_Score
      Religion_Score
      History_Score
      Biography_Score
      Autobiography_Score
      Fantasy_Score
      isbn
      title
    
  
  
    
      0
      0
      0.038207
      0.068504
      0.069262
      0.034280
      0.067933
      0.080699
      0.093292
      0.048437
      0.067634
      0.058378
      0.065526
      0.045191
      0.044895
      0.071272
      0002007770
      Water for Elephants
    
    
      1
      1
      0.039468
      0.075064
      0.061319
      0.055943
      0.092886
      0.093401
      0.090650
      0.063545
      0.055523
      0.064056
      0.094257
      0.073566
      0.068922
      0.071120
      0002051850
      For Whom the Bell Tolls
    
    
      2
      2
      0.037345
      0.061224
      0.052370
      0.034974
      0.064670
      0.076909
      0.076948
      0.040161
      0.049419
      0.057505
      0.076696
      0.031504
      0.030831
      0.060471
      0002247399
      A Dance with Dragons (A Song of Ice and Fire, #5)
    
    
      3
      3
      0.040277
      0.059965
      0.056401
      0.040376
      0.068644
      0.090155
      0.089819
      0.046808
      0.054902
      0.057387
      0.068051
      0.053922
      0.046720
      0.067391
      0006476155
      Along Came a Spider (Alex Cross, #1)
    
    
      4
      4
      0.024843
      0.050273
      0.067219
      0.023418
      0.088785
      0.081145
      0.076581
      0.041120
      0.087698
      0.046154
      0.067406
      0.054567
      0.061165
      0.064440
      0006514006
      The Other Boleyn Girl



In [277]:

    
scores = feats.iloc[:,1:15]



In [278]:

    
scores1 = scores.as_matrix()



In [279]:

    
scores1.shape









    Out[279]:





(4959, 14)



In [280]:

    
inputscores = scores1.T

Similarity Matrix Using Only Book Features



In [284]:

    
naive = np.zeros((len(rows),len(cols)))
for row in x1:
    naive[row[0], row[1]] = row[2]
items_features, temple1, temple2, temple3 = itemsimilar(inputscores,'')









    



/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/ipykernel/__main__.py:4: RuntimeWarning: invalid value encountered in true_divide



In [349]:

    
s1 = time.time()
get_results1(nmat, len(rows), len(cols), 5 ,20,items_features, amean,umean, imean)
e1 = time.time()









    



----------   Fold  1 ---------------






    



/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/ipykernel/__main__.py:8: RuntimeWarning: invalid value encountered in double_scalars






    



Train Errors
RMSE : 0.7901
MAE : 0.5836
Test Errors
RMSE : 0.8037
MAE : 0.5931
----------   Fold  2 ---------------
Train Errors
RMSE : 0.7892
MAE : 0.5832
Test Errors
RMSE : 0.8072
MAE : 0.5960
----------   Fold  3 ---------------
Train Errors
RMSE : 0.7908
MAE : 0.5845
Test Errors
RMSE : 0.8016
MAE : 0.5918
----------   Fold  4 ---------------
Train Errors
RMSE : 0.7903
MAE : 0.5840
Test Errors
RMSE : 0.8036
MAE : 0.5939
----------   Fold  5 ---------------
Train Errors
RMSE : 0.7902
MAE : 0.5841
Test Errors
RMSE : 0.8037
MAE : 0.5926
-------------------------------------
Training Avg Error:
AVG RMSE : 0.790119701713
AVG MAE : 0.583863348997

Testing Avg Error:
AVG RMSE : 0.803975533939
AVG MAE : 0.593486413251



In [351]:

    
print("Time to test the recommendation over 5 folds cross validation of the data", (e1-s1)/5, "seconds")









    



Time to test the recommendation over 5 folds cross validation of the data 45.11421179771423 seconds



In [287]:

    
each_sims_con = []
each_sims_rmse_con = []
for k in [5, 10, 15, 20, 25]:
    print("Nearest Neighbors: ",k)
    ans1, ans2  = get_results1(nmat, len(rows), len(cols), 5 ,k,items_features, amean,umean, imean)
    each_sims_con.append(ans1)
    each_sims_rmse_con.append(ans2)

print()
print("Best K Value for")
print()
print("Min MAE")
print(np.min(each_sims_con), np.argmin(each_sims_con))
print("Min RMSE")
print(np.min(each_sims_rmse_con), np.argmin(each_sims_rmse_con))
print()









    



Nearest Neighbors:  5
----------   Fold  1 ---------------






    



/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/ipykernel/__main__.py:8: RuntimeWarning: invalid value encountered in double_scalars






    



Train Errors
RMSE : 0.8301
MAE : 0.6046
Test Errors
RMSE : 0.8606
MAE : 0.6289
----------   Fold  2 ---------------
Train Errors
RMSE : 0.8317
MAE : 0.6063
Test Errors
RMSE : 0.8532
MAE : 0.6202
----------   Fold  3 ---------------
Train Errors
RMSE : 0.8336
MAE : 0.6076
Test Errors
RMSE : 0.8470
MAE : 0.6182
----------   Fold  4 ---------------
Train Errors
RMSE : 0.8332
MAE : 0.6069
Test Errors
RMSE : 0.8467
MAE : 0.6187
----------   Fold  5 ---------------
Train Errors
RMSE : 0.8351
MAE : 0.6085
Test Errors
RMSE : 0.8495
MAE : 0.6204
-------------------------------------
Training Avg Error:
AVG RMSE : 0.832729392494
AVG MAE : 0.606799104198

Testing Avg Error:
AVG RMSE : 0.851395324026
AVG MAE : 0.621298062042
 
Nearest Neighbors:  10
----------   Fold  1 ---------------
Train Errors
RMSE : 0.8011
MAE : 0.5893
Test Errors
RMSE : 0.8303
MAE : 0.6108
----------   Fold  2 ---------------
Train Errors
RMSE : 0.8037
MAE : 0.5912
Test Errors
RMSE : 0.8198
MAE : 0.6007
----------   Fold  3 ---------------
Train Errors
RMSE : 0.8048
MAE : 0.5920
Test Errors
RMSE : 0.8162
MAE : 0.6005
----------   Fold  4 ---------------
Train Errors
RMSE : 0.8046
MAE : 0.5914
Test Errors
RMSE : 0.8159
MAE : 0.6014
----------   Fold  5 ---------------
Train Errors
RMSE : 0.8043
MAE : 0.5917
Test Errors
RMSE : 0.8174
MAE : 0.6012
-------------------------------------
Training Avg Error:
AVG RMSE : 0.803683231936
AVG MAE : 0.591136356278

Testing Avg Error:
AVG RMSE : 0.819910470474
AVG MAE : 0.602929407931
 
Nearest Neighbors:  15
----------   Fold  1 ---------------
Train Errors
RMSE : 0.7928
MAE : 0.5851
Test Errors
RMSE : 0.8205
MAE : 0.6052
----------   Fold  2 ---------------
Train Errors
RMSE : 0.7954
MAE : 0.5871
Test Errors
RMSE : 0.8094
MAE : 0.5950
----------   Fold  3 ---------------
Train Errors
RMSE : 0.7964
MAE : 0.5878
Test Errors
RMSE : 0.8060
MAE : 0.5951
----------   Fold  4 ---------------
Train Errors
RMSE : 0.7965
MAE : 0.5875
Test Errors
RMSE : 0.8065
MAE : 0.5963
----------   Fold  5 ---------------
Train Errors
RMSE : 0.7958
MAE : 0.5874
Test Errors
RMSE : 0.8077
MAE : 0.5959
-------------------------------------
Training Avg Error:
AVG RMSE : 0.795360580389
AVG MAE : 0.586974450166

Testing Avg Error:
AVG RMSE : 0.810040310956
AVG MAE : 0.597481696425
 
Nearest Neighbors:  20
----------   Fold  1 ---------------
Train Errors
RMSE : 0.7892
MAE : 0.5834
Test Errors
RMSE : 0.8164
MAE : 0.6030
----------   Fold  2 ---------------
Train Errors
RMSE : 0.7919
MAE : 0.5855
Test Errors
RMSE : 0.8048
MAE : 0.5923
----------   Fold  3 ---------------
Train Errors
RMSE : 0.7929
MAE : 0.5861
Test Errors
RMSE : 0.8011
MAE : 0.5921
----------   Fold  4 ---------------
Train Errors
RMSE : 0.7929
MAE : 0.5859
Test Errors
RMSE : 0.8025
MAE : 0.5946
----------   Fold  5 ---------------
Train Errors
RMSE : 0.7926
MAE : 0.5860
Test Errors
RMSE : 0.8030
MAE : 0.5932
-------------------------------------
Training Avg Error:
AVG RMSE : 0.791927450548
AVG MAE : 0.585381915522

Testing Avg Error:
AVG RMSE : 0.805574805134
AVG MAE : 0.595045180631
 
Nearest Neighbors:  25
----------   Fold  1 ---------------
Train Errors
RMSE : 0.7876
MAE : 0.5826
Test Errors
RMSE : 0.8143
MAE : 0.6020
----------   Fold  2 ---------------
Train Errors
RMSE : 0.7903
MAE : 0.5847
Test Errors
RMSE : 0.8025
MAE : 0.5911
----------   Fold  3 ---------------
Train Errors
RMSE : 0.7913
MAE : 0.5855
Test Errors
RMSE : 0.7987
MAE : 0.5908
----------   Fold  4 ---------------
Train Errors
RMSE : 0.7911
MAE : 0.5850
Test Errors
RMSE : 0.7995
MAE : 0.5930
----------   Fold  5 ---------------
Train Errors
RMSE : 0.7909
MAE : 0.5853
Test Errors
RMSE : 0.8008
MAE : 0.5919
-------------------------------------
Training Avg Error:
AVG RMSE : 0.790240307876
AVG MAE : 0.584623208397

Testing Avg Error:
AVG RMSE : 0.803158875282
AVG MAE : 0.593777558237
 

Best K Value for

Min MAE
0.593777558237 4
Min RMSE
0.803158875282 4



In [308]:

    
results_df2 = pd.DataFrame({'Nearest Neighbors': [5, 10, 15, 20, 25], 'MAE': each_sims_con, 'RMSE': each_sims_rmse_con })
plot2 = results_df2.plot(x='Nearest Neighbors', y=['MAE', 'RMSE'], ylim=(0.5,0.9), title = 'Content Based Item-Item CF: Metrics over different K')
fig = plot2.get_figure()
fig.savefig('MetricsContentCFK.png')









    



/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/pandas/plotting/_core.py:1714: UserWarning: Pandas doesn't allow columns to be created via a new attribute name - see https://pandas.pydata.org/pandas-docs/stable/indexing.html#attribute-access
  series.name = label

Coverage for Content Based CF



In [316]:

    
covcon1 = []
covcon2 = []
covcon3 = []
covcon4 = []
covcon5 = []
mlist = [5,10,15,20,25]
for i in range(len(rows)):
    if(i%100 == 0):
        print(i)
    ans = coverage(full_mat, i, items_features, 10, mlist, flipped_dict, True)
    covcon1.extend(ans[0])
    covcon2.extend(ans[1])
    covcon3.extend(ans[2])
    covcon4.extend(ans[3])
    covcon5.extend(ans[4])









    



0






    



/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/ipykernel/__main__.py:8: RuntimeWarning: invalid value encountered in double_scalars






    



100
200
300
400
500
600
700
800
900
1000
1100
1200
1300
1400
1500
1600
1700
1800
1900
2000
2100
2200
2300
2400
2500
2600
2700
2800
2900
3000
3100
3200
3300
3400
3500
3600
3700
3800
3900
4000
4100
4200
4300
4400
4500
4600
4700
4800
4900
5000
5100
5200
5300
5400
5500
5600
5700
5800
5900
6000
6100
6200
6300
6400
6500
6600
6700
6800
6900
7000
7100
7200
7300
7400
7500
7600
7700
7800
7900
8000
8100
8200
8300
8400
8500
8600
8700
8800
8900
9000
9100
9200
9300
9400
9500
9600
9700
9800
9900
10000
10100
10200
10300
10400
10500
10600
10700
10800
10900
11000
11100
11200
11300
11400
11500
11600
11700
11800
11900
12000



In [338]:

    
print("Coverage with recommending 5 books", len(set(covcon1))/4959 *100 ,"%")









    



Coverage with recommending 5 books 19.298245614035086 %



In [339]:

    
print("Coverage with recommending 10 books", len(set(covcon2))/4959 *100 ,"%")









    



Coverage with recommending 10 books 26.31578947368421 %



In [340]:

    
print("Coverage with recommending 15 books", len(set(covcon3))/4959 *100 ,"%")









    



Coverage with recommending 15 books 31.558782012502522 %



In [341]:

    
print("Coverage with recommending 20 books", len(set(covcon4))/4959 *100 ,"%")









    



Coverage with recommending 20 books 35.47086106069772 %



In [342]:

    
print("Coverage with recommending 25 books", len(set(covcon5))/4959 *100 ,"%")









    



Coverage with recommending 25 books 38.57632587215164 %



In [ ]:

	Unnamed: 0	Science_Score	Satire_Score	Drama_Score	Action_Score	Romance_Score	Mystery_Score	Horror_Score	Travel_Score	Children_Score	Religion_Score	History_Score	Biography_Score	Autobiography_Score	Fantasy_Score	isbn	title
0	0	0.038207	0.068504	0.069262	0.034280	0.067933	0.080699	0.093292	0.048437	0.067634	0.058378	0.065526	0.045191	0.044895	0.071272	0002007770	Water for Elephants
1	1	0.039468	0.075064	0.061319	0.055943	0.092886	0.093401	0.090650	0.063545	0.055523	0.064056	0.094257	0.073566	0.068922	0.071120	0002051850	For Whom the Bell Tolls
2	2	0.037345	0.061224	0.052370	0.034974	0.064670	0.076909	0.076948	0.040161	0.049419	0.057505	0.076696	0.031504	0.030831	0.060471	0002247399	A Dance with Dragons (A Song of Ice and Fire, #5)
3	3	0.040277	0.059965	0.056401	0.040376	0.068644	0.090155	0.089819	0.046808	0.054902	0.057387	0.068051	0.053922	0.046720	0.067391	0006476155	Along Came a Spider (Alex Cross, #1)
4	4	0.024843	0.050273	0.067219	0.023418	0.088785	0.081145	0.076581	0.041120	0.087698	0.046154	0.067406	0.054567	0.061165	0.064440	0006514006	The Other Boleyn Girl