In [8]:
import pandas as pd
import matplotlib.pyplot as plt
import re
from scipy.sparse import csr_matrix, coo_matrix
import numpy as np

%matplotlib inline

In [24]:
ratings = pd.read_csv('BX-Book-Ratings.csv', encoding='iso-8859-1', sep = ';')
ratings.columns = ['user_id', 'isbn', 'book_rating']
books = pd.read_csv('BX-Books.csv', sep=';', encoding = 'iso-8859-1', dtype =str)

books["Book-Title"].nunique() == books["ISBN"].nunique()
book_dict = books[["Book-Title","ISBN"]].set_index("Book-Title").to_dict()["ISBN"]
books['new_isbn'] = books["Book-Title"].apply(lambda x: book_dict[x])
books["Book-Title"].nunique() == books["new_isbn"].nunique()
books['isbn'] = books['new_isbn']

del books['Image-URL-L']
del books['Image-URL-M']
del books['Image-URL-S']
del books['Book-Author']
del books['Publisher']
del books['ISBN']
del books['new_isbn']

newdf = ratings[ratings.book_rating>0]
joined = books.merge(newdf, on ='isbn')
print(newdf.shape)


(433671, 3)

In [72]:
datasets = []
for j in [100, 150, 200, 300, 500]:
    df = joined.groupby('isbn').count().sort_values('user_id', ascending =False)[0:j].index.values
    test = joined.groupby('user_id').count().sort_values('isbn', ascending = False)[:20000].index.values
    newdf = joined[joined.user_id.isin(test) & joined.isbn.isin(df)]
    data  = newdf[newdf['user_id'].isin(newdf['user_id'].value_counts()[newdf['user_id'].value_counts()>1].index)]
    data = data[['user_id', 'Book-Title', 'book_rating', 'isbn']].drop_duplicates()
    print(data.user_id.nunique(), data.isbn.nunique())
    print(data.groupby('user_id').count().sort_values('isbn', ascending = False).mean())
    datasets.append(data)
    #data.to_csv('data' + str(j) + '.csv')


2517 100
Book-Title     2.617402
book_rating    2.617402
isbn           2.617402
dtype: float64
3086 150
Book-Title     2.851588
book_rating    2.851588
isbn           2.851588
dtype: float64
3543 200
Book-Title     3.003669
book_rating    3.003669
isbn           3.003669
dtype: float64
4280 300
Book-Title     3.180841
book_rating    3.180841
isbn           3.180841
dtype: float64
5282 500
Book-Title     3.444529
book_rating    3.444529
isbn           3.444529
dtype: float64

Implementation of Non-negative matrix factorization

We implemented NMF from scratch using this paper: http://www.siam.org/meetings/sdm06/proceedings/059zhangs2.pdf. This implementation has regularisation parameters as well.

There are two functions which can used to train NMF. train_nmf() can be used for training when you have a train/test split. CV_nmf() can be used for cross validation


In [9]:
def train_nmf(X, Wts, Wts_train, Wts_test, factors=30, iterations=10, lambda1=0, lambda2=0):
    """ Train NMF on train data and get results on test data
    Args:
        X: Matrix with rows for users and columns for books
        Wts: Matrix of shape of X. Entry should be 1 if rating is present, 0 otherwise
        Wts_train: Matrix of shape of X. Entry should be 1 if known rating should be used for training, 0 otherwise
        Wts_test: Matrix of shape of X. Entry should be 1 if known rating should be used for testing, 0 otherwise
        factors: Number of latent factors to train
        iterations: Number of times latent factors are updated
        lambda1: Regularization parameters for W. Predicted ratings = W*H
        lambda2: Regularization parameters for H. Predicted ratings = W*H
    Returns:
        W,H: Latent factors of desired size
        test_mae, test_rmse, train_mae, train_rmse: MAE & RMSE metrics on train/test
    """
    
    W=np.random.uniform(low=0, high=0.5, size=(X.shape[0], factors))
    H=np.random.uniform(low=0, high=0.5, size=(factors, X.shape[1]))
    # lambda1=0 #.01
    # lambda2=0 #.01
    for i in range(iterations):
        num_w = np.dot(np.nan_to_num(np.multiply(Wts_train, X)),H.T)
        den_w = np.dot(Wts_train * np.dot(W,H), H.T) + lambda1*(np.asarray([Wts_train.sum(axis=1)]).T)*W
        den_w[den_w==0] = 1e-16
        W = W*num_w/den_w

        num_h = np.dot(W.T, np.nan_to_num(Wts_train*X))
        den_h = np.dot(W.T, Wts_train*np.dot(W,H)) + lambda2*(np.asarray([Wts_train.sum(axis=0)]))*H
        den_h[den_h==0] = 1e-16
        H = H*num_h/den_h

    test_mae = ((np.abs(np.nan_to_num(X - np.dot(W,H)))*Wts_test).sum())/Wts_test.sum()
    test_rmse = np.sqrt(((np.nan_to_num(X - np.dot(W,H))**2)*Wts_test).sum()/Wts_test.sum())
    train_mae = ((np.abs(np.nan_to_num(X - np.dot(W,H)))*Wts_train).sum())/Wts_train.sum()
    train_rmse = np.sqrt(((np.nan_to_num(X - np.dot(W,H))**2)*Wts_train).sum()/Wts_train.sum())

    print("Test MAE", test_mae)
    print("Test RMSE", test_rmse)
    print("Train MAE", train_mae)
    print("Train RMSE", train_rmse)
    return([W, H, test_mae, test_rmse, train_mae, train_rmse])

In [51]:
def CV_nmf(data, k_cv=5, factors=30, iterations=10, lambda1=0, lambda2=0):
    """ Cross Validation for NMF implementation
    Args:
        data: DataFrame in long form. User-book pair and corresponding rating
        k_cv: Number of folds
        factors: Number of latent factors to train
        iterations: Number of times latent factors are updated
        lambda1: Regularization parameters for W. Predicted ratings = W*H
        lambda2: Regularization parameters for H. Predicted ratings = W*H
    Returns:
        test_mae, test_rmse, train_mae, train_rmse: Mean MAE & RMSE metrics on train/test
    """
    test_mae_mean=0
    test_rmse_mean=0
    train_mae_mean=0
    train_rmse_mean=0
    sample_threshold = 1 - 1/k_cv
    
    for i in range(k_cv):
        data['rn'] = np.random.uniform(low=0, high=1, size=(len(data),))
        cuts = list(np.arange(0,1,1/10))
        cuts.append(1)
        data['fold'] = pd.cut(data.rn, cuts, labels=False)
        data['test'] = (data['fold']==i).astype(int)
        X_matrix = data.pivot(index='user_id', columns='Book-Title', values='book_rating')
        X = np.asarray(X_matrix)
        Wts = pd.isnull(X)
        Wts = (~np.asarray(Wts)).astype(int)
        Wts_test = np.nan_to_num(np.asarray(data.pivot(index='user_id', columns='Book-Title', values='test')))
        Wts_train = Wts * (~Wts_test.astype(bool))
        print('Results of fold', i)
        W, H, test_mae, test_rmse, train_mae, train_rmse = train_nmf(X, Wts, Wts_train, Wts_test, factors, iterations, lambda1, lambda2)

        test_mae_mean = test_mae_mean + test_mae
        test_rmse_mean = test_rmse_mean + test_rmse
        train_mae_mean = train_mae_mean + train_mae
        train_rmse_mean = train_rmse_mean + train_rmse
    print("----------Average of metrics------------")
    test_mae_mean=test_mae_mean/k_cv
    test_rmse_mean=test_rmse_mean/k_cv
    train_mae_mean=train_mae_mean/k_cv
    train_rmse_mean=train_rmse_mean/k_cv
    print("Average Test MAE", test_mae_mean)
    print("Average Test RMSE", test_rmse_mean)
    print("Average Train MAE", train_mae_mean)
    print("Average Train RMSE", train_rmse_mean)
    return([test_mae_mean, test_rmse_mean, train_mae_mean, train_rmse_mean])

In [49]:
data = datasets[4]
rows = data.user_id.unique()
cols = data['Book-Title'].unique()
data = data[['user_id', 'Book-Title', 'book_rating']]
idict  = dict(zip(cols, range(len(cols))))
udict = dict(zip(rows, range(len(rows))))

data.user_id = [
    udict[i] for i in data.user_id
]
data['Book-Title'] = [
    idict[i] for i in data['Book-Title']
]
data = data[['user_id','Book-Title','book_rating']].drop_duplicates()
nmat = data.as_matrix()


G:\Anaconda\lib\site-packages\pandas\core\generic.py:3110: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self[name] = value
G:\Anaconda\lib\site-packages\ipykernel_launcher.py:13: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  del sys.path[0]

In [53]:
data['test'] = (np.random.uniform(low=0, high=1, size=(len(data),))>0.9).astype(int)
X_matrix = data.pivot(index='user_id', columns='Book-Title', values='book_rating')
X = np.asarray(X_matrix)
Wts = pd.isnull(X)
Wts = (~np.asarray(Wts)).astype(int)
Wts_test = np.nan_to_num(np.asarray(data.pivot(index='user_id', columns='Book-Title', values='test')))
Wts_train = Wts * (~Wts_test.astype(bool))

In [54]:
W, H, test_mae, test_rmse, train_mae, train_rmse = train_nmf(X, Wts, Wts_train, Wts_test, factors=50, iterations=5, lambda1=0, lambda2=0)


Test MAE 2.00005507841
Test RMSE 3.02227590744
Train MAE 0.569063511289
Train RMSE 0.841478229267

In [52]:
CV_nmf(data, factors=20, iterations=5) #2.047 on 300, 1.9 on 500, 2.1 on 200


Results of fold 0
Test MAE 1.97098442365
Test RMSE 2.91691641275
Train MAE 0.617075364151
Train RMSE 0.899852584143
Results of fold 1
Test MAE 2.01389533042
Test RMSE 2.97613825488
Train MAE 0.623882142343
Train RMSE 0.90791094516
Results of fold 2
Test MAE 1.90730924575
Test RMSE 2.82579881773
Train MAE 0.62520935424
Train RMSE 0.908456767493
Results of fold 3
Test MAE 2.02933728672
Test RMSE 3.02550021891
Train MAE 0.624846424292
Train RMSE 0.90981917014
Results of fold 4
Test MAE 2.01082283306
Test RMSE 2.92193387119
Train MAE 0.612090923212
Train RMSE 0.893164542018
----------Average of metrics------------
Average Test MAE 1.98646982392
Average Test RMSE 2.93325751509
Average Train MAE 0.620620841648
Average Train RMSE 0.903840801791
Out[52]:
[1.9864698239204119,
 2.9332575150916207,
 0.62062084164761677,
 0.90384080179084347]

In [16]:
# Grid searching for best factors: number of factors, lambda1, lambda2
factors = np.arange(1,100,20)
test_maes = []
train_maes = []
for factor in factors:
    print("Results for factor ",factor)
    test_mae_mean, test_rmse_mean, train_mae_mean, train_rmse_mean = CV_nmf(data, factors=factor, iterations=10)
    test_maes.append(test_mae_mean)
    train_maes.append(train_mae_mean)


Results for factor  1
Results of fold 0
Test MAE 6.8324813437
Test RMSE 103.229535954
Train MAE 0.877891161692
Train RMSE 1.32892383183
Results of fold 1
Test MAE 2.11184830115
Test RMSE 4.90572766277
Train MAE 0.885234217579
Train RMSE 1.31844534504
Results of fold 2
Test MAE 2.51108965693
Test RMSE 11.7000196997
Train MAE 0.924726610624
Train RMSE 1.43052851199
Results of fold 3
Test MAE 2.20032312093
Test RMSE 10.000793738
Train MAE 0.888369933705
Train RMSE 1.31460032958
Results of fold 4
Test MAE 1.97497969468
Test RMSE 3.36714492407
Train MAE 0.852446761609
Train RMSE 1.22307804565
----------Average of metrics------------
Average Test MAE 3.12614442348
Average Test RMSE 26.6406443957
Average Train MAE 0.885733737042
Average Train RMSE 1.32311521282
Results for factor  21
Results of fold 0
Test MAE 2.04966145531
Test RMSE 3.00209878882
Train MAE 0.445951789662
Train RMSE 0.701933771164
Results of fold 1
Test MAE 2.10937959773
Test RMSE 3.11415910584
Train MAE 0.453608649858
Train RMSE 0.714330909632
Results of fold 2
Test MAE 1.9669089985
Test RMSE 2.88010223139
Train MAE 0.441544465692
Train RMSE 0.699202654019
Results of fold 3
Test MAE 2.07019070114
Test RMSE 3.0356626445
Train MAE 0.445226035834
Train RMSE 0.701588926844
Results of fold 4
Test MAE 2.09530158529
Test RMSE 3.0989750405
Train MAE 0.449875051414
Train RMSE 0.709089257821
----------Average of metrics------------
Average Test MAE 2.05828846759
Average Test RMSE 3.02619956221
Average Train MAE 0.447241198492
Average Train RMSE 0.705229103896
Results for factor  41
Results of fold 0
Test MAE 2.00960833442
Test RMSE 3.03921288944
Train MAE 0.396001724784
Train RMSE 0.64071154261
Results of fold 1
Test MAE 1.97644039029
Test RMSE 2.97540330193
Train MAE 0.400157947525
Train RMSE 0.645725642001
Results of fold 2
Test MAE 1.95492576796
Test RMSE 2.95804565849
Train MAE 0.404780922007
Train RMSE 0.654040964695
Results of fold 3
Test MAE 2.04201250855
Test RMSE 3.07995279727
Train MAE 0.402234638628
Train RMSE 0.64874531991
Results of fold 4
Test MAE 2.01677025662
Test RMSE 3.06008027423
Train MAE 0.398941315598
Train RMSE 0.641622744943
----------Average of metrics------------
Average Test MAE 1.99995145157
Average Test RMSE 3.02253898427
Average Train MAE 0.400423309708
Average Train RMSE 0.646169242832
Results for factor  61
Results of fold 0
Test MAE 1.99767919169
Test RMSE 3.05126062648
Train MAE 0.385445911668
Train RMSE 0.625312502727
Results of fold 1
Test MAE 1.86737714261
Test RMSE 2.82547249974
Train MAE 0.385912302343
Train RMSE 0.627602076577
Results of fold 2
Test MAE 2.03068948776
Test RMSE 3.03229440996
Train MAE 0.378116450515
Train RMSE 0.617527358182
Results of fold 3
Test MAE 2.06667270985
Test RMSE 3.12206819396
Train MAE 0.377860319184
Train RMSE 0.614352181108
Results of fold 4
Test MAE 1.96314550674
Test RMSE 2.96600064306
Train MAE 0.378912495183
Train RMSE 0.618355418884
----------Average of metrics------------
Average Test MAE 1.98511280773
Average Test RMSE 2.99941927464
Average Train MAE 0.381249495779
Average Train RMSE 0.620629907496
Results for factor  81
Results of fold 0
Test MAE 1.8647879776
Test RMSE 2.8881132855
Train MAE 0.37514273847
Train RMSE 0.615461372489
Results of fold 1
Test MAE 1.89669886531
Test RMSE 2.89887695914
Train MAE 0.374720956665
Train RMSE 0.611690897478
Results of fold 2
Test MAE 1.98360156152
Test RMSE 3.00988634016
Train MAE 0.371966873442
Train RMSE 0.612101551286
Results of fold 3
Test MAE 1.93887154895
Test RMSE 2.91171490256
Train MAE 0.375809086274
Train RMSE 0.616669800622
Results of fold 4
Test MAE 1.92891052683
Test RMSE 2.94503766461
Train MAE 0.375562557455
Train RMSE 0.616010967485
----------Average of metrics------------
Average Test MAE 1.92257409604
Average Test RMSE 2.93072583039
Average Train MAE 0.374640442461
Average Train RMSE 0.614386917872

In [19]:
results_df = pd.DataFrame({'Factors': factors, 'test_mae': test_maes, 'train_mae': train_maes})
results_df.plot(x='Factors', y=['test_mae', 'train_mae'], ylim=(0,4))


Out[19]:
<matplotlib.axes._subplots.AxesSubplot at 0x18e705be7b8>

In [21]:
# Grid searching for best iterations/epochs
iterations = np.arange(1,50,10)
test_maes = []
train_maes = []
for iteration in iterations:
    test_mae_mean, test_rmse_mean, train_mae_mean, train_rmse_mean = CV_nmf(data, factors=10, iterations=iteration)
    test_maes.append(test_mae_mean)
    train_maes.append(train_mae_mean)


Results of fold 0
Test MAE 2.33847964536
Test RMSE 3.29466027828
Train MAE 1.0972159179
Train RMSE 1.43336751174
Results of fold 1
Test MAE 2.3502177285
Test RMSE 3.32007473642
Train MAE 1.0863888976
Train RMSE 1.41744325701
Results of fold 2
Test MAE 2.37975167056
Test RMSE 3.32123942958
Train MAE 1.08143049941
Train RMSE 1.41316637344
Results of fold 3
Test MAE 2.31903265159
Test RMSE 3.24690579237
Train MAE 1.08380263918
Train RMSE 1.4177775971
Results of fold 4
Test MAE 2.35194558236
Test RMSE 3.30019548408
Train MAE 1.0875343399
Train RMSE 1.41309233746
----------Average of metrics------------
Average Test MAE 2.34788545568
Average Test RMSE 3.29661514415
Average Train MAE 1.0872744588
Average Train RMSE 1.41896941535
Results of fold 0
Test MAE 2.14884466312
Test RMSE 3.09535836215
Train MAE 0.502180098472
Train RMSE 0.776466957197
Results of fold 1
Test MAE 2.1029972857
Test RMSE 3.09063694938
Train MAE 0.511150780588
Train RMSE 0.786407647508
Results of fold 2
Test MAE 2.17839712284
Test RMSE 3.1370323438
Train MAE 0.50359857763
Train RMSE 0.77843372593
Results of fold 3
Test MAE 2.15162293421
Test RMSE 3.09759924135
Train MAE 0.510585499313
Train RMSE 0.790359834119
Results of fold 4
Test MAE 2.30426926409
Test RMSE 3.34011079593
Train MAE 0.509489453916
Train RMSE 0.784807873938
----------Average of metrics------------
Average Test MAE 2.17722625399
Average Test RMSE 3.15214753852
Average Train MAE 0.507400881984
Average Train RMSE 0.783295207738
Results of fold 0
Test MAE 2.20935371531
Test RMSE 3.15581715663
Train MAE 0.370349289032
Train RMSE 0.617186381541
Results of fold 1
Test MAE 2.20018177379
Test RMSE 3.14589104785
Train MAE 0.373758113714
Train RMSE 0.621618877344
Results of fold 2
Test MAE 2.24380589486
Test RMSE 3.22576782791
Train MAE 0.372053234524
Train RMSE 0.621076128402
Results of fold 3
Test MAE 2.27991265517
Test RMSE 3.29940213405
Train MAE 0.373730828381
Train RMSE 0.628968644357
Results of fold 4
Test MAE 2.21600433215
Test RMSE 3.17705309608
Train MAE 0.366548633163
Train RMSE 0.615653182576
----------Average of metrics------------
Average Test MAE 2.22985167426
Average Test RMSE 3.2007862525
Average Train MAE 0.371288019763
Average Train RMSE 0.620900642844
Results of fold 0
Test MAE 2.2195697665
Test RMSE 3.16709531155
Train MAE 0.296161876581
Train RMSE 0.529604530008
Results of fold 1
Test MAE 2.18157401936
Test RMSE 3.13177580531
Train MAE 0.297604497435
Train RMSE 0.530652476763
Results of fold 2
Test MAE 2.23389508534
Test RMSE 3.20257854218
Train MAE 0.291644838572
Train RMSE 0.520000030108
Results of fold 3
Test MAE 2.24188561615
Test RMSE 3.20144945901
Train MAE 0.290716614446
Train RMSE 0.511541481319
Results of fold 4
Test MAE 2.2668539067
Test RMSE 3.23390378024
Train MAE 0.29261840392
Train RMSE 0.51279922542
----------Average of metrics------------
Average Test MAE 2.22875567881
Average Test RMSE 3.18736057966
Average Train MAE 0.293749246191
Average Train RMSE 0.520919548724
Results of fold 0
Test MAE 2.20227727797
Test RMSE 3.13983358249
Train MAE 0.256110658417
Train RMSE 0.471786259504
Results of fold 1
Test MAE 2.26469712578
Test RMSE 3.21964371802
Train MAE 0.246298273618
Train RMSE 0.457363377063
Results of fold 2
Test MAE 2.16439204791
Test RMSE 3.09098421869
Train MAE 0.247074179053
Train RMSE 0.451976873327
Results of fold 3
Test MAE 2.20268706967
Test RMSE 3.11544894933
Train MAE 0.249332141951
Train RMSE 0.462986482654
Results of fold 4
Test MAE 2.39892139634
Test RMSE 3.42968971138
Train MAE 0.24893809144
Train RMSE 0.459482087377
----------Average of metrics------------
Average Test MAE 2.24659498353
Average Test RMSE 3.19912003598
Average Train MAE 0.249550668896
Average Train RMSE 0.460719015985
Results of fold 0
Test MAE 2.29406021951
Test RMSE 3.29280755537
Train MAE 0.212939328646
Train RMSE 0.407859122129
Results of fold 1
Test MAE 2.2408244271
Test RMSE 3.18507670432
Train MAE 0.221717679393
Train RMSE 0.417456932913
Results of fold 2
Test MAE 2.32886125749
Test RMSE 3.30397435046
Train MAE 0.214123449078
Train RMSE 0.404221703778
Results of fold 3
Test MAE 2.40094737666
Test RMSE 3.39996925367
Train MAE 0.213427257385
Train RMSE 0.409393432584
Results of fold 4
Test MAE 2.22893560247
Test RMSE 3.16933632555
Train MAE 0.219648664788
Train RMSE 0.417350191873
----------Average of metrics------------
Average Test MAE 2.29872577665
Average Test RMSE 3.27023283787
Average Train MAE 0.216371275858
Average Train RMSE 0.411256276655
Results of fold 0
Test MAE 2.37263408064
Test RMSE 3.35533119187
Train MAE 0.194303494291
Train RMSE 0.387136654768
Results of fold 1
Test MAE 2.37802926857
Test RMSE 3.37220048598
Train MAE 0.193063115007
Train RMSE 0.373680417107
Results of fold 2
Test MAE 2.3356860492
Test RMSE 3.33462137813
Train MAE 0.191159028533
Train RMSE 0.37566419935
Results of fold 3
Test MAE 2.24281353926
Test RMSE 3.18347727255
Train MAE 0.191067134429
Train RMSE 0.376890511468
Results of fold 4
Test MAE 2.2806202729
Test RMSE 3.23442436856
Train MAE 0.202471513241
Train RMSE 0.398992748057
----------Average of metrics------------
Average Test MAE 2.32195664211
Average Test RMSE 3.29601093942
Average Train MAE 0.1944128571
Average Train RMSE 0.38247290615
Results of fold 0
Test MAE 2.29252684346
Test RMSE 3.2568803171
Train MAE 0.17584108779
Train RMSE 0.351642505781
Results of fold 1
Test MAE 2.34779815832
Test RMSE 3.27301188704
Train MAE 0.177691979744
Train RMSE 0.35150256385
Results of fold 2
Test MAE 2.35616854441
Test RMSE 3.30833125352
Train MAE 0.179186420044
Train RMSE 0.356603274385
Results of fold 3
Test MAE 2.29117086884
Test RMSE 3.23323728092
Train MAE 0.175957222984
Train RMSE 0.347456576747
Results of fold 4
Test MAE 2.42679112143
Test RMSE 3.48980895676
Train MAE 0.174679248246
Train RMSE 0.351269011581
----------Average of metrics------------
Average Test MAE 2.34289110729
Average Test RMSE 3.31225393907
Average Train MAE 0.176671191761
Average Train RMSE 0.351694786469
Results of fold 0
Test MAE 2.38399440443
Test RMSE 3.35878183573
Train MAE 0.157402007806
Train RMSE 0.323307727969
Results of fold 1
Test MAE 2.37499667104
Test RMSE 3.35853911791
Train MAE 0.161980924053
Train RMSE 0.325825985929
Results of fold 2
Test MAE 2.35894005324
Test RMSE 3.38018404429
Train MAE 0.163358879107
Train RMSE 0.332342047187
Results of fold 3
Test MAE 2.28709224395
Test RMSE 3.18912672644
Train MAE 0.161402496861
Train RMSE 0.329193413101
Results of fold 4
Test MAE 2.38557757757
Test RMSE 3.34949959815
Train MAE 0.165717637599
Train RMSE 0.340792429859
----------Average of metrics------------
Average Test MAE 2.35812019005
Average Test RMSE 3.3272262645
Average Train MAE 0.161972389085
Average Train RMSE 0.330292320809
Results of fold 0
Test MAE 2.35108443544
Test RMSE 3.3100935551
Train MAE 0.147579575092
Train RMSE 0.307638326077
Results of fold 1
Test MAE 2.36545198211
Test RMSE 3.2961436963
Train MAE 0.14902733829
Train RMSE 0.31015346807
Results of fold 2
Test MAE 2.42610894871
Test RMSE 3.42707942731
Train MAE 0.153621542777
Train RMSE 0.318887722622
Results of fold 3
Test MAE 2.38157871848
Test RMSE 3.40474285104
Train MAE 0.151798708283
Train RMSE 0.313470047979
Results of fold 4
Test MAE 2.40243225573
Test RMSE 3.40863421647
Train MAE 0.145303418459
Train RMSE 0.302013294236
----------Average of metrics------------
Average Test MAE 2.38533126809
Average Test RMSE 3.36933874924
Average Train MAE 0.14946611658
Average Train RMSE 0.310432571797

In [22]:
results_df = pd.DataFrame({'Iterations': iterations, 'test_mae': test_maes, 'train_mae': train_maes})
results_df.plot(x='Iterations', y=['test_mae', 'train_mae'], ylim=(0,3))


Out[22]:
<matplotlib.axes._subplots.AxesSubplot at 0x18e71e4dd30>

In [ ]:
test_maes = []
train_maes = []
for iteration in iterations:
    test_mae_mean, test_rmse_mean, train_mae_mean, train_rmse_mean = CV_nmf(data, factors=10, iterations=iteration)
    test_maes.append(test_mae_mean)
    train_maes.append(train_mae_mean)

In [67]:
# Time taken for different data sizes
import time
Num_users = [100, 150, 200, 300, 500]
times = []
test_maes = []
train_maes = []
for datas in datasets:
    time_start = time.time()
    test_mae_mean, test_rmse_mean, train_mae_mean, train_rmse_mean = CV_nmf(datas, factors=20, iterations=10)
    time_taken = time.time()-time_start
    times.append(time_taken)
    test_maes.append(test_mae_mean)
    train_maes.append(train_mae_mean)


Results of fold 0
Test MAE 2.2913284441
Test RMSE 3.33980632262
Train MAE 0.339019674761
Train RMSE 0.566084762553
Results of fold 1
Test MAE 2.3564721117
Test RMSE 3.42047137005
Train MAE 0.335932360844
Train RMSE 0.558467413512
Results of fold 2
Test MAE 2.38573934893
Test RMSE 3.48439505845
Train MAE 0.335394738499
Train RMSE 0.57137688815
Results of fold 3
Test MAE 2.35593729747
Test RMSE 3.44014032721
Train MAE 0.332041618681
Train RMSE 0.558050219915
Results of fold 4
Test MAE 2.2381612308
Test RMSE 3.30657784115
Train MAE 0.333881592091
Train RMSE 0.556898057055
----------Average of metrics------------
Average Test MAE 2.3255276866
Average Test RMSE 3.3982781839
Average Train MAE 0.335253996975
Average Train RMSE 0.562175468237
Results of fold 0
Test MAE 2.28555997439
Test RMSE 3.36332934329
Train MAE 0.381778394877
Train RMSE 0.618755159324
Results of fold 1
Test MAE 2.186172645
Test RMSE 3.21588837776
Train MAE 0.389358297163
Train RMSE 0.634204774503
Results of fold 2
Test MAE 2.39048891068
Test RMSE 3.51853811685
Train MAE 0.381894280167
Train RMSE 0.617766488181
Results of fold 3
Test MAE 2.32274789297
Test RMSE 3.3864123123
Train MAE 0.366816933936
Train RMSE 0.60329811119
Results of fold 4
Test MAE 2.35899186553
Test RMSE 3.4729801208
Train MAE 0.388196228013
Train RMSE 0.633547421668
----------Average of metrics------------
Average Test MAE 2.30879225771
Average Test RMSE 3.3914296542
Average Train MAE 0.381608826831
Average Train RMSE 0.621514390973
Results of fold 0
Test MAE 2.21240812845
Test RMSE 3.26708596664
Train MAE 0.393144148411
Train RMSE 0.640322709875
Results of fold 1
Test MAE 2.19764388188
Test RMSE 3.2417241974
Train MAE 0.386283361723
Train RMSE 0.625945850107
Results of fold 2
Test MAE 2.25009262657
Test RMSE 3.36813406568
Train MAE 0.396419804504
Train RMSE 0.640804138743
Results of fold 3
Test MAE 2.09789702134
Test RMSE 3.10838918857
Train MAE 0.405738683455
Train RMSE 0.652235641003
Results of fold 4
Test MAE 2.20273906583
Test RMSE 3.27330639005
Train MAE 0.395761283498
Train RMSE 0.643685601524
----------Average of metrics------------
Average Test MAE 2.19215614481
Average Test RMSE 3.25172796167
Average Train MAE 0.395469456318
Average Train RMSE 0.64059878825
Results of fold 0
Test MAE 2.03228017432
Test RMSE 3.05101203403
Train MAE 0.425958889639
Train RMSE 0.679194397348
Results of fold 1
Test MAE 2.08650600468
Test RMSE 3.09764129491
Train MAE 0.414409671289
Train RMSE 0.665908587323
Results of fold 2
Test MAE 2.17877279318
Test RMSE 3.2253835767
Train MAE 0.413483300788
Train RMSE 0.665729887904
Results of fold 3
Test MAE 2.08847231647
Test RMSE 3.07948055435
Train MAE 0.412848059709
Train RMSE 0.658760322173
Results of fold 4
Test MAE 2.13992970833
Test RMSE 3.14454935819
Train MAE 0.414971840174
Train RMSE 0.667761654233
----------Average of metrics------------
Average Test MAE 2.10519219939
Average Test RMSE 3.11961336364
Average Train MAE 0.41633435232
Average Train RMSE 0.667470969796
Results of fold 0
Test MAE 2.01598392663
Test RMSE 2.99104145333
Train MAE 0.458146454999
Train RMSE 0.718672879308
Results of fold 1
Test MAE 1.99564167207
Test RMSE 2.91367598539
Train MAE 0.450452373355
Train RMSE 0.712688604588
Results of fold 2
Test MAE 2.00764911148
Test RMSE 2.99677730883
Train MAE 0.454515077092
Train RMSE 0.711136967641
Results of fold 3
Test MAE 2.03776804862
Test RMSE 3.01418807557
Train MAE 0.450804126133
Train RMSE 0.714821249923
Results of fold 4
Test MAE 1.95584706607
Test RMSE 2.9388053185
Train MAE 0.449291131628
Train RMSE 0.704435429937
----------Average of metrics------------
Average Test MAE 2.00257796497
Average Test RMSE 2.97089762832
Average Train MAE 0.452641832641
Average Train RMSE 0.712351026279

In [71]:
results_df = pd.DataFrame({'Num_users': Num_users, 'test_mae': test_maes, 'train_mae': train_maes, 'time': times})
results_df.plot(x='Num_users', y=['test_mae', 'train_mae'], ylim=(0,3))
results_df.plot(x='Num_users', y=['time'], ylim=(0,30))


Out[71]:
<matplotlib.axes._subplots.AxesSubplot at 0x18e015913c8>