Latent Factor Models for Collaborative Filtering

Load Pandas, we are going to need it for manipulating data


In [1]:
import pandas as pd
import numpy as np
from IPython.display import Image
np.set_printoptions(precision = 3)

In [2]:
Image(filename='books.png')


Out[2]:

Now load the data


In [3]:
data = pd.read_csv("user_ratings.csv")
d = data.to_latex()
text_file = open("Output.txt", "w")
text_file.write(d)
text_file.close()

In [4]:
n_features = 2

user_ratings = data.values
latent_user_preferences = np.random.random((user_ratings.shape[0], n_features))
latent_item_features = np.random.random((user_ratings.shape[1],n_features))

In [5]:
latent_item_features


Out[5]:
array([[ 0.831,  0.2  ],
       [ 0.796,  0.164],
       [ 0.583,  0.344],
       [ 0.835,  0.92 ],
       [ 0.704,  0.233]])

In [6]:
latent_user_preferences


Out[6]:
array([[ 0.657,  0.661],
       [ 0.021,  0.671],
       [ 0.4  ,  0.321],
       [ 0.253,  0.163],
       [ 0.131,  0.932],
       [ 0.286,  0.727],
       [ 0.643,  0.919],
       [ 0.212,  0.851],
       [ 0.861,  0.675],
       [ 0.593,  0.284]])

In [7]:
def predict_rating(user_id,item_id):
    """ Predict a rating given a user_id and an item_id.
    """
    user_preference = latent_user_preferences[user_id]
    item_preference = latent_item_features[item_id]
    return user_preference.dot(item_preference)

def train(user_id, item_id, rating,alpha = 0.0001):
    
    #print item_id
    prediction_rating = predict_rating(user_id, item_id)
    err =  ( prediction_rating- rating );
    #print err
    user_pref_values = latent_user_preferences[user_id][:]
    latent_user_preferences[user_id] -= alpha * err * latent_item_features[item_id]
    latent_item_features[item_id] -= alpha * err * user_pref_values
    return err
    


def sgd(iterations = 300000):
    """ Iterate over all users and all items and train for 
        a certain number of iterations
    """
    for iteration in range(0,iterations):
        error = []
        for user_id in range(0,latent_user_preferences.shape[0]):
            for item_id in range(0,latent_item_features.shape[0]):
                rating = user_ratings[user_id][item_id]
                if(not np.isnan(rating)):
                    err = train(user_id,item_id,rating)
                    error.append(err)
        mse = (np.array(error) ** 2).mean()   
        if(iteration%10000 == 0 ):
            print mse

In [8]:
sgd()


25.5415388252
1.71953831229
1.64570912318
1.5874847222
1.54778222601
1.49989371477
1.40194779792
1.32476484266
1.3167462766
1.31522557497
1.31405251762
1.31309791797
1.31235360671
1.31179749208
1.31139744901
1.31111874367
1.31092951702
1.31080356822
1.31072095283
1.31066731368
1.31063272479
1.31061051954
1.31059631076
1.31058725115
1.31058150909
1.31057791202
1.31057571043
1.31057442455
1.31057374608
1.31057347562

In [9]:
predictions = latent_user_preferences.dot(latent_item_features.T)
predictions


Out[9]:
array([[  7.73 ,   1.311,   4.653,   4.298,   5.162],
       [  3.167,   1.874,   3.248,   7.498,   6.368],
       [  8.999,   1.906,   5.798,   6.634,   7.218],
       [ 11.016,   2.331,   7.095,   8.107,   8.826],
       [ 15.886,   1.393,   8.256,   3.245,   6.467],
       [  2.221,   3.516,   4.488,  14.71 ,  11.472],
       [  4.721,   1.165,   3.208,   4.189,   4.312],
       [  6.173,   1.999,   4.671,   7.517,   7.15 ],
       [  2.549,   1.47 ,   2.575,   5.869,   5.002],
       [  4.815,   0.912,   2.994,   3.085,   3.517]])

In [ ]:


In [10]:
values = [zip(user_ratings[i], predictions[i]) for i in range(0,predictions.shape[0])]
comparison_data = pd.DataFrame(values)
comparison_data.columns = data.columns
comparison_data.applymap(lambda (x,y): "(%2.3f|%2.3f)"%(x,y))


Out[10]:
The Call of Cthulhu Frankenstein Dracula Neuromancer Space Odyssey
0 (8.000|7.730) (2.000|1.311) (nan|4.653) (5.000|4.298) (4.000|5.162)
1 (3.000|3.167) (2.000|1.874) (nan|3.248) (7.000|7.498) (7.000|6.368)
2 (9.000|8.999) (nan|1.906) (7.000|5.798) (8.000|6.634) (5.000|7.218)
3 (nan|11.016) (nan|2.331) (7.000|7.095) (8.000|8.107) (9.000|8.826)
4 (nan|15.886) (1.000|1.393) (8.000|8.256) (3.000|3.245) (7.000|6.467)
5 (2.000|2.221) (3.000|3.516) (5.000|4.488) (nan|14.710) (nan|11.472)
6 (4.000|4.721) (2.000|1.165) (nan|3.208) (2.000|4.189) (7.000|4.312)
7 (7.000|6.173) (1.000|1.999) (2.000|4.671) (7.000|7.517) (9.000|7.150)
8 (3.000|2.549) (3.000|1.470) (nan|2.575) (7.000|5.869) (3.000|5.002)
9 (4.000|4.815) (nan|0.912) (5.000|2.994) (3.000|3.085) (3.000|3.517)

In [11]:
comparison_data


Out[11]:
The Call of Cthulhu Frankenstein Dracula Neuromancer Space Odyssey
0 (8.0, 7.73025755984) (2.0, 1.31128122933) (nan, 4.65308346989) (5.0, 4.29763508292) (4.0, 5.16210530463)
1 (3.0, 3.16678708028) (2.0, 1.87401027353) (nan, 3.2481017053) (7.0, 7.49843957036) (7.0, 6.36815973682)
2 (9.0, 8.99943553249) (nan, 1.90649716785) (7.0, 5.79841201063) (8.0, 6.63393047892) (5.0, 7.21846133857)
3 (nan, 11.0159127447) (nan, 2.33061937257) (7.0, 7.09457255476) (8.0, 8.10723744944) (9.0, 8.82614142249)
4 (nan, 15.8864780566) (1.0, 1.39306965374) (8.0, 8.25586748992) (3.0, 3.24478931003) (7.0, 6.46683136079)
5 (2.0, 2.22064914497) (3.0, 3.5162859393) (5.0, 4.48821714565) (nan, 14.7101798287) (nan, 11.4722931602)
6 (4.0, 4.7213607158) (2.0, 1.16534349548) (nan, 3.20778119051) (2.0, 4.18915701359) (7.0, 4.31244580086)
7 (7.0, 6.17261410836) (1.0, 1.99883455876) (2.0, 4.67088512745) (7.0, 7.51682404393) (9.0, 7.15024857426)
8 (3.0, 2.54869219237) (3.0, 1.46965049361) (nan, 2.57539938649) (7.0, 5.86925853438) (3.0, 5.00243727975)
9 (4.0, 4.81479892139) (nan, 0.911735286485) (5.0, 2.99354160322) (3.0, 3.0845506317) (3.0, 3.51749578677)

In [12]:
d = comparison_data.to_latex()
text_file = open("comparison.txt", "w")
text_file.write(d)
text_file.close()

In [ ]:


In [ ]: