In [1]:
import pandas as pd
import numpy as np
from IPython.display import Image
from sklearn.model_selection import train_test_split
np.set_printoptions(precision = 3)
In [2]:
joke_data_df = pd.read_csv("jester-data-1.csv",header=None)
#joke_data_df.shape
joke_data_df.drop(joke_data_df.columns[[0]], axis=1, inplace=True)
joke_data_df.shape
joke_data_df.head()
#max(joke_data_df)
#joke_data_df.replace([99,100], np.nan)
Out[2]:
In [3]:
#d = joke_data_df.to_latex()
#text_file = open("Output.txt", "w")
#text_file.write(d)
#text_file.close()
In [4]:
train, testandvalidation = train_test_split(joke_data_df, test_size = 0.2)
test,validation = train_test_split(testandvalidation, test_size = 0.5)
n_features = 2
user_ratings = train.values
latent_user_preferences = np.random.random((user_ratings.shape[0], n_features))
latent_item_features = np.random.random((user_ratings.shape[1],n_features))
In [5]:
print(latent_item_features.shape)
In [6]:
print(latent_user_preferences.shape)
In [ ]:
def predict_rating(user_id,item_id):
""" Predict a rating given a user_id and an item_id.
"""
user_preference = latent_user_preferences[user_id]
item_preference = latent_item_features[item_id]
return user_preference.dot(item_preference)
def train(user_id, item_id, rating,alpha = 0.001):
#print item_id
prediction_rating = predict_rating(user_id, item_id)
err = ( prediction_rating- rating );
#print err
user_pref_values = latent_user_preferences[user_id][:]
latent_user_preferences[user_id] -= alpha * err * latent_item_features[item_id]
latent_item_features[item_id] -= alpha * err * user_pref_values
return err
def sgd(iterations = 30):
""" Iterate over all users and all items and train for
a certain number of iterations
"""
for iteration in range(0,iterations):
error = []
for user_id in range(0,latent_user_preferences.shape[0]):
for item_id in range(0,latent_item_features.shape[0]):
rating = user_ratings[user_id][item_id]
if(rating !=99):
err = train(user_id,item_id,rating)
error.append(err)
mse = (np.array(error) ** 2).mean()
if(iteration%1 == 0 ):
print mse
In [ ]:
sgd()
In [ ]:
predictions = latent_user_preferences.dot(latent_item_features.T)
predictions
In [ ]:
values = [zip(user_ratings[i], predictions[i]) for i in range(0,predictions.shape[0])]
comparison_data = pd.DataFrame(values)
comparison_data.columns = data.columns
comparison_data.applymap(lambda (x,y): "(%2.3f|%2.3f)"%(x,y))
In [ ]:
comparison_data
In [ ]:
In [ ]: