In [1]:
import pandas as pd
import numpy as np
from IPython.display import Image
from sklearn.model_selection import train_test_split
np.set_printoptions(precision = 3)

In [2]:
joke_data_df = pd.read_csv("jester-data-1.csv",header=None)
#joke_data_df.shape
joke_data_df.drop(joke_data_df.columns[[0]], axis=1, inplace=True)
joke_data_df.shape
joke_data_df.head()
#max(joke_data_df)
#joke_data_df.replace([99,100], np.nan)


Out[2]:
1 2 3 4 5 6 7 8 9 10 ... 91 92 93 94 95 96 97 98 99 100
0 -7.82 8.79 -9.66 -8.16 -7.52 -8.50 -9.85 4.17 -8.98 -4.76 ... 2.82 99.00 99.00 99.00 99.00 99.00 -5.63 99.00 99.00 99.00
1 4.08 -0.29 6.36 4.37 -2.38 -9.66 -0.73 -5.34 8.88 9.22 ... 2.82 -4.95 -0.29 7.86 -0.19 -2.14 3.06 0.34 -4.32 1.07
2 99.00 99.00 99.00 99.00 9.03 9.27 9.03 9.27 99.00 99.00 ... 99.00 99.00 99.00 9.08 99.00 99.00 99.00 99.00 99.00 99.00
3 99.00 8.35 99.00 99.00 1.80 8.16 -2.82 6.21 99.00 1.84 ... 99.00 99.00 99.00 0.53 99.00 99.00 99.00 99.00 99.00 99.00
4 8.50 4.61 -4.17 -5.39 1.36 1.60 7.04 4.61 -0.44 5.73 ... 5.19 5.58 4.27 5.19 5.73 1.55 3.11 6.55 1.80 1.60

5 rows × 100 columns


In [3]:
#d = joke_data_df.to_latex()
#text_file = open("Output.txt", "w")
#text_file.write(d)
#text_file.close()

In [4]:
train, testandvalidation = train_test_split(joke_data_df, test_size = 0.2)

test,validation = train_test_split(testandvalidation, test_size = 0.5)


n_features = 2

user_ratings = train.values
latent_user_preferences = np.random.random((user_ratings.shape[0], n_features))
latent_item_features = np.random.random((user_ratings.shape[1],n_features))

In [5]:
print(latent_item_features.shape)


(100, 2)

In [6]:
print(latent_user_preferences.shape)


(19986, 2)

In [ ]:
def predict_rating(user_id,item_id):
    """ Predict a rating given a user_id and an item_id.
    """
    user_preference = latent_user_preferences[user_id]
    item_preference = latent_item_features[item_id]
    return user_preference.dot(item_preference)

def train(user_id, item_id, rating,alpha = 0.001):
    
    #print item_id
    prediction_rating = predict_rating(user_id, item_id)
    err =  ( prediction_rating- rating );
    #print err
    user_pref_values = latent_user_preferences[user_id][:]
    latent_user_preferences[user_id] -= alpha * err * latent_item_features[item_id]
    latent_item_features[item_id] -= alpha * err * user_pref_values
    return err
    


def sgd(iterations = 30):
    """ Iterate over all users and all items and train for 
        a certain number of iterations
    """
    for iteration in range(0,iterations):
        error = []
        for user_id in range(0,latent_user_preferences.shape[0]):
            for item_id in range(0,latent_item_features.shape[0]):
                rating = user_ratings[user_id][item_id]
                if(rating !=99):
                    err = train(user_id,item_id,rating)
                    error.append(err)
        mse = (np.array(error) ** 2).mean()   
        if(iteration%1 == 0 ):
            print mse

In [ ]:
sgd()


24.4313180103
22.2173312358
20.6392687444
18.797368001
17.8239733082
17.6703184492
17.671873432
17.7162633361
17.7822045135
17.8693072428
17.9867705883
18.1534988068
18.4055079633
18.8256612848
19.756399858
/usr/local/lib/python2.7/dist-packages/ipykernel/__main__.py:16: RuntimeWarning: overflow encountered in multiply
/usr/local/lib/python2.7/dist-packages/ipykernel/__main__.py:15: RuntimeWarning: invalid value encountered in subtract
/usr/local/lib/python2.7/dist-packages/ipykernel/__main__.py:15: RuntimeWarning: overflow encountered in multiply
/usr/local/lib/python2.7/dist-packages/ipykernel/__main__.py:33: RuntimeWarning: overflow encountered in square
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan

In [ ]:
predictions = latent_user_preferences.dot(latent_item_features.T)

predictions

In [ ]:
values = [zip(user_ratings[i], predictions[i]) for i in range(0,predictions.shape[0])]
comparison_data = pd.DataFrame(values)
comparison_data.columns = data.columns
comparison_data.applymap(lambda (x,y): "(%2.3f|%2.3f)"%(x,y))

In [ ]:
comparison_data

In [ ]:


In [ ]: