notebook.community

Edit and run



In [1]:

    
import pandas as pd
import numpy as np
from IPython.display import Image
from sklearn.model_selection import train_test_split
np.set_printoptions(precision = 3)



In [2]:

    
joke_data_df = pd.read_csv("jester-data-1.csv",header=None)
#joke_data_df.shape
joke_data_df.drop(joke_data_df.columns[[0]], axis=1, inplace=True)
joke_data_df.shape
joke_data_df.head()
#max(joke_data_df)
#joke_data_df.replace([99,100], np.nan)









    Out[2]:






  
    
      
      1
      2
      3
      4
      5
      6
      7
      8
      9
      10
      ...
      91
      92
      93
      94
      95
      96
      97
      98
      99
      100
    
  
  
    
      0
      -7.82
      8.79
      -9.66
      -8.16
      -7.52
      -8.50
      -9.85
      4.17
      -8.98
      -4.76
      ...
      2.82
      99.00
      99.00
      99.00
      99.00
      99.00
      -5.63
      99.00
      99.00
      99.00
    
    
      1
      4.08
      -0.29
      6.36
      4.37
      -2.38
      -9.66
      -0.73
      -5.34
      8.88
      9.22
      ...
      2.82
      -4.95
      -0.29
      7.86
      -0.19
      -2.14
      3.06
      0.34
      -4.32
      1.07
    
    
      2
      99.00
      99.00
      99.00
      99.00
      9.03
      9.27
      9.03
      9.27
      99.00
      99.00
      ...
      99.00
      99.00
      99.00
      9.08
      99.00
      99.00
      99.00
      99.00
      99.00
      99.00
    
    
      3
      99.00
      8.35
      99.00
      99.00
      1.80
      8.16
      -2.82
      6.21
      99.00
      1.84
      ...
      99.00
      99.00
      99.00
      0.53
      99.00
      99.00
      99.00
      99.00
      99.00
      99.00
    
    
      4
      8.50
      4.61
      -4.17
      -5.39
      1.36
      1.60
      7.04
      4.61
      -0.44
      5.73
      ...
      5.19
      5.58
      4.27
      5.19
      5.73
      1.55
      3.11
      6.55
      1.80
      1.60
    
  

5 rows × 100 columns



In [3]:

    
#d = joke_data_df.to_latex()
#text_file = open("Output.txt", "w")
#text_file.write(d)
#text_file.close()



In [4]:

    
train, testandvalidation = train_test_split(joke_data_df, test_size = 0.2)

test,validation = train_test_split(testandvalidation, test_size = 0.5)


n_features = 2

user_ratings = train.values
latent_user_preferences = np.random.random((user_ratings.shape[0], n_features))
latent_item_features = np.random.random((user_ratings.shape[1],n_features))



In [5]:

    
print(latent_item_features.shape)



In [6]:

    
print(latent_user_preferences.shape)



In [ ]:

    
def predict_rating(user_id,item_id):
    """ Predict a rating given a user_id and an item_id.
    """
    user_preference = latent_user_preferences[user_id]
    item_preference = latent_item_features[item_id]
    return user_preference.dot(item_preference)

def train(user_id, item_id, rating,alpha = 0.001):
    
    #print item_id
    prediction_rating = predict_rating(user_id, item_id)
    err =  ( prediction_rating- rating );
    #print err
    user_pref_values = latent_user_preferences[user_id][:]
    latent_user_preferences[user_id] -= alpha * err * latent_item_features[item_id]
    latent_item_features[item_id] -= alpha * err * user_pref_values
    return err
    


def sgd(iterations = 30):
    """ Iterate over all users and all items and train for 
        a certain number of iterations
    """
    for iteration in range(0,iterations):
        error = []
        for user_id in range(0,latent_user_preferences.shape[0]):
            for item_id in range(0,latent_item_features.shape[0]):
                rating = user_ratings[user_id][item_id]
                if(rating !=99):
                    err = train(user_id,item_id,rating)
                    error.append(err)
        mse = (np.array(error) ** 2).mean()   
        if(iteration%1 == 0 ):
            print mse



In [ ]:

    
sgd()









    



24.4313180103
22.2173312358
20.6392687444
18.797368001
17.8239733082
17.6703184492
17.671873432
17.7162633361
17.7822045135
17.8693072428
17.9867705883
18.1534988068
18.4055079633
18.8256612848
19.756399858






    



/usr/local/lib/python2.7/dist-packages/ipykernel/__main__.py:16: RuntimeWarning: overflow encountered in multiply
/usr/local/lib/python2.7/dist-packages/ipykernel/__main__.py:15: RuntimeWarning: invalid value encountered in subtract
/usr/local/lib/python2.7/dist-packages/ipykernel/__main__.py:15: RuntimeWarning: overflow encountered in multiply
/usr/local/lib/python2.7/dist-packages/ipykernel/__main__.py:33: RuntimeWarning: overflow encountered in square






    



nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan



In [ ]:

    
predictions = latent_user_preferences.dot(latent_item_features.T)

predictions



In [ ]:

    
values = [zip(user_ratings[i], predictions[i]) for i in range(0,predictions.shape[0])]
comparison_data = pd.DataFrame(values)
comparison_data.columns = data.columns
comparison_data.applymap(lambda (x,y): "(%2.3f|%2.3f)"%(x,y))



In [ ]:

    
comparison_data



In [ ]:



In [ ]:

	1	2	3	4	5	6	7	8	9	10	...	91	92	93	94	95	96	97	98	99	100
0	-7.82	8.79	-9.66	-8.16	-7.52	-8.50	-9.85	4.17	-8.98	-4.76	...	2.82	99.00	99.00	99.00	99.00	99.00	-5.63	99.00	99.00	99.00
1	4.08	-0.29	6.36	4.37	-2.38	-9.66	-0.73	-5.34	8.88	9.22	...	2.82	-4.95	-0.29	7.86	-0.19	-2.14	3.06	0.34	-4.32	1.07
2	99.00	99.00	99.00	99.00	9.03	9.27	9.03	9.27	99.00	99.00	...	99.00	99.00	99.00	9.08	99.00	99.00	99.00	99.00	99.00	99.00
3	99.00	8.35	99.00	99.00	1.80	8.16	-2.82	6.21	99.00	1.84	...	99.00	99.00	99.00	0.53	99.00	99.00	99.00	99.00	99.00	99.00
4	8.50	4.61	-4.17	-5.39	1.36	1.60	7.04	4.61	-0.44	5.73	...	5.19	5.58	4.27	5.19	5.73	1.55	3.11	6.55	1.80	1.60