In [ ]:
import pandas as pd
import numpy as np
import math
import pickle
import matplotlib.pyplot as plt
%matplotlib inline

In [ ]:
def loading_data(filepath):
    #loading data
    ml = pd.read_csv(filepath, header=None)
    ml.columns = ['User','Item','ItemRating']
    return ml

In [ ]:
def create_interaction_cov(ml):
    # creating matrix from transactions
    ml_user_item_matrix = ml.pivot(index='User', columns='Item', values='ItemRating')
    ml_user_item_matrix = ml_user_item_matrix.fillna(0)
    ml_user_item_matrix = ml_user_item_matrix.reindex(index=range(ml_user_item_matrix.index.max() + 1), columns= range(ml_user_item_matrix.columns.max() + 1), fill_value=0)
    
    # create user covariance matrix
    cov_ml = np.dot(ml_user_item_matrix.values,ml_user_item_matrix.T.values)
    
    return ml_user_item_matrix, cov_ml

In [ ]:
def neighbors(cov_ml, user):
    # 'cov_ml' is the covariance matrix
    nn = np.argsort(-cov_ml[user,:])  # all neighbors sorted descending
    # I previously had a -1, double check
    return nn

In [ ]:
def prediction(ml_user_item_matrix, cov_ml, nn, user, item, number_of_n=10):
    neighbors = []
    
    # populating 'neighbors' with defined 'number_of_n'
    for n in nn:
        if len(neighbors) < number_of_n:
            if n in ml_user_item_matrix.index:
                # enforcing that neighbors have rated the item 
                if ml_user_item_matrix[item][n] != 0:
                    neighbors.append(n)

    # total weight of N neighbors
    total_distance = sum(cov_ml[user, neighbors])

    # get the proportion of weight for each neighbor
    weighted_input = cov_ml[user, neighbors] / total_distance

    # creating prediction from weighted average of neighbors ratings

    # getting the rating of the item to the predicted from each of the neighbors
    neighbors_ratings = []
    for e in neighbors:
        neighbors_ratings.append(ml_user_item_matrix[item][e])

    weighted_rate = neighbors_ratings * weighted_input
    prediction_rate = weighted_rate.sum()

    return weighted_input, neighbors_ratings, prediction_rate

Load training set and Create interaction matrix


In [ ]:
ml2 = loading_data("data/ml1m-train-clean")
ml_user_item_matrix, cov_ml = create_interaction_cov(ml2)

In [ ]:
ml_user_item_matrix.shape, cov_ml.shape

There are 6041 users and 3953 items

  • When we create a User covariance similarity matrix we end up with a 6041x6041 U/U matrix

In [ ]:
ml_user_item_matrix.head()

In [ ]:
# find neighbors of user "1"
u = 1
nn = neighbors(cov_ml, u)

In [ ]:
nn

In [ ]:
i = 2018

In [ ]:
# to make prediction for user ='1' and item '2018'
weighted_input, ratings_for_item, prediction_rate = prediction(ml_user_item_matrix, cov_ml, nn, u, i, number_of_n=10)

In [ ]:
prediction_rate

Load validation set


In [ ]:
val = pd.read_csv("data/ml1m-validation-clean", header=None)
val.columns = ['User','Item','ItemRating']

In [ ]:
val.head()

Make Predictions

To make predictions, we need to:

  • find the neighbors of user "u"
  • calculate a weighted average of the ratings of movie "i" for the neighbors of "u"

Lets calculate the neighbors for each of the users in the test/val set and add them to the dataframe


In [ ]:
val["Neighbors"] = val["User"].apply(lambda x: neighbors(cov_ml, x))

In [ ]:
val.head()

In [ ]:
users = val["User"].values
items = val["Item"].values
nn = val["Neighbors"].values

Lets test the prediction process through the first 5 items in the test set, using 10 nearest neighbors


In [ ]:
prediction_rates = []
for i in range(5):
    prediction_rates.append(prediction(ml_user_item_matrix, cov_ml, nn[i], users[i], items[i], number_of_n=10)[2])

In [ ]:
prediction_rates

In [ ]:
# Adding predictions to dataframe
pred200 = pd.Series(prediction_rates200)
val200['Prediction'] = pred200.values

Calculate RMSE

The prediction loop was used for a range of neighbors. Each of the outputs added to a column named "Prediction" and the dataframe were pickled with the following names:

  • val200.pickle
  • val300.pickle
  • val400.pickle
  • val500.pickle
  • val600.pickle
  • val700.pickle
  • val800.pickle
  • val900.pickle
  • val1000.pickle

In [ ]:
def calculate_RMSE(pickle_path):
    # Load pickled file with "Prediction"
    nn = pd.read_pickle(pickle_path)
    # Calculate "Error" for each prediction
    nn["Error"] = (nn["ItemRating"] - nn["Prediction"])**2
    number_of_preds = nn.shape[0]
    # Root mean square error
    rmse = math.sqrt(nn["Error"].sum() / number_of_preds)
    return rmse

In [ ]:
rmse200 = calculate_RMSE("data/val200.pickle")

In [ ]:
rmse200

In [ ]:
rmse300 = calculate_RMSE("data/val300.pickle")
rmse400 = calculate_RMSE("data/val400.pickle")
rmse500 = calculate_RMSE("data/val500.pickle")
rmse600 = calculate_RMSE("data/val600.pickle")
rmse700 = calculate_RMSE("data/val700.pickle")
rmse800 = calculate_RMSE("data/val800.pickle")
rmse900 = calculate_RMSE("data/val900.pickle")
rmse1000 = calculate_RMSE("data/val1000.pickle")

In [ ]:
x = np.linspace(200, 1000, 9)

In [ ]:
x

In [ ]:
rmse_values = [rmse200, rmse300, rmse400, rmse500, rmse600, rmse700, rmse800, rmse900, rmse1000]

In [ ]:
data = { "neighbors": x, "COV-RMSE": rmse_values}

In [ ]:
data

In [ ]:
neighbors = pd.DataFrame(data = data, columns=["neighbors", "COV-RMSE"])

In [ ]:
neighbors.head()

In [ ]:
ax = neighbors.plot( x="neighbors",style=['rx'], figsize=(8,4));
ax.set_ylabel("RMSE");
ax.set_xlabel("Neighbors");
ax.set_title("Evaluation Scores");

Comparing results


In [ ]:
l2v_exp = pd.read_pickle("data/l2v-neighbors.pickle")
l2v_neighbors = l2v_exp[["pred_neighbors","eval_score"]]
l2v_neighbors.columns = ["pred_neighbors","L2V-RMSE"]
l2v_neighbors.head()

In [ ]:
# adding COV-RMSE to dataframe containing l2V-RMSE
# l2v_neighbors.loc[l2v_neighbors.pred_neighbors == 200, "COV-RMSE"] = rmse200

In [84]:
ax = l2v_neighbors.plot( x="pred_neighbors",style=['o','rx'], figsize=(8,4));
ax.set_ylabel("RMSE");
ax.set_xlabel("Neighbors");
ax.set_title("Evaluation Scores");


The hyperparameters for L2v in the comparison are the experiments listed below (which can be found in "/CONFIG" folder in s3)


In [85]:
import sqlalchemy
import psycopg2

In [86]:
db_string = "postgresql://localhost:5433/jaimealmeida"
engine = sqlalchemy.create_engine(db_string)

In [92]:
the_frame = pd.read_sql_query("SELECT experimentid, pred_neighbors, eval_score  FROM %s WHERE pred_neighbors > 100 and pred_neighbors < 1100;" % "experiments", engine)

In [94]:
the_frame.sort_values(by="pred_neighbors")


Out[94]:
experimentid pred_neighbors eval_score
4 du05-d100w10l80n10d30-p5q1-200-072717 200 1.527724
1 du05-d100w10l80n10d30-p5q1-300-072717 300 1.449006
2 du05-d100w10l80n10d30-p5q1-400-072717 400 1.409381
3 du05-d100w10l80n10d30-p5q1-500-072717 500 1.383879
5 du05-d100w10l80n10d30-p5q1-600-072717 600 1.368604
6 du05-d100w10l80n10d30-p5q1-700-072717 700 1.356443
7 du05-d100w10l80n10d30-p5q1-800-072717 800 1.348276
8 du05-d100w10l80n10d30-p5q1-900-072717 900 1.342677
0 du05-d100w10l80n10d30-p5q1-1000-072717 1000 1.337528

In [ ]: