In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
In [2]:
# Import data
path = "../data/petdata_binary_1000_100.csv"
raw_data = pd.read_csv(path, index_col="doc_uri")
assert raw_data.shape == (1000,100), "Import error, df has false shape"
In [3]:
# Convert df
data = raw_data.unstack().to_frame().reset_index()
data.columns = ["user", "doc_uri", "rating"]
# Missing value handling
data.fillna(0, inplace=True)
assert data.shape == (raw_data.shape[0] * raw_data.shape[1], 3), "Conversion error, df has false shape"
assert data.rating.max() <= 1., "Value error, max rating over upper bound"
assert data.rating.min() >= -1., "Value error, min rating under lower bound"
data.head()
Out[3]:
In [4]:
data.rating.describe().to_frame().T
Out[4]:
In [5]:
data.rating.value_counts(normalize=True).to_frame().T
Out[5]:
In [6]:
# Plot distribution of (random) ratings
hist = data.rating.plot(kind="hist", grid=True,
bins=[-1.1,-0.9,-0.1,0.1,0.9,1.1])
hist.set(xlabel= "rating")
plt.tight_layout()
plt.savefig("plots/ratings_binary.png", orientation="landscape", dpi=120)
In [7]:
from surprise import KNNWithMeans, SVD, NMF, Dataset, Reader, accuracy
from surprise.prediction_algorithms.random_pred import NormalPredictor
from surprise.model_selection import cross_validate, GridSearchCV
reader = Reader(rating_scale=(-1, 1))
ds = Dataset.load_from_df(data[["user", "doc_uri", "rating"]], reader)
baseline_model = NormalPredictor() # Baseline model, predicts labels based on distribution of ratings
In [8]:
sim_options = {"name": "cosine", # cosine similarity
"user_based": True, # user-based
"min_support": 10 # min number of common items, else pred 0
}
user_knn = KNNWithMeans(sim_options=sim_options)
In [9]:
sim_options = {"name": "cosine", # cosine similarity
"user_based": False, # item-based
"min_support": 5 # min number of common users, else pred 0
}
item_knn = KNNWithMeans(sim_options=sim_options)
In [10]:
for algo_name, algo in zip(["Baseline", "User-based CF", "Item-based CF"],
[baseline_model, user_knn, item_knn]):
history = cross_validate(algo, ds, measures=["RMSE", "MAE"], cv=5, verbose=False)
print("***", algo_name, "***")
print("RMSE: {:0.3f} (std {:0.4f}) <- {}".format(history["test_rmse"].mean(),
history["test_rmse"].std(),
history["test_rmse"]))
print("MAE: {:0.3f} (std {:0.4f}) <- {}".format(history["test_mae"].mean(),
history["test_mae"].std(),
history["test_mae"]))
print("Avg fit time: {:0.5f}s".format(np.array(history["fit_time"]).mean()))
In [11]:
# Models - tune parameters, if you'd like ;)
svd = SVD() # Singular value decomposition
pmf = SVD(biased=False) # Probabilistic matrix factorization
nmf = NMF() # Non-negative matrix factorization
Predictions
SVD: $\hat r_{ui} = \mu + b_{u} + b_{i} + q^{\mathrm{T}}_{i} p_{u}$
Probabilistic MF: $\hat r_{ui} = q^{\mathrm{T}}_{i} p_{u}$
Non-negative MF: $\hat r_{ui} = q^{\mathrm{T}}_{i} p_{u}$ $\mid$ $p_{u}, q_{i} \in \mathbb{R_{+}}$
In [12]:
for algo_name, algo in zip(["SVD", "Probabilistic MF", "Non-negative MF"],
[svd, pmf, nmf]):
history = cross_validate(algo, ds, measures=["RMSE", "MAE"], cv=5, verbose=False)
print("***", algo_name, "***")
print("RMSE: {:0.3f} (std {:0.4f}) <- {}".format(history["test_rmse"].mean(),
history["test_rmse"].std(),
history["test_rmse"]))
print("MAE: {:0.3f} (std {:0.4f}) <- {}".format(history["test_mae"].mean(),
history["test_mae"].std(),
history["test_mae"]))
print("Avg fit time: {:0.5f}s".format(np.array(history["fit_time"]).mean()))
Nope, there isn't much of enhancement. But maybe finetuning on the two most promising models helps.
In [13]:
# SVD
param_svd = {"n_factors": [1, 100],
"n_epochs": [5, 20],
"reg_all": [0.02, 0.08], # regularization term for all param
"lr_all": [0.001, 0.005]} # learning rate for all param
gs_svd = GridSearchCV(SVD, param_svd, measures=["rmse", "mae"], cv=5)
gs_svd.fit(ds)
print("Best RMSE:", gs_svd.best_score["rmse"])
best_params_svd = gs_svd.best_params["rmse"]
for param in best_params_svd:
print(param, ":", best_params_svd[param])
In [14]:
# NMF
param_nmf = {"n_factors": [15, 100],
"n_epochs": [50, 60],
#"biased": [True, False],
#"reg_pu": [0.04, 0.06, 0.08], # regularization term for users
#"reg_qi": [0.04, 0.06, 0.08], # regularization term for items
"lr_bu": [0.001, 0.005], # learning rate for user bias term
"lr_bi": [0.001, 0.005]} # learning rate for item bias term
gs_nmf = GridSearchCV(NMF, param_nmf, measures=["rmse"], cv=5)
gs_nmf.fit(ds)
print("Best RMSE:", gs_nmf.best_score["rmse"])
best_params_nmf = gs_nmf.best_params["rmse"]
for param in best_params_nmf:
print(param, ":", best_params_nmf[param])
In [15]:
# Train final model
trainset = ds.build_full_trainset()
model = gs_svd.best_estimator["rmse"]
model.fit(trainset)
# RMSE of final model
testset = trainset.build_testset()
test_pred = model.test(testset)
accuracy.rmse(test_pred, verbose=True) # should be very bad ;)
Out[15]:
In [20]:
combinations_to_predict = [("Aaron Keith III", "http://www.bell.com/main.php"),
("Linda Torres", "http://www.martin-harris.org/main/"),
("Veronica Jackson", "https://www.carter.com/"),
("Cindy Jones", "https://www.garcia.com/homepage/")]
In [21]:
# Predictions
for combination in combinations_to_predict:
user = combination[0]
doc = combination[1]
pred = model.predict(user, doc)
pred_string = "like" if pred[3] > 0 else "dislike" # if estimated rating >0 => "like", else "dislike"
print(pred[0], "should **>", pred_string, "<**", pred[1])
In [ ]: