In [ ]:
%matplotlib inline
import matplotlib.pyplot as plt
import numpy as np
import os.path as op
from zipfile import ZipFile
try:
from urllib.request import urlretrieve
except ImportError: # Python 2 compat
from urllib import urlretrieve
ML_100K_URL = "http://files.grouplens.org/datasets/movielens/ml-100k.zip"
ML_100K_FILENAME = ML_100K_URL.rsplit('/', 1)[1]
ML_100K_FOLDER = 'ml-100k'
if not op.exists(ML_100K_FILENAME):
print('Downloading %s to %s...' % (ML_100K_URL, ML_100K_FILENAME))
urlretrieve(ML_100K_URL, ML_100K_FILENAME)
if not op.exists(ML_100K_FOLDER):
print('Extracting %s to %s...' % (ML_100K_FILENAME, ML_100K_FOLDER))
ZipFile(ML_100K_FILENAME).extractall('.')
In [ ]:
import pandas as pd
raw_ratings = pd.read_csv(op.join(ML_100K_FOLDER, 'u.data'), sep='\t',
names=["user_id", "item_id", "rating", "timestamp"])
raw_ratings.head()
In [ ]:
m_cols = ['item_id', 'title', 'release_date', 'video_release_date', 'imdb_url']
items = pd.read_csv(op.join(ML_100K_FOLDER, 'u.item'), sep='|',
names=m_cols, usecols=range(5), encoding='latin-1')
items.head()
Let's write a bit of Python preprocessing code to extract the release year as an integer value:
In [ ]:
def extract_year(release_date):
if hasattr(release_date, 'split'):
components = release_date.split('-')
if len(components) == 3:
return int(components[2])
# Missing value marker
return 1920
items['release_year'] = items['release_date'].map(extract_year)
items.hist('release_year', bins=50);
Enrich the raw ratings data with the collected items metadata:
In [ ]:
all_ratings = pd.merge(items, raw_ratings)
In [ ]:
all_ratings.head()
In [ ]:
min_user_id = all_ratings['user_id'].min()
min_user_id
In [ ]:
max_user_id = all_ratings['user_id'].max()
max_user_id
In [ ]:
min_item_id = all_ratings['item_id'].min()
min_item_id
In [ ]:
max_item_id = all_ratings['item_id'].max()
max_item_id
In [ ]:
all_ratings['rating'].describe()
Let's do a bit more pandas magic compute the popularity of each movie (number of ratings):
In [ ]:
popularity = all_ratings.groupby('item_id').size().reset_index(name='popularity')
items = pd.merge(popularity, items)
items.nlargest(10, 'popularity')
In [ ]:
items["title"][181]
In [ ]:
indexed_items = items.set_index('item_id')
indexed_items["title"][181]
In [ ]:
all_ratings = pd.merge(popularity, all_ratings)
all_ratings.describe()
In [ ]:
all_ratings.head()
Later in the analysis we will assume that this popularity does not come from the ratings themselves but from an external metadata, e.g. box office numbers in the month after the release in movie theaters.
Let's split the enriched data in a train / test split to make it possible to do predictive modeling:
In [ ]:
from sklearn.model_selection import train_test_split
ratings_train, ratings_test = train_test_split(
all_ratings, test_size=0.2, random_state=0)
user_id_train = np.array(ratings_train['user_id'])
item_id_train = np.array(ratings_train['item_id'])
rating_train = np.array(ratings_train['rating'])
user_id_test = np.array(ratings_test['user_id'])
item_id_test = np.array(ratings_test['item_id'])
rating_test = np.array(ratings_test['rating'])
In [ ]:
from tensorflow.keras.layers import Embedding, Flatten, Dense, Dropout
from tensorflow.keras.layers import Dot
from tensorflow.keras.models import Model
In [ ]:
# For each sample we input the integer identifiers
# of a single user and a single item
class RegressionModel(Model):
def __init__(self, embedding_size, max_user_id, max_item_id):
super().__init__()
self.user_embedding = Embedding(output_dim=embedding_size,
input_dim=max_user_id + 1,
input_length=1,
name='user_embedding')
self.item_embedding = Embedding(output_dim=embedding_size,
input_dim=max_item_id + 1,
input_length=1,
name='item_embedding')
# The following two layers don't have parameters.
self.flatten = Flatten()
self.dot = Dot(axes=1)
def call(self, inputs):
user_inputs = inputs[0]
item_inputs = inputs[1]
user_vecs = self.flatten(self.user_embedding(user_inputs))
item_vecs = self.flatten(self.item_embedding(item_inputs))
y = self.dot([user_vecs, item_vecs])
return y
model = RegressionModel(64, max_user_id, max_item_id)
model.compile(optimizer="adam", loss='mae')
In [ ]:
# Useful for debugging the output shape of model
initial_train_preds = model.predict([user_id_train, item_id_train])
initial_train_preds.shape
Using initial_train_preds
, compute the model errors:
Converting a pandas Series to numpy array is usually implicit, but you may use rating_train.values
to do so explicitly. Be sure to monitor the shapes of each object you deal with by using object.shape
.
In [ ]:
# %load solutions/compute_errors.py
In [ ]:
%%time
# Training the model
history = model.fit([user_id_train, item_id_train], rating_train,
batch_size=64, epochs=10, validation_split=0.1,
shuffle=True)
In [ ]:
plt.plot(history.history['loss'], label='train')
plt.plot(history.history['val_loss'], label='validation')
plt.ylim(0, 2)
plt.legend(loc='best')
plt.title('Loss');
Questions:
Now that the model is trained, the model MSE and MAE look nicer:
In [ ]:
def plot_predictions(y_true, y_pred):
plt.figure(figsize=(4, 4))
plt.xlim(-1, 6)
plt.xlabel("True rating")
plt.ylim(-1, 6)
plt.xlabel("Predicted rating")
plt.scatter(y_true, y_pred, s=60, alpha=0.01)
In [ ]:
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error
test_preds = model.predict([user_id_test, item_id_test])
print("Final test MSE: %0.3f" % mean_squared_error(test_preds, rating_test))
print("Final test MAE: %0.3f" % mean_absolute_error(test_preds, rating_test))
plot_predictions(rating_test, test_preds)
In [ ]:
train_preds = model.predict([user_id_train, item_id_train])
print("Final train MSE: %0.3f" % mean_squared_error(train_preds, rating_train))
print("Final train MAE: %0.3f" % mean_absolute_error(train_preds, rating_train))
plot_predictions(rating_train, train_preds)
In [ ]:
# weights and shape
weights = model.get_weights()
[w.shape for w in weights]
In [ ]:
# Solution:
# model.summary()
In [ ]:
user_embeddings = weights[0]
item_embeddings = weights[1]
In [ ]:
item_id = 181
print(f"Title for item_id={item_id}: {indexed_items['title'][item_id]}")
In [ ]:
print(f"Embedding vector for item_id={item_id}")
print(item_embeddings[item_id])
print("shape:", item_embeddings[item_id].shape)
Finding k most similar items to a point in embedding space
Notes:
np.linalg.norm
to compute the norm of vector, and you may specify the axis=
np.argsort(...)
enables to compute the sorted indices of a vectoritems["name"][idxs]
returns the names of the items indexed by array idxs
In [ ]:
EPSILON = 1e-07 # to avoid division by 0.
def cosine(x, y):
# TODO: implement me!
return 0.
In [ ]:
# %load solutions/similarity.py
In [ ]:
def print_similarity(item_a, item_b, item_embeddings, titles):
print(titles[item_a])
print(titles[item_b])
similarity = cosine(item_embeddings[item_a],
item_embeddings[item_b])
print(f"Cosine similarity: {similarity:.3}")
print_similarity(50, 181, item_embeddings, indexed_items["title"])
In [ ]:
print_similarity(181, 288, item_embeddings, indexed_items["title"])
In [ ]:
print_similarity(181, 1, item_embeddings, indexed_items["title"])
In [ ]:
print_similarity(181, 181, item_embeddings, indexed_items["title"])
In [ ]:
def cosine_similarities(item_id, item_embeddings):
"""Compute similarities between item_id and all items embeddings"""
query_vector = item_embeddings[item_id]
dot_products = item_embeddings @ query_vector
query_vector_norm = np.linalg.norm(query_vector)
all_item_norms = np.linalg.norm(item_embeddings, axis=1)
norm_products = query_vector_norm * all_item_norms
return dot_products / (norm_products + EPSILON)
similarities = cosine_similarities(181, item_embeddings)
similarities
In [ ]:
plt.hist(similarities, bins=30);
In [ ]:
def most_similar(item_id, item_embeddings, titles,
top_n=30):
sims = cosine_similarities(item_id, item_embeddings)
# [::-1] makes it possible to reverse the order of a numpy
# array, this is required because most similar items have
# a larger cosine similarity value
sorted_indexes = np.argsort(sims)[::-1]
idxs = sorted_indexes[0:top_n]
return list(zip(idxs, titles[idxs], sims[idxs]))
most_similar(50, item_embeddings, indexed_items["title"], top_n=10)
In [ ]:
# items[items['title'].str.contains("Star Trek")]
In [ ]:
most_similar(227, item_embeddings, indexed_items["title"], top_n=10)
The similarities do not always make sense: the number of ratings is low and the embedding does not automatically capture semantic relationships in that context. Better representations arise with higher number of ratings, and less overfitting in models or maybe better loss function, such as those based on implicit feedback.
In [ ]:
from sklearn.manifold import TSNE
item_tsne = TSNE(perplexity=30).fit_transform(item_embeddings)
In [ ]:
import matplotlib.pyplot as plt
plt.figure(figsize=(10, 10))
plt.scatter(item_tsne[:, 0], item_tsne[:, 1]);
plt.xticks(()); plt.yticks(());
plt.show()
In [ ]:
%pip install -q plotly
In [ ]:
import plotly.express as px
tsne_df = pd.DataFrame(item_tsne, columns=["tsne_1", "tsne_2"])
tsne_df["item_id"] = np.arange(item_tsne.shape[0])
tsne_df = tsne_df.merge(items.reset_index())
px.scatter(tsne_df, x="tsne_1", y="tsne_2",
color="popularity",
hover_data=["item_id", "title",
"release_year", "popularity"])
Alternatively with Uniform Manifold Approximation and Projection:
In [ ]:
# %pip install umap-learn
In [ ]:
# import umap
# item_umap = umap.UMAP().fit_transform(item_embeddings)
# plt.figure(figsize=(10, 10))
# plt.scatter(item_umap[:, 0], item_umap[:, 1]);
# plt.xticks(()); plt.yticks(());
# plt.show()
In [ ]:
from tensorflow.keras.layers import Concatenate
In [ ]:
class DeepRegressionModel(Model):
def __init__(self, embedding_size, max_user_id, max_item_id):
super().__init__()
self.user_embedding = Embedding(output_dim=embedding_size,
input_dim=max_user_id + 1,
input_length=1,
name='user_embedding')
self.item_embedding = Embedding(output_dim=embedding_size,
input_dim=max_item_id + 1,
input_length=1,
name='item_embedding')
# The following two layers don't have parameters.
self.flatten = Flatten()
self.concat = Concatenate()
self.dropout = Dropout(0.99)
self.dense1 = Dense(64, activation="relu")
self.dense2 = Dense(2, activation="tanh")
def call(self, inputs, training=False):
user_inputs = inputs[0]
item_inputs = inputs[1]
user_vecs = self.flatten(self.user_embedding(user_inputs))
item_vecs = self.flatten(self.item_embedding(item_inputs))
input_vecs = self.concat([user_vecs, item_vecs])
y = self.dropout(input_vecs, training=training)
y = self.dense1(y)
y = self.dense2(y)
return y
model = DeepRegressionModel(64, max_user_id, max_item_id)
model.compile(optimizer='adam', loss='binary_crossentropy')
initial_train_preds = model.predict([user_id_train, item_id_train])
In [ ]:
# %load solutions/deep_explicit_feedback_recsys.py
In [ ]:
%%time
history = model.fit([user_id_train, item_id_train], rating_train,
batch_size=64, epochs=10, validation_split=0.1,
shuffle=True)
In [ ]:
plt.plot(history.history['loss'], label='train')
plt.plot(history.history['val_loss'], label='validation')
plt.ylim(0, 2)
plt.legend(loc='best')
plt.title('Loss');
In [ ]:
train_preds = model.predict([user_id_train, item_id_train])
print("Final train MSE: %0.3f" % mean_squared_error(train_preds, rating_train))
print("Final train MAE: %0.3f" % mean_absolute_error(train_preds, rating_train))
In [ ]:
test_preds = model.predict([user_id_test, item_id_test])
print("Final test MSE: %0.3f" % mean_squared_error(test_preds, rating_test))
print("Final test MAE: %0.3f" % mean_absolute_error(test_preds, rating_test))
The performance of this model is not necessarily significantly better than the previous model but you can notice that the gap between train and test is lower, probably thanks to the use of dropout.
Furthermore this model is more flexible in the sense that we can extend it to include metadata for hybrid recsys as we will see in the following.
Manual tuning of so many hyperparameters is tedious. In practice it's better to automate the design of the model using an hyperparameter search tool such as:
In [ ]:
from sklearn.preprocessing import QuantileTransformer
meta_columns = ['popularity', 'release_year']
scaler = QuantileTransformer()
item_meta_train = scaler.fit_transform(ratings_train[meta_columns])
item_meta_test = scaler.transform(ratings_test[meta_columns])
In [ ]:
class HybridModel(Model):
def __init__(self, embedding_size, max_user_id, max_item_id):
super().__init__()
self.user_embedding = Embedding(output_dim=embedding_size,
input_dim=max_user_id + 1,
input_length=1,
name='user_embedding')
self.item_embedding = Embedding(output_dim=embedding_size,
input_dim=max_item_id + 1,
input_length=1,
name='item_embedding')
# The following two layers don't have parameters.
self.flatten = Flatten()
self.concat = Concatenate()
self.dense1 = Dense(64, activation="relu")
self.dropout = Dropout(0.3)
self.dense2 = Dense(64, activation='relu')
self.dense3 = Dense(1)
def call(self, inputs, training=False):
user_inputs = inputs[0]
item_inputs = inputs[1]
meta_inputs = inputs[2]
user_vecs = self.flatten(self.user_embedding(user_inputs))
user_vecs = self.dropout(user_vecs, training=training)
item_vecs = self.flatten(self.item_embedding(item_inputs))
item_vecs = self.dropout(item_vecs, training=training)
input_vecs = self.concat([user_vecs, item_vecs, meta_inputs])
y = self.dense1(input_vecs)
y = self.dropout(y, training=training)
y = self.dense2(y)
y = self.dropout(y, training=training)
y = self.dense3(y)
return y
model = HybridModel(64, max_user_id, max_item_id)
model.compile(optimizer='adam', loss='mae')
initial_train_preds = model.predict([user_id_train,
item_id_train,
item_meta_train])
In [ ]:
%%time
history = model.fit([user_id_train, item_id_train, item_meta_train],
rating_train,
batch_size=64, epochs=10, validation_split=0.1,
shuffle=True)
In [ ]:
test_preds = model.predict([user_id_test, item_id_test, item_meta_test])
print("Final test MSE: %0.3f" % mean_squared_error(test_preds, rating_test))
print("Final test MAE: %0.3f" % mean_absolute_error(test_preds, rating_test))
The additional metadata seems to improve the predictive power of the model a bit but this should be re-run several times to see the impact of the random initialization of the model.
Once the model is trained, the system can be used to recommend a few items for a user, that he/she hasn't already seen:
model.predict
to compute the ratings a user would have given to all items
In [ ]:
def recommend(user_id, top_n=10):
item_ids = range(1, max_item_id)
seen_mask = all_ratings["user_id"] == user_id
seen_movies = set(all_ratings[seen_mask]["item_id"])
item_ids = list(filter(lambda x: x not in seen_movies, item_ids))
print("User %d has seen %d movies, including:" % (user_id, len(seen_movies)))
for title in all_ratings[seen_mask].nlargest(20, 'popularity')['title']:
print(" ", title)
print("Computing ratings for %d other movies:" % len(item_ids))
item_ids = np.array(item_ids)
user_ids = np.zeros_like(item_ids)
user_ids[:] = user_id
items_meta = scaler.transform(indexed_items[meta_columns].loc[item_ids])
rating_preds = model.predict([user_ids, item_ids, items_meta])
item_ids = np.argsort(rating_preds[:, 0])[::-1].tolist()
rec_items = item_ids[:top_n]
return [(items["title"][movie], rating_preds[movie][0])
for movie in rec_items]
In [ ]:
for title, pred_rating in recommend(5):
print(" %0.1f: %s" % (pred_rating, title))
In [ ]:
import numpy as np
np.unique(rating_train)
Maybe we can help the model by forcing it to predict those values by treating the problem as a multiclassification problem. The only required changes are:
In [ ]:
# %load solutions/classification.py
In [ ]: