In this notebook we are going to explain how the model presented in the paper 'Multi-Criteria Service Recommendation Based on User Criteria Preferences' was implemented.
To test this implementation a dataset containing reviews from TripAdvisor was chosen. This dataset consists of 878,561 reviews (1.3GB) from 4,333 hotels crawled from TripAdvisor.
The implementation is maninly composed of two big steps, which are data preprocessing and rating prediction.
In this step the reviews are filtered in order to obtain only the reviews that are useful for the recommendation purposes. The steps followed to clean the data and get it ready for the recommendation process are:
In [ ]:
def remove_empty_user_reviews(reviews):
"""
Returns a copy of the original reviews list without the reviews made by
users who have an empty ID
:param reviews: a list of reviews
:return: a copy of the original reviews list without the reviews made by
users who have an empty ID
"""
filtered_reviews = [review for review in reviews if
review['user_id'] != '']
return filtered_reviews
def extract_fields(reviews):
ratings_criteria = [
'cleanliness',
'location',
'rooms',
'service',
# 'sleep_quality',
'value'
]
for review in reviews:
# review['user_id'] = get_dictionary_subfield(review, ['author', 'id'])
review['user_id'] = review['author']['id']
review['overall_rating'] = review['ratings']['overall']
ratings = review['ratings']
for criterion in ratings_criteria:
if criterion in ratings:
review[criterion + '_rating'] = review['ratings'][criterion]
def remove_users_with_low_reviews(reviews, min_reviews):
"""
Returns a copy of the original reviews list without the reviews made by
users who have made less than min_reviews reviews
:param reviews: a list of reviews
:param min_reviews: the minimum number of reviews a user must have in order
not to be removed from the reviews list
:return: a copy of the original reviews list without the reviews made by
users who have made less than min_reviews reviews
"""
users = get_user_list(reviews, min_reviews)
return ETLUtils.filter_records(reviews, 'user_id', users)
def remove_single_review_hotels(reviews):
"""
TODO: Missing, this method is just a stub
Returns a copy of the original reviews list without the reviews of hotels
that just have been reviewed once
:param reviews: a list of reviews
:return: a copy of the original reviews list without the reviews of hotels
that just have been reviewed once
"""
items = get_item_list(reviews, 2)
return ETLUtils.filter_records(reviews, 'offering_id', items)
def remove_missing_ratings_reviews(reviews):
"""
Returns a copy of the original reviews list without the reviews that have
missing ratings
:param reviews: a list of reviews
:return: a copy of the original reviews list without the reviews that have
missing ratings
"""
filtered_reviews = [review for review in reviews if
verify_rating_criteria(review)]
return filtered_reviews
def verify_rating_criteria(review):
"""
Verifies if the given review contains all the ratings criteria required.
Returns True in case the review contains all the necessary keys. Returns
False otherwise. For example, if a review contains
{'ratings': {'cleanliness':3, 'location':4, 'rooms': 5, 'service':3, 'value': 4}, ...}
It will return True because all the desired criteria is present. But if a
review contains {'ratings': {'cleanliness':3, 'value': 4}, ...} it will
return False because it doesn't contain values for 'location', 'rooms' and
'service'
:param review: a dictionary with the review information
:return: True in case all the desired ratings criteria are present in the
review, False otherwise
"""
expected_criteria = [
'cleanliness',
'location',
'rooms',
'service',
# 'sleep_quality',
'value'
]
expected_criteria = set(expected_criteria)
actual_criteria = set(review['ratings'])
return expected_criteria.issubset(actual_criteria)
def clean_reviews(reviews):
"""
Returns a copy of the original reviews list with only that are useful for
recommendation purposes
:param reviews: a list of reviews
:return: a copy of the original reviews list with only that are useful for
recommendation purposes
"""
filtered_reviews = remove_empty_user_reviews(reviews)
filtered_reviews = remove_missing_ratings_reviews(filtered_reviews)
print('Finished remove_missing_ratings_reviews')
filtered_reviews = remove_users_with_low_reviews(filtered_reviews, 10)
print('Finished remove_users_with_low_reviews')
filtered_reviews = remove_single_review_hotels(filtered_reviews)
print('Finished remove_single_review_hotels')
return filtered_reviews
def pre_process_reviews():
"""
Returns a list of preprocessed reviews, where the reviews have been filtered
to obtain only relevant data, have dropped any fields that are not useful,
and also have additional fields that are handy to make calculations
:return: a list of preprocessed reviews
"""
data_folder = '../../../../../../datasets/TripAdvisor/Four-City/'
review_file_path = data_folder + 'review.txt'
# review_file_path = data_folder + 'review-short.json'
reviews = ETLUtils.load_json_file(review_file_path)
select_fields = ['ratings', 'author', 'offering_id']
reviews = ETLUtils.select_fields(select_fields, reviews)
extract_fields(reviews)
ETLUtils.drop_fields(['author'], reviews)
reviews = clean_reviews(reviews)
return reviews
def create_ratings_matrix(reviews):
"""
Returns (ratings_matrix, overall_ratings_list), where ratings_matrix is a
list of lists containing the values for all the rating criteria (except
the overall rating), so for each review a list of rating values is returned.
overall_ratings_list is a list containing the overall rating for each of the
reviews. This function verifies that reviews don't have any missing ratings,
in case there are ratings missing, those are not included in the returning
values
:param reviews: a list of reviews
:return: (ratings_matrix, overall_ratings_list), where ratings_matrix is a
list of lists containing the values for all the rating criteria (except
the overall rating), and overall_ratings_list is a list containing the
overall rating for each of the reviews
"""
ratings_matrix = []
overall_ratings_list = []
missing_count = 0
for review in reviews:
ratings = review['ratings']
ratings_list = []
rating_criteria = [
'cleanliness',
'location',
'rooms',
'service',
# 'sleep_quality',
'value'
]
contains_missing_rating = 0
for criterion in rating_criteria:
if criterion in ratings:
ratings_list.append(ratings[criterion])
missing_count += contains_missing_rating
# If there are not missing ratings, we add the ratings to the matrix
# In other words, we are ignoring reviews with missing ratings
if not contains_missing_rating:
ratings_matrix.append(ratings_list)
overall_ratings_list.append(ratings['overall'])
review['ratings_list'] = ratings_list
# review['overall_rating'] = ratings['overall']
# print(missing_count)
return ratings_matrix, overall_ratings_list
def get_user_list(reviews, min_reviews):
"""
Returns the list of users that have reviewed at least min_reviews hotels
:param reviews: the list of reviews
:param min_reviews: the minimum number of reviews
:return: a list of user IDs
"""
data_frame = DataFrame(reviews)
column = 'user_id'
counts = data_frame.groupby(column).size()
filtered_counts = counts[counts >= min_reviews]
# print(filtered_counts)
num_users = len(filtered_counts)
num_reviews = filtered_counts.sum()
print('Number of users: %i' % num_users)
print('Number of reviews: %i' % num_reviews)
users = filtered_counts.index.get_level_values(1).tolist()
return users
def get_groupby_list(reviews, column):
"""
Groups the reviews by the given column and then returns all the distinct
column values in a list
:param reviews: the list of reviews
:param column: the column which is going to be used to group the data
:return: a list of all the distinct values of the given column in the
reviews
"""
data_frame = DataFrame(reviews)
counts = data_frame.groupby(column).size()
users = counts.index.get_level_values(1).tolist()
return users
def get_item_list(reviews, min_reviews):
"""
Returns the list of items that have at least min_reviews
:param reviews: the list of reviews
:param min_reviews: the minimum number of reviews
:return: a list of item IDs
"""
data_frame = DataFrame(reviews)
column = 'offering_id'
counts = data_frame.groupby(column).size()
filtered_counts = counts[counts >= min_reviews]
# print(filtered_counts)
num_items = len(filtered_counts)
num_reviews = filtered_counts.sum()
print('Number of items: %i' % num_items)
print('Number of reviews: %i' % num_reviews)
items = filtered_counts.index.get_level_values(1).tolist()
return items
def get_user_average_overall_rating(reviews, user_id, apply_filter=True):
"""
Returns the average of the overall ratings that this user has given to
every item he/she has reviewed
:param reviews: a list of reviews
:param user_id: the ID of the user
:return: the average (or mean) of all the overall ratings that this has
given to all the items he/she has reviewed
"""
if apply_filter:
user_reviews = ETLUtils.filter_records(reviews, 'user_id', [user_id])
else:
user_reviews = reviews
ratings_sum = 0.
ratings_count = len(user_reviews)
for review in user_reviews:
ratings_sum += review['overall_rating']
average_rating = float(ratings_sum) / float(ratings_count)
return average_rating
def get_criteria_weights(reviews, user_id, apply_filter=True):
"""
Obtains the weights for each of the criterion of the given user
:param reviews: a list of all the available reviews
:param user_id: the ID of the user
:return: a list with the weights for each of the criterion of the given user
"""
# filtered_reviews = ETLUtils.filter_records(reviews, 'user_id', [user_id])
if apply_filter:
user_reviews = ETLUtils.filter_records(reviews, 'user_id', [user_id])
else:
user_reviews = reviews
ratings_matrix, overall_ratings_list = create_ratings_matrix(user_reviews)
overall_ratings_matrix = numpy.vstack(
[overall_ratings_list, numpy.ones(len(overall_ratings_list))]).T
m, c = numpy.linalg.lstsq(overall_ratings_matrix, ratings_matrix)[0]
return m
def get_significant_criteria(criteria_weights):
"""
Returns (significant_criteria, cluster_name) where significant_criteria is a
dictionary with the criteria that are significant and their values.
cluster_name is the name of the cluster in which a user with the obtained
significant criteria must belong
:param criteria_weights: a list with the weights for each criterion
:return: (significant_criteria, cluster_name) where significant_criteria is
a dictionary with the criteria that are significant and their values.
cluster_name is the name of the cluster in which a user with the obtained
significant criteria must belong
"""
rating_criteria = [
'cleanliness',
'location',
'rooms',
'service',
# 'sleep_quality',
'value'
]
cluster_name = ''
significant_criteria = {}
for index, value in enumerate(criteria_weights):
# if (0.8 < value < 1.2) or (-1.2 < value < -0.8):
# if (0.7 < value < 1.3) or (-1.3 < value < -0.7):
# if (0.5 < value < 1.5) or (-1.5 < value < -0.5):
# if (0.1 < value < 1.9) or (-1.1 < value < -0.9):
if True:
significant_criteria[rating_criteria[index]] = value
cluster_name += '1'
else:
cluster_name += '0'
return significant_criteria, cluster_name
def initialize_users(reviews):
"""
Builds a dictionary containing all the users in the reviews. Each user
contains information about its average overall rating, the list of reviews
that user has made, and the cluster the user belongs to
:param reviews: the list of reviews
:return: a dictionary with the users initialized, the keys of the
dictionaries are the users' ID
"""
user_ids = get_groupby_list(reviews, 'user_id')
user_dictionary = {}
for user_id in user_ids:
user = User(user_id)
user_reviews = ETLUtils.filter_records(reviews, 'user_id', [user_id])
user.average_overall_rating = get_user_average_overall_rating(
user_reviews, user_id, apply_filter=False)
user.criteria_weights = get_criteria_weights(
user_reviews, user_id, apply_filter=False)
_, user.cluster = get_significant_criteria(user.criteria_weights)
user_dictionary[user_id] = user
print('Total users: %i' % len(user_ids))
return user_dictionary
def get_five_star_hotels_from_user(user_reviews, min_value):
"""
Returns the list of hotels that this user has reviewed with an average
overall rating higher than min_value
:param user_reviews: the reviews the user has made
:param min_value: the minimum value for the average overall rating that this
user has given to a hotel
:return: the list of hotels that this user has reviewed with an average
overall rating higher than min_value
"""
data_frame = DataFrame(user_reviews)
column = 'offering_id'
counts = data_frame.groupby(column).mean()
filtered_counts = counts[counts['overall_rating'] >= min_value]
# print(filtered_counts)
items = filtered_counts.index.get_level_values(1).tolist()
return items
We also create some auxiliary functions to help us in the recommendation process. The first function will help us to obtain the list of users that have made for than N reviews. The second function will extract the multi-criteria ratings and overall ratings of each review in a matrix-like form and a list.
In [ ]:
def create_ratings_matrix(reviews):
"""
Returns (ratings_matrix, overall_ratings_list), where ratings_matrix is a
list of lists containing the values for all the rating criteria (except
the overall rating), so for each review a list of rating values is returned.
overall_ratings_list is a list containing the overall rating for each of the
reviews. This function verifies that reviews don't have any missing ratings,
in case there are ratings missing, those are not included in the returning
values
:param reviews: a list of reviews
:return: (ratings_matrix, overall_ratings_list), where ratings_matrix is a
list of lists containing the values for all the rating criteria (except
the overall rating), and overall_ratings_list is a list containing the
overall rating for each of the reviews
"""
ratings_matrix = []
overall_ratings_list = []
missing_count = 0
for review in reviews:
ratings = review['ratings']
ratings_list = []
rating_criteria = [
'cleanliness',
'location',
'rooms',
'service',
# 'sleep_quality',
'value'
]
contains_missing_rating = 0
for criterion in rating_criteria:
if criterion in ratings:
ratings_list.append(ratings[criterion])
missing_count += contains_missing_rating
# If there are not missing ratings, we add the ratings to the matrix
# In other words, we are ignoring reviews with missing ratings
if not contains_missing_rating:
ratings_matrix.append(ratings_list)
overall_ratings_list.append(ratings['overall'])
review['ratings_list'] = ratings_list
review['overall_rating'] = ratings['overall']
# print(missing_count)
return ratings_matrix, overall_ratings_list
def get_user_list(reviews, min_reviews):
"""
Returns the list of users that have reviewed at least min_reviews hotels
:param reviews: the list of reviews
:param min_reviews: the minimum number of reviews
:return: a list of user IDs
"""
data_frame = DataFrame(reviews)
column = 'user_id'
counts = data_frame.groupby(column).size()
filtered_counts = counts[counts >= min_reviews]
# print(filtered_counts)
num_users = len(filtered_counts)
num_reviews = filtered_counts.sum()
print('Number of users: %i' % num_users)
print('Number of reviews: %i' % num_reviews)
users = filtered_counts.index.get_level_values(1).tolist()
return users
In order to predict the overall rating that a user would give to a hotel, we must first determine which criteria are relevant for that user, to do so, we start by performing a linear regression between the overall (y) rating of a user and its multi-criteria ratings (x). This will help us to see how much each criterion influences the overall rating.
To do so, we execute the following function that will return us the weight of each criterion over the overall rating.
In [ ]:
def get_criteria_weights(reviews, user_id):
"""
Obtains the weights for each of the criterion of the given user
:param reviews: a list of all the available reviews
:param user_id: the ID of the user
:return: a list with the weights for each of the criterion of the given user
"""
filtered_reviews = ETLUtils.filter_records(reviews, 'user_id', [user_id])
ratings_matrix, overall_ratings_list = extractor.create_ratings_matrix(
filtered_reviews)
overall_ratings_matrix = numpy.vstack(
[overall_ratings_list, numpy.ones(len(overall_ratings_list))]).T
m, c = numpy.linalg.lstsq(overall_ratings_matrix, ratings_matrix)[0]
return m
Now that we have the weights for each of the criterion, we must determine, which of this criteria are significant and which are not. We can use the following function to do that. We have set some dummy values for the range.
In [ ]:
def get_significant_criteria(criteria_weights):
"""
Returns (significant_criteria, cluster_name) where significant_criteria is a
dictionary with the criteria that are significant and their values.
cluster_name is the name of the cluster in which a user with the obtained
significant criteria must belong
:param criteria_weights: a list with the weights for each criterion
:return: (significant_criteria, cluster_name) where significant_criteria is
a dictionary with the criteria that are significant and their values.
cluster_name is the name of the cluster in which a user with the obtained
significant criteria must belong
"""
rating_criteria = [
'cleanliness',
'location',
'rooms',
'service',
# 'sleep_quality',
'value'
]
cluster_name = ''
significant_criteria = {}
for index, value in enumerate(criteria_weights):
if (0.7 < value < 1.3) or (-1.3 < value < -0.7):
significant_criteria[rating_criteria[index]] = value
cluster_name += '1'
else:
cluster_name += '0'
return significant_criteria, cluster_name
Having determine which criteria is significant for each user, we can cluster together the users that share exactly the same significant criteria. The next function will do that for us
In [ ]:
def build_user_clusters(reviews):
"""
Builds a series of clusters for users according to their significant
criteria. Users that have exactly the same significant criteria will belong
to the same cluster.
:param reviews: the list of reviews
:return: a dictionary where all the keys are the cluster names and the
values for those keys are list of users that belong to that cluster
"""
min_reviews = 10
user_list = extractor.get_user_list(reviews, min_reviews)
user_cluster_dictionary = {}
for user in user_list:
weights = get_criteria_weights(reviews, user)
significant_criteria, cluster_name = get_significant_criteria(weights)
if cluster_name in user_cluster_dictionary:
user_cluster_dictionary[cluster_name].append(user)
else:
user_cluster_dictionary[cluster_name] = [user]
for key in user_cluster_dictionary.keys():
print(key, len(user_cluster_dictionary[key]))
return user_cluster_dictionary
Finally, we implement the function that will predict the overall rating of a user for a hotel.
In [ ]:
def clu_overall(reviews, user_id, user_cluster_dictionary, hotel_id):
single_review = ETLUtils.filter_records(reviews, 'user_id', [user_id])
single_review = ETLUtils.filter_records(single_review, 'offering_id',
[hotel_id])
actual_overall_rating = single_review[0]['ratings']['overall']
# TODO: In case a user has more than one review for the same hotel
# TODO: take the average of the overall ratings in those reviews
weights = get_criteria_weights(reviews, user_id)
significant_criteria, cluster_name = get_significant_criteria(weights)
# We remove the given user from the cluster in order to avoid bias
cluster_users = list(user_cluster_dictionary[cluster_name])
cluster_users.remove(user_id)
filtered_reviews = ETLUtils.filter_records(reviews, 'offering_id',
[hotel_id])
filtered_reviews = ETLUtils.filter_out_records(filtered_reviews, 'user_id',
[user_id])
ratings_sum = 0
ratings_count = 0
for user in cluster_users:
user_reviews = ETLUtils.filter_records(filtered_reviews, 'user_id',
[user])
for review in user_reviews:
ratings_sum += review['overall_rating']
ratings_count += 1
average_rating = 0.
error = 0.
if ratings_count > 0:
average_rating = float(ratings_sum) / float(ratings_count)
error = abs(average_rating - actual_overall_rating)
return average_rating, error
To test the accuracy of the prediction we use the mean average error metric.
In [ ]:
def calculate_mean_average_error(reviews, user_cluster_dictionary):
"""
Calculates the mean average error for the predicted rating
:param reviews: the list of all reviews
:param user_cluster_dictionary: a dictionary where all the keys are the
cluster names and the values for those keys are list of users that belong to
that cluster
:return: the mean average error after predicting all the overall ratings
"""
num_ratings = 0.
total_error = 0.
for review in reviews:
average_rating, error = clu_overall(
reviews, review['user_id'], user_cluster_dictionary,
review['offering_id'])
if average_rating != 0:
total_error += error
num_ratings += 1
mean_absolute_error = total_error / num_ratings
return mean_absolute_error
def main():
reviews = extractor.pre_process_reviews()
user_cluster_dictionary = build_user_clusters(reviews)
mean_absolute_error = calculate_mean_average_error(reviews, user_cluster_dictionary)
print('Mean Absolute error: %f' % mean_absolute_error)
In [ ]:
class CluCFEuc:
def __init__(self):
self.reviews = extractor.pre_process_reviews()
self.user_dictionary = extractor.initialize_users(self.reviews)
self.user_cluster_dictionary = fourcity_clusterer.build_user_clusters(self.reviews)
self.users = extractor.get_groupby_list(self.reviews, 'user_id')
self.items = extractor.get_groupby_list(self.reviews, 'offering_id')
self.user_reviews_dictionary = fourcity_clusterer.build_user_reviews_dictionary(self.reviews, self.users)
def clu_cf_euc(self, user_id, hotel_id):
average_overall_rating = self.user_dictionary[
user_id].average_overall_rating
cluster_name = self.user_dictionary[user_id].cluster
# We remove the given user from the cluster in order to avoid bias
cluster_users = list(self.user_cluster_dictionary[cluster_name])
cluster_users.remove(user_id)
similarities_sum = 0.
similarities_ratings_sum = 0.
for cluster_user in cluster_users:
cluster_user_overall_rating = self.user_dictionary[cluster_user].average_overall_rating
users_similarity = self.calculate_users_similarity(cluster_user, user_id)
user_item_rating = fourcity_clusterer.get_user_item_overall_rating(
self.user_reviews_dictionary[user_id], user_id, hotel_id)
if user_item_rating is not None:
similarities_sum += users_similarity
similarities_ratings_sum += users_similarity * (user_item_rating - cluster_user_overall_rating)
predicted_rating = None
error = None
if similarities_sum > 0:
predicted_rating = \
average_overall_rating + similarities_ratings_sum / similarities_sum
if predicted_rating > 5:
predicted_rating = 5
elif predicted_rating < 1:
predicted_rating = 1
error = abs(predicted_rating - average_overall_rating)
# print('Predicted rating: %f' % predicted_rating)
# print('Average overall rating: %f' % average_overall_rating)
return predicted_rating, error
def clu_cf_euc_list(self):
predicted_ratings = []
errors = []
index = 0
print('CluCFEuc')
print('Total reviews: %i' % len(self.reviews))
for review in self.reviews:
print('Index: %i' % index)
index += 1
predicted_rating, error = self.clu_cf_euc(
review['user_id'], review['offering_id'])
predicted_ratings.append(predicted_rating)
errors.append(error)
return predicted_ratings, errors
def clu_cf_euc_items(self):
predicted_ratings = {}
errors = []
index = 0
print('CluCFEuc Items')
print('Total reviews: %i' % len(self.reviews))
for user in self.users:
print('Index: %i' % index)
index += 1
predicted_ratings[user] = {}
five_star_hotels = extractor.get_five_star_hotels_from_user(self.user_reviews_dictionary[user], 4.5)
print(five_star_hotels)
for item in self.items:
predicted_rating, _ = self.clu_cf_euc(user, item)
# predicted_ratings[user][item] = self.clu_cf_euc(self.reviews, user, item)
if predicted_rating is not None:
predicted_ratings[user][item] = predicted_rating
print(predicted_ratings[user])
print(predicted_ratings)
return predicted_ratings, errors
def calculate_users_similarity(self, user_id1, user_id2):
"""
Calculates the similarity between two users based on how similar are their
ratings in the reviews
:param user_id1: the ID of user 1
:param user_id2: the ID of user 2
:return: a float with the similarity between the two users. Since this
function is based on euclidean distance to calculate the similarity, a
similarity of 0 indicates that the users share exactly the same tastes
"""
user_weights1 = self.user_dictionary[user_id1].criteria_weights
user_weights2 = self.user_dictionary[user_id2].criteria_weights
return fourcity_clusterer.calculate_euclidean_distance(user_weights1, user_weights2)
CluCFEuc
Mean Absolute error: 0.614083
Root mean square error: 0.842096
--- 327.711871862 seconds ---
Mean Absolute error: 0.621729
Root mean square error: 0.841825
--- 127.151209831 seconds ---
Mean Absolute error: 0.626290
Root mean square error: 0.839320
--- 119.938344002 seconds ---
Mean Absolute error: 0.631229
Root mean square error: 0.839463
--- 230.939920902 seconds ---
Mean Absolute error: 0.632573
Root mean square error: 0.837022
--- 133.390977859 seconds ---