Our dataset is provided by GroupLens and can be found here. We will be using the latest small and full datasets. The task at hand is to fill those missing ratings:
One simple technique is to use the user rating matrix and find similar users who might rated those movies. First, we need a similarity metric to tell us how similar two users are. We will use Pearson correlation for that:
where:
Possible similarity values are between $-1$ and $1$.
How to make predictions using the similarity function? Here is one option:
where $U$ is the set of all users.
In [78]:
library(tidyverse)
library(ggthemes)
library(lubridate)
library(stringr)
library(wordcloud)
library(recommenderlab)
library(reshape2)
theme_set(theme_bw())
theme_update(plot.title = element_text(hjust = 0.5))
In [2]:
dataset_files <- c("movies", "ratings", "links", "tags")
dataset_small <- "ml-latest-small"
dataset_full <- "ml-latest"
dataset <- dataset_full
data_folder <- "data"
suffix <- ".csv"
for (f in dataset_files) {
path <- file.path(data_folder, dataset, paste0(f, suffix))
assign(f, read_csv(path))
print(paste(f, "object size is", format(object.size(get(f)),units="Mb")))
}
In [3]:
glimpse(ratings)
In [4]:
ratings <- ratings %>%
mutate(timestamp = as_datetime(timestamp))
In [5]:
glimpse(ratings)
In [6]:
glimpse(movies)
In [7]:
movies <- movies %>%
# trim whitespaces
mutate(title = str_trim(title)) %>%
# split title to title, year
extract(title, c("title_tmp", "year"), regex = "^(.*) \\(([0-9 \\-]*)\\)$", remove = F) %>%
# for series take debut date
mutate(year = if_else(str_length(year) > 4, as.integer(str_split(year, "-", simplify = T)[1]), as.integer(year))) %>%
# replace title NA's with original title
mutate(title = if_else(is.na(title_tmp), title, title_tmp)) %>%
# drop title_tmp column
select(-title_tmp) %>%
# generic function to turn (no genres listed) to NA
mutate(genres = if_else(genres == "(no genres listed)", `is.na<-`(genres), genres))
In [8]:
head(movies)
In [9]:
nrow(movies %>%
filter(is.na(title) | is.na(year)))
In [10]:
glimpse(tags)
In [11]:
tags <- tags %>%
mutate(timestamp = as_datetime(timestamp))
In [12]:
glimpse(tags)
In [13]:
movies_per_year <- movies %>%
na.omit() %>% # omit missing values
select(movieId, year) %>% # select columns we need
group_by(year) %>% # group by year
summarise(count = n()) %>% # count movies per year
arrange(year)
In [14]:
head(movies_per_year)
In [15]:
# fill missing years
movies_per_year <- movies_per_year %>%
complete(year = full_seq(year, 1), fill = list(count = 0))
In [16]:
head(movies_per_year)
In [17]:
movies_per_year %>%
ggplot(aes(x = year, y = count)) +
geom_line() +
ggtitle("Movies produced by year")
In [18]:
genres <- movies %>%
separate_rows(genres, sep = "\\|") %>%
group_by(genres) %>%
summarise(number = n()) %>%
arrange(desc(number))
In [19]:
head(genres, 10)
In [20]:
genres_popularity_per_year <- movies %>%
na.omit() %>% # omit missing values
select(movieId, year, genres) %>% # select columns we are interested in
separate_rows(genres, sep = "\\|") %>% # separate genres into rows
mutate(genres = as.factor(genres)) %>% # turn genres in factors
group_by(year, genres) %>% # group data by year and genre
summarise(number = n()) %>% # count
complete(year = full_seq(year, 1), genres, fill = list(number = 0)) # add missing years/genres
In [21]:
head(genres_popularity_per_year)
In [22]:
genres_popularity_per_year %>%
filter(year > 1930) %>%
filter(genres %in% c("Drama", "Comedy", "Western", "Sci-Fi", "Documentary")) %>%
ggplot(aes(x = year, y = number)) +
geom_line(aes(color=genres)) +
ggtitle("Movies (of genre) produced by year")
In [23]:
genres_tags <- movies %>%
na.omit() %>%
select(movieId, year, genres) %>%
separate_rows(genres, sep = "\\|") %>%
inner_join(tags, by = "movieId") %>%
select(genres, tag) %>%
group_by(genres) %>%
nest()
In [24]:
genre <- "Drama"
genre_words <- genres_tags %>%
filter(genres == genre) %>%
unnest() %>%
mutate(tag = str_to_lower(tag, "en")) %>%
anti_join(tibble(tag=c(tolower(genre)))) %>%
count(tag)
In [25]:
wordcloud(genre_words$tag, genre_words$n, max.words = 30, colors=brewer.pal(8, "Dark2"))
In [26]:
avg_rating <- ratings %>%
inner_join(movies, by = "movieId") %>%
na.omit() %>%
select(movieId, title, rating, year) %>%
group_by(movieId, title, year) %>%
summarise(count = n(), mean = mean(rating), min = min(rating), max = max(rating)) %>%
ungroup() %>%
arrange(desc(mean))
In [27]:
head(avg_rating, 10)
That doesn't seem right. Let's try again using the IMDB weighted average rating:
In [28]:
# R = average for the movie (mean) = (Rating)
# v = number of votes for the movie = (votes)
# m = minimum votes required to be listed in the Top 250
# C = the mean vote across the whole report
weighted_rating <- function(R, v, m, C) {
return (v/(v+m))*R + (m/(v+m))*C
}
In [29]:
avg_rating <- avg_rating %>%
mutate(wr = weighted_rating(mean, count, 500, mean(mean))) %>%
arrange(desc(wr))
In [30]:
head(avg_rating)
In [31]:
best_per_decade <- avg_rating %>%
mutate(decade = year %/% 10 * 10) %>%
arrange(year, desc(wr)) %>%
group_by(decade) %>%
summarise(
title = first(title),
year = first(year),
wr = first(wr),
mean = first(mean),
count = first(count)
)
In [32]:
best_per_decade
In [33]:
avg_rating %>%
ggplot(aes(mean)) +
geom_histogram(binwidth = 0.5) +
ggtitle("Average rating per movie")
In [34]:
head(ratings)
In [35]:
ratings_per_user <- ratings %>%
group_by(userId) %>%
summarise(count = n()) %>%
arrange(desc(count))
In [36]:
head(ratings_per_user)
In [37]:
ratings_per_user %>%
ggplot(aes(count)) +
geom_histogram(bins=100) +
xlab("# rated movies") +
ggtitle("# rated movies per user")
In [38]:
ratings_per_user %>%
filter(count < 500) %>%
ggplot(aes(count)) +
geom_histogram(bins=100) +
xlab("# rated movies") +
ggtitle("# rated movies per user")
In [39]:
movies_small <- read_csv("data/ml-latest-small/movies.csv")
ratings_small <- read_csv("data/ml-latest-small/ratings.csv")
In [40]:
ratings_matrix <-
dcast(ratings_small,
userId ~ movieId,
value.var = "rating",
na.rm = FALSE)
In [41]:
head(ratings_matrix)
In [42]:
ratings_matrix <- as.matrix(ratings_matrix[, -1])
In [43]:
head(ratings_matrix)
In [44]:
ratings_matrix <- as(ratings_matrix, "realRatingMatrix")
In [45]:
hist(getRatings(ratings_matrix), breaks="FD")
Some users consistently give high (or low) ratings to all movies they watch. We can try to remove this effect by normalizing our data in such a way that the average rating of each user is 0. This seems to be
In [46]:
ratings_norm <- normalize(ratings_matrix)
In [47]:
hist(getRatings(ratings_norm), breaks="FD")
In [48]:
evaluation_scheme <- evaluationScheme(
ratings_matrix,
method = "cross-validation",
k = 5,
given = 3,
goodRating = 5
)
algorithms <- list(
"random items" = list(name = "RANDOM", param = NULL),
"popular items" = list(name = "POPULAR", param = NULL),
"user-based CF (cosine)" = list(
name = "UBCF",
param = list(
method = "Cosine",
nn = 40
)
),
"user-based CF (pearson)" = list(
name = "UBCF",
param = list(
method = "Pearson",
nn = 40
)
)
)
In [49]:
eval_results <- evaluate(
evaluation_scheme,
algorithms,
n = c(1, 3, 5, 10, 15, 20) # top 1, 3, 5 etc recommendations
)
In [50]:
plot(eval_results, legend = "topleft")
In [51]:
plot(eval_results, "prec/rec")
In [52]:
avg(eval_results[[3]])
In [85]:
model <- Recommender(
ratings_matrix,
method = "UBCF",
param = list(
normalize = "center",
method = "Pearson",
nn = 30
)
)
Let's recommend new movies for a user:
In [101]:
user_ratings <- as(ratings_matrix[3, ], "list")[[1]]
In [102]:
d <- as_tibble(user_ratings)
d$movieId = as.integer(rownames(d))
colnames(d) <- c("rating", "movieId")
rownames(d) <- NULL
In [103]:
d %>%
inner_join(movies_small, by = "movieId")
In [104]:
predictions <- predict(model, ratings_matrix[3], n = 10)
In [105]:
recommendations <- predictions@items[[1]]
In [106]:
recom_result <- as.data.frame(movies_small[recommendations, ])
In [107]:
recom_result
In [108]:
user_ratings <- data.frame(movieId = colnames(ratings_matrix))
user_ratings$rating <- NA
In [109]:
movies_with_ratings <- movies_small[-which(
(movies_small$movieId %in% ratings_small$movieId) == FALSE
),]
In [110]:
movies_with_ratings %>%
filter(str_detect(title, 'Alien'))
In [111]:
change_rating <- function(data, movie_id, new_rating = 5) {
return(
data %>%
mutate(rating = replace(rating, movieId == movie_id, new_rating))
)
}
In [112]:
user_ratings <- change_rating(user_ratings, 1214)
user_ratings <- change_rating(user_ratings, 1200)
In [113]:
movies_with_ratings %>%
filter(str_detect(title, 'Inception'))
In [114]:
user_ratings <- change_rating(user_ratings, 79132)
In [115]:
movies_with_ratings %>%
filter(str_detect(title, 'The Dark Knight'))
In [116]:
user_ratings <- change_rating(user_ratings, 98124)
user_ratings <- change_rating(user_ratings, 99813, 4.5)
In [117]:
movies_with_ratings %>%
filter(str_detect(title, 'Matrix'))
In [118]:
user_ratings <- change_rating(user_ratings, 2571)
user_ratings <- change_rating(user_ratings, 6365, 5)
user_ratings <- change_rating(user_ratings, 6934, 4.5)
In [119]:
rownames(user_ratings) <- user_ratings$movieId
In [120]:
user_pref = as(
t(as.matrix(user_ratings['rating'])),
"realRatingMatrix"
)
In [121]:
user_pred <- predict(model, user_pref, n = 10)
In [122]:
user_rec <- user_pred@items[[1]]
In [123]:
user_recom_result <- as.data.frame(movies_small[user_rec, ])
In [124]:
user_recom_result