Our dataset is provided by GroupLens and can be found here. We will be using the latest small and full datasets. The task at hand is to fill those missing ratings:

One simple technique is to use the user rating matrix and find similar users who might rated those movies. First, we need a similarity metric to tell us how similar two users are. We will use Pearson correlation for that:

$$\mathrm{sim}(a, b) = \frac{\sum_{m \in M}(r_{a, m} - \overline{r}_a)(r_{b, m} - \overline{r}_b)}{\sqrt{\sum_{m \in M}(r_{a, m} - \overline{r}_a)^2} \sqrt{\sum_{m \in M}(r_{b, m} - \overline{r}_b)^2}} $$

where:

  • $a, b$ are users
  • $r_{a, m}$: rating of user $a$ for movie $m$
  • $M$: set of movies rated by both $a$ and $b$

Possible similarity values are between $-1$ and $1$.

How to make predictions using the similarity function? Here is one option:

$$pred(a, m) = \overline{r}_a + \frac{\sum_{b \in U}\mathrm{sim}(a, b) * (r_{b, m} - \overline{r}_b)}{\sum_{b \in U}\mathrm{sim}(a, b)}$$

where $U$ is the set of all users.


In [78]:
library(tidyverse)
library(ggthemes)
library(lubridate)
library(stringr)
library(wordcloud)
library(recommenderlab)
library(reshape2)

theme_set(theme_bw())
theme_update(plot.title = element_text(hjust = 0.5))

In [2]:
dataset_files <- c("movies", "ratings", "links", "tags")
dataset_small <- "ml-latest-small"
dataset_full <- "ml-latest"
dataset <- dataset_full
data_folder <- "data"
suffix <- ".csv"

for (f in dataset_files) {
  path <- file.path(data_folder, dataset, paste0(f, suffix))
  assign(f, read_csv(path))
  print(paste(f, "object size is", format(object.size(get(f)),units="Mb")))
}


Parsed with column specification:
cols(
  movieId = col_integer(),
  title = col_character(),
  genres = col_character()
)
[1] "movies object size is 4.3 Mb"
Parsed with column specification:
cols(
  userId = col_integer(),
  movieId = col_integer(),
  rating = col_double(),
  timestamp = col_integer()
)
[1] "ratings object size is 496.4 Mb"
Parsed with column specification:
cols(
  movieId = col_integer(),
  imdbId = col_character(),
  tmdbId = col_integer()
)
[1] "links object size is 2.8 Mb"
Parsed with column specification:
cols(
  userId = col_integer(),
  movieId = col_integer(),
  tag = col_character(),
  timestamp = col_integer()
)
[1] "tags object size is 17.5 Mb"

In [3]:
glimpse(ratings)


Observations: 26,024,289
Variables: 4
$ userId    <int> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
$ movieId   <int> 110, 147, 858, 1221, 1246, 1968, 2762, 2918, 2959, 4226, ...
$ rating    <dbl> 1.0, 4.5, 5.0, 5.0, 5.0, 4.0, 4.5, 5.0, 4.0, 4.0, 5.0, 5....
$ timestamp <int> 1425941529, 1425942435, 1425941523, 1425941546, 142594155...

In [4]:
ratings <- ratings %>%
  mutate(timestamp = as_datetime(timestamp))

In [5]:
glimpse(ratings)


Observations: 26,024,289
Variables: 4
$ userId    <int> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
$ movieId   <int> 110, 147, 858, 1221, 1246, 1968, 2762, 2918, 2959, 4226, ...
$ rating    <dbl> 1.0, 4.5, 5.0, 5.0, 5.0, 4.0, 4.5, 5.0, 4.0, 4.0, 5.0, 5....
$ timestamp <dttm> 2015-03-09 22:52:09, 2015-03-09 23:07:15, 2015-03-09 22:...

In [6]:
glimpse(movies)


Observations: 45,843
Variables: 3
$ movieId <int> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, ...
$ title   <chr> "Toy Story (1995)", "Jumanji (1995)", "Grumpier Old Men (19...
$ genres  <chr> "Adventure|Animation|Children|Comedy|Fantasy", "Adventure|C...

In [7]:
movies <- movies %>%
  # trim whitespaces
  mutate(title = str_trim(title)) %>%
  # split title to title, year
  extract(title, c("title_tmp", "year"), regex = "^(.*) \\(([0-9 \\-]*)\\)$", remove = F) %>%
  # for series take debut date
  mutate(year = if_else(str_length(year) > 4, as.integer(str_split(year, "-", simplify = T)[1]), as.integer(year))) %>%
  # replace title NA's with original title
  mutate(title = if_else(is.na(title_tmp), title, title_tmp)) %>%
  # drop title_tmp column
  select(-title_tmp)  %>%
  # generic function to turn (no genres listed) to NA
  mutate(genres = if_else(genres == "(no genres listed)", `is.na<-`(genres), genres))

In [8]:
head(movies)


movieIdtitleyeargenres
1 Toy Story 1995 Adventure|Animation|Children|Comedy|Fantasy
2 Jumanji 1995 Adventure|Children|Fantasy
3 Grumpier Old Men 1995 Comedy|Romance
4 Waiting to Exhale 1995 Comedy|Drama|Romance
5 Father of the Bride Part II 1995 Comedy
6 Heat 1995 Action|Crime|Thriller

In [9]:
nrow(movies %>%
  filter(is.na(title) | is.na(year)))


196

In [10]:
glimpse(tags)


Observations: 753,170
Variables: 4
$ userId    <int> 1, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 49, 49...
$ movieId   <int> 318, 4306, 89302, 89302, 89302, 89302, 96079, 113315, 113...
$ tag       <chr> "narrated", "Dreamworks", "England", "espionage", "jazz",...
$ timestamp <int> 1425942391, 1459855607, 1400778834, 1400778836, 140077884...

In [11]:
tags <- tags %>%
  mutate(timestamp = as_datetime(timestamp))

In [12]:
glimpse(tags)


Observations: 753,170
Variables: 4
$ userId    <int> 1, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 49, 49...
$ movieId   <int> 318, 4306, 89302, 89302, 89302, 89302, 96079, 113315, 113...
$ tag       <chr> "narrated", "Dreamworks", "England", "espionage", "jazz",...
$ timestamp <dttm> 2015-03-09 23:06:31, 2016-04-05 11:26:47, 2014-05-22 17:...

How many movies were produced per year?


In [13]:
movies_per_year <- movies %>%
  na.omit() %>% # omit missing values
  select(movieId, year) %>% # select columns we need
  group_by(year) %>% # group by year
  summarise(count = n())  %>% # count movies per year
  arrange(year)

In [14]:
head(movies_per_year)


yearcount
1874 1
1888 2
1890 1
1891 4
1892 1
189412

In [15]:
# fill missing years

movies_per_year <- movies_per_year %>%
  complete(year = full_seq(year, 1), fill = list(count = 0))

In [16]:
head(movies_per_year)


yearcount
18741
18750
18760
18770
18780
18790

In [17]:
movies_per_year %>%
    ggplot(aes(x = year, y = count)) +
    geom_line() +
    ggtitle("Movies produced by year")


What genres were popular during the years?


In [18]:
genres <- movies %>%
  separate_rows(genres, sep = "\\|") %>%
  group_by(genres) %>%
  summarise(number = n()) %>%
  arrange(desc(number))

In [19]:
head(genres, 10)


genresnumber
Drama 19806
Comedy 13002
Thriller 6761
Romance 6069
Action 5775
Horror 4448
Crime 4247
Documentary 4122
Adventure 3369
Sci-Fi 2847

In [20]:
genres_popularity_per_year <- movies %>%
  na.omit() %>% # omit missing values
  select(movieId, year, genres) %>% # select columns we are interested in
  separate_rows(genres, sep = "\\|") %>% # separate genres into rows
  mutate(genres = as.factor(genres)) %>% # turn genres in factors
  group_by(year, genres) %>% # group data by year and genre
  summarise(number = n()) %>% # count
  complete(year = full_seq(year, 1), genres, fill = list(number = 0)) # add missing years/genres

In [21]:
head(genres_popularity_per_year)


yeargenresnumber
1874 Action 0
1874 Adventure0
1874 Animation0
1874 Children 0
1874 Comedy 0
1874 Crime 0

In [22]:
genres_popularity_per_year %>%
  filter(year > 1930) %>%
  filter(genres %in% c("Drama", "Comedy", "Western", "Sci-Fi", "Documentary")) %>%
  ggplot(aes(x = year, y = number)) +
    geom_line(aes(color=genres)) +
    ggtitle("Movies (of genre) produced by year")


What tags summarize a specific movie genre?


In [23]:
genres_tags <- movies %>%
  na.omit() %>%
  select(movieId, year, genres) %>%
  separate_rows(genres, sep = "\\|") %>%
  inner_join(tags, by = "movieId") %>%
  select(genres, tag) %>%
  group_by(genres) %>%
  nest()

In [24]:
genre <- "Drama"

genre_words <- genres_tags %>%
  filter(genres == genre) %>%
  unnest() %>%
  mutate(tag = str_to_lower(tag, "en")) %>%
  anti_join(tibble(tag=c(tolower(genre)))) %>%
  count(tag)


Joining, by = "tag"

In [25]:
wordcloud(genre_words$tag, genre_words$n, max.words = 30, colors=brewer.pal(8, "Dark2"))


Warning message in wordcloud(genre_words$tag, genre_words$n, max.words = 30, colors = brewer.pal(8, :
“atmospheric could not be fit on page. It will not be plotted.”

What are the highest rated movies for every decade?


In [26]:
avg_rating <- ratings %>%
  inner_join(movies, by = "movieId") %>%
  na.omit() %>%
  select(movieId, title, rating, year) %>%
  group_by(movieId, title, year) %>%
  summarise(count = n(), mean = mean(rating), min = min(rating), max = max(rating)) %>%
  ungroup() %>%
  arrange(desc(mean))

In [27]:
head(avg_rating, 10)


movieIdtitleyearcountmeanminmax
27396 Gentleman's Game, A 2002 1 5 5 5
27914 Hijacking Catastrophe: 9/11, Fear & the Selling of American Empire2004 1 5 5 5
66389 AmericanEast 2008 1 5 5 5
90464 Frozen North, The 2006 2 5 5 5
92783 Latin Music USA 2009 1 5 5 5
93967 Keeping the Promise (Sign of the Beaver, The) 1997 1 5 5 5
94972 Best of Ernie and Bert, The 1988 1 5 5 5
95517 Barchester Chronicles, The 1982 3 5 5 5
95977 Junior Prom 1946 1 5 5 5
98437 Bed of Roses 1933 1 5 5 5

That doesn't seem right. Let's try again using the IMDB weighted average rating:


In [28]:
# R = average for the movie (mean) = (Rating)
# v = number of votes for the movie = (votes)
# m = minimum votes required to be listed in the Top 250
# C = the mean vote across the whole report
weighted_rating <- function(R, v, m, C) {
  return (v/(v+m))*R + (m/(v+m))*C
}

In [29]:
avg_rating <- avg_rating %>%
  mutate(wr = weighted_rating(mean, count, 500, mean(mean))) %>%
  arrange(desc(wr))

In [30]:
head(avg_rating)


movieIdtitleyearcountmeanminmaxwr
356 Forrest Gump 1994 91921 4.052926 0.5 5 0.9945900
318 Shawshank Redemption, The 1994 91082 4.429015 0.5 5 0.9945404
296 Pulp Fiction 1994 87901 4.169975 0.5 5 0.9943440
593 Silence of the Lambs, The 1991 84078 4.152246 0.5 5 0.9940883
2571 Matrix, The 1999 77960 4.154098 0.5 5 0.9936273
260 Star Wars: Episode IV - A New Hope1977 77045 4.132299 0.5 5 0.9935521

In [31]:
best_per_decade <- avg_rating %>%
  mutate(decade = year  %/% 10 * 10) %>%
  arrange(year, desc(wr)) %>%
  group_by(decade) %>%
  summarise(
      title = first(title), 
      year = first(year), 
      wr = first(wr), 
      mean = first(mean), 
      count = first(count)
  )

In [32]:
best_per_decade


decadetitleyearwrmeancount
1870 Passage de Venus 1874 0.01768173 2.388889 9
1880 Traffic Crossing Leeds Bridge 1888 0.01768173 2.333333 9
1890 Monkeyshines, No. 1 1890 0.01185771 1.583333 6
1900 The Kiss 1900 0.02534113 3.153846 13
1910 Frankenstein 1910 0.04761905 3.180000 25
1920 Cabinet of Dr. Caligari, The (Cabinet des Dr. Caligari., Das)1920 0.77283053 3.897707 1701
1930 All Quiet on the Western Front 1930 0.85043374 3.932642 2843
1940 Pinocchio 1940 0.96872068 3.447594 15485
1950 Cinderella 1950 0.95640042 3.542715 10968
1960 Psycho 1960 0.97889227 4.065206 23188
1970 M*A*S*H (a.k.a. MASH) 1970 0.96543620 3.875770 13966
1980 Star Wars: Episode V - The Empire Strikes Back 1980 0.99195779 4.142536 61672
1990 Dances with Wolves 1990 0.99035457 3.740689 51338
2000 Gladiator 2000 0.98910984 3.954815 45413
2010 Inception 2010 0.98603235 4.161756 35297

What is the average rating for a movie?


In [33]:
avg_rating %>% 
    ggplot(aes(mean)) + 
    geom_histogram(binwidth = 0.5) +
    ggtitle("Average rating per movie")


How many movies users are rating?


In [34]:
head(ratings)


userIdmovieIdratingtimestamp
1 110 1.0 2015-03-09 22:52:09
1 147 4.5 2015-03-09 23:07:15
1 858 5.0 2015-03-09 22:52:03
1 1221 5.0 2015-03-09 22:52:26
1 1246 5.0 2015-03-09 22:52:36
1 1968 4.0 2015-03-09 23:02:28

In [35]:
ratings_per_user <- ratings %>%
    group_by(userId) %>%
    summarise(count = n()) %>%
    arrange(desc(count))

In [36]:
head(ratings_per_user)


userIdcount
4581118276
8659 9279
270123 7638
179792 7515
228291 7410
243443 6320

In [37]:
ratings_per_user %>% 
    ggplot(aes(count)) + 
    geom_histogram(bins=100) +
    xlab("# rated movies") +
    ggtitle("# rated movies per user")



In [38]:
ratings_per_user %>%
    filter(count < 500) %>%
    ggplot(aes(count)) + 
    geom_histogram(bins=100) +
    xlab("# rated movies") +
    ggtitle("# rated movies per user")


Building a Movie Recommender


In [39]:
movies_small <- read_csv("data/ml-latest-small/movies.csv")
ratings_small <- read_csv("data/ml-latest-small/ratings.csv")


Parsed with column specification:
cols(
  movieId = col_integer(),
  title = col_character(),
  genres = col_character()
)
Parsed with column specification:
cols(
  userId = col_integer(),
  movieId = col_integer(),
  rating = col_double(),
  timestamp = col_integer()
)

In [40]:
ratings_matrix <-
  dcast(ratings_small,
        userId ~ movieId,
        value.var = "rating",
        na.rm = FALSE)

In [41]:
head(ratings_matrix)


userId123456789161084161155161594161830161918161944162376162542162672163949
1 NANANANANANANANANANANANANANANANANANANA
2 NANANANANANANANANANANANANANANANANANANA
3 NANANANANANANANANANANANANANANANANANANA
4 NANANANANANANANANANANANANANANANANANANA
5 NANA 4NANANANANANANANANANANANANANANANA
6 NANANANANANANANANANANANANANANANANANANA

In [42]:
ratings_matrix <- as.matrix(ratings_matrix[, -1])

In [43]:
head(ratings_matrix)


12345678910161084161155161594161830161918161944162376162542162672163949
NANANANANANANANANANANANANANANANANANANANA
NANANANANANANANANA 4NANANANANANANANANANA
NANANANANANANANANANANANANANANANANANANANA
NANANANANANANANANA 4NANANANANANANANANANA
NANA 4NANANANANANANANANANANANANANANANANA
NANANANANANANANANANANANANANANANANANANANA

In [44]:
ratings_matrix <- as(ratings_matrix, "realRatingMatrix")

In [45]:
hist(getRatings(ratings_matrix), breaks="FD")


Some users consistently give high (or low) ratings to all movies they watch. We can try to remove this effect by normalizing our data in such a way that the average rating of each user is 0. This seems to be


In [46]:
ratings_norm <- normalize(ratings_matrix)

In [47]:
hist(getRatings(ratings_norm), breaks="FD")


Making a model & evaluation


In [48]:
evaluation_scheme <- evaluationScheme(
    ratings_matrix,
    method = "cross-validation",
    k = 5,
    given = 3,
    goodRating = 5
)

algorithms <- list(
    "random items" = list(name = "RANDOM", param = NULL),
    "popular items" = list(name = "POPULAR", param = NULL),
    "user-based CF (cosine)" = list(
        name = "UBCF", 
        param = list(
            method = "Cosine",
            nn = 40
        )
    ),
    "user-based CF (pearson)" = list(
        name = "UBCF", 
        param = list(
            method = "Pearson",
            nn = 40
        )
    )
)

In [49]:
eval_results <- evaluate(
    evaluation_scheme, 
    algorithms, 
    n = c(1, 3, 5, 10, 15, 20) # top 1, 3, 5 etc recommendations
)


RANDOM run fold/sample [model time/prediction time]
	 1  [0.004sec/1.324sec] 
	 2  [0.018sec/1.303sec] 
	 3  [0.001sec/1.207sec] 
	 4  [0.002sec/1.2sec] 
	 5  [0.001sec/1.135sec] 
POPULAR run fold/sample [model time/prediction time]
	 1  [0.011sec/1.126sec] 
	 2  [0.044sec/1.147sec] 
	 3  [0.01sec/1.128sec] 
	 4  [0.015sec/1.184sec] 
	 5  [0.01sec/1.187sec] 
UBCF run fold/sample [model time/prediction time]
	 1  [0.007sec/3.223sec] 
	 2  [0.032sec/3.563sec] 
	 3  [0.007sec/3.24sec] 
	 4  [0.007sec/3.272sec] 
	 5  [0.005sec/3.4sec] 
UBCF run fold/sample [model time/prediction time]
	 1  [0.005sec/1.276sec] 
	 2  [0.007sec/1.429sec] 
	 3  [0.005sec/1.119sec] 
	 4  [0.095sec/1.259sec] 
	 5  [0.008sec/1.274sec] 

In [50]:
plot(eval_results, legend = "topleft")



In [51]:
plot(eval_results, "prec/rec")



In [52]:
avg(eval_results[[3]])


TPFPFNTNprecisionrecallTPRFPR
10.2000000 0.7125926 21.64889 9040.439 0.2191350 0.01525558 0.01525558 7.877017e-05
30.5096296 2.2281481 21.33926 9038.923 0.1860303 0.04298963 0.04298963 2.463152e-04
50.8029630 3.7600000 21.04593 9037.391 0.1759057 0.06295357 0.06295357 4.156582e-04
101.4296296 7.6962963 20.41926 9033.455 0.1565942 0.10662260 0.10662260 8.507980e-04
151.8577778 11.8311111 19.99111 9029.320 0.1357149 0.13706452 0.13706452 1.307929e-03
202.2622222 15.9896296 19.58667 9025.161 0.1239303 0.16458516 0.16458516 1.767679e-03

Making predictions


In [85]:
model <- Recommender(
    ratings_matrix, 
    method = "UBCF", 
    param = list(
        normalize = "center",
        method = "Pearson", 
        nn = 30
    )
)

Let's recommend new movies for a user:


In [101]:
user_ratings <- as(ratings_matrix[3, ], "list")[[1]]

In [102]:
d <- as_tibble(user_ratings)
d$movieId = as.integer(rownames(d))
colnames(d) <- c("rating", "movieId")
rownames(d) <- NULL

In [103]:
d %>%
    inner_join(movies_small, by = "movieId")


ratingmovieIdtitlegenres
3.0 60 Indian in the Cupboard, The (1995) Adventure|Children|Fantasy
4.0 110 Braveheart (1995) Action|Drama|War
3.5 247 Heavenly Creatures (1994) Crime|Drama
3.0 267 Major Payne (1995) Comedy
4.5 296 Pulp Fiction (1994) Comedy|Crime|Drama|Thriller
5.0 318 Shawshank Redemption, The (1994) Crime|Drama
2.5 355 Flintstones, The (1994) Children|Comedy|Fantasy
5.0 356 Forrest Gump (1994) Comedy|Drama|Romance|War
2.5 377 Speed (1994) Action|Romance|Thriller
3.0 527 Schindler's List (1993) Drama|War
3.0 588 Aladdin (1992) Adventure|Animation|Children|Comedy|Musical
3.0 592 Batman (1989) Action|Crime|Thriller
3.0 593 Silence of the Lambs, The (1991) Crime|Horror|Thriller
2.0 595 Beauty and the Beast (1991) Animation|Children|Fantasy|Musical|Romance|IMAX
3.5 736 Twister (1996) Action|Adventure|Romance|Thriller
4.0 778 Trainspotting (1996) Comedy|Crime|Drama
3.0 866 Bound (1996) Crime|Drama|Romance|Thriller
5.0 1197 Princess Bride, The (1987) Action|Adventure|Comedy|Fantasy|Romance
3.0 1210 Star Wars: Episode VI - Return of the Jedi (1983) Action|Adventure|Sci-Fi
4.0 1235 Harold and Maude (1971) Comedy|Drama|Romance
3.0 1271 Fried Green Tomatoes (1991) Comedy|Crime|Drama
4.0 1378 Young Guns (1988) Action|Comedy|Western
3.5 1580 Men in Black (a.k.a. MIB) (1997) Action|Comedy|Sci-Fi
4.5 1721 Titanic (1997) Drama|Romance
4.0 1884 Fear and Loathing in Las Vegas (1998) Adventure|Comedy|Drama
4.0 2028 Saving Private Ryan (1998) Action|Drama|War
4.0 2318 Happiness (1998) Comedy|Drama
3.0 2513 Pet Sematary (1989) Horror
3.0 2694 Big Daddy (1999) Comedy
3.5 2702 Summer of Sam (1999) Drama
3.0 2716 Ghostbusters (a.k.a. Ghost Busters) (1984) Action|Comedy|Sci-Fi
3.5 2762 Sixth Sense, The (1999) Drama|Horror|Mystery
4.0 2841 Stir of Echoes (1999) Horror|Mystery|Thriller
4.0 2858 American Beauty (1999) Drama|Romance
5.0 2959 Fight Club (1999) Action|Crime|Drama|Thriller
3.0 3243 Encino Man (1992) Comedy
4.0 3510 Frequency (2000) Drama|Thriller
5.0 3949 Requiem for a Dream (2000) Drama
3.0 5349 Spider-Man (2002) Action|Adventure|Sci-Fi|Thriller
3.5 5669 Bowling for Columbine (2002) Documentary
3.0 6377 Finding Nemo (2003) Adventure|Animation|Children|Comedy
2.5 7153 Lord of the Rings: The Return of the King, The (2003) Action|Adventure|Drama|Fantasy
3.0 7361 Eternal Sunshine of the Spotless Mind (2004) Drama|Romance|Sci-Fi
3.5 8622 Fahrenheit 9/11 (2004) Documentary
3.0 8636 Spider-Man 2 (2004) Action|Adventure|Sci-Fi|IMAX
3.5 27369 Daria: Is It Fall Yet? (2000) Animation|Comedy
3.5 44191 V for Vendetta (2006) Action|Sci-Fi|Thriller|IMAX
4.5 48783 Flags of Our Fathers (2006) Drama|War
4.5 50068 Letters from Iwo Jima (2006) Drama|War
3.0 58559 Dark Knight, The (2008) Action|Crime|Drama|IMAX
4.0 84236 White Stripes Under Great White Northern Lights, The (2009)Documentary

In [104]:
predictions <- predict(model, ratings_matrix[3], n = 10)

In [105]:
recommendations <- predictions@items[[1]]

In [106]:
recom_result <- as.data.frame(movies_small[recommendations, ])

In [107]:
recom_result


movieIdtitlegenres
858 Godfather, The (1972) Crime|Drama
1221 Godfather: Part II, The (1974) Crime|Drama
5893 Last Seduction, The (1994) Crime|Drama|Thriller
85 Angels and Insects (1995) Drama|Romance
4971 Moscow on the Hudson (1984) Comedy|Drama
3671 Blazing Saddles (1974) Comedy|Western
1270 Back to the Future (1985) Adventure|Comedy|Sci-Fi
108548 Big Bang Theory, The (2007-) Comedy
673 Space Jam (1996) Adventure|Animation|Children|Comedy|Fantasy|Sci-Fi
6862 Out of Time (2003) Crime|Drama|Thriller

Predict movies to watch based on your preferences


In [108]:
user_ratings <- data.frame(movieId = colnames(ratings_matrix))
user_ratings$rating <- NA

In [109]:
movies_with_ratings <- movies_small[-which(
    (movies_small$movieId %in% ratings_small$movieId) == FALSE
),]

In [110]:
movies_with_ratings %>%
  filter(str_detect(title, 'Alien'))


movieIdtitlegenres
1200 Aliens (1986) Action|Adventure|Horror|Sci-Fi
1214 Alien (1979) Horror|Sci-Fi
1320 Alien³ (a.k.a. Alien 3) (1992) Action|Horror|Sci-Fi|Thriller
1690 Alien: Resurrection (1997) Action|Horror|Sci-Fi
1692 Alien Escape (1995) Horror|Sci-Fi
3701 Alien Nation (1988) Crime|Drama|Sci-Fi|Thriller
4526 My Stepmother Is an Alien (1988) Comedy|Romance|Sci-Fi
6835 Alien Contamination (1980) Action|Horror|Sci-Fi
8810 AVP: Alien vs. Predator (2004) Action|Horror|Sci-Fi|Thriller
56801 AVPR: Aliens vs. Predator - Requiem (2007) Action|Horror|Sci-Fi
62113 How to Lose Friends & Alienate People (2008) Comedy
67408 Monsters vs. Aliens (2009) Animation|Sci-Fi|IMAX
70282 Aliens in the Attic (2009) Adventure|Children|Fantasy|Sci-Fi
72043 Monsters vs Aliens: Mutant Pumpkins from Outer Space (2009)Animation|Comedy|Sci-Fi
83613 Cowboys & Aliens (2011) Action|Sci-Fi|Thriller|Western|IMAX

In [111]:
change_rating <- function(data, movie_id, new_rating = 5) {
    return(
        data %>%
            mutate(rating = replace(rating, movieId == movie_id, new_rating))
    )
}

In [112]:
user_ratings <- change_rating(user_ratings, 1214)
user_ratings <- change_rating(user_ratings, 1200)

In [113]:
movies_with_ratings %>%
  filter(str_detect(title, 'Inception'))


movieIdtitlegenres
79132 Inception (2010) Action|Crime|Drama|Mystery|Sci-Fi|Thriller|IMAX

In [114]:
user_ratings <- change_rating(user_ratings, 79132)

In [115]:
movies_with_ratings %>%
  filter(str_detect(title, 'The Dark Knight'))


movieIdtitlegenres
98124 Batman: The Dark Knight Returns, Part 1 (2012)Action|Animation|Sci-Fi
99813 Batman: The Dark Knight Returns, Part 2 (2013)Action|Animation

In [116]:
user_ratings <- change_rating(user_ratings, 98124)
user_ratings <- change_rating(user_ratings, 99813, 4.5)

In [117]:
movies_with_ratings %>%
  filter(str_detect(title, 'Matrix'))


movieIdtitlegenres
2571 Matrix, The (1999) Action|Sci-Fi|Thriller
6365 Matrix Reloaded, The (2003) Action|Adventure|Sci-Fi|Thriller|IMAX
6934 Matrix Revolutions, The (2003) Action|Adventure|Sci-Fi|Thriller|IMAX

In [118]:
user_ratings <- change_rating(user_ratings, 2571)
user_ratings <- change_rating(user_ratings, 6365, 5)
user_ratings <- change_rating(user_ratings, 6934, 4.5)

In [119]:
rownames(user_ratings) <- user_ratings$movieId

In [120]:
user_pref = as(
    t(as.matrix(user_ratings['rating'])), 
    "realRatingMatrix"
)

In [121]:
user_pred <- predict(model, user_pref, n = 10)

In [122]:
user_rec <- user_pred@items[[1]]

In [123]:
user_recom_result <- as.data.frame(movies_small[user_rec, ])

In [124]:
user_recom_result


movieIdtitlegenres
4990 Jimmy Neutron: Boy Genius (2001) Adventure|Animation|Children|Comedy
296 Pulp Fiction (1994) Comedy|Crime|Drama|Thriller
2959 Fight Club (1999) Action|Crime|Drama|Thriller
858 Godfather, The (1972) Crime|Drama
1196 Star Wars: Episode V - The Empire Strikes Back (1980)Action|Adventure|Sci-Fi
5944 Star Trek: Nemesis (2002) Action|Drama|Sci-Fi|Thriller
7137 Cooler, The (2003) Comedy|Drama|Romance
260 Star Wars: Episode IV - A New Hope (1977) Action|Adventure|Sci-Fi
4226 Memento (2000) Mystery|Thriller
1210 Star Wars: Episode VI - Return of the Jedi (1983) Action|Adventure|Sci-Fi