In [ ]:
import os
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.optim as optim
import torch.utils.data
from torch.autograd import Variable
from sklearn.utils import shuffle
from torchsample.modules import ModuleTrainer

%matplotlib inline

In [ ]:
#path = "data/ml-20m/"
path = "data/ml-latest-small/"
model_path = 'data/lesson-4-models/'
if not os.path.exists(model_path): os.mkdir(model_path)
batch_size=64
use_cuda=False

Set up data

We're working with the movielens data, which contains one rating per row, like this:


In [ ]:
ratings = pd.read_csv(path+'ratings.csv')
# ratings = shuffle(ratings)
ratings.tail()

In [ ]:
len(ratings)

Just for display purposes, let's read in the movie names too.


In [ ]:
movies_table = pd.read_csv(path+'movies.csv').set_index('movieId')
movies_table.tail()

We update the movie and user ids so that they are contiguous integers, which we want when using embeddings.


In [ ]:
users = ratings.userId.unique()
movies = ratings.movieId.unique()

userid2idx = {o:i for i,o in enumerate(users)}
movieid2idx = {o:i for i,o in enumerate(movies)}

ratings.movieId = ratings.movieId.apply(lambda x: movieid2idx[x])
ratings.userId = ratings.userId.apply(lambda x: userid2idx[x])

This is the number of latent factors in each embedding.


In [ ]:
num_dimensions = 50
num_users = ratings.userId.nunique()
num_movies = ratings.movieId.nunique()
num_users, num_movies

Randomly split into training and validation.


In [ ]:
mask = np.random.rand(len(ratings)) < 0.8
ratings_train = ratings[mask]
ratings_valid = ratings[~mask]

Dot product

The most basic model is a dot product of a movie embedding and a user embedding. Let's see how well that works:


In [ ]:
import torch.nn as nn
import torch.nn.functional as F

class UserMovieModule(nn.Module):
    def __init__(self, num_users, num_movies, num_dimensions):
        super(UserMovieModule, self).__init__()
        self.users_embedding = nn.Embedding(num_users, num_dimensions)
        self.movies_embedding = nn.Embedding(num_movies, num_dimensions)
        self.user_bias = nn.Embedding(num_users, 1)
        self.movie_bias = nn.Embedding(num_movies, 1)
        self.init()

    def forward(self, users, movies):
        u = self.users_embedding(users)
        m = self.movies_embedding(movies)
        dot_product = torch.sum(torch.mul(u, m), dim=1)
        return dot_product
    
    def init(self):
        torch.nn.init.uniform(self.users_embedding.weight, a=-0.05, b=0.05)
        torch.nn.init.uniform(self.movies_embedding.weight, a=-0.05, b=0.05)

def tensorl(from_int):
    return torch.from_numpy(np.array(from_int)).long()

def tensorf(from_int):
    return torch.from_numpy(np.array(from_int)).float()

In [ ]:
criterion = nn.MSELoss()
model = UserMovieModule(num_users, num_movies, num_dimensions)
if(use_cuda):
    model.cuda()
    criterion.cuda()
trainer = ModuleTrainer(model)

In [ ]:
if 1 == 1:
    trainer.set_optimizer(optim.Adam, lr=1e-3)
    trainer.set_loss(criterion)
    trainer.fit([tensorl(ratings_train.userId), tensorl(ratings_train.movieId)], tensorf(ratings_train.rating), 
                batch_size=batch_size,
                val_data=([tensorl(ratings_valid.userId), tensorl(ratings_valid.movieId)], tensorf(ratings_valid.rating)),
                nb_epoch=5, shuffle=True)
    trainer.save_state_dict(model_path + '/model-no-bias.pth')
    
model.load_state_dict(torch.load(model_path + '/model-no-bias.pth'))

The best benchmarks are a bit over 0.9, so this model doesn't seem to be working that well...

Bias

The problem is likely to be that we don't have bias terms - that is, a single bias for each user and each movie representing how positive or negative each user is, and how good each movie is. We can add that easily by simply creating an embedding with one output for each movie and each user, and adding it to our output.


In [ ]:
import torch.nn as nn
import torch.nn.functional as F

class UserMovieModuleWithBias(nn.Module):
    def __init__(self, num_users, num_movies, num_dimensions):
        super(UserMovieModuleWithBias, self).__init__()
        self.users_embedding = nn.Embedding(num_users, num_dimensions)
        self.movies_embedding = nn.Embedding(num_movies, num_dimensions)
        self.user_bias = nn.Embedding(num_users, 1)
        self.movie_bias = nn.Embedding(num_movies, 1)
        self.init()

    def forward(self, users, movies):
        u = self.users_embedding(users)
        bu = self.user_bias(users)
        m = self.movies_embedding(movies)
        bm = self.movie_bias(users)
        dot_product = torch.sum(torch.mul(u, m), dim=1)
        result = torch.add(dot_product, bu).add_(bm)
        return result
    
    def init(self):
        torch.nn.init.uniform(self.users_embedding.weight, a=-0.05, b=0.05)
        torch.nn.init.uniform(self.movies_embedding.weight, a=-0.05, b=0.05)
        torch.nn.init.uniform(self.user_bias.weight, a=-0.05, b=0.05)
        torch.nn.init.uniform(self.movie_bias.weight, a=-0.05, b=0.05)

In [ ]:
criterion = nn.MSELoss()
model = UserMovieModuleWithBias(num_users, num_movies, num_dimensions)
if(use_cuda):
    model.cuda()
    criterion.cuda()
trainer = ModuleTrainer(model)

In [ ]:
from torchsample.regularizers import L2Regularizer

if 1 == 1:
    trainer.set_optimizer(optim.Adam, lr=1e-2)
    trainer.set_loss(criterion)

    regularizers = [L2Regularizer(scale=1e-2, module_filter='*embedding*')]
    trainer.set_regularizers(regularizers)
    
    trainer.fit([tensorl(ratings_train.userId), tensorl(ratings_train.movieId)], tensorf(ratings_train.rating), 
                batch_size=batch_size,
                val_data=([tensorl(ratings_valid.userId), tensorl(ratings_valid.movieId)], tensorf(ratings_valid.rating)),
                nb_epoch=10, shuffle=True)
    trainer.save_state_dict(model_path + '/model-bias.pth')
    
model.load_state_dict(torch.load(model_path + '/model-bias.pth'))

In [ ]:
if 1 == 1:
    trainer.set_optimizer(optim.Adam, lr=1e-3)
    trainer.fit([tensorl(ratings_train.userId), tensorl(ratings_train.movieId)], tensorf(ratings_train.rating), 
                    batch_size=batch_size,
                    val_data=([tensorl(ratings_valid.userId), tensorl(ratings_valid.movieId)], tensorf(ratings_valid.rating)),
                    nb_epoch=3, shuffle=True)

This result is quite a bit better than the best benchmarks that we could find with a quick google search - so looks like a great approach!

We can use the model to generate predictions by passing a pair of ints - a user id and a movie id. For instance, this predicts that user #3 would really enjoy movie #6.


In [ ]:
trainer.predict([tensorl([3]), tensorl([6])])

Analyze results

To make the analysis of the factors more interesting, we'll restrict it to the top 2000 most popular movies.


In [ ]:
g=ratings.groupby('movieId')['rating'].count()
top_movies=g.sort_values(ascending=False)[:2000]
top_movies = torch.LongTensor(np.array(top_movies.index))
top_movies[:5]

First, we'll look at the movie bias term. We create a 'model' - which in PyTorch is simply using the movie_bias field from our model. Here, our input is the movie id (a single id), and the output is the movie bias (a single float).


In [ ]:
movie_bias = model.movie_bias(Variable(top_movies, volatile=True))
movie_names = movies_table['title'].to_dict()
movie_ratings = [(b[0].data.cpu()[0], movie_names[movies[i]]) for i,b in zip(top_movies, movie_bias)]

Now we can look at the top and bottom rated movies. These ratings are corrected for different levels of reviewer sentiment, as well as different types of movies that different reviewers watch.


In [ ]:
from operator import itemgetter
sorted(movie_ratings, key=itemgetter(0), reverse=False)[:15]

In [ ]:
from operator import itemgetter
sorted(movie_ratings, key=itemgetter(0), reverse=True)[:15]

We can now do the same thing for the embeddings.


In [ ]:
movies_embedding = model.movies_embedding(Variable(top_movies, volatile=True))
movies_embedding.size()

Because it's hard to interpret 50 embeddings, we use PCA to simplify them down to just 3 vectors.


In [ ]:
from sklearn.decomposition import PCA
pca = PCA(n_components=3)
movies_pca = pca.fit(movies_embedding.data.cpu().numpy().T).components_

In [ ]:
fac0 = movies_pca[0]
fac0

In [ ]:
movie_comp = [(f, movie_names[movies[i]]) for f,i in zip(fac0, top_movies)]

Here's the 1st component. It seems to be 'critically acclaimed' or 'classic'.


In [ ]:
sorted(movie_comp, key=itemgetter(0), reverse=True)[:10]

In [ ]:
sorted(movie_comp, key=itemgetter(0))[:10]

In [ ]:
fac1 = movies_pca[1]

In [ ]:
movie_comp = [(f, movie_names[movies[i]]) for f,i in zip(fac1, top_movies)]

The 2nd is 'hollywood blockbuster'.


In [ ]:
sorted(movie_comp, key=itemgetter(0), reverse=True)[:10]

In [ ]:
sorted(movie_comp, key=itemgetter(0))[:10]

In [ ]:
fac2 = movies_pca[2]

The 3rd is 'violent vs happy'.


In [ ]:
movie_comp = [(f, movie_names[movies[i]]) for f,i in zip(fac2, top_movies)]

In [ ]:
sorted(movie_comp, key=itemgetter(0), reverse=True)[:10]

In [ ]:
sorted(movie_comp, key=itemgetter(0))[:10]

We can draw a picture to see how various movies appear on the map of these components. This picture shows the 1st and 3rd components.


In [ ]:
from matplotlib import pyplot as plt

start=50; end=100
X = fac0[start:end]
Y = fac2[start:end]
plt.figure(figsize=(15,15))
plt.scatter(X, Y)
for i, x, y in zip(top_movies[start:end], X, Y):
    plt.text(x,y,movie_names[movies[i]], color=np.random.rand(3)*0.7, fontsize=14)
plt.show()

Neural net

Rather than creating a special purpose architecture (like our dot-product with bias earlier), it's often both easier and more accurate to use a standard neural network. Let's try it! Here, we simply concatenate the user and movie embeddings into a single vector, which we feed into the neural net.


In [ ]:
class UserMovieModuleNN(nn.Module):
    def __init__(self, num_users, num_movies, num_dimensions):
        super().__init__()
        self.users_embedding = nn.Embedding(num_users, num_dimensions)
        self.movies_embedding = nn.Embedding(num_movies, num_dimensions)
        self.predictor = nn.Sequential(
            nn.Dropout(0.3),
            nn.Linear(2 * num_dimensions, 70),
            nn.ReLU(True),
            nn.Dropout(0.75),
            nn.Linear(70, 1),
        )

    def forward(self, users, movies):
        u = self.users_embedding(users)
        m = self.movies_embedding(movies)
        
        x = torch.cat([u, m], 1)
        r = self.predictor(x)
        return r
    
    def init(self):
        torch.nn.init.uniform(self.users_embedding.weight, a=-0.05, b=0.05)
        torch.nn.init.uniform(self.movies_embedding.weight, a=-0.05, b=0.05)
        torch.nn.init.xavier_uniform(self.predictor[1].weight) # Linear 1
        torch.nn.init.constant(self.predictor[1].bias, val=0.0) # Linear 1
        torch.nn.init.xavier_uniform(self.predictor[3].weight) # Linear 2
        torch.nn.init.constant(self.predictor[3].bias, val=0.0) # Linear 2

In [ ]:
model_nn = UserMovieModuleNN(num_users, num_movies, num_dimensions)
if(use_cuda):
    model_nn.cuda()
trainer_nn = ModuleTrainer(model)

In [ ]:
trainer_nn.set_optimizer(optim.Adam, lr=1e-2)
trainer_nn.set_loss(criterion)

regularizers = [L2Regularizer(scale=1e-2, module_filter='*embedding*')]
trainer_nn.set_regularizers(regularizers)

trainer_nn.fit([tensorl(ratings_train.userId), tensorl(ratings_train.movieId)], tensorf(ratings_train.rating), 
                batch_size=batch_size,
                val_data=([tensorl(ratings_valid.userId), tensorl(ratings_valid.movieId)], tensorf(ratings_valid.rating)),
                nb_epoch=1, shuffle=True)

In [ ]:
trainer_nn.set_optimizer(optim.Adam, lr=1e-3)
trainer_nn.fit([tensorl(ratings_train.userId), tensorl(ratings_train.movieId)], tensorf(ratings_train.rating), 
                batch_size=batch_size,
                val_data=([tensorl(ratings_valid.userId), tensorl(ratings_valid.movieId)], tensorf(ratings_valid.rating)),
                nb_epoch=4, shuffle=True)

This improves on our already impressive accuracy even further!