In [ ]:
from fastai.collab import *
from fastai.tabular import *
collab models use data in a DataFrame of user, items, and ratings.
In [ ]:
user,item,title = 'userId','movieId','title'
In [ ]:
path = untar_data(URLs.ML_SAMPLE)
path
Out[ ]:
In [ ]:
ratings = pd.read_csv(path/'ratings.csv')
ratings.head()
Out[ ]:
That's all we need to create and train a model:
In [ ]:
data = CollabDataBunch.from_df(ratings, seed=42)
In [ ]:
y_range = [0,5.5]
In [ ]:
learn = collab_learner(data, n_factors=50, y_range=y_range)
In [ ]:
learn.fit_one_cycle(3, 5e-3)
Let's try with the full Movielens 100k data dataset, available from http://files.grouplens.org/datasets/movielens/ml-100k.zip
In [ ]:
path=Config.data_path()/'ml-100k'
In [ ]:
ratings = pd.read_csv(path/'u.data', delimiter='\t', header=None,
names=[user,item,'rating','timestamp'])
ratings.head()
Out[ ]:
In [ ]:
movies = pd.read_csv(path/'u.item', delimiter='|', encoding='latin-1', header=None,
names=[item, 'title', 'date', 'N', 'url', *[f'g{i}' for i in range(19)]])
movies.head()
Out[ ]:
In [ ]:
len(ratings)
Out[ ]:
In [ ]:
rating_movie = ratings.merge(movies[[item, title]])
rating_movie.head()
Out[ ]:
In [ ]:
data = CollabDataBunch.from_df(rating_movie, seed=42, valid_pct=0.1, item_name=title)
In [ ]:
data.show_batch()
In [ ]:
y_range = [0,5.5]
In [ ]:
learn = collab_learner(data, n_factors=40, y_range=y_range, wd=1e-1)
In [ ]:
learn.lr_find()
learn.recorder.plot(skip_end=15)
In [ ]:
learn.fit_one_cycle(5, 5e-3)
In [ ]:
learn.save('dotprod')
Here's some benchmarks on the same dataset for the popular Librec system for collaborative filtering. They show best results based on RMSE of 0.91, which corresponds to an MSE of 0.91**2 = 0.83.
In [ ]:
learn.load('dotprod');
In [ ]:
learn.model
Out[ ]:
In [ ]:
g = rating_movie.groupby(title)['rating'].count()
top_movies = g.sort_values(ascending=False).index.values[:1000]
top_movies[:10]
Out[ ]:
In [ ]:
movie_bias = learn.bias(top_movies, is_item=True)
movie_bias.shape
Out[ ]:
In [ ]:
mean_ratings = rating_movie.groupby(title)['rating'].mean()
movie_ratings = [(b, i, mean_ratings.loc[i]) for i,b in zip(top_movies,movie_bias)]
In [ ]:
item0 = lambda o:o[0]
In [ ]:
sorted(movie_ratings, key=item0)[:15]
Out[ ]:
In [ ]:
sorted(movie_ratings, key=lambda o: o[0], reverse=True)[:15]
Out[ ]:
In [ ]:
movie_w = learn.weight(top_movies, is_item=True)
movie_w.shape
Out[ ]:
In [ ]:
movie_pca = movie_w.pca(3)
movie_pca.shape
Out[ ]:
In [ ]:
fac0,fac1,fac2 = movie_pca.t()
movie_comp = [(f, i) for f,i in zip(fac0, top_movies)]
In [ ]:
sorted(movie_comp, key=itemgetter(0), reverse=True)[:10]
Out[ ]:
In [ ]:
sorted(movie_comp, key=itemgetter(0))[:10]
Out[ ]:
In [ ]:
movie_comp = [(f, i) for f,i in zip(fac1, top_movies)]
In [ ]:
sorted(movie_comp, key=itemgetter(0), reverse=True)[:10]
Out[ ]:
In [ ]:
sorted(movie_comp, key=itemgetter(0))[:10]
Out[ ]:
In [ ]:
idxs = np.random.choice(len(top_movies), 50, replace=False)
idxs = list(range(50))
X = fac0[idxs]
Y = fac2[idxs]
plt.figure(figsize=(15,15))
plt.scatter(X, Y)
for i, x, y in zip(top_movies[idxs], X, Y):
plt.text(x,y,i, color=np.random.rand(3)*0.7, fontsize=11)
plt.show()
In [ ]: