In [118]:
from __future__ import division
from polara.recommender.data import RecommenderData
from polara.recommender.models import RecommenderModel
from polara.tools.movielens import get_movielens_data
from polara.tools.printing import print_frames
In [4]:
data, genres = get_movielens_data(get_genres=True)
In [9]:
print_frames((data.head(), genres.head()))
Out[9]:
In [84]:
top_movies = data.movieid.value_counts().head(200)
In [83]:
top_genres = (genres.query('movieid in @top_movies.index')
.pivot('movienm', 'genreid')['movieid']
.isnull().eq(False).astype(int))
top_genres
Out[83]:
In [81]:
top_genres.sum().sort_values()
Out[81]:
Maybe we should promote movies with Action and Sci-Fi more than others?
Possible catches:
In [82]:
genres.genreid.value_counts()
Out[82]:
In [87]:
cross = data.query('movieid in @top_movies.index')
In [111]:
import numpy as np
import scipy as sp
from scipy import sparse
import matplotlib.pyplot as plt
In [112]:
%matplotlib inline
In [106]:
useridx, _ = pd.factorize(cross.userid)
movieidx, movieid = pd.factorize(cross.movieid)
mat = sp.sparse.coo_matrix((np.ones(len(movieidx),), (useridx, movieidx))).tocsc()
In [107]:
mat.shape
Out[107]:
In [119]:
mat.nnz / np.prod(mat.shape)
Out[119]:
In [108]:
cooc = mat.T.dot(mat)
In [121]:
cooc
Out[121]:
In [167]:
top_genres.values.T.dot(top_genres.values)
# plt.pcolormesh()
# ax = plt.gca()
# ax.set_xticklabels(top_genres.columns);
# plt.xticks(rotation=90)
# ax.set_yticklabels(top_genres.columns);
Out[167]:
In [ ]: