In [118]:
from __future__ import division

from polara.recommender.data import RecommenderData
from polara.recommender.models import RecommenderModel
from polara.tools.movielens import get_movielens_data
from polara.tools.printing import print_frames

In [4]:
data, genres = get_movielens_data(get_genres=True)

In [9]:
print_frames((data.head(), genres.head()))


Out[9]:
userid movieid rating
0 1 1193 5
1 1 661 3
2 1 914 3
3 1 3408 4
4 1 2355 5
movieid movienm genreid
0 1 Toy Story (1995) Animation
1 1 Toy Story (1995) Children's
2 1 Toy Story (1995) Comedy
3 2 Jumanji (1995) Adventure
4 2 Jumanji (1995) Children's

In [84]:
top_movies = data.movieid.value_counts().head(200)

In [83]:
top_genres = (genres.query('movieid in @top_movies.index')
              .pivot('movienm', 'genreid')['movieid']
              .isnull().eq(False).astype(int))
top_genres


Out[83]:
genreid Action Adventure Comedy Drama Fantasy Romance Sci-Fi Thriller War
movienm
American Beauty (1999) 0 0 1 1 0 0 0 0 0
Back to the Future (1985) 0 0 1 0 0 0 1 0 0
Jurassic Park (1993) 1 1 0 0 0 0 1 0 0
Matrix, The (1999) 1 0 0 0 0 0 1 1 0
Saving Private Ryan (1998) 1 0 0 1 0 0 0 0 1
Silence of the Lambs, The (1991) 0 0 0 1 0 0 0 1 0
Star Wars: Episode IV - A New Hope (1977) 1 1 0 0 1 0 1 0 0
Star Wars: Episode V - The Empire Strikes Back (1980) 1 1 0 1 0 0 1 0 1
Star Wars: Episode VI - Return of the Jedi (1983) 1 1 0 0 0 1 1 0 1
Terminator 2: Judgment Day (1991) 1 0 0 0 0 0 1 1 0

In [81]:
top_genres.sum().sort_values()


Out[81]:
genreid
Fantasy      1
Romance      1
Comedy       2
Thriller     3
War          3
Adventure    4
Drama        4
Action       7
Sci-Fi       7
dtype: int64

Maybe we should promote movies with Action and Sci-Fi more than others?

Possible catches:

  • movie has dramatic flavor but is not marked as drama
  • genre distribution is random, or not statistically significant

In [82]:
genres.genreid.value_counts()


Out[82]:
Drama          1603
Comedy         1200
Action          503
Thriller        492
Romance         471
Horror          343
Adventure       283
Sci-Fi          276
Children's      251
Crime           211
War             143
Documentary     127
Musical         114
Mystery         106
Animation       105
Fantasy          68
Western          68
Film-Noir        44
Name: genreid, dtype: int64

In [87]:
cross = data.query('movieid in @top_movies.index')

In [111]:
import numpy as np
import scipy as sp
from scipy import sparse
import matplotlib.pyplot as plt

In [112]:
%matplotlib inline

In [106]:
useridx, _ = pd.factorize(cross.userid)
movieidx, movieid = pd.factorize(cross.movieid)

mat = sp.sparse.coo_matrix((np.ones(len(movieidx),), (useridx, movieidx))).tocsc()

In [107]:
mat.shape


Out[107]:
(6039, 200)

In [119]:
mat.nnz / np.prod(mat.shape)


Out[119]:
0.24700529889054479

In [108]:
cooc = mat.T.dot(mat)

In [121]:
cooc


Out[121]:
<200x200 sparse matrix of type '<type 'numpy.float64'>'
	with 40000 stored elements in Compressed Sparse Row format>

In [167]:
top_genres.values.T.dot(top_genres.values)
# plt.pcolormesh()
# ax = plt.gca()
# ax.set_xticklabels(top_genres.columns);
# plt.xticks(rotation=90)
# ax.set_yticklabels(top_genres.columns);


Out[167]:
array([[7, 4, 0, 2, 1, 1, 6, 2, 3],
       [4, 4, 0, 1, 1, 1, 4, 0, 2],
       [0, 0, 2, 1, 0, 0, 1, 0, 0],
       [2, 1, 1, 4, 0, 0, 1, 1, 2],
       [1, 1, 0, 0, 1, 0, 1, 0, 0],
       [1, 1, 0, 0, 0, 1, 1, 0, 1],
       [6, 4, 1, 1, 1, 1, 7, 2, 2],
       [2, 0, 0, 1, 0, 0, 2, 3, 0],
       [3, 2, 0, 2, 0, 1, 2, 0, 3]])

In [ ]: