notebook.community

Edit and run



In [118]:

    
from __future__ import division

from polara.recommender.data import RecommenderData
from polara.recommender.models import RecommenderModel
from polara.tools.movielens import get_movielens_data
from polara.tools.printing import print_frames



In [4]:

    
data, genres = get_movielens_data(get_genres=True)



In [9]:

    
print_frames((data.head(), genres.head()))









    Out[9]:





     

  
    
      
      userid
      movieid
      rating
    
  
  
    
      0
      1
      1193
      5
    
    
      1
      1
      661
      3
    
    
      2
      1
      914
      3
    
    
      3
      1
      3408
      4
    
    
      4
      1
      2355
      5
    
  

 
 

  
    
      
      movieid
      movienm
      genreid
    
  
  
    
      0
      1
      Toy Story (1995)
      Animation
    
    
      1
      1
      Toy Story (1995)
      Children's
    
    
      2
      1
      Toy Story (1995)
      Comedy
    
    
      3
      2
      Jumanji (1995)
      Adventure
    
    
      4
      2
      Jumanji (1995)
      Children's



In [84]:

    
top_movies = data.movieid.value_counts().head(200)



In [83]:

    
top_genres = (genres.query('movieid in @top_movies.index')
              .pivot('movienm', 'genreid')['movieid']
              .isnull().eq(False).astype(int))
top_genres









    Out[83]:






  
    
      genreid
      Action
      Adventure
      Comedy
      Drama
      Fantasy
      Romance
      Sci-Fi
      Thriller
      War
    
    
      movienm
      
      
      
      
      
      
      
      
      
    
  
  
    
      American Beauty (1999)
      0
      0
      1
      1
      0
      0
      0
      0
      0
    
    
      Back to the Future (1985)
      0
      0
      1
      0
      0
      0
      1
      0
      0
    
    
      Jurassic Park (1993)
      1
      1
      0
      0
      0
      0
      1
      0
      0
    
    
      Matrix, The (1999)
      1
      0
      0
      0
      0
      0
      1
      1
      0
    
    
      Saving Private Ryan (1998)
      1
      0
      0
      1
      0
      0
      0
      0
      1
    
    
      Silence of the Lambs, The (1991)
      0
      0
      0
      1
      0
      0
      0
      1
      0
    
    
      Star Wars: Episode IV - A New Hope (1977)
      1
      1
      0
      0
      1
      0
      1
      0
      0
    
    
      Star Wars: Episode V - The Empire Strikes Back (1980)
      1
      1
      0
      1
      0
      0
      1
      0
      1
    
    
      Star Wars: Episode VI - Return of the Jedi (1983)
      1
      1
      0
      0
      0
      1
      1
      0
      1
    
    
      Terminator 2: Judgment Day (1991)
      1
      0
      0
      0
      0
      0
      1
      1
      0



In [81]:

    
top_genres.sum().sort_values()









    Out[81]:





genreid
Fantasy      1
Romance      1
Comedy       2
Thriller     3
War          3
Adventure    4
Drama        4
Action       7
Sci-Fi       7
dtype: int64

Maybe we should promote movies with Action and Sci-Fi more than others?

Possible catches:

movie has dramatic flavor but is not marked as drama
genre distribution is random, or not statistically significant



In [82]:

    
genres.genreid.value_counts()









    Out[82]:





Drama          1603
Comedy         1200
Action          503
Thriller        492
Romance         471
Horror          343
Adventure       283
Sci-Fi          276
Children's      251
Crime           211
War             143
Documentary     127
Musical         114
Mystery         106
Animation       105
Fantasy          68
Western          68
Film-Noir        44
Name: genreid, dtype: int64



In [87]:

    
cross = data.query('movieid in @top_movies.index')



In [111]:

    
import numpy as np
import scipy as sp
from scipy import sparse
import matplotlib.pyplot as plt



In [112]:

    
%matplotlib inline



In [106]:

    
useridx, _ = pd.factorize(cross.userid)
movieidx, movieid = pd.factorize(cross.movieid)

mat = sp.sparse.coo_matrix((np.ones(len(movieidx),), (useridx, movieidx))).tocsc()



In [107]:

    
mat.shape









    Out[107]:





(6039, 200)



In [119]:

    
mat.nnz / np.prod(mat.shape)









    Out[119]:





0.24700529889054479



In [108]:

    
cooc = mat.T.dot(mat)



In [121]:

    
cooc









    Out[121]:





<200x200 sparse matrix of type '<type 'numpy.float64'>'
	with 40000 stored elements in Compressed Sparse Row format>



In [167]:

    
top_genres.values.T.dot(top_genres.values)
# plt.pcolormesh()
# ax = plt.gca()
# ax.set_xticklabels(top_genres.columns);
# plt.xticks(rotation=90)
# ax.set_yticklabels(top_genres.columns);









    Out[167]:





array([[7, 4, 0, 2, 1, 1, 6, 2, 3],
       [4, 4, 0, 1, 1, 1, 4, 0, 2],
       [0, 0, 2, 1, 0, 0, 1, 0, 0],
       [2, 1, 1, 4, 0, 0, 1, 1, 2],
       [1, 1, 0, 0, 1, 0, 1, 0, 0],
       [1, 1, 0, 0, 0, 1, 1, 0, 1],
       [6, 4, 1, 1, 1, 1, 7, 2, 2],
       [2, 0, 0, 1, 0, 0, 2, 3, 0],
       [3, 2, 0, 2, 0, 1, 2, 0, 3]])



In [ ]:

genreid	Action	Adventure	Comedy	Drama	Fantasy	Romance	Sci-Fi	Thriller	War
movienm
American Beauty (1999)	0	0	1	1	0	0	0	0	0
Back to the Future (1985)	0	0	1	0	0	0	1	0	0
Jurassic Park (1993)	1	1	0	0	0	0	1	0	0
Matrix, The (1999)	1	0	0	0	0	0	1	1	0
Saving Private Ryan (1998)	1	0	0	1	0	0	0	0	1
Silence of the Lambs, The (1991)	0	0	0	1	0	0	0	1	0
Star Wars: Episode IV - A New Hope (1977)	1	1	0	0	1	0	1	0	0
Star Wars: Episode V - The Empire Strikes Back (1980)	1	1	0	1	0	0	1	0	1
Star Wars: Episode VI - Return of the Jedi (1983)	1	1	0	0	0	1	1	0	1
Terminator 2: Judgment Day (1991)	1	0	0	0	0	0	1	1	0