notebook.community

Edit and run



In [1]:

    
import pandas as pd
import numpy as np 
from sklearn.metrics.pairwise import cosine_similarity



In [2]:

    
ratings =pd.read_csv('ratings.csv')
ratings.head(1)









    Out[2]:







  
    
      
      user_id
      movie_id
      rating
    
  
  
    
      0
      1
      31
      2.5



In [3]:

    
movies = pd.read_csv('movies.csv')
movies.head(1)









    Out[3]:







  
    
      
      movie_id
      title
      genres
    
  
  
    
      0
      1
      Toy Story (1995)
      Adventure|Animation|Children|Comedy|Fantasy



In [4]:

    
ratings_df = pd.merge(movies,ratings)
ratings_df.head()









    Out[4]:







  
    
      
      movie_id
      title
      genres
      user_id
      rating
    
  
  
    
      0
      1
      Toy Story (1995)
      Adventure|Animation|Children|Comedy|Fantasy
      7
      3.0
    
    
      1
      1
      Toy Story (1995)
      Adventure|Animation|Children|Comedy|Fantasy
      9
      4.0
    
    
      2
      1
      Toy Story (1995)
      Adventure|Animation|Children|Comedy|Fantasy
      13
      5.0
    
    
      3
      1
      Toy Story (1995)
      Adventure|Animation|Children|Comedy|Fantasy
      15
      2.0
    
    
      4
      1
      Toy Story (1995)
      Adventure|Animation|Children|Comedy|Fantasy
      19
      3.0



In [5]:

    
ratings_mtx_df = ratings_df.pivot_table(index='user_id', columns='title',values='rating')  
ratings_mtx_df.fillna(0, inplace=True)



In [6]:

    
ratings_mtx_df.head()









    Out[6]:







  
    
      title
      "Great Performances" Cats (1998)
      $9.99 (2008)
      'Hellboy': The Seeds of Creation (2004)
      'Neath the Arizona Skies (1934)
      'Round Midnight (1986)
      'Salem's Lot (2004)
      'Til There Was You (1997)
      'burbs, The (1989)
      'night Mother (1986)
      (500) Days of Summer (2009)
      ...
      Zulu (1964)
      Zulu (2013)
      [REC] (2007)
      eXistenZ (1999)
      loudQUIETloud: A Film About the Pixies (2006)
      xXx (2002)
      xXx: State of the Union (2005)
      ¡Three Amigos! (1986)
      À nous la liberté (Freedom for Us) (1931)
      İtirazım Var (2014)
    
    
      user_id
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
    
  
  
    
      1
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      ...
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
    
    
      2
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      ...
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
    
    
      3
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      ...
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
    
    
      4
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      ...
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
    
    
      5
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      ...
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
    
  

5 rows × 9064 columns



In [7]:

    
movie_index = ratings_mtx_df.columns



In [8]:

    
corr_matrix = cosine_similarity(ratings_mtx_df.T.as_matrix())
np.fill_diagonal(corr_matrix, 0 )#Filling diagonals with 0s for future use when sorting is done
corr= pd.DataFrame(corr_matrix)



In [9]:

    
corr.head()









    Out[9]:







  
    
      
      0
      1
      2
      3
      4
      5
      6
      7
      8
      9
      ...
      9054
      9055
      9056
      9057
      9058
      9059
      9060
      9061
      9062
      9063
    
  
  
    
      0
      0.000000
      0.0
      0.0
      0.164399
      0.020391
      0.0
      0.014046
      0.000000
      0.0
      0.003166
      ...
      0.0
      0.0
      0.0
      0.000000
      0.0
      0.000000
      0.0
      0.000000
      0.0
      0.0
    
    
      1
      0.000000
      0.0
      0.0
      0.000000
      0.000000
      0.0
      0.000000
      0.079474
      0.0
      0.156330
      ...
      0.0
      0.0
      0.0
      0.000000
      0.0
      0.013899
      0.0
      0.058218
      0.0
      0.0
    
    
      2
      0.000000
      0.0
      0.0
      0.000000
      0.000000
      1.0
      0.000000
      0.217357
      0.0
      0.000000
      ...
      0.0
      0.0
      0.0
      0.000000
      0.0
      0.000000
      0.0
      0.000000
      0.0
      0.0
    
    
      3
      0.164399
      0.0
      0.0
      0.000000
      0.124035
      0.0
      0.085436
      0.000000
      0.0
      0.019259
      ...
      0.0
      0.0
      0.0
      0.000000
      0.0
      0.000000
      0.0
      0.000000
      0.0
      0.0
    
    
      4
      0.020391
      0.0
      0.0
      0.124035
      0.000000
      0.0
      0.010597
      0.143786
      0.0
      0.136163
      ...
      0.0
      0.0
      0.0
      0.121567
      0.0
      0.000000
      0.0
      0.000000
      0.0
      0.0
    
  

5 rows × 9064 columns



In [10]:

    
inp = list(movie_index).index('Inception (2010)')
inp









    Out[10]:





4013



In [11]:

    
P = corr_matrix[inp]



In [12]:

    
max(P)









    Out[12]:





0.65699917684681852



In [13]:

    
list(movie_index[(P>0.5) & (P<0.7)])









    Out[13]:





['Avatar (2009)',
 'Avengers, The (2012)',
 'Dark Knight Rises, The (2012)',
 'Dark Knight, The (2008)',
 'District 9 (2009)',
 'Django Unchained (2012)',
 'Inglourious Basterds (2009)',
 'Interstellar (2014)',
 'Iron Man (2008)',
 'Sherlock Holmes (2009)',
 'Shutter Island (2010)',
 'Social Network, The (2010)',
 'Star Trek (2009)',
 'Up (2009)',
 'WALL·E (2008)']

Pearson Similarity



In [14]:

    
corr_matrix = np.corrcoef(ratings_mtx_df.T)
np.fill_diagonal(corr_matrix, 0 )
corr = pd.DataFrame(corr_matrix)



In [15]:

    
corr.head()









    Out[15]:







  
    
      
      0
      1
      2
      3
      4
      5
      6
      7
      8
      9
      ...
      9054
      9055
      9056
      9057
      9058
      9059
      9060
      9061
      9062
      9063
    
  
  
    
      0
      0.000000
      -0.002894
      -0.001718
      0.162966
      0.018512
      -0.001718
      0.011006
      -0.007311
      -0.002980
      -0.008271
      ...
      -0.002431
      -0.001718
      -0.002929
      -0.008014
      -0.001718
      -0.007546
      -0.001718
      -0.009406
      -0.001718
      -0.001718
    
    
      1
      -0.002894
      0.000000
      -0.002514
      -0.002514
      -0.002807
      -0.002514
      -0.004518
      0.070014
      -0.004360
      0.144961
      ...
      -0.003558
      -0.002514
      -0.004286
      -0.011726
      -0.002514
      0.003087
      -0.002514
      0.045869
      -0.002514
      -0.002514
    
    
      2
      -0.001718
      -0.002514
      0.000000
      -0.001493
      -0.001666
      1.000000
      -0.002682
      0.214088
      -0.002589
      -0.010031
      ...
      -0.002112
      -0.001493
      -0.002545
      -0.006962
      -0.001493
      -0.006556
      -0.001493
      -0.008172
      -0.001493
      -0.001493
    
    
      3
      0.162966
      -0.002514
      -0.001493
      0.000000
      0.122576
      -0.001493
      0.083023
      -0.006351
      -0.002589
      0.009882
      ...
      -0.002112
      -0.001493
      -0.002545
      -0.006962
      -0.001493
      -0.006556
      -0.001493
      -0.008172
      -0.001493
      -0.001493
    
    
      4
      0.018512
      -0.002807
      -0.001666
      0.122576
      0.000000
      -0.001666
      0.007638
      0.138760
      -0.002891
      0.129609
      ...
      -0.002358
      -0.001666
      -0.002841
      0.115866
      -0.001666
      -0.007320
      -0.001666
      -0.009124
      -0.001666
      -0.001666
    
  

5 rows × 9064 columns



In [16]:

    
P = corr_matrix[inp]



In [17]:

    
max(P)









    Out[17]:





0.58951497276264542



In [18]:

    
list(movie_index[(P>0.45) & (P<0.6)])









    Out[18]:





['Avatar (2009)',
 'Avengers, The (2012)',
 'Dark Knight Rises, The (2012)',
 'Dark Knight, The (2008)',
 'District 9 (2009)',
 'Django Unchained (2012)',
 'Inglourious Basterds (2009)',
 'Interstellar (2014)',
 'Iron Man (2008)',
 'Sherlock Holmes (2009)',
 'Sherlock Holmes: A Game of Shadows (2011)',
 'Shutter Island (2010)',
 'Social Network, The (2010)',
 'Star Trek (2009)',
 'Up (2009)',
 'WALL·E (2008)']

	movie_id	title	genres	user_id	rating
0	1	Toy Story (1995)	Adventure\|Animation\|Children\|Comedy\|Fantasy	7	3.0
1	1	Toy Story (1995)	Adventure\|Animation\|Children\|Comedy\|Fantasy	9	4.0
2	1	Toy Story (1995)	Adventure\|Animation\|Children\|Comedy\|Fantasy	13	5.0
3	1	Toy Story (1995)	Adventure\|Animation\|Children\|Comedy\|Fantasy	15	2.0
4	1	Toy Story (1995)	Adventure\|Animation\|Children\|Comedy\|Fantasy	19	3.0

title	"Great Performances" Cats (1998)	$9.99 (2008)	'Hellboy': The Seeds of Creation (2004)	'Neath the Arizona Skies (1934)	'Round Midnight (1986)	'Salem's Lot (2004)	'Til There Was You (1997)	'burbs, The (1989)	'night Mother (1986)	(500) Days of Summer (2009)	...	Zulu (1964)	Zulu (2013)	[REC] (2007)	eXistenZ (1999)	loudQUIETloud: A Film About the Pixies (2006)	xXx (2002)	xXx: State of the Union (2005)	¡Three Amigos! (1986)	À nous la liberté (Freedom for Us) (1931)	İtirazım Var (2014)
user_id
1	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	...	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0
2	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	...	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0
3	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	...	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0
4	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	...	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0
5	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	...	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0

	0	3	4	5	6	7	9	...	9057	9059	9061
0	0.000000	0.164399	0.020391	0.0	0.014046	0.000000	0.003166	...	0.000000	0.000000	0.000000
1	0.000000	0.000000	0.000000	0.0	0.000000	0.079474	0.156330	...	0.000000	0.013899	0.058218
2	0.000000	0.000000	0.000000	1.0	0.000000	0.217357	0.000000	...	0.000000	0.000000	0.000000
3	0.164399	0.000000	0.124035	0.0	0.085436	0.000000	0.019259	...	0.000000	0.000000	0.000000
4	0.020391	0.124035	0.000000	0.0	0.010597	0.143786	0.136163	...	0.121567	0.000000	0.000000

	0	1	2	3	4	5	6	7	8	9	...	9054	9055	9056	9057	9058	9059	9060	9061	9062	9063
0	0.000000	-0.002894	-0.001718	0.162966	0.018512	-0.001718	0.011006	-0.007311	-0.002980	-0.008271	...	-0.002431	-0.001718	-0.002929	-0.008014	-0.001718	-0.007546	-0.001718	-0.009406	-0.001718	-0.001718
1	-0.002894	0.000000	-0.002514	-0.002514	-0.002807	-0.002514	-0.004518	0.070014	-0.004360	0.144961	...	-0.003558	-0.002514	-0.004286	-0.011726	-0.002514	0.003087	-0.002514	0.045869	-0.002514	-0.002514
2	-0.001718	-0.002514	0.000000	-0.001493	-0.001666	1.000000	-0.002682	0.214088	-0.002589	-0.010031	...	-0.002112	-0.001493	-0.002545	-0.006962	-0.001493	-0.006556	-0.001493	-0.008172	-0.001493	-0.001493
3	0.162966	-0.002514	-0.001493	0.000000	0.122576	-0.001493	0.083023	-0.006351	-0.002589	0.009882	...	-0.002112	-0.001493	-0.002545	-0.006962	-0.001493	-0.006556	-0.001493	-0.008172	-0.001493	-0.001493
4	0.018512	-0.002807	-0.001666	0.122576	0.000000	-0.001666	0.007638	0.138760	-0.002891	0.129609	...	-0.002358	-0.001666	-0.002841	0.115866	-0.001666	-0.007320	-0.001666	-0.009124	-0.001666	-0.001666