notebook.community

Edit and run



In [38]:

    
import numpy as np 
import pandas as pd 
from sklearn.metrics.pairwise import cosine_similarity



In [39]:

    
u_cols = ['user_id','sex','age','occupation', 'zip_code']
r_cols = ['user_id', 'movie_id', 'rating']
m_cols = ['movie_id', 'title', 'genres']

users=pd.read_csv("users.dat",sep='::',names=u_cols, engine='python')
ratings = pd.read_csv('ratings.dat',sep='::', names=r_cols, engine='python', usecols=range(3))
movies = pd.read_csv('movies.dat', sep='::', names=m_cols, engine='python')
#combined=pd.merge(movies,ratings)
#df=pd.merge(combined,users)



In [40]:

    
ratings.head(1)









    Out[40]:







  
    
      
      user_id
      movie_id
      rating
    
  
  
    
      0
      1
      1193
      5



In [41]:

    
movies.head(1)









    Out[41]:







  
    
      
      movie_id
      title
      genres
    
  
  
    
      0
      1
      Toy Story (1995)
      Animation|Children's|Comedy



In [42]:

    
ratings_df = pd.merge(movies,ratings)
ratings_df.head()









    Out[42]:







  
    
      
      movie_id
      title
      genres
      user_id
      rating
    
  
  
    
      0
      1
      Toy Story (1995)
      Animation|Children's|Comedy
      1
      5
    
    
      1
      1
      Toy Story (1995)
      Animation|Children's|Comedy
      6
      4
    
    
      2
      1
      Toy Story (1995)
      Animation|Children's|Comedy
      8
      4
    
    
      3
      1
      Toy Story (1995)
      Animation|Children's|Comedy
      9
      5
    
    
      4
      1
      Toy Story (1995)
      Animation|Children's|Comedy
      10
      5



In [43]:

    
ratings_mtx_df = ratings_df.pivot_table(index='user_id', columns='title',values='rating')  
ratings_mtx_df.fillna(0, inplace=True)



In [44]:

    
ratings_mtx_df.head()









    Out[44]:







  
    
      title
      $1,000,000 Duck (1971)
      'Night Mother (1986)
      'Til There Was You (1997)
      'burbs, The (1989)
      ...And Justice for All (1979)
      1-900 (1994)
      10 Things I Hate About You (1999)
      101 Dalmatians (1961)
      101 Dalmatians (1996)
      12 Angry Men (1957)
      ...
      Young Poisoner's Handbook, The (1995)
      Young Sherlock Holmes (1985)
      Young and Innocent (1937)
      Your Friends and Neighbors (1998)
      Zachariah (1971)
      Zed & Two Noughts, A (1985)
      Zero Effect (1998)
      Zero Kelvin (Kjærlighetens kjøtere) (1995)
      Zeus and Roxanne (1997)
      eXistenZ (1999)
    
    
      user_id
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
    
  
  
    
      1
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      ...
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
    
    
      2
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      ...
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
    
    
      3
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      ...
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
    
    
      4
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      ...
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
    
    
      5
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      ...
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
    
  

5 rows × 3706 columns



In [45]:

    
movie_index = ratings_mtx_df.columns



In [46]:

    
corr_matrix = cosine_similarity(ratings_mtx_df.T.as_matrix())
np.fill_diagonal(corr_matrix, 0 )#Filling diagonals with 0s for future use when sorting is done
corr= pd.DataFrame(corr_matrix)



In [47]:

    
corr.head()









    Out[47]:







  
    
      
      0
      1
      2
      3
      4
      5
      6
      7
      8
      9
      ...
      3696
      3697
      3698
      3699
      3700
      3701
      3702
      3703
      3704
      3705
    
  
  
    
      0
      0.000000
      0.072357
      0.037011
      0.079291
      0.060838
      0.00000
      0.058619
      0.189965
      0.172254
      0.094785
      ...
      0.038725
      0.076474
      0.000000
      0.044074
      0.0
      0.045280
      0.039395
      0.000000
      0.120242
      0.027003
    
    
      1
      0.072357
      0.000000
      0.115290
      0.115545
      0.159526
      0.00000
      0.076798
      0.147437
      0.095922
      0.111413
      ...
      0.053010
      0.087828
      0.063758
      0.135962
      0.0
      0.091150
      0.074787
      0.000000
      0.000000
      0.077807
    
    
      2
      0.037011
      0.115290
      0.000000
      0.098756
      0.066301
      0.08025
      0.127895
      0.112654
      0.125670
      0.079115
      ...
      0.029200
      0.062893
      0.000000
      0.079187
      0.0
      0.022594
      0.079261
      0.000000
      0.047526
      0.063284
    
    
      3
      0.079291
      0.115545
      0.098756
      0.000000
      0.143620
      0.00000
      0.192191
      0.246927
      0.175885
      0.170719
      ...
      0.113386
      0.207897
      0.019962
      0.138064
      0.0
      0.055704
      0.161174
      0.000000
      0.033567
      0.110525
    
    
      4
      0.060838
      0.159526
      0.066301
      0.143620
      0.000000
      0.00000
      0.075093
      0.194154
      0.116379
      0.205486
      ...
      0.089998
      0.153006
      0.067009
      0.109029
      0.0
      0.086080
      0.110867
      0.074317
      0.000000
      0.111040
    
  

5 rows × 3706 columns



In [48]:

    
inp = list(movie_index).index('Final Destination (2000)')
inp









    Out[48]:





1158



In [49]:

    
P = corr_matrix[inp]



In [50]:

    
max(P)









    Out[50]:





0.40850620631042028



In [51]:

    
list(movie_index[(P>0.34) & (P<0.41)])









    Out[51]:





['American Psycho (2000)',
 'Frequency (2000)',
 'Pitch Black (2000)',
 'Scream 3 (2000)',
 'Skulls, The (2000)']

Pearson Similarity



In [52]:

    
corr_matrix = np.corrcoef(ratings_mtx_df.T)
np.fill_diagonal(corr_matrix, 0 )
corr = pd.DataFrame(corr_matrix)



In [53]:

    
corr.head()









    Out[53]:







  
    
      
      0
      1
      2
      3
      4
      5
      6
      7
      8
      9
      ...
      3696
      3697
      3698
      3699
      3700
      3701
      3702
      3703
      3704
      3705
    
  
  
    
      0
      0.000000
      0.065338
      0.030805
      0.065478
      0.048708
      -0.001319
      0.036612
      0.176528
      0.159973
      0.075665
      ...
      0.030758
      0.060574
      -0.002833
      0.035056
      -0.001237
      0.040590
      0.024165
      -0.001332
      0.116574
      0.009243
    
    
      1
      0.065338
      0.000000
      0.107374
      0.096778
      0.144480
      -0.001834
      0.046122
      0.123378
      0.074706
      0.083988
      ...
      0.042061
      0.065333
      0.060202
      0.124593
      -0.001719
      0.085001
      0.054343
      -0.001852
      -0.005825
      0.054699
    
    
      2
      0.030805
      0.107374
      0.000000
      0.082706
      0.051965
      0.079011
      0.105672
      0.091422
      0.108952
      0.054821
      ...
      0.019685
      0.043294
      -0.003341
      0.068935
      -0.001459
      0.016935
      0.062262
      -0.001571
      0.042842
      0.043483
    
    
      3
      0.065478
      0.096778
      0.082706
      0.000000
      0.110790
      -0.003821
      0.133881
      0.198168
      0.134041
      0.113112
      ...
      0.092597
      0.165666
      0.012226
      0.114837
      -0.003582
      0.042862
      0.121617
      -0.003858
      0.022249
      0.062471
    
    
      4
      0.048708
      0.144480
      0.051965
      0.110790
      0.000000
      -0.003203
      0.018614
      0.151020
      0.078917
      0.160556
      ...
      0.071819
      0.115402
      0.061253
      0.088616
      -0.003002
      0.075720
      0.075804
      0.072283
      -0.010171
      0.071000
    
  

5 rows × 3706 columns



In [54]:

    
P = corr_matrix[inp]



In [55]:

    
max(P)









    Out[55]:





0.35259154120260761



In [56]:

    
list(movie_index[(P>0.28) & (P<0.35)])









    Out[56]:





['American Psycho (2000)',
 'Frequency (2000)',
 'Scream 3 (2000)',
 'Skulls, The (2000)']

	movie_id	title	genres	user_id	rating
0	1	Toy Story (1995)	Animation\|Children's\|Comedy	1	5
1	1	Toy Story (1995)	Animation\|Children's\|Comedy	6	4
2	1	Toy Story (1995)	Animation\|Children's\|Comedy	8	4
3	1	Toy Story (1995)	Animation\|Children's\|Comedy	9	5
4	1	Toy Story (1995)	Animation\|Children's\|Comedy	10	5

title	$1,000,000 Duck (1971)	'Night Mother (1986)	'Til There Was You (1997)	'burbs, The (1989)	...And Justice for All (1979)	1-900 (1994)	10 Things I Hate About You (1999)	101 Dalmatians (1961)	101 Dalmatians (1996)	12 Angry Men (1957)	...	Young Poisoner's Handbook, The (1995)	Young Sherlock Holmes (1985)	Young and Innocent (1937)	Your Friends and Neighbors (1998)	Zachariah (1971)	Zed & Two Noughts, A (1985)	Zero Effect (1998)	Zero Kelvin (Kjærlighetens kjøtere) (1995)	Zeus and Roxanne (1997)	eXistenZ (1999)
user_id
1	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	...	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0
2	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	...	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0
3	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	...	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0
4	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	...	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0
5	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	...	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0

	0	1	2	3	4	5	6	7	8	9	...	3696	3697	3698	3699	3701	3702	3703	3704	3705
0	0.000000	0.072357	0.037011	0.079291	0.060838	0.00000	0.058619	0.189965	0.172254	0.094785	...	0.038725	0.076474	0.000000	0.044074	0.045280	0.039395	0.000000	0.120242	0.027003
1	0.072357	0.000000	0.115290	0.115545	0.159526	0.00000	0.076798	0.147437	0.095922	0.111413	...	0.053010	0.087828	0.063758	0.135962	0.091150	0.074787	0.000000	0.000000	0.077807
2	0.037011	0.115290	0.000000	0.098756	0.066301	0.08025	0.127895	0.112654	0.125670	0.079115	...	0.029200	0.062893	0.000000	0.079187	0.022594	0.079261	0.000000	0.047526	0.063284
3	0.079291	0.115545	0.098756	0.000000	0.143620	0.00000	0.192191	0.246927	0.175885	0.170719	...	0.113386	0.207897	0.019962	0.138064	0.055704	0.161174	0.000000	0.033567	0.110525
4	0.060838	0.159526	0.066301	0.143620	0.000000	0.00000	0.075093	0.194154	0.116379	0.205486	...	0.089998	0.153006	0.067009	0.109029	0.086080	0.110867	0.074317	0.000000	0.111040

	0	1	2	3	4	5	6	7	8	9	...	3696	3697	3698	3699	3700	3701	3702	3703	3704	3705
0	0.000000	0.065338	0.030805	0.065478	0.048708	-0.001319	0.036612	0.176528	0.159973	0.075665	...	0.030758	0.060574	-0.002833	0.035056	-0.001237	0.040590	0.024165	-0.001332	0.116574	0.009243
1	0.065338	0.000000	0.107374	0.096778	0.144480	-0.001834	0.046122	0.123378	0.074706	0.083988	...	0.042061	0.065333	0.060202	0.124593	-0.001719	0.085001	0.054343	-0.001852	-0.005825	0.054699
2	0.030805	0.107374	0.000000	0.082706	0.051965	0.079011	0.105672	0.091422	0.108952	0.054821	...	0.019685	0.043294	-0.003341	0.068935	-0.001459	0.016935	0.062262	-0.001571	0.042842	0.043483
3	0.065478	0.096778	0.082706	0.000000	0.110790	-0.003821	0.133881	0.198168	0.134041	0.113112	...	0.092597	0.165666	0.012226	0.114837	-0.003582	0.042862	0.121617	-0.003858	0.022249	0.062471
4	0.048708	0.144480	0.051965	0.110790	0.000000	-0.003203	0.018614	0.151020	0.078917	0.160556	...	0.071819	0.115402	0.061253	0.088616	-0.003002	0.075720	0.075804	0.072283	-0.010171	0.071000