notebook.community

Edit and run



In [103]:

    
import numpy as np 
import pandas as pd 
from sklearn.metrics.pairwise import cosine_similarity



In [104]:

    
r_cols = ['user_id', 'movie_id', 'rating']
m_cols = ['movie_id', 'title']

ratings = pd.read_csv('u.data',sep='\t', names=r_cols, usecols = range(3))
movies = pd.read_csv('u.item', sep='|', names=m_cols, usecols=range(2))



In [105]:

    
ratings.head(1)









    Out[105]:







  
    
      
      user_id
      movie_id
      rating
    
  
  
    
      0
      196
      242
      3



In [106]:

    
movies.head(1)









    Out[106]:







  
    
      
      movie_id
      title
    
  
  
    
      0
      1
      Toy Story (1995)



In [107]:

    
ratings_df = pd.merge(movies,ratings)
ratings_df.head()









    Out[107]:







  
    
      
      movie_id
      title
      user_id
      rating
    
  
  
    
      0
      1
      Toy Story (1995)
      308
      4
    
    
      1
      1
      Toy Story (1995)
      287
      5
    
    
      2
      1
      Toy Story (1995)
      148
      4
    
    
      3
      1
      Toy Story (1995)
      280
      4
    
    
      4
      1
      Toy Story (1995)
      66
      3



In [108]:

    
ratings_mtx_df = ratings_df.pivot_table(index='user_id', columns='title',values='rating')  
ratings_mtx_df.fillna(0, inplace=True)



In [109]:

    
ratings_mtx_df.head()









    Out[109]:







  
    
      title
      'Til There Was You (1997)
      1-900 (1994)
      101 Dalmatians (1996)
      12 Angry Men (1957)
      187 (1997)
      2 Days in the Valley (1996)
      20,000 Leagues Under the Sea (1954)
      2001: A Space Odyssey (1968)
      3 Ninjas: High Noon At Mega Mountain (1998)
      39 Steps, The (1935)
      ...
      Yankee Zulu (1994)
      Year of the Horse (1997)
      You So Crazy (1994)
      Young Frankenstein (1974)
      Young Guns (1988)
      Young Guns II (1990)
      Young Poisoner's Handbook, The (1995)
      Zeus and Roxanne (1997)
      unknown
      Á köldum klaka (Cold Fever) (1994)
    
    
      user_id
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
    
  
  
    
      1
      0.0
      0.0
      2.0
      5.0
      0.0
      0.0
      3.0
      4.0
      0.0
      0.0
      ...
      0.0
      0.0
      0.0
      5.0
      3.0
      0.0
      0.0
      0.0
      4.0
      0.0
    
    
      2
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      1.0
      0.0
      ...
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
    
    
      3
      0.0
      0.0
      0.0
      0.0
      2.0
      0.0
      0.0
      0.0
      0.0
      0.0
      ...
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
    
    
      4
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      ...
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
    
    
      5
      0.0
      0.0
      2.0
      0.0
      0.0
      0.0
      0.0
      4.0
      0.0
      0.0
      ...
      0.0
      0.0
      0.0
      4.0
      0.0
      0.0
      0.0
      0.0
      4.0
      0.0
    
  

5 rows × 1664 columns



In [110]:

    
movie_index = ratings_mtx_df.columns

Cosine Similarity

https://stackoverflow.com/questions/35281691/scikit-cosine-similarity-vs-pairwise-distances



In [111]:

    
corr_matrix = cosine_similarity(ratings_mtx_df.T.as_matrix())
np.fill_diagonal(corr_matrix, 0 )#Filling diagonals with 0s for future use when sorting is done
corr= pd.DataFrame(corr_matrix)



In [112]:

    
corr.head()









    Out[112]:







  
    
      
      0
      1
      2
      3
      4
      5
      6
      7
      8
      9
      ...
      1654
      1655
      1656
      1657
      1658
      1659
      1660
      1661
      1662
      1663
    
  
  
    
      0
      0.000000
      0.000000
      0.024561
      0.099561
      0.185236
      0.159265
      0.000000
      0.052203
      0.000000
      0.033326
      ...
      0.000000
      0.000000
      0.000000
      0.027774
      0.118840
      0.142315
      0.029070
      0.000000
      0.110208
      0.000000
    
    
      1
      0.000000
      0.000000
      0.014139
      0.009294
      0.007354
      0.004702
      0.010055
      0.067038
      0.000000
      0.000000
      ...
      0.152499
      0.015484
      0.000000
      0.069284
      0.018243
      0.023408
      0.006694
      0.079640
      0.042295
      0.000000
    
    
      2
      0.024561
      0.014139
      0.000000
      0.167006
      0.061105
      0.143878
      0.203781
      0.225803
      0.027642
      0.092337
      ...
      0.000000
      0.021965
      0.030905
      0.274877
      0.204267
      0.101199
      0.056976
      0.172155
      0.045714
      0.000000
    
    
      3
      0.099561
      0.009294
      0.167006
      0.000000
      0.056822
      0.167235
      0.304078
      0.422506
      0.072682
      0.394854
      ...
      0.060946
      0.016502
      0.000000
      0.403270
      0.259436
      0.145519
      0.105226
      0.038901
      0.060101
      0.081261
    
    
      4
      0.185236
      0.007354
      0.061105
      0.056822
      0.000000
      0.132327
      0.042928
      0.065060
      0.043133
      0.027300
      ...
      0.000000
      0.141997
      0.000000
      0.068257
      0.067786
      0.091293
      0.099490
      0.025184
      0.142667
      0.096449
    
  

5 rows × 1664 columns



In [113]:

    
inp = list(movie_index).index('Shawshank Redemption, The (1994)')
inp









    Out[113]:





1317



In [114]:

    
P = corr_matrix[inp]
P









    Out[114]:





array([ 0.06982111,  0.02813572,  0.24355218, ...,  0.03441109,
        0.05848069,  0.05271378])



In [115]:

    
max(P)









    Out[115]:





0.67020385185286324



In [116]:

    
list(movie_index[(P>0.6) & (P<0.7)])









    Out[116]:





['Apollo 13 (1995)',
 'Back to the Future (1985)',
 'Braveheart (1995)',
 'Dead Poets Society (1989)',
 'Empire Strikes Back, The (1980)',
 'Forrest Gump (1994)',
 'Fugitive, The (1993)',
 "One Flew Over the Cuckoo's Nest (1975)",
 'Pulp Fiction (1994)',
 'Raiders of the Lost Ark (1981)',
 "Schindler's List (1993)",
 'Silence of the Lambs, The (1991)',
 'Usual Suspects, The (1995)',
 'When Harry Met Sally... (1989)']

Pearson Similarity



In [117]:

    
corr_matrix = np.corrcoef(ratings_mtx_df.T)
np.fill_diagonal(corr_matrix, 0 )
corr = pd.DataFrame(corr_matrix)



In [118]:

    
corr.head()









    Out[118]:







  
    
      
      0
      1
      2
      3
      4
      5
      6
      7
      8
      9
      ...
      1654
      1655
      1656
      1657
      1658
      1659
      1660
      1661
      1662
      1663
    
  
  
    
      0
      0.000000
      -0.005884
      -0.004595
      0.072113
      0.171574
      0.138981
      -0.025561
      0.007270
      -0.006640
      0.011552
      ...
      -0.002963
      -0.006937
      -0.002963
      -0.014489
      0.095510
      0.127067
      0.011604
      -0.006721
      0.102622
      -0.002963
    
    
      1
      -0.005884
      0.000000
      -0.006827
      -0.014922
      -0.005330
      -0.015494
      -0.007714
      0.039863
      -0.004723
      -0.016332
      ...
      0.150791
      0.010627
      -0.002108
      0.045195
      -0.002247
      0.010526
      -0.006079
      0.075243
      0.036506
      -0.002108
    
    
      2
      -0.004595
      -0.006827
      0.000000
      0.059246
      -0.001097
      0.052877
      0.128745
      0.078260
      0.004667
      0.015504
      ...
      -0.010968
      -0.002430
      0.021658
      0.155450
      0.115195
      0.039172
      -0.005917
      0.157268
      0.016893
      -0.010968
    
    
      3
      0.072113
      -0.014922
      0.059246
      0.000000
      -0.014343
      0.066339
      0.230274
      0.298716
      0.049985
      0.339195
      ...
      0.052798
      -0.011624
      -0.012541
      0.290188
      0.164959
      0.079341
      0.038111
      0.013353
      0.028600
      0.074578
    
    
      4
      0.171574
      -0.005330
      -0.001097
      -0.014343
      0.000000
      0.078770
      -0.010333
      -0.039939
      0.029597
      -0.021414
      ...
      -0.006468
      0.130047
      -0.006468
      -0.021873
      0.006811
      0.053843
      0.063789
      0.011076
      0.127479
      0.091915
    
  

5 rows × 1664 columns



In [119]:

    
P = corr_matrix[inp]









    Out[119]:





array([ 0.02495611, -0.00799703,  0.08939762, ..., -0.00640164,
        0.00965978,  0.04176051])



In [120]:

    
max(P)









    Out[120]:





0.52681432410411766



In [121]:

    
list(movie_index[(P>0.45) & (P<0.7)])









    Out[121]:





['Apollo 13 (1995)',
 'Braveheart (1995)',
 'Dead Poets Society (1989)',
 'Forrest Gump (1994)',
 "One Flew Over the Cuckoo's Nest (1975)",
 'Pulp Fiction (1994)',
 'Quiz Show (1994)',
 'Raiders of the Lost Ark (1981)',
 "Schindler's List (1993)",
 'Seven (Se7en) (1995)',
 'Silence of the Lambs, The (1991)',
 'Usual Suspects, The (1995)']

	movie_id	title	user_id	rating
0	1	Toy Story (1995)	308	4
1	1	Toy Story (1995)	287	5
2	1	Toy Story (1995)	148	4
3	1	Toy Story (1995)	280	4
4	1	Toy Story (1995)	66	3

title	'Til There Was You (1997)	1-900 (1994)	101 Dalmatians (1996)	12 Angry Men (1957)	187 (1997)	2 Days in the Valley (1996)	20,000 Leagues Under the Sea (1954)	2001: A Space Odyssey (1968)	3 Ninjas: High Noon At Mega Mountain (1998)	39 Steps, The (1935)	...	Yankee Zulu (1994)	Year of the Horse (1997)	You So Crazy (1994)	Young Frankenstein (1974)	Young Guns (1988)	Young Guns II (1990)	Young Poisoner's Handbook, The (1995)	Zeus and Roxanne (1997)	unknown	Á köldum klaka (Cold Fever) (1994)
user_id
1	0.0	0.0	2.0	5.0	0.0	0.0	3.0	4.0	0.0	0.0	...	0.0	0.0	0.0	5.0	3.0	0.0	0.0	0.0	4.0	0.0
2	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	1.0	0.0	...	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0
3	0.0	0.0	0.0	0.0	2.0	0.0	0.0	0.0	0.0	0.0	...	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0
4	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	...	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0
5	0.0	0.0	2.0	0.0	0.0	0.0	0.0	4.0	0.0	0.0	...	0.0	0.0	0.0	4.0	0.0	0.0	0.0	0.0	4.0	0.0

	0	1	2	3	4	5	6	7	8	9	...	1654	1655	1656	1657	1658	1659	1660	1661	1662	1663
0	0.000000	0.000000	0.024561	0.099561	0.185236	0.159265	0.000000	0.052203	0.000000	0.033326	...	0.000000	0.000000	0.000000	0.027774	0.118840	0.142315	0.029070	0.000000	0.110208	0.000000
1	0.000000	0.000000	0.014139	0.009294	0.007354	0.004702	0.010055	0.067038	0.000000	0.000000	...	0.152499	0.015484	0.000000	0.069284	0.018243	0.023408	0.006694	0.079640	0.042295	0.000000
2	0.024561	0.014139	0.000000	0.167006	0.061105	0.143878	0.203781	0.225803	0.027642	0.092337	...	0.000000	0.021965	0.030905	0.274877	0.204267	0.101199	0.056976	0.172155	0.045714	0.000000
3	0.099561	0.009294	0.167006	0.000000	0.056822	0.167235	0.304078	0.422506	0.072682	0.394854	...	0.060946	0.016502	0.000000	0.403270	0.259436	0.145519	0.105226	0.038901	0.060101	0.081261
4	0.185236	0.007354	0.061105	0.056822	0.000000	0.132327	0.042928	0.065060	0.043133	0.027300	...	0.000000	0.141997	0.000000	0.068257	0.067786	0.091293	0.099490	0.025184	0.142667	0.096449

	0	1	2	3	4	5	6	7	8	9	...	1654	1655	1656	1657	1658	1659	1660	1661	1662	1663
0	0.000000	-0.005884	-0.004595	0.072113	0.171574	0.138981	-0.025561	0.007270	-0.006640	0.011552	...	-0.002963	-0.006937	-0.002963	-0.014489	0.095510	0.127067	0.011604	-0.006721	0.102622	-0.002963
1	-0.005884	0.000000	-0.006827	-0.014922	-0.005330	-0.015494	-0.007714	0.039863	-0.004723	-0.016332	...	0.150791	0.010627	-0.002108	0.045195	-0.002247	0.010526	-0.006079	0.075243	0.036506	-0.002108
2	-0.004595	-0.006827	0.000000	0.059246	-0.001097	0.052877	0.128745	0.078260	0.004667	0.015504	...	-0.010968	-0.002430	0.021658	0.155450	0.115195	0.039172	-0.005917	0.157268	0.016893	-0.010968
3	0.072113	-0.014922	0.059246	0.000000	-0.014343	0.066339	0.230274	0.298716	0.049985	0.339195	...	0.052798	-0.011624	-0.012541	0.290188	0.164959	0.079341	0.038111	0.013353	0.028600	0.074578
4	0.171574	-0.005330	-0.001097	-0.014343	0.000000	0.078770	-0.010333	-0.039939	0.029597	-0.021414	...	-0.006468	0.130047	-0.006468	-0.021873	0.006811	0.053843	0.063789	0.011076	0.127479	0.091915