In [103]:
import numpy as np 
import pandas as pd 
from sklearn.metrics.pairwise import cosine_similarity

In [104]:
r_cols = ['user_id', 'movie_id', 'rating']
m_cols = ['movie_id', 'title']

ratings = pd.read_csv('u.data',sep='\t', names=r_cols, usecols = range(3))
movies = pd.read_csv('u.item', sep='|', names=m_cols, usecols=range(2))

In [105]:
ratings.head(1)


Out[105]:
user_id movie_id rating
0 196 242 3

In [106]:
movies.head(1)


Out[106]:
movie_id title
0 1 Toy Story (1995)

In [107]:
ratings_df = pd.merge(movies,ratings)
ratings_df.head()


Out[107]:
movie_id title user_id rating
0 1 Toy Story (1995) 308 4
1 1 Toy Story (1995) 287 5
2 1 Toy Story (1995) 148 4
3 1 Toy Story (1995) 280 4
4 1 Toy Story (1995) 66 3

In [108]:
ratings_mtx_df = ratings_df.pivot_table(index='user_id', columns='title',values='rating')  
ratings_mtx_df.fillna(0, inplace=True)

In [109]:
ratings_mtx_df.head()


Out[109]:
title 'Til There Was You (1997) 1-900 (1994) 101 Dalmatians (1996) 12 Angry Men (1957) 187 (1997) 2 Days in the Valley (1996) 20,000 Leagues Under the Sea (1954) 2001: A Space Odyssey (1968) 3 Ninjas: High Noon At Mega Mountain (1998) 39 Steps, The (1935) ... Yankee Zulu (1994) Year of the Horse (1997) You So Crazy (1994) Young Frankenstein (1974) Young Guns (1988) Young Guns II (1990) Young Poisoner's Handbook, The (1995) Zeus and Roxanne (1997) unknown Á köldum klaka (Cold Fever) (1994)
user_id
1 0.0 0.0 2.0 5.0 0.0 0.0 3.0 4.0 0.0 0.0 ... 0.0 0.0 0.0 5.0 3.0 0.0 0.0 0.0 4.0 0.0
2 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 1.0 0.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
3 0.0 0.0 0.0 0.0 2.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
4 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
5 0.0 0.0 2.0 0.0 0.0 0.0 0.0 4.0 0.0 0.0 ... 0.0 0.0 0.0 4.0 0.0 0.0 0.0 0.0 4.0 0.0

5 rows × 1664 columns


In [110]:
movie_index = ratings_mtx_df.columns

In [111]:
corr_matrix = cosine_similarity(ratings_mtx_df.T.as_matrix())
np.fill_diagonal(corr_matrix, 0 )#Filling diagonals with 0s for future use when sorting is done
corr= pd.DataFrame(corr_matrix)

In [112]:
corr.head()


Out[112]:
0 1 2 3 4 5 6 7 8 9 ... 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663
0 0.000000 0.000000 0.024561 0.099561 0.185236 0.159265 0.000000 0.052203 0.000000 0.033326 ... 0.000000 0.000000 0.000000 0.027774 0.118840 0.142315 0.029070 0.000000 0.110208 0.000000
1 0.000000 0.000000 0.014139 0.009294 0.007354 0.004702 0.010055 0.067038 0.000000 0.000000 ... 0.152499 0.015484 0.000000 0.069284 0.018243 0.023408 0.006694 0.079640 0.042295 0.000000
2 0.024561 0.014139 0.000000 0.167006 0.061105 0.143878 0.203781 0.225803 0.027642 0.092337 ... 0.000000 0.021965 0.030905 0.274877 0.204267 0.101199 0.056976 0.172155 0.045714 0.000000
3 0.099561 0.009294 0.167006 0.000000 0.056822 0.167235 0.304078 0.422506 0.072682 0.394854 ... 0.060946 0.016502 0.000000 0.403270 0.259436 0.145519 0.105226 0.038901 0.060101 0.081261
4 0.185236 0.007354 0.061105 0.056822 0.000000 0.132327 0.042928 0.065060 0.043133 0.027300 ... 0.000000 0.141997 0.000000 0.068257 0.067786 0.091293 0.099490 0.025184 0.142667 0.096449

5 rows × 1664 columns


In [113]:
inp = list(movie_index).index('Shawshank Redemption, The (1994)')
inp


Out[113]:
1317

In [114]:
P = corr_matrix[inp]
P


Out[114]:
array([ 0.06982111,  0.02813572,  0.24355218, ...,  0.03441109,
        0.05848069,  0.05271378])

In [115]:
max(P)


Out[115]:
0.67020385185286324

In [116]:
list(movie_index[(P>0.6) & (P<0.7)])


Out[116]:
['Apollo 13 (1995)',
 'Back to the Future (1985)',
 'Braveheart (1995)',
 'Dead Poets Society (1989)',
 'Empire Strikes Back, The (1980)',
 'Forrest Gump (1994)',
 'Fugitive, The (1993)',
 "One Flew Over the Cuckoo's Nest (1975)",
 'Pulp Fiction (1994)',
 'Raiders of the Lost Ark (1981)',
 "Schindler's List (1993)",
 'Silence of the Lambs, The (1991)',
 'Usual Suspects, The (1995)',
 'When Harry Met Sally... (1989)']

Pearson Similarity


In [117]:
corr_matrix = np.corrcoef(ratings_mtx_df.T)
np.fill_diagonal(corr_matrix, 0 )
corr = pd.DataFrame(corr_matrix)

In [118]:
corr.head()


Out[118]:
0 1 2 3 4 5 6 7 8 9 ... 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663
0 0.000000 -0.005884 -0.004595 0.072113 0.171574 0.138981 -0.025561 0.007270 -0.006640 0.011552 ... -0.002963 -0.006937 -0.002963 -0.014489 0.095510 0.127067 0.011604 -0.006721 0.102622 -0.002963
1 -0.005884 0.000000 -0.006827 -0.014922 -0.005330 -0.015494 -0.007714 0.039863 -0.004723 -0.016332 ... 0.150791 0.010627 -0.002108 0.045195 -0.002247 0.010526 -0.006079 0.075243 0.036506 -0.002108
2 -0.004595 -0.006827 0.000000 0.059246 -0.001097 0.052877 0.128745 0.078260 0.004667 0.015504 ... -0.010968 -0.002430 0.021658 0.155450 0.115195 0.039172 -0.005917 0.157268 0.016893 -0.010968
3 0.072113 -0.014922 0.059246 0.000000 -0.014343 0.066339 0.230274 0.298716 0.049985 0.339195 ... 0.052798 -0.011624 -0.012541 0.290188 0.164959 0.079341 0.038111 0.013353 0.028600 0.074578
4 0.171574 -0.005330 -0.001097 -0.014343 0.000000 0.078770 -0.010333 -0.039939 0.029597 -0.021414 ... -0.006468 0.130047 -0.006468 -0.021873 0.006811 0.053843 0.063789 0.011076 0.127479 0.091915

5 rows × 1664 columns


In [119]:
P = corr_matrix[inp]


Out[119]:
array([ 0.02495611, -0.00799703,  0.08939762, ..., -0.00640164,
        0.00965978,  0.04176051])

In [120]:
max(P)


Out[120]:
0.52681432410411766

In [121]:
list(movie_index[(P>0.45) & (P<0.7)])


Out[121]:
['Apollo 13 (1995)',
 'Braveheart (1995)',
 'Dead Poets Society (1989)',
 'Forrest Gump (1994)',
 "One Flew Over the Cuckoo's Nest (1975)",
 'Pulp Fiction (1994)',
 'Quiz Show (1994)',
 'Raiders of the Lost Ark (1981)',
 "Schindler's List (1993)",
 'Seven (Se7en) (1995)',
 'Silence of the Lambs, The (1991)',
 'Usual Suspects, The (1995)']