In [1]:
import pandas as pd
import numpy as np 
from sklearn.metrics.pairwise import cosine_similarity

In [2]:
ratings =pd.read_csv('ratings.csv')
ratings.head(1)


Out[2]:
user_id movie_id rating
0 1 31 2.5

In [3]:
movies = pd.read_csv('movies.csv')
movies.head(1)


Out[3]:
movie_id title genres
0 1 Toy Story (1995) Adventure|Animation|Children|Comedy|Fantasy

In [4]:
ratings_df = pd.merge(movies,ratings)
ratings_df.head()


Out[4]:
movie_id title genres user_id rating
0 1 Toy Story (1995) Adventure|Animation|Children|Comedy|Fantasy 7 3.0
1 1 Toy Story (1995) Adventure|Animation|Children|Comedy|Fantasy 9 4.0
2 1 Toy Story (1995) Adventure|Animation|Children|Comedy|Fantasy 13 5.0
3 1 Toy Story (1995) Adventure|Animation|Children|Comedy|Fantasy 15 2.0
4 1 Toy Story (1995) Adventure|Animation|Children|Comedy|Fantasy 19 3.0

In [5]:
ratings_mtx_df = ratings_df.pivot_table(index='user_id', columns='title',values='rating')  
ratings_mtx_df.fillna(0, inplace=True)

In [6]:
ratings_mtx_df.head()


Out[6]:
title "Great Performances" Cats (1998) $9.99 (2008) 'Hellboy': The Seeds of Creation (2004) 'Neath the Arizona Skies (1934) 'Round Midnight (1986) 'Salem's Lot (2004) 'Til There Was You (1997) 'burbs, The (1989) 'night Mother (1986) (500) Days of Summer (2009) ... Zulu (1964) Zulu (2013) [REC] (2007) eXistenZ (1999) loudQUIETloud: A Film About the Pixies (2006) xXx (2002) xXx: State of the Union (2005) ¡Three Amigos! (1986) À nous la liberté (Freedom for Us) (1931) İtirazım Var (2014)
user_id
1 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
2 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
3 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
4 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
5 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0

5 rows × 9064 columns


In [7]:
movie_index = ratings_mtx_df.columns

In [8]:
corr_matrix = cosine_similarity(ratings_mtx_df.T.as_matrix())
np.fill_diagonal(corr_matrix, 0 )#Filling diagonals with 0s for future use when sorting is done
corr= pd.DataFrame(corr_matrix)

In [9]:
corr.head()


Out[9]:
0 1 2 3 4 5 6 7 8 9 ... 9054 9055 9056 9057 9058 9059 9060 9061 9062 9063
0 0.000000 0.0 0.0 0.164399 0.020391 0.0 0.014046 0.000000 0.0 0.003166 ... 0.0 0.0 0.0 0.000000 0.0 0.000000 0.0 0.000000 0.0 0.0
1 0.000000 0.0 0.0 0.000000 0.000000 0.0 0.000000 0.079474 0.0 0.156330 ... 0.0 0.0 0.0 0.000000 0.0 0.013899 0.0 0.058218 0.0 0.0
2 0.000000 0.0 0.0 0.000000 0.000000 1.0 0.000000 0.217357 0.0 0.000000 ... 0.0 0.0 0.0 0.000000 0.0 0.000000 0.0 0.000000 0.0 0.0
3 0.164399 0.0 0.0 0.000000 0.124035 0.0 0.085436 0.000000 0.0 0.019259 ... 0.0 0.0 0.0 0.000000 0.0 0.000000 0.0 0.000000 0.0 0.0
4 0.020391 0.0 0.0 0.124035 0.000000 0.0 0.010597 0.143786 0.0 0.136163 ... 0.0 0.0 0.0 0.121567 0.0 0.000000 0.0 0.000000 0.0 0.0

5 rows × 9064 columns


In [10]:
inp = list(movie_index).index('Inception (2010)')
inp


Out[10]:
4013

In [11]:
P = corr_matrix[inp]

In [12]:
max(P)


Out[12]:
0.65699917684681852

In [13]:
list(movie_index[(P>0.5) & (P<0.7)])


Out[13]:
['Avatar (2009)',
 'Avengers, The (2012)',
 'Dark Knight Rises, The (2012)',
 'Dark Knight, The (2008)',
 'District 9 (2009)',
 'Django Unchained (2012)',
 'Inglourious Basterds (2009)',
 'Interstellar (2014)',
 'Iron Man (2008)',
 'Sherlock Holmes (2009)',
 'Shutter Island (2010)',
 'Social Network, The (2010)',
 'Star Trek (2009)',
 'Up (2009)',
 'WALL·E (2008)']

Pearson Similarity


In [14]:
corr_matrix = np.corrcoef(ratings_mtx_df.T)
np.fill_diagonal(corr_matrix, 0 )
corr = pd.DataFrame(corr_matrix)

In [15]:
corr.head()


Out[15]:
0 1 2 3 4 5 6 7 8 9 ... 9054 9055 9056 9057 9058 9059 9060 9061 9062 9063
0 0.000000 -0.002894 -0.001718 0.162966 0.018512 -0.001718 0.011006 -0.007311 -0.002980 -0.008271 ... -0.002431 -0.001718 -0.002929 -0.008014 -0.001718 -0.007546 -0.001718 -0.009406 -0.001718 -0.001718
1 -0.002894 0.000000 -0.002514 -0.002514 -0.002807 -0.002514 -0.004518 0.070014 -0.004360 0.144961 ... -0.003558 -0.002514 -0.004286 -0.011726 -0.002514 0.003087 -0.002514 0.045869 -0.002514 -0.002514
2 -0.001718 -0.002514 0.000000 -0.001493 -0.001666 1.000000 -0.002682 0.214088 -0.002589 -0.010031 ... -0.002112 -0.001493 -0.002545 -0.006962 -0.001493 -0.006556 -0.001493 -0.008172 -0.001493 -0.001493
3 0.162966 -0.002514 -0.001493 0.000000 0.122576 -0.001493 0.083023 -0.006351 -0.002589 0.009882 ... -0.002112 -0.001493 -0.002545 -0.006962 -0.001493 -0.006556 -0.001493 -0.008172 -0.001493 -0.001493
4 0.018512 -0.002807 -0.001666 0.122576 0.000000 -0.001666 0.007638 0.138760 -0.002891 0.129609 ... -0.002358 -0.001666 -0.002841 0.115866 -0.001666 -0.007320 -0.001666 -0.009124 -0.001666 -0.001666

5 rows × 9064 columns


In [16]:
P = corr_matrix[inp]

In [17]:
max(P)


Out[17]:
0.58951497276264542

In [18]:
list(movie_index[(P>0.45) & (P<0.6)])


Out[18]:
['Avatar (2009)',
 'Avengers, The (2012)',
 'Dark Knight Rises, The (2012)',
 'Dark Knight, The (2008)',
 'District 9 (2009)',
 'Django Unchained (2012)',
 'Inglourious Basterds (2009)',
 'Interstellar (2014)',
 'Iron Man (2008)',
 'Sherlock Holmes (2009)',
 'Sherlock Holmes: A Game of Shadows (2011)',
 'Shutter Island (2010)',
 'Social Network, The (2010)',
 'Star Trek (2009)',
 'Up (2009)',
 'WALL·E (2008)']