In [38]:
import numpy as np 
import pandas as pd 
from sklearn.metrics.pairwise import cosine_similarity

In [39]:
u_cols = ['user_id','sex','age','occupation', 'zip_code']
r_cols = ['user_id', 'movie_id', 'rating']
m_cols = ['movie_id', 'title', 'genres']

users=pd.read_csv("users.dat",sep='::',names=u_cols, engine='python')
ratings = pd.read_csv('ratings.dat',sep='::', names=r_cols, engine='python', usecols=range(3))
movies = pd.read_csv('movies.dat', sep='::', names=m_cols, engine='python')
#combined=pd.merge(movies,ratings)
#df=pd.merge(combined,users)

In [40]:
ratings.head(1)


Out[40]:
user_id movie_id rating
0 1 1193 5

In [41]:
movies.head(1)


Out[41]:
movie_id title genres
0 1 Toy Story (1995) Animation|Children's|Comedy

In [42]:
ratings_df = pd.merge(movies,ratings)
ratings_df.head()


Out[42]:
movie_id title genres user_id rating
0 1 Toy Story (1995) Animation|Children's|Comedy 1 5
1 1 Toy Story (1995) Animation|Children's|Comedy 6 4
2 1 Toy Story (1995) Animation|Children's|Comedy 8 4
3 1 Toy Story (1995) Animation|Children's|Comedy 9 5
4 1 Toy Story (1995) Animation|Children's|Comedy 10 5

In [43]:
ratings_mtx_df = ratings_df.pivot_table(index='user_id', columns='title',values='rating')  
ratings_mtx_df.fillna(0, inplace=True)

In [44]:
ratings_mtx_df.head()


Out[44]:
title $1,000,000 Duck (1971) 'Night Mother (1986) 'Til There Was You (1997) 'burbs, The (1989) ...And Justice for All (1979) 1-900 (1994) 10 Things I Hate About You (1999) 101 Dalmatians (1961) 101 Dalmatians (1996) 12 Angry Men (1957) ... Young Poisoner's Handbook, The (1995) Young Sherlock Holmes (1985) Young and Innocent (1937) Your Friends and Neighbors (1998) Zachariah (1971) Zed & Two Noughts, A (1985) Zero Effect (1998) Zero Kelvin (Kjærlighetens kjøtere) (1995) Zeus and Roxanne (1997) eXistenZ (1999)
user_id
1 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
2 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
3 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
4 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
5 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0

5 rows × 3706 columns


In [45]:
movie_index = ratings_mtx_df.columns

In [46]:
corr_matrix = cosine_similarity(ratings_mtx_df.T.as_matrix())
np.fill_diagonal(corr_matrix, 0 )#Filling diagonals with 0s for future use when sorting is done
corr= pd.DataFrame(corr_matrix)

In [47]:
corr.head()


Out[47]:
0 1 2 3 4 5 6 7 8 9 ... 3696 3697 3698 3699 3700 3701 3702 3703 3704 3705
0 0.000000 0.072357 0.037011 0.079291 0.060838 0.00000 0.058619 0.189965 0.172254 0.094785 ... 0.038725 0.076474 0.000000 0.044074 0.0 0.045280 0.039395 0.000000 0.120242 0.027003
1 0.072357 0.000000 0.115290 0.115545 0.159526 0.00000 0.076798 0.147437 0.095922 0.111413 ... 0.053010 0.087828 0.063758 0.135962 0.0 0.091150 0.074787 0.000000 0.000000 0.077807
2 0.037011 0.115290 0.000000 0.098756 0.066301 0.08025 0.127895 0.112654 0.125670 0.079115 ... 0.029200 0.062893 0.000000 0.079187 0.0 0.022594 0.079261 0.000000 0.047526 0.063284
3 0.079291 0.115545 0.098756 0.000000 0.143620 0.00000 0.192191 0.246927 0.175885 0.170719 ... 0.113386 0.207897 0.019962 0.138064 0.0 0.055704 0.161174 0.000000 0.033567 0.110525
4 0.060838 0.159526 0.066301 0.143620 0.000000 0.00000 0.075093 0.194154 0.116379 0.205486 ... 0.089998 0.153006 0.067009 0.109029 0.0 0.086080 0.110867 0.074317 0.000000 0.111040

5 rows × 3706 columns


In [48]:
inp = list(movie_index).index('Final Destination (2000)')
inp


Out[48]:
1158

In [49]:
P = corr_matrix[inp]

In [50]:
max(P)


Out[50]:
0.40850620631042028

In [51]:
list(movie_index[(P>0.34) & (P<0.41)])


Out[51]:
['American Psycho (2000)',
 'Frequency (2000)',
 'Pitch Black (2000)',
 'Scream 3 (2000)',
 'Skulls, The (2000)']

Pearson Similarity


In [52]:
corr_matrix = np.corrcoef(ratings_mtx_df.T)
np.fill_diagonal(corr_matrix, 0 )
corr = pd.DataFrame(corr_matrix)

In [53]:
corr.head()


Out[53]:
0 1 2 3 4 5 6 7 8 9 ... 3696 3697 3698 3699 3700 3701 3702 3703 3704 3705
0 0.000000 0.065338 0.030805 0.065478 0.048708 -0.001319 0.036612 0.176528 0.159973 0.075665 ... 0.030758 0.060574 -0.002833 0.035056 -0.001237 0.040590 0.024165 -0.001332 0.116574 0.009243
1 0.065338 0.000000 0.107374 0.096778 0.144480 -0.001834 0.046122 0.123378 0.074706 0.083988 ... 0.042061 0.065333 0.060202 0.124593 -0.001719 0.085001 0.054343 -0.001852 -0.005825 0.054699
2 0.030805 0.107374 0.000000 0.082706 0.051965 0.079011 0.105672 0.091422 0.108952 0.054821 ... 0.019685 0.043294 -0.003341 0.068935 -0.001459 0.016935 0.062262 -0.001571 0.042842 0.043483
3 0.065478 0.096778 0.082706 0.000000 0.110790 -0.003821 0.133881 0.198168 0.134041 0.113112 ... 0.092597 0.165666 0.012226 0.114837 -0.003582 0.042862 0.121617 -0.003858 0.022249 0.062471
4 0.048708 0.144480 0.051965 0.110790 0.000000 -0.003203 0.018614 0.151020 0.078917 0.160556 ... 0.071819 0.115402 0.061253 0.088616 -0.003002 0.075720 0.075804 0.072283 -0.010171 0.071000

5 rows × 3706 columns


In [54]:
P = corr_matrix[inp]

In [55]:
max(P)


Out[55]:
0.35259154120260761

In [56]:
list(movie_index[(P>0.28) & (P<0.35)])


Out[56]:
['American Psycho (2000)',
 'Frequency (2000)',
 'Scream 3 (2000)',
 'Skulls, The (2000)']