In [103]:
import numpy as np
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity
In [104]:
r_cols = ['user_id', 'movie_id', 'rating']
m_cols = ['movie_id', 'title']
ratings = pd.read_csv('u.data',sep='\t', names=r_cols, usecols = range(3))
movies = pd.read_csv('u.item', sep='|', names=m_cols, usecols=range(2))
In [105]:
ratings.head(1)
Out[105]:
In [106]:
movies.head(1)
Out[106]:
In [107]:
ratings_df = pd.merge(movies,ratings)
ratings_df.head()
Out[107]:
In [108]:
ratings_mtx_df = ratings_df.pivot_table(index='user_id', columns='title',values='rating')
ratings_mtx_df.fillna(0, inplace=True)
In [109]:
ratings_mtx_df.head()
Out[109]:
In [110]:
movie_index = ratings_mtx_df.columns
In [111]:
corr_matrix = cosine_similarity(ratings_mtx_df.T.as_matrix())
np.fill_diagonal(corr_matrix, 0 )#Filling diagonals with 0s for future use when sorting is done
corr= pd.DataFrame(corr_matrix)
In [112]:
corr.head()
Out[112]:
In [113]:
inp = list(movie_index).index('Shawshank Redemption, The (1994)')
inp
Out[113]:
In [114]:
P = corr_matrix[inp]
P
Out[114]:
In [115]:
max(P)
Out[115]:
In [116]:
list(movie_index[(P>0.6) & (P<0.7)])
Out[116]:
In [117]:
corr_matrix = np.corrcoef(ratings_mtx_df.T)
np.fill_diagonal(corr_matrix, 0 )
corr = pd.DataFrame(corr_matrix)
In [118]:
corr.head()
Out[118]:
In [119]:
P = corr_matrix[inp]
Out[119]:
In [120]:
max(P)
Out[120]:
In [121]:
list(movie_index[(P>0.45) & (P<0.7)])
Out[121]: