In [29]:
import numpy as np
import pandas as pd

In [30]:
m_cols = ['movie_id', 'title', 'genres']
movies_df = pd.read_csv('movies.dat', sep='::', names=m_cols, engine='python')
movies_df.title=movies_df.title.astype(str)

In [31]:
movies_df = pd.concat([movies_df, movies_df.genres.str.get_dummies(sep='|')], axis=1)  
movies_df.head(3)


Out[31]:
movie_id title genres Action Adventure Animation Children's Comedy Crime Documentary ... Fantasy Film-Noir Horror Musical Mystery Romance Sci-Fi Thriller War Western
0 1 Toy Story (1995) Animation|Children's|Comedy 0 0 1 1 1 0 0 ... 0 0 0 0 0 0 0 0 0 0
1 2 Jumanji (1995) Adventure|Children's|Fantasy 0 1 0 1 0 0 0 ... 1 0 0 0 0 0 0 0 0 0
2 3 Grumpier Old Men (1995) Comedy|Romance 0 0 0 0 1 0 0 ... 0 0 0 0 0 1 0 0 0 0

3 rows × 21 columns


In [32]:
movies_category = movies_df.columns[3:]
movies_category


Out[32]:
Index(['Action', 'Adventure', 'Animation', 'Children's', 'Comedy', 'Crime',
       'Documentary', 'Drama', 'Fantasy', 'Film-Noir', 'Horror', 'Musical',
       'Mystery', 'Romance', 'Sci-Fi', 'Thriller', 'War', 'Western'],
      dtype='object')

In [33]:
def dot_product(vector1,vector2):
    return sum([ i*j for i,j in zip(vector1, vector2)])

def movie_score(movie_features,user_preferences):
    return dot_product(movie_features, user_preferences)

In [34]:
movies_df[movies_df.title.str.contains('Shawshank')]


Out[34]:
movie_id title genres Action Adventure Animation Children's Comedy Crime Documentary ... Fantasy Film-Noir Horror Musical Mystery Romance Sci-Fi Thriller War Western
315 318 Shawshank Redemption, The (1994) Drama 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0

1 rows × 21 columns


In [35]:
eternal=movies_df.loc[315][3:]
eternal


Out[35]:
Action         0
Adventure      0
Animation      0
Children's     0
Comedy         0
Crime          0
Documentary    0
Drama          1
Fantasy        0
Film-Noir      0
Horror         0
Musical        0
Mystery        0
Romance        0
Sci-Fi         0
Thriller       0
War            0
Western        0
Name: 315, dtype: object

In [44]:
from collections import OrderedDict
user_preferences = OrderedDict(zip(movies_category, []))

user_preferences['Action'] = 0 
user_preferences['Adventure'] = 0  
user_preferences['Animation'] = 0  
user_preferences["Children's"] = 0  
user_preferences["Comedy"] = 0  
user_preferences['Crime'] = 0 
user_preferences['Documentary'] = 0  
user_preferences['Drama'] = 0
user_preferences['Fantasy'] = 0  
user_preferences['Film-Noir'] = 0  
user_preferences['Horror'] = 1
user_preferences['Musical'] = 0  
user_preferences['Mystery'] = 1  
user_preferences['Romance'] = 0  
user_preferences['Sci-Fi'] = 0 
user_preferences['Thriller'] = 1
user_preferences['War'] = 0
user_preferences['Western'] =0

In [40]:
eternal_user_predicted_score = dot_product(eternal, user_preferences.values())  
eternal_user_predicted_score


Out[40]:
0

In [45]:
movies_df['score'] = movies_df[movies_category].apply(movie_score,args=([user_preferences.values()]), axis=1)
movies_df.sort_values(by=['score'], ascending=False)['title'][:10]


Out[45]:
3407                           Jacob's Ladder (1990)
2269    I Still Know What You Did Last Summer (1998)
1598          I Know What You Did Last Summer (1997)
3204                                 Scream 3 (2000)
1599                    Devil's Advocate, The (1997)
1070                        Dial M for Murder (1954)
1201                                   Psycho (1960)
3592                         Puppet Master II (1990)
2057                               Snake Eyes (1998)
2198                          Mortal Thoughts (1991)
Name: title, dtype: object