In [1]:
import pandas as pd
import numpy as np

ratings_list = [i.strip().split("::") for i in open('ml-1m/ratings.dat', 'r').readlines()]
users_list = [i.strip().split("::") for i in open('ml-1m/users.dat', 'r').readlines()]
movies_list = [i.strip().split("::") for i in open('ml-1m/movies.dat', 'r').readlines()]

ratings_df = pd.DataFrame(ratings_list, columns = ['UserID', 'MovieID', 'Rating', 'Timestamp'], dtype = int)
movies_df = pd.DataFrame(movies_list, columns = ['MovieID', 'Title', 'Genres'])
movies_df['MovieID'] = movies_df['MovieID'].apply(pd.to_numeric)

In [2]:
movies_df.head()


Out[2]:
MovieID Title Genres
0 1 Toy Story (1995) Animation|Children's|Comedy
1 2 Jumanji (1995) Adventure|Children's|Fantasy
2 3 Grumpier Old Men (1995) Comedy|Romance
3 4 Waiting to Exhale (1995) Comedy|Drama
4 5 Father of the Bride Part II (1995) Comedy

In [3]:
ratings_df.head()


Out[3]:
UserID MovieID Rating Timestamp
0 1 1193 5 978300760
1 1 661 3 978302109
2 1 914 3 978301968
3 1 3408 4 978300275
4 1 2355 5 978824291

In [4]:
R_df = ratings_df.pivot(index = 'UserID', columns ='MovieID', values = 'Rating').fillna(0)
R_df.head()


Out[4]:
MovieID 1 2 3 4 5 6 7 8 9 10 ... 3943 3944 3945 3946 3947 3948 3949 3950 3951 3952
UserID
1 5.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
2 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
3 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
4 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
5 0.0 0.0 0.0 0.0 0.0 2.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0

5 rows × 3706 columns


In [5]:
R = R_df.as_matrix()
user_ratings_mean = np.mean(R, axis = 1)
R_demeaned = R - user_ratings_mean.reshape(-1, 1)

In [6]:
from scipy.sparse.linalg import svds
U, sigma, Vt = svds(R_demeaned, k = 50)

In [7]:
sigma = np.diag(sigma)

In [8]:
all_user_predicted_ratings = np.dot(np.dot(U, sigma), Vt) + user_ratings_mean.reshape(-1, 1)
preds_df = pd.DataFrame(all_user_predicted_ratings, columns = R_df.columns)

In [9]:
def recommend_movies(predictions_df, userID, movies_df, original_ratings_df, num_recommendations=5):
    
    # Get and sort the user's predictions
    user_row_number = userID - 1 # UserID starts at 1, not 0
    sorted_user_predictions = predictions_df.iloc[user_row_number].sort_values(ascending=False)
    
    # Get the user's data and merge in the movie information.
    user_data = original_ratings_df[original_ratings_df.UserID == (userID)]
    user_full = (user_data.merge(movies_df, how = 'left', left_on = 'MovieID', right_on = 'MovieID').
                     sort_values(['Rating'], ascending=False)
                 )

    print 'User {0} has already rated {1} movies.'.format(userID, user_full.shape[0])
    print 'Recommending the highest {0} predicted ratings movies not already rated.'.format(num_recommendations)
    
    # Recommend the highest predicted rating movies that the user hasn't seen yet.
    recommendations = (movies_df[~movies_df['MovieID'].isin(user_full['MovieID'])].
         merge(pd.DataFrame(sorted_user_predictions).reset_index(), how = 'left',
               left_on = 'MovieID',
               right_on = 'MovieID').
         rename(columns = {user_row_number: 'Predictions'}).
         sort_values('Predictions', ascending = False).
                       iloc[:num_recommendations, :-1]
                      )

    return user_full, recommendations

already_rated, predictions = recommend_movies(preds_df, 837, movies_df, ratings_df, 10)


User 837 has already rated 69 movies.
Recommending the highest 10 predicted ratings movies not already rated.

In [10]:
already_rated.head(10)


Out[10]:
UserID MovieID Rating Timestamp Title Genres
36 837 858 5 975360036 Godfather, The (1972) Action|Crime|Drama
35 837 1387 5 975360036 Jaws (1975) Action|Horror
65 837 2028 5 975360089 Saving Private Ryan (1998) Action|Drama|War
63 837 1221 5 975360036 Godfather: Part II, The (1974) Action|Crime|Drama
11 837 913 5 975359921 Maltese Falcon, The (1941) Film-Noir|Mystery
20 837 3417 5 975360893 Crimson Pirate, The (1952) Adventure|Comedy|Sci-Fi
34 837 2186 4 975359955 Strangers on a Train (1951) Film-Noir|Thriller
55 837 2791 4 975360893 Airplane! (1980) Comedy
31 837 1188 4 975360920 Strictly Ballroom (1992) Comedy|Romance
28 837 1304 4 975360058 Butch Cassidy and the Sundance Kid (1969) Action|Comedy|Western

In [11]:
predictions


Out[11]:
MovieID Title Genres
516 527 Schindler's List (1993) Drama|War
1848 1953 French Connection, The (1971) Action|Crime|Drama|Thriller
596 608 Fargo (1996) Crime|Drama|Thriller
1235 1284 Big Sleep, The (1946) Film-Noir|Mystery
2085 2194 Untouchables, The (1987) Action|Crime|Drama
1188 1230 Annie Hall (1977) Comedy|Romance
1198 1242 Glory (1989) Action|Drama|War
897 922 Sunset Blvd. (a.k.a. Sunset Boulevard) (1950) Film-Noir
1849 1954 Rocky (1976) Action|Drama
581 593 Silence of the Lambs, The (1991) Drama|Thriller

In [ ]: