notebook.community

Edit and run



In [ ]:

    
import numpy as np
import pandas as pd
np.seterr(divide='ignore', invalid='ignore')



In [ ]:

    
data = pd.io.parsers.read_csv('data/final-new-ratings.csv',
                              names=['user_id', 'movie_id', 'rating', 'time'],
                              engine='python', delimiter=';')
movie_data = pd.io.parsers.read_csv('data/final-new-movies.csv',
                                    names=['movie_id', 'title', 'genre'],
                                    engine='python', delimiter=';')



In [ ]:

    
ratings_mat = np.ndarray(
    shape=(np.max(data.movie_id.values), np.max(data.user_id.values)),
    dtype=np.uint8)
ratings_mat[data.movie_id.values - 1, data.user_id.values - 1] = data.rating.values



In [ ]:

    
normalised_mat = ratings_mat - np.matrix(np.mean(ratings_mat, 1)).T
cov_mat = np.cov(normalised_mat)
evals, evecs = np.linalg.eig(cov_mat)



In [ ]:

    
def top_cosine_similarity(data, movie_id, top_n=10):
    index = movie_id - 1  # Movie id starts from 1
    movie_row = data[index, :]
    magnitude = np.sqrt(np.einsum('ij, ij -> i', data, data))
    similarity = np.dot(movie_row, data.T) / (magnitude[index] * magnitude)
    sort_indexes = np.argsort(-similarity)
    return (sort_indexes[:top_n], similarity)


#"kasutatav filmi id": tmdb filmi id
lookup = {
    "71": 862,
    "66": 197,
    "67": 278,
    "68": 3049,
    "69": 8587,
    "62": 78,
    "63": 771,
    "3": 114,
    "64": 627,
    "65": 238,
    "70": 872,
    "4": 567,
    "5": 770,
    "6": 62,
    "7": 88,
    "8": 601,
    "9": 85,
    "10": 348,
    "11": 703,
    "12": 694,
    "13": 914,
    "14": 621,
    "15": 578,
    "2": 816,
    "16": 18,
    "17": 597,
    "18": 1725,
    "19": 11252,
    "20": 8741,
    "21": 11167,
    "22": 603,
    "23": 509,
    "1": 2105,
    "24": 550,
    "25": 10784,
    "26": 392,
    "27": 77,
    "28": 808,
    "29": 676,
    "30": 585,
    "31": 120,
    "32": 453,
    "33": 855,
    "34": 425,
    "35": 672,
    "36": 423,
    "37": 12,
    "38": 22,
    "39": 24,
    "40": 11846,
    "41": 38,
    "42": 11036,
    "43": 6947,
    "44": 9806,
    "45": 477433,
    "46": 591,
    "47": 920,
    "48": 350,
    "49": 1858,
    "50": 7326,
    "51": 155,
    "52": 8966,
    "53": 13223,
    "54": 19995,
    "55": 50014,
    "56": 84892,
    "57": 157336,
    "58": 207703,
    "59": 140607,
    "60": 286217,
    "61": 259693,
}


# Helper function to print top N similar movies
def get_similar_movies(movie_data, movie_id, top_indexes):
    print('Recommendations for {0}: \n'.format(
        movie_data[movie_data.movie_id == movie_id].title.values[0]))
    result = []
    for id in top_indexes[0] + 1:
        result.append({
            "tmdb_id": lookup[str(movie_data[movie_data.movie_id == id].movie_id.values[0])],
            "similarity": top_indexes[1][id - 1],
            "title": movie_data[movie_data.movie_id == id].title.values[0]
        })
    return {"result": result}


k = 25
# kui mitmele filmile soovitakse sarnasus saada
top_n = 70

sliced = evecs[:, :k]  # representative data


def getData(movie_id):
    movie_id = int(movie_id)
    top_indexes = top_cosine_similarity(sliced, movie_id, top_n)
    return get_similar_movies(movie_data, movie_id, top_indexes)

# muutes id-d saab muuta, mis filmile sarnasusi otsitakse
print(getData("54"))