In [1]:
import os
os.chdir('..')

In [2]:
# Import all the packages we need to generate recommendations
import pandas as pd
import src.utils as utils
import src.recommenders as recommenders
import src.similarity as similarity

# Enable logging on Jupyter notebook
import logging
logger = logging.getLogger()
logger.setLevel(logging.DEBUG)

Similarity metrics: When both Customers rate two movies exactly the same


In [3]:
# Create a dataframe manually to illustrate the examples
ratings = pd.DataFrame(columns = ["customer", "movie", "rating"], 
                       data=[
                           ['Ana','movie_1',1],
                           ['Ana', 'movie_2', 5],
                           ['Bob','movie_1',1],
                           ['Bob', 'movie_2', 5]])
ratings_matrix = ratings.pivot_table(index='customer', columns='movie', values='rating', fill_value=0)

rating_1 = ratings_matrix.ix['Ana']
rating_2 = ratings_matrix.ix['Bob']

ratings_matrix


Out[3]:
movie movie_1 movie_2
customer
Ana 1 5
Bob 1 5

In [4]:
s_intersection =similarity.calculate_distance(rating_1, rating_2, 'intersection')
s_cosine = similarity.calculate_distance(rating_1, rating_2, 'cosine')
s_pearson = similarity.calculate_distance(rating_1, rating_2, 'pearson')
s_jaccard = similarity.calculate_distance(rating_1, rating_2, 'jaccard')

print("similarity intersection: ", s_intersection)
print("similarity cosine: ", s_cosine)
print("similarity pearson: ", s_pearson)
print("similarity jaccard: ", s_jaccard)


similarity intersection:  2
similarity cosine:  1.0
similarity pearson:  1.0
similarity jaccard:  1.0

 When two customers rate the same movies very differently


In [5]:
# Create a dataframe manually to illustrate the examples
ratings = pd.DataFrame(columns = ["customer", "movie", "rating"], 
                       data=[
                           ['Ana','movie_1',5],
                           ['Ana', 'movie_2', 1],
                           ['Bob','movie_1',1],
                           ['Bob', 'movie_2', 5]])
ratings_matrix = ratings.pivot_table(index='customer', columns='movie', values='rating', fill_value=0)

rating_1 = ratings_matrix.ix['Ana']
rating_2 = ratings_matrix.ix['Bob']

ratings_matrix


Out[5]:
movie movie_1 movie_2
customer
Ana 5 1
Bob 1 5

In [6]:
s_intersection =similarity.calculate_distance(rating_1, rating_2, 'intersection')
s_cosine = similarity.calculate_distance(rating_1, rating_2, 'cosine')
s_pearson = similarity.calculate_distance(rating_1, rating_2, 'pearson')
s_jaccard = similarity.calculate_distance(rating_1, rating_2, 'jaccard')

print("similarity intersection: ", s_intersection)
print("similarity cosine: ", s_cosine)
print("similarity pearson: ", s_pearson)
print("similarity jaccard: ", s_jaccard)


similarity intersection:  2
similarity cosine:  0.384615384615
similarity pearson:  -1.0
similarity jaccard:  1.0

When two customers rate different movies


In [7]:
# Create a dataframe manually to illustrate the examples
data=[['Ana','movie_1',5],['Ana', 'movie_2', 1],['Bob','movie_3',5],['Bob', 'movie_4', 5]]
ratings = pd.DataFrame(columns = ["customer", "movie", "rating"], data=data)
ratings_matrix = ratings.pivot_table(index='customer', columns='movie', values='rating', fill_value=0)

rating_1 = ratings_matrix.ix['Ana']
rating_2 = ratings_matrix.ix['Bob']

ratings_matrix


Out[7]:
movie movie_1 movie_2 movie_3 movie_4
customer
Ana 5 1 0 0
Bob 0 0 5 5

In [8]:
s_intersection =similarity.calculate_distance(rating_1, rating_2, 'intersection')
s_cosine = similarity.calculate_distance(rating_1, rating_2, 'cosine')
s_pearson = similarity.calculate_distance(rating_1, rating_2, 'pearson')
s_jaccard = similarity.calculate_distance(rating_1, rating_2, 'jaccard')

print("similarity intersection: ", s_intersection)
print("similarity cosine: ", s_cosine)
print("similarity pearson: ", s_pearson)
print("similarity jaccard: ", s_jaccard)


similarity intersection:  0
similarity cosine:  0.0
similarity pearson:  -0.727606875109
similarity jaccard:  0.0

 Positive people vs. Negative people


In [9]:
# Create a dataframe manually to illustrate the examples
data=[['Ana','movie_1',5],['Ana', 'movie_2', 4],['Bob','movie_1',3],['Bob', 'movie_2', 2]]
ratings = pd.DataFrame(columns = ["customer", "movie", "rating"], data=data)
ratings_matrix = ratings.pivot_table(index='customer', columns='movie', values='rating', fill_value=0)

rating_1 = ratings_matrix.ix['Ana']
rating_2 = ratings_matrix.ix['Bob']

ratings_matrix


Out[9]:
movie movie_1 movie_2
customer
Ana 5 4
Bob 3 2

In [10]:
s_intersection =similarity.calculate_distance(rating_1, rating_2, 'intersection')
s_cosine = similarity.calculate_distance(rating_1, rating_2, 'cosine')
s_pearson = similarity.calculate_distance(rating_1, rating_2, 'pearson')
s_jaccard = similarity.calculate_distance(rating_1, rating_2, 'jaccard')

print("similarity intersection: ", s_intersection)
print("similarity cosine: ", s_cosine)
print("similarity pearson: ", s_pearson)
print("similarity jaccard: ", s_jaccard)


similarity intersection:  2
similarity cosine:  0.996240588196
similarity pearson:  1.0
similarity jaccard:  1.0

People who rate a lot of movies vs. people who don't rate a lot of movies


In [11]:
# Create a dataframe manually to illustrate the examples
data=[['Ana','movie_1',5],['Ana', 'movie_2', 4],['Ana', 'movie_3', 4],['Bob','movie_1',3],['Bob', 'movie_2', 2]]
ratings = pd.DataFrame(columns = ["customer", "movie", "rating"], data=data)
ratings_matrix = ratings.pivot_table(index='customer', columns='movie', values='rating', fill_value=0)

rating_1 = ratings_matrix.ix['Ana']
rating_2 = ratings_matrix.ix['Bob']

ratings_matrix


Out[11]:
movie movie_1 movie_2 movie_3
customer
Ana 5 4 4
Bob 3 2 0

In [12]:
s_intersection =similarity.calculate_distance(rating_1, rating_2, 'intersection')
s_cosine = similarity.calculate_distance(rating_1, rating_2, 'cosine')
s_pearson = similarity.calculate_distance(rating_1, rating_2, 'pearson')
s_jaccard = similarity.calculate_distance(rating_1, rating_2, 'jaccard')

print("similarity intersection: ", s_intersection)
print("similarity cosine: ", s_cosine)
print("similarity pearson: ", s_pearson)
print("similarity jaccard: ", s_jaccard)


similarity intersection:  2
similarity cosine:  0.844926112126
similarity pearson:  0.755928946018
similarity jaccard:  0.6666666666666666