In [147]:
# Carregando tabelas users, reviews, places
# users: Alan, Barbara, Carlos, Denis, Edgar
# reviews: uid, pid, value
# places: 3 sushis, 3 pizzarias, 3 cantinas, 2 fast-food
%matplotlib inline
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import math
In [148]:
# Carregando tableas
users = pd.read_csv("../data/users.csv")
reviews = pd.read_csv("../data/reviews.csv")
places = pd.read_csv("../data/places.csv")
In [149]:
users.head()
Out[149]:
In [150]:
places.head(n=10)
Out[150]:
Considere o um vetor v
onde cada usuário uid
representa um eixo e a nota atribuída pelo usuário representa a intensidade da direção deste vetor. Para calcular a similaridade entre duas lojas, vamos calcular o cosseno entre estas duas listas de vetores por meio da seguinte fórmula:
SUM(ab)/(SQRT(SUM(aˆ2)) SQRT(SUM(bˆ2)))
Esta fórmula nos permite calcular a diferença pelo valor do cosseno para o ângulo retornado. Quando os vetores são idênticos o ângulo entre eles é igual 0. Ao calcularmos o cosseno deste ângulo temos que cos 0 = 1
portanto a similaridade é máxima. De forma análoga, quando os vetores forem diferentes o valor será igual a 0.
Como estamos trabalhando com notas positivas (notas 0 a 5) é esperado que o resultado esteja sempre entre 0~1.
In [151]:
def cosine(r1=None, r2=None):
"""
Filter ratings lists by users that evaluated both places.
Each user is considered a vector the we must compare the ratings from same person.
After filtering users we calculate the cosine similarity which will give us the similarity between the two vectors.
The formula for the cosine similarity is:
> SUM(a*b)/(SQRT(SUM(aˆ2)) * SQRT(SUM(bˆ2)))
"""
v1 = {r["uid"]: r["review"] for i, r in r1.iterrows()}
v2 = {r["uid"]: r["review"] for i, r in r2.iterrows()}
uids = set(v1.keys()) & set(v2.keys())
dot = sum([v1[uid] * v2[uid] for uid in uids])
len1 = sum([v1[uid] * v1[uid] for uid in uids])
len2 = sum([v2[uid] * v2[uid] for uid in uids])
return dot/(math.sqrt(len1) * math.sqrt(len2))
In [139]:
sims = np.empty((len(places) + 1, len(places) + 1))
sims.fill(0)
# Selecionar todas as avaliações de usuários que classificaram ambos os lugares
for index, place1 in places.iterrows():
for index, place2 in places.iterrows():
sim = cosine(
reviews[reviews["pid"] == place1["id"]],
reviews[reviews["pid"] == place2["id"]]
)
sims[place1["id"]][place2["id"]] = sim
#print("sim('{0}', '{1}') = {2}".format(place1["name"], place2["name"], sim))
# Total de relacionamentos calculados
print("Total: {0} relacionamentos calculados.".format((sims.shape[0] - 1) * (sims.shape[1] - 1)))
In [181]:
def predict(pid, reviews):
"""
Predicts user rating based on previous ratings and similarity between places.
"""
s1 = sum([sims[r["pid"]][pid] * r["review"] for i, r in reviews.iterrows()])
s2 = 1 + sum([sims[r["pid"]][pid] for i, r in reviews.iterrows()])
return s1/s2
# Prevendo estabelecimento para o usuário
for i, user in users.iterrows():
for i, place in places.iterrows():
if not len(reviews[(reviews.uid == user["id"]) & (reviews.pid == place["id"])]):
print("* ", end="")
print("Pred('{0}', '{1}') = ".format(place["name"], user["name"]), end="")
print(predict(place["id"], reviews[reviews["uid"] == user["id"]]))
#print("Pred('{0}', '{1}') = ".format(place["name"], user["name"]), end="")
#print(predict(place["id"], reviews[reviews["uid"] == user["id"]]))