Sistemas de recomendação são uma das principais aplicações de machine learning na atualidade: estima-se que até **60%** da receita da Amazon venha de de recomendações [não tenho uma citação ainda, então isso é achismo].
Em 2 de Outubro 2 de 2006, a Netflix ofereceu um prêmio de **US\$ 1.000.000,00** a qualquer pessoa ou time que melhorasse seu algoritmo de recomendação em 10% ... o prêmio final foi conquistado em 21 Setembro de 2009, mais de 3 anos depois.
Só estes exemplos já demonstram a importância real de boas recomendações ..e o impacto que algoritmos que obtém bons resultados podem ter.
Collaborative Filtering é o 'carro-chefe' quando se pensa em sistemas de recomendação ...mas o que é um _filtro colaborativo_?
VAMOS PRO PPT! DEPOIS VOLTAMOS
In [59]:
from math import sqrt
from critics import critics
#SampleSet with movies
movies={
'Bernadette': {
'Lady in the Water': 2.5,
'Snakes on a Plane': 1.0,
'Just My Luck': 3.0,
'Superman Returns': 3.5,
#'Star Wars': 1.0,
'Love Story': 5.0,
'The Notebook': 5.0},
'Amy': {
'Lady in the Water': 2.0,
'Snakes on a Plane': 3.5,
'Just My Luck': 1.5,
'Superman Returns': 5.0,
#'Star Wars': 5.0,
'Love Story': 3.5,
'The Notebook': 5.0},
'Howard': {
'Lady in the Water': 4.5,
'Snakes on a Plane': 1.0,
# ###NO just my luck
'Superman Returns': 5.0,
#'Star Wars': 5.0,
# no Love Story
'The Notebook': 1.0},
'Raj': {
# ###NO lady in the water
'Snakes on a Plane': 3.5,
'Just My Luck': 3.0,
'Superman Returns': 4.0,
#'Star Wars': 5.0,
'Love Story': 5.0,
'The Notebook': 4.5},
'Leonard': {
'Lady in the Water': 2.0,
'Snakes on a Plane': 5.0,
'Just My Luck': 2.0,
'Superman Returns': 5.0,
#'Star Wars': 5.0,
'Love Story': 2.5,
'The Notebook': 2.5},
'Sheldon': {
'Lady in the Water': 1.5,
'Snakes on a Plane': 5.0,
# ###no just my luck
'Superman Returns': 5.0,
#'Star Wars': 1.0,
'Love Story': 1.0,
'The Notebook': 1.0},
'Penny': {
# ###NO lady in the water
'Snakes on a Plane': 1.0,
# ###NO just my luck
'Superman Returns':4.0,
#'Star Wars': 2.5,
'Love Story':5.0,
# ###NO night listener
},
'Daniel': {
'Superman Returns': 1.0,
'The Notebook': 5.0,
#'Star Wars'
}
}
#################################################################
def loadDataset(path=""):
""" To load the dataSet"
Parameter: The folder where the data files are stored
Return: the dictionary with the data
"""
#Recover the titles of the books
books = {}
for line in open(path+"BX-Books.csv"):
line = line.replace('"', "")
(id,title) = line.split(";") [0:2]
books[id] = title
#Load the data
prefs = {}
count = 0
for line in open(path+"BX-Book-Ratings.csv"):
line = line.replace('"', "")
line = line.replace("\\","")
(user,bookid,rating) = line.split(";")
try:
if float(rating) > 0.0:
prefs.setdefault(user,{})
prefs[user][books[bookid]] = float(rating)
except ValueError:
count+=1
print "value error found! " + user + bookid + rating
except KeyError:
count +=1
print "key error found! " + user + " " + bookid
return prefs
#################################################################
# transofrma pessoa, item -em-> item, pessoa
def transformPrefs(prefs):
results = {}
for person in prefs:
for item in prefs[person]:
results.setdefault(item,{})
results[item][person] = prefs[person][item]
return results
#################################################################
# SIMILARIDADE com base da dist. euclideana -- que vimos
def sim_euclidean(prefs, person1, person2):
#pego a lista de items que existem em ambos
si = {}
for item in prefs[person1]:
if item in prefs[person2]:
si[item] = 1
#se nao há items em comum, retorno zero
if len(si) == 0: return 0
#soma das diferencas
sum_of_squares = sum([pow(prefs[person1][item]-prefs[person2][item],2) for item in prefs[person1] if item in prefs[person2]])
return 1 / (1 + sum_of_squares)
#################################################################
#Returns the Pearson correlation coefficient for p1 and p2
def sim_pearson(prefs,p1,p2):
#pego a lista de items que existem em ambos
si = {}
for item in prefs[p1]:
if item in prefs[p2]:
si[item] = 1
#se nao há items em comum, retorno zero
if len(si) == 0: return 0
#sum calculations
n = len(si)
#sum of all preferences
sum1 = sum([prefs[p1][it] for it in si])
sum2 = sum([prefs[p2][it] for it in si])
#Sum of the squares
sum1Sq = sum([pow(prefs[p1][it],2) for it in si])
sum2Sq = sum([pow(prefs[p2][it],2) for it in si])
#Sum of the products
pSum = sum([prefs[p1][it] * prefs[p2][it] for it in si])
#Calculate r (Pearson score)
num = pSum - (sum1 * sum2/n)
den = sqrt((sum1Sq - pow(sum1,2)/n) * (sum2Sq - pow(sum2,2)/n))
if den == 0:
return 0
r = num/den
return r
#################################################################
#Returns the best matches for person from the prefs dictionary
#Number of the results and similiraty function are optional params.
def topMatches(prefs,person,n=5,similarity=sim_pearson):
scores = [(similarity(prefs,person,other),other) for other in prefs if other != person]
scores.sort()
scores.reverse()
return scores[0:n]
#################################################################
#Gets recommendations for a person by using a weighted average
#of every other user's rankings
def getRecommendations(prefs,person,similarity=sim_pearson):
totals = {}
simSums = {}
for other in prefs:
# nao me comparo com eu mesmo
if other == person: continue
# uso a fn. de similaridade provida
sim = similarity(prefs,person,other)
# caso haja, ignore notas de zero ou menos
if sim <= 0: continue
for item in prefs[other]:
# so olhe para filmes que ainda nao vi
if item not in prefs[person] or prefs[person][item] == 0:
#Similarity * score
totals.setdefault(item,0)
totals[item] += prefs[other][item] * sim
#Sum of similarities
simSums.setdefault(item,0)
simSums[item] += sim
#crio uma lista normalizada
rankings = [(total/simSums[item],item) for item,total in totals.items()]
#retorno rearrumado de maior a menor
rankings.sort()
rankings.reverse()
return rankings
In [60]:
sim_euclidean(critics,'98556', '180727')
Out[60]:
In [61]:
sim_pearson(critics,'180727', '177432')
Out[61]:
In [62]:
topMatches(critics,'98556',10,sim_euclidean)
Out[62]:
In [64]:
getRecommendations(movies,'Daniel', sim_euclidean)[:5]
Out[64]:
In [ ]: