Recommender Systems & Collaborative Filtering

Sistemas de recomendação são uma das principais aplicações de machine learning na atualidade: estima-se que até **60%** da receita da Amazon venha de de recomendações [não tenho uma citação ainda, então isso é achismo].

Em 2 de Outubro 2 de 2006, a Netflix ofereceu um prêmio de **US\$ 1.000.000,00** a qualquer pessoa ou time que melhorasse seu algoritmo de recomendação em 10% ... o prêmio final foi conquistado em 21 Setembro de 2009, mais de 3 anos depois.

Só estes exemplos já demonstram a importância real de boas recomendações ..e o impacto que algoritmos que obtém bons resultados podem ter.

Collaborative Filtering é o 'carro-chefe' quando se pensa em sistemas de recomendação ...mas o que é um _filtro colaborativo_?

VAMOS PRO PPT! DEPOIS VOLTAMOS


In [59]:
from math import sqrt
from critics import critics

#SampleSet with movies
movies={
    'Bernadette': {
        'Lady in the Water': 2.5, 
        'Snakes on a Plane': 1.0, 
        'Just My Luck': 3.0, 
        'Superman Returns': 3.5,
        #'Star Wars': 1.0,
        'Love Story': 5.0, 
        'The Notebook': 5.0},
    'Amy': {
        'Lady in the Water': 2.0, 
        'Snakes on a Plane': 3.5, 
        'Just My Luck': 1.5, 
        'Superman Returns': 5.0,
        #'Star Wars': 5.0,
        'Love Story': 3.5,
        'The Notebook': 5.0}, 
    'Howard': {
        'Lady in the Water': 4.5, 
        'Snakes on a Plane': 1.0,
        # ###NO just my luck
        'Superman Returns': 5.0,
        #'Star Wars': 5.0,
        # no Love Story
        'The Notebook': 1.0},
    'Raj': {
        # ###NO lady in the water
        'Snakes on a Plane': 3.5, 
        'Just My Luck': 3.0,
        'Superman Returns': 4.0,
        #'Star Wars': 5.0,
        'Love Story': 5.0,
        'The Notebook': 4.5},
    'Leonard': {
        'Lady in the Water': 2.0, 
        'Snakes on a Plane': 5.0, 
        'Just My Luck': 2.0, 
        'Superman Returns': 5.0,
        #'Star Wars': 5.0,
        'Love Story': 2.5,
        'The Notebook': 2.5}, 
    'Sheldon': {
        'Lady in the Water': 1.5, 
        'Snakes on a Plane': 5.0,
        # ###no just my luck
        'Superman Returns': 5.0, 
        #'Star Wars': 1.0,
        'Love Story': 1.0,
        'The Notebook': 1.0},
    'Penny': {
        # ###NO lady in the water
        'Snakes on a Plane': 1.0,
        # ###NO just my luck
        'Superman Returns':4.0,
        #'Star Wars': 2.5,
        'Love Story':5.0,
        # ###NO night listener
        },
    'Daniel': {
        'Superman Returns': 1.0,
        'The Notebook': 5.0,
        #'Star Wars'
    }
}
#################################################################
def loadDataset(path=""):
    """ To load the dataSet"
        Parameter: The folder where the data files are stored
        Return: the dictionary with the data
    """
    #Recover the titles of the books
    books = {}
    for line in open(path+"BX-Books.csv"):
        line = line.replace('"', "")
        (id,title) = line.split(";") [0:2]
        books[id] = title

    #Load the data
    prefs = {}
    count = 0
    for line in open(path+"BX-Book-Ratings.csv"):
        line = line.replace('"', "")
        line = line.replace("\\","")
        (user,bookid,rating) = line.split(";")
        try:
            if float(rating) > 0.0:
                prefs.setdefault(user,{})
                prefs[user][books[bookid]] = float(rating)
        except ValueError:
            count+=1
            print "value error found! " + user + bookid + rating
        except KeyError:
            count +=1
            print "key error found! " + user + " " + bookid
    return prefs
#################################################################
# transofrma pessoa, item -em-> item, pessoa
def transformPrefs(prefs):
    results = {}
    for person in prefs:
        for item in prefs[person]:
            results.setdefault(item,{})

            results[item][person] = prefs[person][item]
    return results
#################################################################
# SIMILARIDADE com base da dist. euclideana -- que vimos
def sim_euclidean(prefs, person1, person2):
    #pego a lista de items que existem em ambos
    si = {}
    for item in prefs[person1]:
        if item in prefs[person2]:
            si[item] = 1

    #se nao há items em comum, retorno zero
    if len(si) == 0: return 0

    #soma das diferencas
    sum_of_squares = sum([pow(prefs[person1][item]-prefs[person2][item],2) for item in prefs[person1] if item in prefs[person2]])

    return 1 / (1 + sum_of_squares)
#################################################################
#Returns the Pearson correlation coefficient for p1 and p2 
def sim_pearson(prefs,p1,p2):
    #pego a lista de items que existem em ambos
    si = {}
    for item in prefs[p1]:
        if item in prefs[p2]: 
            si[item] = 1

    #se nao há items em comum, retorno zero
    if len(si) == 0: return 0

    #sum calculations
    n = len(si)

    #sum of all preferences
    sum1 = sum([prefs[p1][it] for it in si])
    sum2 = sum([prefs[p2][it] for it in si])

    #Sum of the squares
    sum1Sq = sum([pow(prefs[p1][it],2) for it in si])
    sum2Sq = sum([pow(prefs[p2][it],2) for it in si])

    #Sum of the products
    pSum = sum([prefs[p1][it] * prefs[p2][it] for it in si])

    #Calculate r (Pearson score)
    num = pSum - (sum1 * sum2/n)
    den = sqrt((sum1Sq - pow(sum1,2)/n) * (sum2Sq - pow(sum2,2)/n))
    if den == 0:
        return 0

    r = num/den

    return r
#################################################################
#Returns the best matches for person from the prefs dictionary
#Number of the results and similiraty function are optional params.
def topMatches(prefs,person,n=5,similarity=sim_pearson):
    scores = [(similarity(prefs,person,other),other) for other in prefs if other != person]
    scores.sort()
    scores.reverse()
    return scores[0:n]
#################################################################
#Gets recommendations for a person by using a weighted average
#of every other user's rankings
def getRecommendations(prefs,person,similarity=sim_pearson):
    totals = {}
    simSums = {}
    for other in prefs:
        # nao me comparo com eu mesmo
        if other == person: continue
        # uso a fn. de similaridade provida    
        sim = similarity(prefs,person,other)

        # caso haja, ignore notas de zero ou menos
        if sim <= 0: continue
            
        for item in prefs[other]:
            # so olhe para filmes que ainda nao vi
            if item not in prefs[person] or prefs[person][item] == 0:
                #Similarity * score
                totals.setdefault(item,0)
                totals[item] += prefs[other][item] * sim
                #Sum of similarities
                simSums.setdefault(item,0)
                simSums[item] += sim

    #crio uma lista normalizada
    rankings = [(total/simSums[item],item) for item,total in totals.items()]

    #retorno rearrumado de maior a menor
    rankings.sort()
    rankings.reverse()
    return rankings

In [60]:
sim_euclidean(critics,'98556', '180727')


Out[60]:
0.058823529411764705

In [61]:
sim_pearson(critics,'180727', '177432')


Out[61]:
0.6622661785325219

In [62]:
topMatches(critics,'98556',10,sim_euclidean)


Out[62]:
[(1.0, '69721'),
 (1.0, '28667'),
 (1.0, '224646'),
 (1.0, '182212'),
 (1.0, '11676'),
 (0.5, '4157'),
 (0.5, '28729'),
 (0.5, '224650'),
 (0.5, '199616'),
 (0.5, '189139')]

In [64]:
getRecommendations(movies,'Daniel', sim_euclidean)[:5]


Out[64]:
[(4.322167619090522, 'Love Story'),
 (2.6109225700944156, 'Just My Luck'),
 (2.4313712546785573, 'Lady in the Water'),
 (2.3741151597664496, 'Snakes on a Plane')]

In [ ]: