张艺馨 15210130100

1.UserCF&ItemCF代码


In [1]:
critics={'Lisa Rose': {'Lady in the Water': 2.5, 'Snakes on a Plane': 3.5,
      'Just My Luck': 3.0, 'Superman Returns': 3.5, 'You, Me and Dupree': 2.5,
      'The Night Listener': 3.0},
     'Gene Seymour': {'Lady in the Water': 3.0, 'Snakes on a Plane': 3.5,
      'Just My Luck': 1.5, 'Superman Returns': 5.0, 'The Night Listener': 3.0,
      'You, Me and Dupree': 3.5},
     'Michael Phillips': {'Lady in the Water': 2.5, 'Snakes on a Plane': 3.0,
      'Superman Returns': 3.5, 'The Night Listener': 4.0},
     'Claudia Puig': {'Snakes on a Plane': 3.5, 'Just My Luck': 3.0,
      'The Night Listener': 4.5, 'Superman Returns': 4.0,
      'You, Me and Dupree': 2.5},
     'Mick LaSalle': {'Lady in the Water': 3.0, 'Snakes on a Plane': 4.0,
      'Just My Luck': 2.0, 'Superman Returns': 3.0, 'The Night Listener': 3.0,
      'You, Me and Dupree': 2.0},
     'Jack Matthews': {'Lady in the Water': 3.0, 'Snakes on a Plane': 4.0,
      'The Night Listener': 3.0, 'Superman Returns': 5.0, 'You, Me and Dupree': 3.5},
     'Toby': {'Snakes on a Plane':4.5,'You, Me and Dupree':1.0,'Superman Returns':4.0}}

In [2]:
critics['Lisa Rose']['Lady in the Water']


Out[2]:
2.5

In [3]:
critics['Toby']['Snakes on a Plane']=4.5

In [4]:
critics['Toby']


Out[4]:
{'Snakes on a Plane': 4.5, 'Superman Returns': 4.0, 'You, Me and Dupree': 1.0}

1.1 User-based filtering


In [5]:
import numpy as np
np.sqrt(np.power(5-4, 2) + np.power(4-1, 2))


Out[5]:
3.1622776601683795

In [7]:
1.0 /(1 + np.sqrt(np.power(5-4, 2) + np.power(4-1, 2)) )


Out[7]:
0.2402530733520421

In [8]:
def sim_distance(prefs,person1,person2):
    si={}
    for item in prefs[person1]:
        if item in prefs[person2]:
            si[item]=1
    if len(si)==0: return 0
    sum_of_squares=np.sum([np.power(prefs[person1][item]-prefs[person2][item],2)
                      for item in prefs[person1] if item in prefs[person2]])
    return 1/(1+sum_of_squares)

In [9]:
sim_distance(critics, 'Lisa Rose','Gene Seymour')


Out[9]:
0.14814814814814814

In [10]:
def sim_pearson(prefs,p1,p2):
    si={}
    for item in prefs[p1]:
        if item in prefs[p2]: si[item]=1
    n=len(si)
    if n==0: return 0
    sum1=np.sum([prefs[p1][it] for it in si])
    sum2=np.sum([prefs[p2][it] for it in si])
    sum1Sq=np.sum([np.power(prefs[p1][it],2) for it in si])
    sum2Sq=np.sum([np.power(prefs[p2][it],2) for it in si])
    pSum=np.sum([prefs[p1][it]*prefs[p2][it] for it in si])
    num=pSum-(sum1*sum2/n)
    den=np.sqrt((sum1Sq-np.power(sum1,2)/n)*(sum2Sq-np.power(sum2,2)/n))
    if den==0: return 0
    return num/den

In [11]:
sim_pearson(critics, 'Lisa Rose','Gene Seymour')


Out[11]:
0.39605901719066977

In [12]:
def topMatches(prefs,person,n=5,similarity=sim_pearson):
    scores=[(similarity(prefs,person,other),other)
        for other in prefs if other!=person]
    scores.sort( )
    scores.reverse( )
    return scores[0:n]

In [13]:
topMatches(critics,'Toby',n=3)


Out[13]:
[(0.99124070716192991, 'Lisa Rose'),
 (0.92447345164190486, 'Mick LaSalle'),
 (0.89340514744156474, 'Claudia Puig')]

In [14]:
def getRecommendations(prefs,person,similarity=sim_pearson):
    totals={}
    simSums={}
    for other in prefs:
        if other==person: continue
        sim=similarity(prefs,person,other)
        if sim<=0: continue
        for item in prefs[other]:   
            if item not in prefs[person] or prefs[person][item]==0:
                totals.setdefault(item,0)
                totals[item]+=prefs[other][item]*sim
                simSums.setdefault(item,0)
                simSums[item]+=sim
    rankings=[(total/simSums[item],item) for item,total in totals.items()]
    rankings.sort()
    rankings.reverse()
    return rankings

In [15]:
getRecommendations(critics,'Toby')


Out[15]:
[(3.3477895267131013, 'The Night Listener'),
 (2.8325499182641614, 'Lady in the Water'),
 (2.5309807037655645, 'Just My Luck')]

In [16]:
getRecommendations(critics,'Toby',similarity=sim_distance)


Out[16]:
[(3.5002478401415877, 'The Night Listener'),
 (2.7561242939959363, 'Lady in the Water'),
 (2.4619884860743739, 'Just My Luck')]

1.2 Item-based filtering


In [17]:
def transformPrefs(prefs):
    result={}
    for person in prefs:
        for item in prefs[person]:
            result.setdefault(item,{})
            result[item][person]=prefs[person][item]
    return result

movies = transformPrefs(critics)

In [18]:
topMatches(movies,'Superman Returns')


Out[18]:
[(0.65795169495976946, 'You, Me and Dupree'),
 (0.48795003647426888, 'Lady in the Water'),
 (0.11180339887498941, 'Snakes on a Plane'),
 (-0.17984719479905439, 'The Night Listener'),
 (-0.42289003161103106, 'Just My Luck')]

In [19]:
getRecommendations(movies,'Just My Luck')


Out[19]:
[(4.0, 'Michael Phillips'), (3.0, 'Jack Matthews')]

In [20]:
getRecommendations(movies, 'You, Me and Dupree')


Out[20]:
[(3.1637361366111816, 'Michael Phillips')]

In [21]:
def calculateSimilarItems(prefs,n=10):
    result={}
    itemPrefs=transformPrefs(prefs)
    c=0
    for item in itemPrefs:
        c+=1
        if c%100==0: print "%d / %d" % (c,len(itemPrefs))
        scores=topMatches(itemPrefs,item,n=n,similarity=sim_distance)
        result[item]=scores
    return result

itemsim=calculateSimilarItems(critics) 
itemsim


Out[21]:
{'Just My Luck': [(0.22222222222222221, 'Lady in the Water'),
  (0.18181818181818182, 'You, Me and Dupree'),
  (0.15384615384615385, 'The Night Listener'),
  (0.10526315789473684, 'Snakes on a Plane'),
  (0.064516129032258063, 'Superman Returns')],
 'Lady in the Water': [(0.40000000000000002, 'You, Me and Dupree'),
  (0.2857142857142857, 'The Night Listener'),
  (0.22222222222222221, 'Snakes on a Plane'),
  (0.22222222222222221, 'Just My Luck'),
  (0.090909090909090912, 'Superman Returns')],
 'Snakes on a Plane': [(0.22222222222222221, 'Lady in the Water'),
  (0.18181818181818182, 'The Night Listener'),
  (0.16666666666666666, 'Superman Returns'),
  (0.10526315789473684, 'Just My Luck'),
  (0.05128205128205128, 'You, Me and Dupree')],
 'Superman Returns': [(0.16666666666666666, 'Snakes on a Plane'),
  (0.10256410256410256, 'The Night Listener'),
  (0.090909090909090912, 'Lady in the Water'),
  (0.064516129032258063, 'Just My Luck'),
  (0.053333333333333337, 'You, Me and Dupree')],
 'The Night Listener': [(0.2857142857142857, 'Lady in the Water'),
  (0.18181818181818182, 'Snakes on a Plane'),
  (0.15384615384615385, 'Just My Luck'),
  (0.14814814814814814, 'You, Me and Dupree'),
  (0.10256410256410256, 'Superman Returns')],
 'You, Me and Dupree': [(0.40000000000000002, 'Lady in the Water'),
  (0.18181818181818182, 'Just My Luck'),
  (0.14814814814814814, 'The Night Listener'),
  (0.053333333333333337, 'Superman Returns'),
  (0.05128205128205128, 'Snakes on a Plane')]}

In [24]:
def getRecommendedItems(prefs,itemMatch,user):
    userRatings=prefs[user]
    scores={}
    totalSim={}
    # Loop over items rated by this user
    for (item,rating) in userRatings.items( ):
        # Loop over items similar to this one
        for (similarity,item2) in itemMatch[item]:
            # Ignore if this user has already rated this item
            if item2 in userRatings: continue
            # Weighted sum of rating times similarity
            scores.setdefault(item2,0)
            scores[item2]+=similarity*rating
            # Sum of all the similarities
            totalSim.setdefault(item2,0)
            totalSim[item2]+=similarity
    # Divide each total score by total weighting to get an average
    rankings=[(score/totalSim[item],item) for item,score in scores.items( )]
    # Return the rankings from highest to lowest
    rankings.sort( )
    rankings.reverse( )
    return rankings

getRecommendedItems(critics,itemsim,'Toby')


Out[24]:
[(3.182634730538922, 'The Night Listener'),
 (2.5983318700614575, 'Just My Luck'),
 (2.4730878186968837, 'Lady in the Water')]

In [25]:
import os
import random

class Graph:
    def __init__(self):
        self.G = dict()
    
    def addEdge(self, p, q):
        if p not in self.G: self.G[p] = dict()
        if q not in self.G: self.G[q] = dict()
        self.G[p][q] = 1
        self.G[q][p] = 1

    def getGraphMatrix(self):
        return self.G

In [26]:
graph = Graph()
graph.addEdge('A', 'a')
graph.addEdge('A', 'c')
graph.addEdge('B', 'a')
graph.addEdge('B', 'b')
graph.addEdge('B', 'c')
graph.addEdge('B', 'd')
graph.addEdge('C', 'c')
graph.addEdge('C', 'd')
G = graph.getGraphMatrix()
print(G.keys())


['A', 'a', 'c', 'B', 'd', 'C', 'b']

In [27]:
G


Out[27]:
{'A': {'a': 1, 'c': 1},
 'B': {'a': 1, 'b': 1, 'c': 1, 'd': 1},
 'C': {'c': 1, 'd': 1},
 'a': {'A': 1, 'B': 1},
 'b': {'B': 1},
 'c': {'A': 1, 'B': 1, 'C': 1},
 'd': {'B': 1, 'C': 1}}

In [28]:
def PersonalRank(G, alpha, root, max_step):
    rank = dict()
    rank = {x:0.0 for x in G.keys()}
    rank[root] = 1.0
    for k in range(max_step):
        tmp = {x:0.0 for x in G.keys()}
        for i,ri in G.items():
            for j,wij in ri.items():
                if j not in tmp: tmp[j] = 0.0
                tmp[j] += alpha * rank[i] / (len(ri)*1.0)
                if j == root: tmp[j] += 1.0 - alpha
        rank = tmp
        print(k, rank)
    return rank

In [29]:
print(PersonalRank(G, 0.8, 'A', 20))


(0, {'A': 0.3999999999999999, 'a': 0.4, 'c': 0.4, 'B': 0.0, 'd': 0.0, 'C': 0.0, 'b': 0.0})
(1, {'A': 0.6666666666666666, 'a': 0.15999999999999998, 'c': 0.15999999999999998, 'B': 0.2666666666666667, 'd': 0.0, 'C': 0.10666666666666669, 'b': 0.0})
(2, {'A': 0.5066666666666666, 'a': 0.32, 'c': 0.3626666666666667, 'B': 0.10666666666666665, 'd': 0.09600000000000003, 'C': 0.04266666666666666, 'b': 0.053333333333333344})
(3, {'A': 0.624711111111111, 'a': 0.22399999999999998, 'c': 0.24106666666666665, 'B': 0.30577777777777787, 'd': 0.03839999999999999, 'C': 0.13511111111111113, 'b': 0.02133333333333333})
(4, {'A': 0.5538844444444444, 'a': 0.31104, 'c': 0.36508444444444443, 'B': 0.1863111111111111, 'd': 0.11520000000000002, 'C': 0.07964444444444443, 'b': 0.061155555555555574})
(5, {'A': 0.6217718518518518, 'a': 0.258816, 'c': 0.29067377777777775, 'B': 0.31677629629629633, 'd': 0.06911999999999999, 'C': 0.14343585185185187, 'b': 0.03726222222222222})
(6, {'A': 0.5810394074074073, 'a': 0.312064, 'c': 0.3694383407407408, 'B': 0.2384971851851852, 'd': 0.12072960000000002, 'C': 0.1051610074074074, 'b': 0.06335525925925926})
(7, {'A': 0.6233424908641975, 'a': 0.2801152, 'c': 0.322179602962963, 'B': 0.322318538271605, 'd': 0.08976384000000001, 'C': 0.14680873086419757, 'b': 0.047699437037037044})
(8, {'A': 0.5979606407901235, 'a': 0.313800704, 'c': 0.372524196345679, 'B': 0.27202572641975314, 'd': 0.12318720000000004, 'C': 0.12182009679012348, 'b': 0.06446370765432101})
(9, {'A': 0.6248600672921809, 'a': 0.2935894016, 'c': 0.34231744031604944, 'B': 0.32570591341563787, 'd': 0.10313318400000002, 'C': 0.14861466569218107, 'b': 0.05440514528395063})
(10, {'A': 0.6087204113909463, 'a': 0.31508520959999997, 'c': 0.3745310758768724, 'B': 0.29349780121810704, 'd': 0.12458704896, 'C': 0.13253792435094652, 'b': 0.06514118268312757})
(11, {'A': 0.6259090374071659, 'a': 0.3021877248, 'c': 0.3552028945403786, 'B': 0.3278568031376681, 'd': 0.11171472998400003, 'C': 0.14970977315116596, 'b': 0.05869956024362141})
(12, {'A': 0.6155958617974342, 'a': 0.31593497559039996, 'c': 0.37581888485086634, 'B': 0.3072414019859315, 'd': 0.125455269888, 'C': 0.13940666387103434, 'b': 0.06557136062753362})
(13, {'A': 0.6265923595297243, 'a': 0.30768662511615996, 'c': 0.3634492906645737, 'B': 0.32923155598695125, 'd': 0.11721094594560004, 'C': 0.15040047724876437, 'b': 0.0614482803971863})
(14, {'A': 0.6199944608903503, 'a': 0.31648325500928, 'c': 0.37664344590878573, 'B': 0.3160374635863394, 'd': 0.126006502096896, 'C': 0.14380418922212634, 'b': 0.06584631119739025})
(15, {'A': 0.6270315542460547, 'a': 0.311205277073408, 'c': 0.36872695276225853, 'B': 0.3301112040427255, 'd': 0.12072916840611841, 'C': 0.1508408530811013, 'b': 0.06320749271726787})
(16, {'A': 0.6228092982326321, 'a': 0.316834862506967, 'c': 0.37717120373940755, 'B': 0.3216669597688938, 'd': 0.12635858204098563, 'C': 0.14661885476571632, 'b': 0.0660222408085451})
(17, {'A': 0.6273129326666287, 'a': 0.3134571112468316, 'c': 0.37210465315311814, 'B': 0.3306741581298591, 'd': 0.1229809338600653, 'C': 0.15112242048023627, 'b': 0.06433339195377877})
(18, {'A': 0.6246107520062307, 'a': 0.3170600046926233, 'c': 0.3775089728847178, 'B': 0.32526983911327995, 'd': 0.12658379981806633, 'C': 0.1484202810515243, 'b': 0.06613483162597182})
(19, {'A': 0.627493061312974, 'a': 0.3148982686251483, 'c': 0.37426638104575805, 'B': 0.3310344465409781, 'd': 0.1244220802432657, 'C': 0.15130257936315128, 'b': 0.06505396782265599})
{'A': 0.627493061312974, 'a': 0.3148982686251483, 'c': 0.37426638104575805, 'B': 0.3310344465409781, 'd': 0.1244220802432657, 'C': 0.15130257936315128, 'b': 0.06505396782265599}

2. 使用graphlab对于音乐数据或电影数据构建推荐系统


In [30]:
import graphlab as gl
gl.canvas.set_target('ipynb')
import matplotlib.pyplot as plt
%matplotlib inline


A newer version of GraphLab Create (v1.9) is available! Your current version is v1.8.5.

You can use pip to upgrade the graphlab-create package. For more information see https://dato.com/products/create/upgrade.
2016-05-21 10:54:24,093 [INFO] graphlab.cython.cy_server, 176: GraphLab Create v1.8.5 started. Logging: /tmp/graphlab_server_1463799260.log
This non-commercial license of GraphLab Create is assigned to 421901797@qq.com and will expire on May 20, 2017. For commercial licensing options, visit https://dato.com/buy/.

In [31]:
train_file = '/Users/zhangyixin/Desktop/cjc2016-gh-pages/10000.txt'
sf = gl.SFrame.read_csv(train_file, header=False, delimiter='\t', verbose=False)
sf.rename({'X1':'user_id', 'X2':'music_id', 'X3':'rating'}).show()



In [32]:
(train_set, test_set) = sf.random_split(0.8, seed=1)

In [33]:
popularity_model = gl.popularity_recommender.create(train_set, 'user_id', 'music_id', target = 'rating')


Recsys training: model = popularity
Preparing data set.
    Data has 1599753 observations with 76085 users and 10000 items.
    Data prepared in: 2.29914s
1599753 observations to process; with 10000 unique items.

In [34]:
item_sim_model = gl.item_similarity_recommender.create(train_set, 'user_id', 'music_id', target = 'rating', 
                                                       similarity_type='cosine')


Recsys training: model = item_similarity
Preparing data set.
    Data has 1599753 observations with 76085 users and 10000 items.
    Data prepared in: 2.51362s
Computing item similarity statistics:
Computing most similar items for 10000 items:
+-----------------+-----------------+
| Number of items | Elapsed Time    |
+-----------------+-----------------+
| 1000            | 3.24894         |
| 2000            | 3.39865         |
| 3000            | 3.55544         |
| 4000            | 3.67706         |
| 5000            | 3.78196         |
| 6000            | 3.89232         |
| 7000            | 4.00233         |
| 8000            | 4.09852         |
| 9000            | 4.21172         |
| 10000           | 4.39043         |
+-----------------+-----------------+
Finished training in 4.85052s
Finished prediction in 1.6265s

In [35]:
factorization_machine_model = gl.recommender.factorization_recommender.create(train_set, 'user_id', 'music_id',
                                                                              target='rating')


Recsys training: model = factorization_recommender
Preparing data set.
    Data has 1599753 observations with 76085 users and 10000 items.
    Data prepared in: 3.07978s
Training factorization_recommender for recommendations.
+--------------------------------+--------------------------------------------------+----------+
| Parameter                      | Description                                      | Value    |
+--------------------------------+--------------------------------------------------+----------+
| num_factors                    | Factor Dimension                                 | 8        |
| regularization                 | L2 Regularization on Factors                     | 1e-08    |
| solver                         | Solver used for training                         | sgd      |
| linear_regularization          | L2 Regularization on Linear Coefficients         | 1e-10    |
| max_iterations                 | Maximum Number of Iterations                     | 50       |
+--------------------------------+--------------------------------------------------+----------+
  Optimizing model using SGD; tuning step size.
  Using 199969 / 1599753 points for tuning the step size.
+---------+-------------------+------------------------------------------+
| Attempt | Initial Step Size | Estimated Objective Value                |
+---------+-------------------+------------------------------------------+
| 0       | 25                | No Decrease (223.075 >= 36.5392)         |
| 1       | 6.25              | No Decrease (215.831 >= 36.5392)         |
| 2       | 1.5625            | No Decrease (186.436 >= 36.5392)         |
| 3       | 0.390625          | No Decrease (84.5942 >= 36.5392)         |
| 4       | 0.0976562         | 12.0584                                  |
| 5       | 0.0488281         | 8.67429                                  |
| 6       | 0.0244141         | 20.4528                                  |
+---------+-------------------+------------------------------------------+
| Final   | 0.0488281         | 8.67429                                  |
+---------+-------------------+------------------------------------------+
Starting Optimization.
+---------+--------------+-------------------+-----------------------+-------------+
| Iter.   | Elapsed Time | Approx. Objective | Approx. Training RMSE | Step Size   |
+---------+--------------+-------------------+-----------------------+-------------+
| Initial | 134us        | 43.795            | 6.61778               |             |
+---------+--------------+-------------------+-----------------------+-------------+
| 1       | 428.752ms    | 43.517            | 6.59634               | 0.0488281   |
| 2       | 710.758ms    | 40.8118           | 6.38805               | 0.0290334   |
| 3       | 1.00s        | 37.8168           | 6.14917               | 0.0214205   |
| 4       | 1.28s        | 35.2992           | 5.94092               | 0.0172633   |
| 5       | 1.54s        | 32.8415           | 5.73032               | 0.014603    |
| 6       | 1.82s        | 30.6943           | 5.53976               | 0.0127367   |
| 10      | 2.97s        | 24.8223           | 4.98155               | 0.008683    |
| 11      | 3.24s        | 23.7317           | 4.87084               | 0.00808399  |
| 15      | 4.80s        | 20.4674           | 4.52329               | 0.00640622  |
| 20      | 6.53s        | 17.8296           | 4.22159               | 0.00516295  |
| 25      | 8.39s        | 15.9942           | 3.99825               | 0.00436732  |
| 30      | 9.84s        | 14.5492           | 3.81322               | 0.00380916  |
| 35      | 11.18s       | 13.4488           | 3.66606               | 0.00339327  |
| 40      | 12.52s       | 12.6294           | 3.55252               | 0.00306991  |
| 45      | 13.80s       | 11.9692           | 3.45831               | 0.00281035  |
| 50      | 15.14s       | 10.6591           | 3.26338               | 0.00183623  |
+---------+--------------+-------------------+-----------------------+-------------+
Optimization Complete: Maximum number of passes through the data reached.
Computing final objective value and training RMSE.
       Final objective value: 9.46694
       Final training RMSE: 3.0753

In [36]:
result = gl.recommender.util.compare_models(test_set, [popularity_model, item_sim_model, factorization_machine_model],
                                            user_sample=.1, skip_set=train_set)


compare_models: using 6871 users to estimate model performance
PROGRESS: Evaluate model M0
recommendations finished on 1000/6871 queries. users per second: 2266.21
recommendations finished on 2000/6871 queries. users per second: 2103.55
recommendations finished on 3000/6871 queries. users per second: 1950.33
recommendations finished on 4000/6871 queries. users per second: 1929.12
recommendations finished on 5000/6871 queries. users per second: 1855.33
recommendations finished on 6000/6871 queries. users per second: 1745
Precision and recall summary statistics by cutoff
+--------+-------------------+-------------------+
| cutoff |   mean_precision  |    mean_recall    |
+--------+-------------------+-------------------+
|   1    | 0.000436617668462 | 5.40574256191e-05 |
|   2    | 0.000291078445641 | 0.000126827037029 |
|   3    | 0.000339591519915 | 0.000245684068999 |
|   4    | 0.000363848057051 | 0.000273252556399 |
|   5    |  0.00049483335759 | 0.000412897271069 |
|   6    | 0.000436617668462 | 0.000449282076774 |
|   7    |  0.00039503503337 | 0.000522051688185 |
|   8    | 0.000363848057051 | 0.000551159532749 |
|   9    | 0.000420446643704 | 0.000856277368066 |
|   10   | 0.000378401979333 | 0.000856277368066 |
+--------+-------------------+-------------------+
[10 rows x 3 columns]

('\nOverall RMSE: ', 5.720865259296598)

Per User RMSE (best)
+-------------------------------+-------+-----------------+
|            user_id            | count |       rmse      |
+-------------------------------+-------+-----------------+
| 83629183d2b25913d3aeed7668... |   1   | 0.0160714285714 |
+-------------------------------+-------+-----------------+
[1 rows x 3 columns]


Per User RMSE (worst)
+-------------------------------+-------+---------------+
|            user_id            | count |      rmse     |
+-------------------------------+-------+---------------+
| e48e5aeb5a3d9e1425ea541d65... |   3   | 151.583917343 |
+-------------------------------+-------+---------------+
[1 rows x 3 columns]


Per Item RMSE (best)
+--------------------+-------+------+
|      music_id      | count | rmse |
+--------------------+-------+------+
| SOAQJRX12A6701F999 |   2   | 0.0  |
+--------------------+-------+------+
[1 rows x 3 columns]


Per Item RMSE (worst)
+--------------------+-------+---------------+
|      music_id      | count |      rmse     |
+--------------------+-------+---------------+
| SOXRHKP12A58A7F404 |   1   | 249.654320988 |
+--------------------+-------+---------------+
[1 rows x 3 columns]

PROGRESS: Evaluate model M1
recommendations finished on 1000/6871 queries. users per second: 528.71
recommendations finished on 2000/6871 queries. users per second: 540.911
recommendations finished on 3000/6871 queries. users per second: 566.361
recommendations finished on 4000/6871 queries. users per second: 599.767
recommendations finished on 5000/6871 queries. users per second: 633.891
recommendations finished on 6000/6871 queries. users per second: 649.185
Precision and recall summary statistics by cutoff
+--------+-------------------+-------------------+
| cutoff |   mean_precision  |    mean_recall    |
+--------+-------------------+-------------------+
|   1    | 0.000436617668462 | 7.40537810234e-05 |
|   2    | 0.000654926502692 | 0.000259672757767 |
|   3    | 0.000630669965556 | 0.000343221025905 |
|   4    | 0.000509387279872 | 0.000364012343451 |
|   5    | 0.000465725513026 | 0.000389864705399 |
|   6    | 0.000436617668462 | 0.000412979758435 |
|   7    | 0.000415826350916 | 0.000460280005852 |
|   8    | 0.000400232862757 | 0.000542752232117 |
|   9    | 0.000388104594188 | 0.000724676260642 |
|   10   | 0.000436617668462 | 0.000978330334701 |
+--------+-------------------+-------------------+
[10 rows x 3 columns]

Finished prediction in 0.282154s
('\nOverall RMSE: ', 6.658554396106699)

Per User RMSE (best)
+-------------------------------+-------+------+
|            user_id            | count | rmse |
+-------------------------------+-------+------+
| a9be282eaf672c22949799d898... |   1   | 0.0  |
+-------------------------------+-------+------+
[1 rows x 3 columns]


Per User RMSE (worst)
+-------------------------------+-------+---------------+
|            user_id            | count |      rmse     |
+-------------------------------+-------+---------------+
| 972cce803aa7beceaa7d0039e4... |   19  | 152.416108472 |
+-------------------------------+-------+---------------+
[1 rows x 3 columns]


Per Item RMSE (best)
+--------------------+-------+------+
|      music_id      | count | rmse |
+--------------------+-------+------+
| SOURPHM12A67021876 |   1   | 0.0  |
+--------------------+-------+------+
[1 rows x 3 columns]


Per Item RMSE (worst)
+--------------------+-------+---------------+
|      music_id      | count |      rmse     |
+--------------------+-------+---------------+
| SOZTVXP12AF72A760B |   2   | 468.813602358 |
+--------------------+-------+---------------+
[1 rows x 3 columns]

PROGRESS: Evaluate model M2
recommendations finished on 1000/6871 queries. users per second: 1777.53
recommendations finished on 2000/6871 queries. users per second: 1859.51
recommendations finished on 3000/6871 queries. users per second: 1911.33
recommendations finished on 4000/6871 queries. users per second: 1954.23
recommendations finished on 5000/6871 queries. users per second: 2001.15
recommendations finished on 6000/6871 queries. users per second: 2013.4
Precision and recall summary statistics by cutoff
+--------+-------------------+-------------------+
| cutoff |   mean_precision  |    mean_recall    |
+--------+-------------------+-------------------+
|   1    | 0.000436617668462 | 4.60874205598e-05 |
|   2    | 0.000509387279872 | 0.000169795759957 |
|   3    | 0.000630669965556 | 0.000400637138375 |
|   4    | 0.000654926502692 | 0.000473686632907 |
|   5    | 0.000785911803231 | 0.000650759354005 |
|   6    | 0.000679183039829 | 0.000656823488289 |
|   7    | 0.000727696114103 | 0.000915218612788 |
|   8    |  0.00070950371125 | 0.000979411805711 |
|   9    | 0.000711525089345 |  0.00109168492046 |
|   10   | 0.000713142191821 |  0.00116742144715 |
+--------+-------------------+-------------------+
[10 rows x 3 columns]

('\nOverall RMSE: ', 7.555521250194041)

Per User RMSE (best)
+-------------------------------+-------+------------------+
|            user_id            | count |       rmse       |
+-------------------------------+-------+------------------+
| 85d4cca3251f1c9b466d346cbe... |   1   | 0.00368622415654 |
+-------------------------------+-------+------------------+
[1 rows x 3 columns]


Per User RMSE (worst)
+-------------------------------+-------+---------------+
|            user_id            | count |      rmse     |
+-------------------------------+-------+---------------+
| e48e5aeb5a3d9e1425ea541d65... |   3   | 145.876884342 |
+-------------------------------+-------+---------------+
[1 rows x 3 columns]


Per Item RMSE (best)
+--------------------+-------+-------------------+
|      music_id      | count |        rmse       |
+--------------------+-------+-------------------+
| SOLUHDM12A6701BEDA |   1   | 5.07580577498e-05 |
+--------------------+-------+-------------------+
[1 rows x 3 columns]


Per Item RMSE (worst)
+--------------------+-------+---------------+
|      music_id      | count |      rmse     |
+--------------------+-------+---------------+
| SOXRHKP12A58A7F404 |   1   | 238.802020677 |
+--------------------+-------+---------------+
[1 rows x 3 columns]


In [37]:
K = 10
users = gl.SArray(sf['user_id'].unique().head(100))

In [38]:
recs = item_sim_model.recommend(users=users, k=K)
recs.head()


Out[38]:
user_id music_id score rank
c66c10a9567f0d82ff31441a9
fd5063e5cd9dfe8 ...
SOFCGSE12AF72A674F 20.686440678 1
c66c10a9567f0d82ff31441a9
fd5063e5cd9dfe8 ...
SOACBLB12AB01871C7 17.8421052632 2
c66c10a9567f0d82ff31441a9
fd5063e5cd9dfe8 ...
SOZPMJT12AAF3B40D1 16.4245283019 3
c66c10a9567f0d82ff31441a9
fd5063e5cd9dfe8 ...
SOGSDHY12AB017BF39 14.7 4
c66c10a9567f0d82ff31441a9
fd5063e5cd9dfe8 ...
SOJSXJY12A8C13E32E 14.07960199 5
c66c10a9567f0d82ff31441a9
fd5063e5cd9dfe8 ...
SOANOQW12A58A793D2 13.8804347826 6
c66c10a9567f0d82ff31441a9
fd5063e5cd9dfe8 ...
SOFWKCI12A8C13A22A 13.495049505 7
c66c10a9567f0d82ff31441a9
fd5063e5cd9dfe8 ...
SOAFPSO12AF72A4521 12.5774647887 8
c66c10a9567f0d82ff31441a9
fd5063e5cd9dfe8 ...
SONDKOF12A6D4F7D70 12.1647398844 9
c66c10a9567f0d82ff31441a9
fd5063e5cd9dfe8 ...
SOQBUFQ12A6D4F7F4C 11.8518518519 10
[10 rows x 4 columns]


In [ ]: