In [1]:
import graphlab
In [2]:
song_data = graphlab.SFrame('song_data.gl/')
In [3]:
song_data.head()
Out[3]:
In [5]:
graphlab.canvas.set_target('ipynb')
In [6]:
song_data['song'].show()
In [7]:
len(song_data)
Out[7]:
In [16]:
users = song_data['user_id'].unique()
In [6]:
for artist in ['Kanye West','Foo Fighters','Taylor Swift','Lady GaGa']:
print artist, len(song_data[song_data['artist'] == artist]['user_id'].unique())
In [12]:
for artist in ['Kings Of Leon','Coldplay','Taylor Swift','Lady GaGa']:
print artist, song_data[song_data['artist'] == artist]['listen_count'].sum()
pop = song_data.groupby(key_columns='artist', operations={'total_count': graphlab.aggregate.SUM('listen_count')})
pop.sort('total_count', ascending=False)
pop.sort('total_count', ascending=True)
Out[12]:
In [9]:
len(users)
Out[9]:
In [13]:
train_data,test_data = song_data.random_split(.8,seed=0)
In [11]:
popularity_model = graphlab.popularity_recommender.create(train_data,
user_id='user_id',
item_id='song')
In [12]:
popularity_model.recommend(users=[users[0]])
Out[12]:
In [13]:
popularity_model.recommend(users=[users[1]])
Out[13]:
In [14]:
personalized_model = graphlab.item_similarity_recommender.create(train_data,
user_id='user_id',
item_id='song')
In [20]:
subset_test_users = test_data['user_id'].unique()[0:10000]
rec_songs = personalized_model.recommend(users=subset_test_users)
print len(rec_songs)
rec_1song = rec_songs[rec_songs['rank']==1]
res = rec_1song.groupby(key_columns='song', operations={'count': graphlab.aggregate.COUNT()})
print res.sort('count', ascending=False)
print len(rec_songs)
In [15]:
personalized_model.recommend(users=[users[0]])
Out[15]:
In [16]:
personalized_model.recommend(users=[users[1]])
Out[16]:
In [17]:
personalized_model.get_similar_items(['With Or Without You - U2'])
Out[17]:
In [18]:
personalized_model.get_similar_items(['Chan Chan (Live) - Buena Vista Social Club'])
Out[18]:
In [19]:
%matplotlib inline
model_performance = graphlab.recommender.util.compare_models(test_data,
[popularity_model,personalized_model],
user_sample=0.05)
The curve shows that the personalized model provides much better performance.
In [ ]: