In [25]:
import graphlab as gl
gl.canvas.set_target("ipynb")
In [26]:
implicit = gl.SFrame('implicit')
explicit = gl.SFrame('explicit')
items = gl.SFrame('items')
ratings = gl.SFrame('ratings')
In [5]:
ratings.show()
This allows us to evaluate generalization ability.
In [27]:
train, valid = gl.recommender.util.random_split_by_user(implicit)
Compute the number of times each item has been rated.
In [28]:
num_ratings_per_item = train.groupby('item_id', {'num_users': gl.aggregate.COUNT})
items = items.join(num_ratings_per_item, on='item_id')
Transform the count into a categorical variable using the feature_engineering module.
In [29]:
binner = gl.feature_engineering.FeatureBinner(features=['num_users'], strategy='logarithmic', num_bins=5)
items = binner.fit_transform(items)
Convert each genre element into a dictionary and each year to an integer.
In [30]:
items['genres'] = items['genres'].apply(lambda x: {k:1 for k in x})
items['year'] = items['year'].astype(int)
In [31]:
items
Out[31]:
In [32]:
m0 = gl.item_similarity_recommender.create(train)
In [33]:
m1 = gl.ranking_factorization_recommender.create(train, max_iterations=10)
In [34]:
m2 = gl.ranking_factorization_recommender.create(train,
item_data=items[['item_id', 'year']],
max_iterations=10)
In [35]:
m3 = gl.ranking_factorization_recommender.create(train,
item_data=items[['item_id', 'year', 'genres']],
max_iterations=10)
Create a nearest neighbor model that uses the genres in common and the year of the movie.
In [36]:
dist = [[['genres'], 'jaccard', 1.0],
[['year'], 'euclidean', 1.0]]
nn_model = gl.nearest_neighbors.create(items, 'item_id', distance=dist)
In [37]:
gl.nearest_neighbors.create?
Compute a nearest neighbor graph.
In [38]:
similar = nn_model.query(items, 'item_id', k=100)\
.rename({'query_label': 'item_id', 'reference_label': 'similar', 'distance': 'score'})\
.join(items[['item_id', 'title']], on='item_id')\
.join(items[['item_id', 'title']], on={'similar': 'item_id'})
similar['score'] = 1 - similar['score']
similar.print_rows(100, max_row_width=200)
Use this similarity data as the basis for a recommender.
In [39]:
m5 = gl.item_similarity_recommender.create(train, nearest_items=similar)
Create a precision/recall plot to compare the recommendation quality of the above models given our heldout data.
In [40]:
model_comparison = gl.compare(valid, [m0, m1, m2, m3, m5], user_sample=.3)
In [24]:
gl.show_comparison(model_comparison, [m0, m1, m2, m3, m5])
In [ ]: