In [3]:
import graphlab
graphlab.canvas.set_target("ipynb")
# set canvas to show sframes and sgraphs in ipython notebook
import matplotlib.pyplot as plt
%matplotlib inline
In [7]:
# download data from: http://files.grouplens.org/datasets/movielens/ml-1m.zip
In [5]:
data = graphlab.SFrame.read_csv('/Users/chengjun/bigdata/ml-1m/ratings.dat', delimiter='\n',
header=False)['X1'].apply(lambda x: x.split('::')).unpack()
for col in data.column_names():
data[col] = data[col].astype(int)
data.rename({'X.0': 'user_id', 'X.1': 'movie_id', 'X.2': 'rating', 'X.3': 'timestamp'})
data.save('ratings')
users = graphlab.SFrame.read_csv('/Users/chengjun/bigdata/ml-1m/users.dat', delimiter='\n',
header=False)['X1'].apply(lambda x: x.split('::')).unpack()
users.rename({'X.0': 'user_id', 'X.1': 'gender', 'X.2': 'age', 'X.3': 'occupation', 'X.4': 'zip-code'})
users['user_id'] = users['user_id'].astype(int)
users.save('users')
items = graphlab.SFrame.read_csv('/Users/chengjun/bigdata/ml-1m/movies.dat', delimiter='\n',
header=False)['X1'].apply(lambda x: x.split('::')).unpack()
items.rename({'X.0': 'movie_id', 'X.1': 'title', 'X.2': 'genre'})
items['movie_id'] = items['movie_id'].astype(int)
items.save('items')
PROGRESS: Finished parsing file /Users/chengjun/bigdata/ml-1m/ratings.dat
PROGRESS: Parsing completed. Parsed 100 lines in 0.419473 secs.
------------------------------------------------------
Inferred types from first line of file as
column_type_hints=[str]
If parsing fails due to incorrect types, you can correct
the inferred type list above and pass it to read_csv in
the column_type_hints argument
------------------------------------------------------
PROGRESS: Finished parsing file /Users/chengjun/bigdata/ml-1m/ratings.dat
PROGRESS: Parsing completed. Parsed 1000209 lines in 0.516456 secs.
PROGRESS: Finished parsing file /Users/chengjun/bigdata/ml-1m/users.dat
PROGRESS: Parsing completed. Parsed 100 lines in 0.029414 secs.
------------------------------------------------------
Inferred types from first line of file as
column_type_hints=[str]
If parsing fails due to incorrect types, you can correct
the inferred type list above and pass it to read_csv in
the column_type_hints argument
------------------------------------------------------
PROGRESS: Finished parsing file /Users/chengjun/bigdata/ml-1m/users.dat
PROGRESS: Parsing completed. Parsed 6040 lines in 0.013402 secs.
PROGRESS: Finished parsing file /Users/chengjun/bigdata/ml-1m/movies.dat
PROGRESS: Parsing completed. Parsed 100 lines in 0.025157 secs.
------------------------------------------------------
Inferred types from first line of file as
column_type_hints=[str]
If parsing fails due to incorrect types, you can correct
the inferred type list above and pass it to read_csv in
the column_type_hints argument
------------------------------------------------------
PROGRESS: Finished parsing file /Users/chengjun/bigdata/ml-1m/movies.dat
PROGRESS: Parsing completed. Parsed 3883 lines in 0.011876 secs.
In [8]:
data.show()
In [9]:
items.head()
Out[9]:
movie_id
title
genre
1
Toy Story (1995)
Animation|Children's|Come
dy ...
2
Jumanji (1995)
Adventure|Children's|Fant
asy ...
3
Grumpier Old Men (1995)
Comedy|Romance
4
Waiting to Exhale (1995)
Comedy|Drama
5
Father of the Bride Part
II (1995) ...
Comedy
6
Heat (1995)
Action|Crime|Thriller
7
Sabrina (1995)
Comedy|Romance
8
Tom and Huck (1995)
Adventure|Children's
9
Sudden Death (1995)
Action
10
GoldenEye (1995)
Action|Adventure|Thriller
[10 rows x 3 columns]
In [10]:
data = data.join(items, on='movie_id')
In [11]:
data
Out[11]:
user_id
movie_id
rating
timestamp
title
genre
1
1193
5
978300760
One Flew Over the
Cuckoo's Nest (1975) ...
Drama
1
661
3
978302109
James and the Giant Peach
(1996) ...
Animation|Children's|Musi
cal ...
1
914
3
978301968
My Fair Lady (1964)
Musical|Romance
1
3408
4
978300275
Erin Brockovich (2000)
Drama
1
2355
5
978824291
Bug's Life, A (1998)
Animation|Children's|Come
dy ...
1
1197
3
978302268
Princess Bride, The
(1987) ...
Action|Adventure|Comedy|R
omance ...
1
1287
5
978302039
Ben-Hur (1959)
Action|Adventure|Drama
1
2804
5
978300719
Christmas Story, A (1983)
Comedy|Drama
1
594
4
978302268
Snow White and the Seven
Dwarfs (1937) ...
Animation|Children's|Musi
cal ...
1
919
4
978301368
Wizard of Oz, The (1939)
Adventure|Children's|Dram
a|Musical ...
[1000209 rows x 6 columns]
Note: Only the head of the SFrame is printed.
You can use print_rows(num_rows=m, num_columns=n) to print more rows and columns.
In [27]:
(train_set, test_set) = data.random_split(0.95, seed=1)
In [33]:
m = graphlab.recommender.create(train_set, 'user_id', 'movie_id', 'rating')
PROGRESS: Recsys training: model = ranking_factorization_recommender
PROGRESS: Preparing data set.
PROGRESS: Data has 949852 observations with 6040 users and 3701 items.
PROGRESS: Data prepared in: 1.38442s
PROGRESS: Training ranking_factorization_recommender for recommendations.
PROGRESS: +--------------------------------+--------------------------------------------------+----------+
PROGRESS: | Parameter | Description | Value |
PROGRESS: +--------------------------------+--------------------------------------------------+----------+
PROGRESS: | num_factors | Factor Dimension | 32 |
PROGRESS: | regularization | L2 Regularization on Factors | 1e-09 |
PROGRESS: | solver | Solver used for training | adagrad |
PROGRESS: | linear_regularization | L2 Regularization on Linear Coefficients | 1e-09 |
PROGRESS: | ranking_regularization | Rank-based Regularization Weight | 0.25 |
PROGRESS: | max_iterations | Maximum Number of Iterations | 25 |
PROGRESS: +--------------------------------+--------------------------------------------------+----------+
PROGRESS: Optimizing model using SGD; tuning step size.
PROGRESS: Using 118731 / 949852 points for tuning the step size.
PROGRESS: +---------+-------------------+------------------------------------------+
PROGRESS: | Attempt | Initial Step Size | Estimated Objective Value |
PROGRESS: +---------+-------------------+------------------------------------------+
PROGRESS: | 0 | 10 | Not Viable |
PROGRESS: | 1 | 2.5 | Not Viable |
PROGRESS: | 2 | 0.625 | Not Viable |
PROGRESS: | 3 | 0.15625 | 0.38911 |
PROGRESS: | 4 | 0.078125 | 0.54968 |
PROGRESS: | 5 | 0.0390625 | 0.658223 |
PROGRESS: | 6 | 0.0195312 | 1.23822 |
PROGRESS: +---------+-------------------+------------------------------------------+
PROGRESS: | Final | 0.15625 | 0.38911 |
PROGRESS: +---------+-------------------+------------------------------------------+
PROGRESS: Starting Optimization.
PROGRESS: +---------+--------------+-------------------+-----------------------+-------------+
PROGRESS: | Iter. | Elapsed Time | Approx. Objective | Approx. Training RMSE | Step Size |
PROGRESS: +---------+--------------+-------------------+-----------------------+-------------+
PROGRESS: | Initial | 307us | 2.44719 | 1.1172 | |
PROGRESS: +---------+--------------+-------------------+-----------------------+-------------+
PROGRESS: | 1 | 1.58s | DIVERGED | DIVERGED | 0.15625 |
PROGRESS: | RESET | 2.09s | 2.44725 | 1.11722 | |
PROGRESS: | 1 | 3.77s | DIVERGED | DIVERGED | 0.078125 |
PROGRESS: | RESET | 4.39s | 2.44716 | 1.11722 | |
PROGRESS: | 1 | 5.67s | 1.55168 | 1.0281 | 0.0390625 |
PROGRESS: | 2 | 6.96s | 1.14185 | 0.933767 | 0.0390625 |
PROGRESS: | 3 | 8.10s | 1.02045 | 0.90251 | 0.0390625 |
PROGRESS: | 4 | 9.36s | 0.95975 | 0.887199 | 0.0390625 |
PROGRESS: | 5 | 10.51s | 0.917016 | 0.875537 | 0.0390625 |
PROGRESS: | 6 | 11.64s | 0.88812 | 0.867364 | 0.0390625 |
PROGRESS: | 7 | 12.84s | 0.865568 | 0.860687 | 0.0390625 |
PROGRESS: | 8 | 13.92s | 0.846648 | 0.854981 | 0.0390625 |
PROGRESS: | 9 | 15.06s | 0.831916 | 0.850301 | 0.0390625 |
PROGRESS: | 10 | 16.12s | 0.817915 | 0.846041 | 0.0390625 |
PROGRESS: | 11 | 17.48s | 0.806827 | 0.84242 | 0.0390625 |
PROGRESS: | 12 | 18.59s | 0.796439 | 0.838696 | 0.0390625 |
PROGRESS: | 13 | 19.70s | 0.787774 | 0.83584 | 0.0390625 |
PROGRESS: | 14 | 20.94s | 0.779347 | 0.83306 | 0.0390625 |
PROGRESS: | 15 | 22.23s | 0.772255 | 0.830361 | 0.0390625 |
PROGRESS: | 16 | 23.43s | 0.765821 | 0.828197 | 0.0390625 |
PROGRESS: | 17 | 24.54s | 0.75912 | 0.825862 | 0.0390625 |
PROGRESS: | 18 | 25.60s | 0.753293 | 0.823827 | 0.0390625 |
PROGRESS: | 19 | 26.71s | 0.748413 | 0.821838 | 0.0390625 |
PROGRESS: | 20 | 27.74s | 0.743186 | 0.819837 | 0.0390625 |
PROGRESS: | 21 | 28.85s | 0.738489 | 0.8181 | 0.0390625 |
PROGRESS: | 22 | 29.92s | 0.734265 | 0.816403 | 0.0390625 |
PROGRESS: | 23 | 31.00s | 0.72996 | 0.814651 | 0.0390625 |
PROGRESS: | 24 | 32.30s | 0.725537 | 0.813209 | 0.0390625 |
PROGRESS: | 25 | 33.47s | 0.722203 | 0.811926 | 0.0390625 |
PROGRESS: +---------+--------------+-------------------+-----------------------+-------------+
PROGRESS: Optimization Complete: Maximum number of passes through the data reached.
PROGRESS: Computing final objective value and training RMSE.
PROGRESS: Final objective value: 0.708338
PROGRESS: Final training RMSE: 0.802685
In [29]:
m
Out[29]:
Class : ItemSimilarityRecommender
Schema
------
User ID : user_id
Item ID : movie_id
Target : None
Additional observation features : 0
Number of user side features : 0
Number of item side features : 0
Statistics
----------
Number of observations : 949852
Number of users : 6040
Number of items : 3701
Training summary
----------------
Training time : 0.7314
Settings
--------
only_top_k : 100
similarity_type : jaccard
threshold : 0.001
training_method : auto
In [38]:
m2 = graphlab.item_similarity_recommender.create(train_set, 'user_id', 'movie_id', 'rating',
similarity_type='pearson')
PROGRESS: Recsys training: model = item_similarity
PROGRESS: Warning: Ignoring columns timestamp, title, genre;
PROGRESS: To use these columns in scoring predictions, use a model that allows the use of additional features.
PROGRESS: Preparing data set.
PROGRESS: Data has 949852 observations with 6040 users and 3701 items.
PROGRESS: Data prepared in: 0.741166s
PROGRESS: Computing item similarity statistics:
PROGRESS: Computing most similar items for 3701 items:
PROGRESS: +-----------------+-----------------+
PROGRESS: | Number of items | Elapsed Time |
PROGRESS: +-----------------+-----------------+
PROGRESS: | 1000 | 0.502444 |
PROGRESS: | 2000 | 0.525984 |
PROGRESS: | 3000 | 0.547989 |
PROGRESS: +-----------------+-----------------+
PROGRESS: Finished training in 0.782624s
PROGRESS: Finished prediction in 0.688922s
In [39]:
m2
Out[39]:
Class : ItemSimilarityRecommender
Schema
------
User ID : user_id
Item ID : movie_id
Target : rating
Additional observation features : 0
Number of user side features : 0
Number of item side features : 0
Statistics
----------
Number of observations : 949852
Number of users : 6040
Number of items : 3701
Training summary
----------------
Training time : 0.7828
Settings
--------
only_top_k : 100
similarity_type : pearson
threshold : 0.001
training_method : auto
In [40]:
result = graphlab.recommender.util.compare_models(test_set, [m, m2],
user_sample=.1, skip_set=train_set)
compare_models: using 562 users to estimate model performance
PROGRESS: Evaluate model M0
Precision and recall summary statistics by cutoff
+--------+-----------------+------------------+
| cutoff | mean_precision | mean_recall |
+--------+-----------------+------------------+
| 2 | 0.0435943060498 | 0.00956472275563 |
| 4 | 0.0333629893238 | 0.0148154269344 |
| 6 | 0.0308422301305 | 0.0200992907447 |
| 8 | 0.0289145907473 | 0.0259425986711 |
| 10 | 0.0274021352313 | 0.0287214600249 |
| 12 | 0.0260972716489 | 0.0337773113572 |
| 14 | 0.0263091001525 | 0.0394111159869 |
| 16 | 0.0256895017794 | 0.0462196778187 |
| 18 | 0.0250098853302 | 0.050977761984 |
| 20 | 0.0248220640569 | 0.0552180941837 |
+--------+-----------------+------------------+
[10 rows x 3 columns]
Overall RMSE: 0.906418088677
Per User RMSE (best)
+---------+-------+-----------------+
| user_id | count | rmse |
+---------+-------+-----------------+
| 5909 | 1 | 0.0473437604915 |
+---------+-------+-----------------+
[1 rows x 3 columns]
Per User RMSE (worst)
+---------+-------+---------------+
| user_id | count | rmse |
+---------+-------+---------------+
| 2379 | 1 | 3.30603390451 |
+---------+-------+---------------+
[1 rows x 3 columns]
Per Item RMSE (best)
+----------+-------+-------------------+
| movie_id | count | rmse |
+----------+-------+-------------------+
| 3407 | 1 | 0.000624169056996 |
+----------+-------+-------------------+
[1 rows x 3 columns]
Per Item RMSE (worst)
+----------+-------+---------------+
| movie_id | count | rmse |
+----------+-------+---------------+
| 3747 | 1 | 3.91489813071 |
+----------+-------+---------------+
[1 rows x 3 columns]
PROGRESS: Evaluate model M1
Precision and recall summary statistics by cutoff
+--------+-------------------+-------------------+
| cutoff | mean_precision | mean_recall |
+--------+-------------------+-------------------+
| 2 | 0.000889679715302 | 0.000296559905101 |
| 4 | 0.000444839857651 | 0.000296559905101 |
| 6 | 0.000593119810202 | 0.000889679715302 |
| 8 | 0.000667259786477 | 0.00133451957295 |
| 10 | 0.000711743772242 | 0.00169039145907 |
| 12 | 0.000593119810202 | 0.00169039145907 |
| 14 | 0.000762582613116 | 0.00215747330961 |
| 16 | 0.000667259786477 | 0.00215747330961 |
| 18 | 0.000691973111902 | 0.00230575326216 |
| 20 | 0.000800711743772 | 0.00236830044214 |
+--------+-------------------+-------------------+
[10 rows x 3 columns]
PROGRESS: Finished prediction in 0.09301s
Overall RMSE: 0.869846693134
Per User RMSE (best)
+---------+-------+-----------------+
| user_id | count | rmse |
+---------+-------+-----------------+
| 3350 | 1 | 0.0357205929343 |
+---------+-------+-----------------+
[1 rows x 3 columns]
Per User RMSE (worst)
+---------+-------+---------------+
| user_id | count | rmse |
+---------+-------+---------------+
| 200 | 1 | 3.72375859435 |
+---------+-------+---------------+
[1 rows x 3 columns]
Per Item RMSE (best)
+----------+-------+------------------+
| movie_id | count | rmse |
+----------+-------+------------------+
| 2273 | 1 | 0.00162381395374 |
+----------+-------+------------------+
[1 rows x 3 columns]
Per Item RMSE (worst)
+----------+-------+---------------+
| movie_id | count | rmse |
+----------+-------+---------------+
| 627 | 1 | 4.12012186276 |
+----------+-------+---------------+
[1 rows x 3 columns]
In [41]:
m.get_similar_items([1287]) # movie_id is Ben-Hur
PROGRESS: Getting similar items completed in 0.002226
Out[41]:
movie_id
similar
distance
rank
1287
3087
1.65043520927
1
1287
54
1.64714694023
2
1287
3473
1.64681369066
3
1287
2690
1.64273333549
4
1287
2014
1.63784432411
5
1287
1950
1.63471919298
6
1287
585
1.63432335854
7
1287
1265
1.62446278334
8
1287
1919
1.62326407433
9
1287
566
1.61960405111
10
[10 rows x 4 columns]
In [42]:
m.get_similar_items([1287]).join(items, on={'similar': 'movie_id'}).sort('rank')
PROGRESS: Getting similar items completed in 0.001121
Out[42]:
movie_id
similar
distance
rank
title
genre
1287
3087
1.65043520927
1
Scrooged (1988)
Comedy
1287
54
1.64714694023
2
Big Green, The (1995)
Children's|Comedy
1287
3473
1.64681369066
3
Jonah Who Will Be 25 in
the Year 2000 (1976) ...
Comedy
1287
2690
1.64273333549
4
Ideal Husband, An (1999)
Comedy
1287
2014
1.63784432411
5
Freaky Friday (1977)
Children's|Comedy
1287
1950
1.63471919298
6
In the Heat of the Night
(1967) ...
Drama|Mystery
1287
585
1.63432335854
7
Brady Bunch Movie, The
(1995) ...
Comedy
1287
1265
1.62446278334
8
Groundhog Day (1993)
Comedy|Romance
1287
1919
1.62326407433
9
Madeline (1998)
Children's|Comedy
1287
566
1.61960405111
10
Naked in New York (1994)
Comedy|Romance
[10 rows x 6 columns]
In [43]:
recs = m.recommend()
PROGRESS: recommendations finished on 1000/6040 queries. users per second: 7602.42
PROGRESS: recommendations finished on 2000/6040 queries. users per second: 8142.83
PROGRESS: recommendations finished on 3000/6040 queries. users per second: 8330.83
PROGRESS: recommendations finished on 4000/6040 queries. users per second: 8446.43
PROGRESS: recommendations finished on 5000/6040 queries. users per second: 8504.6
PROGRESS: recommendations finished on 6000/6040 queries. users per second: 8163.45
In [44]:
recs
Out[44]:
user_id
movie_id
score
rank
1
356
4.04209059737
1
1
34
4.03408827148
2
1
480
4.00319579504
3
1
2081
3.94718419276
4
1
377
3.92856887243
5
1
2987
3.927424426
6
1
590
3.89587930105
7
1
1387
3.88266849778
8
1
741
3.87735148034
9
1
2006
3.87439003847
10
[60400 rows x 4 columns]
Note: Only the head of the SFrame is printed.
You can use print_rows(num_rows=m, num_columns=n) to print more rows and columns.
In [45]:
data[data['user_id'] == 4].join(items, on='movie_id')
Out[45]:
user_id
movie_id
rating
timestamp
title
genre
4
260
5
978294199
Star Wars: Episode IV - A
New Hope (1977) ...
Action|Adventure|Fantasy
|Sci-Fi ...
4
480
4
978294008
Jurassic Park (1993)
Action|Adventure|Sci-Fi
4
1036
4
978294282
Die Hard (1988)
Action|Thriller
4
1097
4
978293964
E.T. the Extra-
Terrestrial (1982) ...
Children's|Drama|Fantasy
|Sci-Fi ...
4
1196
2
978294199
Star Wars: Episode V -
The Empire Strikes Back ...
Action|Adventure|Drama
|Sci-Fi|War ...
4
1198
5
978294199
Raiders of the Lost Ark
(1981) ...
Action|Adventure
4
1201
5
978294230
Good, The Bad and The
Ugly, The (1966) ...
Action|Western
4
1210
3
978293924
Star Wars: Episode VI -
Return of the Jedi (1 ...
Action|Adventure|Romance
|Sci-Fi|War ...
4
1214
4
978294260
Alien (1979)
Action|Horror|Sci-
Fi|Thriller ...
4
1240
5
978294260
Terminator, The (1984)
Action|Sci-Fi|Thriller
title.1
genre.1
Star Wars: Episode IV - A
New Hope (1977) ...
Action|Adventure|Fantasy
|Sci-Fi ...
Jurassic Park (1993)
Action|Adventure|Sci-Fi
Die Hard (1988)
Action|Thriller
E.T. the Extra-
Terrestrial (1982) ...
Children's|Drama|Fantasy
|Sci-Fi ...
Star Wars: Episode V -
The Empire Strikes Back ...
Action|Adventure|Drama
|Sci-Fi|War ...
Raiders of the Lost Ark
(1981) ...
Action|Adventure
Good, The Bad and The
Ugly, The (1966) ...
Action|Western
Star Wars: Episode VI -
Return of the Jedi (1 ...
Action|Adventure|Romance
|Sci-Fi|War ...
Alien (1979)
Action|Horror|Sci-
Fi|Thriller ...
Terminator, The (1984)
Action|Sci-Fi|Thriller
[21 rows x 8 columns]
Note: Only the head of the SFrame is printed.
You can use print_rows(num_rows=m, num_columns=n) to print more rows and columns.
In [46]:
m.recommend(users=[4], k=20).join(items, on='movie_id')
Out[46]:
user_id
movie_id
score
rank
title
genre
4
34
4.14169768132
1
Babe (1995)
Children's|Comedy|Drama
4
317
3.88899993039
15
Santa Clause, The (1994)
Children's|Comedy|Fantasy
4
531
3.86152254678
18
Secret Garden, The (1993)
Children's|Drama
4
590
4.12568046785
2
Dances with Wolves (1990)
Adventure|Drama|Western
4
741
3.90859223045
12
Ghost in the Shell
(Kokaku kidotai) (1995) ...
Animation|Sci-Fi
4
969
3.92384021617
9
African Queen, The (1951)
Action|Adventure|Romance|
War ...
4
1012
3.91079562045
11
Old Yeller (1957)
Children's|Drama
4
1013
3.84667891897
19
Parent Trap, The (1961)
Children's|Drama
4
1017
3.86540279425
17
Swiss Family Robinson
(1960) ...
Adventure|Children's
4
1204
3.98675022162
5
Lawrence of Arabia (1962)
Adventure|War
[20 rows x 6 columns]
Note: Only the head of the SFrame is printed.
You can use print_rows(num_rows=m, num_columns=n) to print more rows and columns.
In [47]:
m.recommend?
In [48]:
recent_data = graphlab.SFrame()
recent_data['movie_id'] = [1291]
recent_data['user_id'] = 99999
In [51]:
m2.recommend(users=[99999], new_observation_data=recent_data).join(items, on='movie_id').sort('rank')
Out[51]:
user_id
movie_id
score
rank
title
genre
99999
1830
5.0
1
Follow the Bitch (1998)
Comedy
99999
572
5.0
2
Foreign Student (1994)
Drama
99999
3607
5.0
3
One Little Indian (1973)
Comedy|Drama|Western
99999
989
5.0
4
Schlafes Bruder (Brother
of Sleep) (1995) ...
Drama
99999
3172
5.0
5
Ulysses (Ulisse) (1954)
Adventure
99999
3233
5.0
6
Smashing Time (1967)
Comedy
99999
3382
5.0
7
Song of Freedom (1936)
Drama
99999
787
5.0
8
Gate of Heavenly Peace,
The (1995) ...
Documentary
99999
3656
5.0
9
Lured (1947)
Crime
99999
3280
5.0
10
Baby, The (1973)
Horror
[10 rows x 6 columns]
In [ ]:
m.save('my_model')
In [ ]:
m_again = graphlab.load_model('my_model')
In [ ]:
m_again
In [ ]:
Content source: Aggieyixin/cjc2016
Similar notebooks: