In [2]:
import graphlab as gl
# set canvas to show sframes and sgraphs in ipython notebook
gl.canvas.set_target('ipynb')
import matplotlib.pyplot as plt
%matplotlib inline
In [3]:
#train_file = 'http://s3.amazonaws.com/dato-datasets/millionsong/10000.txt'
train_file = '/Users/chengjun/bigdata/millionsong/song_usage_10000.txt'
sf = gl.SFrame.read_csv(train_file, header=False, delimiter='\t', verbose=False)
sf.rename({'X1':'user_id', 'X2':'music_id', 'X3':'rating'}).show()
------------------------------------------------------
Inferred types from first line of file as
column_type_hints=[str,str,int]
If parsing fails due to incorrect types, you can correct
the inferred type list above and pass it to read_csv in
the column_type_hints argument
------------------------------------------------------
PROGRESS: Read 844838 lines. Lines per second: 810295
PROGRESS: Finished parsing file /Users/chengjun/bigdata/millionsong/song_usage_10000.txt
PROGRESS: Parsing completed. Parsed 2000000 lines in 1.59616 secs.
In [4]:
(train_set, test_set) = sf.random_split(0.8, seed=1)
In [5]:
popularity_model = gl.popularity_recommender.create(train_set, 'user_id', 'music_id', target = 'rating')
PROGRESS: Recsys training: model = popularity
PROGRESS: Preparing data set.
PROGRESS: Data has 1599753 observations with 76085 users and 10000 items.
PROGRESS: Data prepared in: 1.23558s
PROGRESS: 1599753 observations to process; with 10000 unique items.
In [7]:
item_sim_model = gl.item_similarity_recommender.create(train_set, 'user_id', 'music_id', target = 'rating',
similarity_type='cosine')
PROGRESS: Recsys training: model = item_similarity
PROGRESS: Preparing data set.
PROGRESS: Data has 1599753 observations with 76085 users and 10000 items.
PROGRESS: Data prepared in: 1.34152s
PROGRESS: Computing item similarity statistics:
PROGRESS: Computing most similar items for 10000 items:
PROGRESS: +-----------------+-----------------+
PROGRESS: | Number of items | Elapsed Time |
PROGRESS: +-----------------+-----------------+
PROGRESS: | 1000 | 1.67234 |
PROGRESS: | 2000 | 1.70878 |
PROGRESS: | 3000 | 1.74289 |
PROGRESS: | 4000 | 1.77751 |
PROGRESS: | 5000 | 1.81794 |
PROGRESS: | 6000 | 1.85361 |
PROGRESS: | 7000 | 1.88976 |
PROGRESS: | 8000 | 1.92744 |
PROGRESS: | 9000 | 1.96709 |
PROGRESS: | 10000 | 2.08439 |
PROGRESS: +-----------------+-----------------+
PROGRESS: Finished training in 2.50669s
PROGRESS: Finished prediction in 0.734376s
In [8]:
factorization_machine_model = gl.recommender.factorization_recommender.create(train_set, 'user_id', 'music_id',
target='rating')
PROGRESS: Recsys training: model = factorization_recommender
PROGRESS: Preparing data set.
PROGRESS: Data has 1599753 observations with 76085 users and 10000 items.
PROGRESS: Data prepared in: 1.31298s
PROGRESS: Training factorization_recommender for recommendations.
PROGRESS: +--------------------------------+--------------------------------------------------+----------+
PROGRESS: | Parameter | Description | Value |
PROGRESS: +--------------------------------+--------------------------------------------------+----------+
PROGRESS: | num_factors | Factor Dimension | 8 |
PROGRESS: | regularization | L2 Regularization on Factors | 1e-08 |
PROGRESS: | solver | Solver used for training | sgd |
PROGRESS: | linear_regularization | L2 Regularization on Linear Coefficients | 1e-10 |
PROGRESS: | max_iterations | Maximum Number of Iterations | 50 |
PROGRESS: +--------------------------------+--------------------------------------------------+----------+
PROGRESS: Optimizing model using SGD; tuning step size.
PROGRESS: Using 199969 / 1599753 points for tuning the step size.
PROGRESS: +---------+-------------------+------------------------------------------+
PROGRESS: | Attempt | Initial Step Size | Estimated Objective Value |
PROGRESS: +---------+-------------------+------------------------------------------+
PROGRESS: | 0 | 25 | No Decrease (234.956 >= 45.6461) |
PROGRESS: | 1 | 6.25 | No Decrease (222.818 >= 45.6461) |
PROGRESS: | 2 | 1.5625 | No Decrease (193.879 >= 45.6461) |
PROGRESS: | 3 | 0.390625 | No Decrease (93.6001 >= 45.6461) |
PROGRESS: | 4 | 0.0976562 | 18.1929 |
PROGRESS: | 5 | 0.0488281 | 12.7349 |
PROGRESS: | 6 | 0.0244141 | 27.6064 |
PROGRESS: +---------+-------------------+------------------------------------------+
PROGRESS: | Final | 0.0488281 | 12.7349 |
PROGRESS: +---------+-------------------+------------------------------------------+
PROGRESS: Starting Optimization.
PROGRESS: +---------+--------------+-------------------+-----------------------+-------------+
PROGRESS: | Iter. | Elapsed Time | Approx. Objective | Approx. Training RMSE | Step Size |
PROGRESS: +---------+--------------+-------------------+-----------------------+-------------+
PROGRESS: | Initial | 388us | 43.795 | 6.61778 | |
PROGRESS: +---------+--------------+-------------------+-----------------------+-------------+
PROGRESS: | 1 | 242.781ms | 43.525 | 6.59695 | 0.0488281 |
PROGRESS: | 2 | 369.391ms | 40.9211 | 6.3966 | 0.0290334 |
PROGRESS: | 3 | 491.657ms | 37.9834 | 6.1627 | 0.0214205 |
PROGRESS: | 4 | 603.858ms | 35.2255 | 5.93471 | 0.0172633 |
PROGRESS: | 5 | 743.824ms | 32.7566 | 5.7229 | 0.014603 |
PROGRESS: | 6 | 861.2ms | 30.8412 | 5.553 | 0.0127367 |
PROGRESS: | 10 | 1.39s | 24.7548 | 4.97477 | 0.008683 |
PROGRESS: | 11 | 1.53s | 23.5887 | 4.85613 | 0.00808399 |
PROGRESS: | 20 | 2.76s | 17.6337 | 4.19832 | 0.00516295 |
PROGRESS: | 30 | 3.96s | 14.4135 | 3.79539 | 0.00380916 |
PROGRESS: | 40 | 5.17s | 12.5212 | 3.53725 | 0.00306991 |
PROGRESS: | 50 | 6.39s | 9.83216 | 3.13412 | 0.00154408 |
PROGRESS: +---------+--------------+-------------------+-----------------------+-------------+
PROGRESS: Optimization Complete: Maximum number of passes through the data reached.
PROGRESS: Computing final objective value and training RMSE.
PROGRESS: Final objective value: 8.86198
PROGRESS: Final training RMSE: 2.97532
In [9]:
result = gl.recommender.util.compare_models(test_set, [popularity_model, item_sim_model, factorization_machine_model],
user_sample=.1, skip_set=train_set)
compare_models: using 6871 users to estimate model performance
PROGRESS: Evaluate model M0
PROGRESS: recommendations finished on 1000/6871 queries. users per second: 12410
PROGRESS: recommendations finished on 2000/6871 queries. users per second: 14958.4
PROGRESS: recommendations finished on 3000/6871 queries. users per second: 15825.3
PROGRESS: recommendations finished on 4000/6871 queries. users per second: 16808.7
PROGRESS: recommendations finished on 5000/6871 queries. users per second: 17280.5
PROGRESS: recommendations finished on 6000/6871 queries. users per second: 17228
Precision and recall summary statistics by cutoff
+--------+-------------------+-------------------+
| cutoff | mean_precision | mean_recall |
+--------+-------------------+-------------------+
| 2 | 0.000363848057051 | 0.000222530101733 |
| 4 | 0.000509387279872 | 0.000644168294629 |
| 6 | 0.000460874205598 | 0.000838220591723 |
| 8 | 0.000418425265609 | 0.000983759814544 |
| 10 | 0.000465725513026 | 0.00128720279373 |
| 12 | 0.000412361131325 | 0.00132237477257 |
| 14 | 0.000457408986007 | 0.00161781917277 |
| 16 | 0.000491194877019 | 0.00189451695711 |
| 18 | 0.000468959717977 | 0.00196078928179 |
| 20 | 0.000480279435308 | 0.00211815339109 |
+--------+-------------------+-------------------+
[10 rows x 3 columns]
Overall RMSE: 5.79840126177
Per User RMSE (best)
+-------------------------------+-------+-----------------+
| user_id | count | rmse |
+-------------------------------+-------+-----------------+
| 907f83008d1b7a7958766544a0... | 1 | 0.0160085378869 |
+-------------------------------+-------+-----------------+
[1 rows x 3 columns]
Per User RMSE (worst)
+-------------------------------+-------+---------------+
| user_id | count | rmse |
+-------------------------------+-------+---------------+
| 2c263b458bb317ee91c346ae90... | 4 | 172.795342779 |
+-------------------------------+-------+---------------+
[1 rows x 3 columns]
Per Item RMSE (best)
+--------------------+-------+------+
| music_id | count | rmse |
+--------------------+-------+------+
| SOZWCBD12AB01848DD | 1 | 0.0 |
+--------------------+-------+------+
[1 rows x 3 columns]
Per Item RMSE (worst)
+--------------------+-------+---------------+
| music_id | count | rmse |
+--------------------+-------+---------------+
| SOTGIKV12AB0182176 | 1 | 173.804878049 |
+--------------------+-------+---------------+
[1 rows x 3 columns]
PROGRESS: Evaluate model M1
PROGRESS: recommendations finished on 1000/6871 queries. users per second: 1291.91
PROGRESS: recommendations finished on 2000/6871 queries. users per second: 1303.24
PROGRESS: recommendations finished on 3000/6871 queries. users per second: 1303.16
PROGRESS: recommendations finished on 4000/6871 queries. users per second: 1309.34
PROGRESS: recommendations finished on 5000/6871 queries. users per second: 1309.57
PROGRESS: recommendations finished on 6000/6871 queries. users per second: 1318.84
Precision and recall summary statistics by cutoff
+--------+-------------------+-------------------+
| cutoff | mean_precision | mean_recall |
+--------+-------------------+-------------------+
| 2 | 0.000509387279872 | 0.000106894612147 |
| 4 | 0.000509387279872 | 0.00041709974353 |
| 6 | 0.000557900354145 | 0.000741928943341 |
| 8 | 0.000491194877019 | 0.000896131215139 |
| 10 | 0.000509387279872 | 0.0010508900222 |
| 12 | 0.000533643817009 | 0.00124220214402 |
| 14 | 0.000509387279872 | 0.00144984014027 |
| 16 | 0.000491194877019 | 0.00164649135206 |
| 18 | 0.000517472792251 | 0.00213570216873 |
| 20 | 0.000509387279872 | 0.00236702556808 |
+--------+-------------------+-------------------+
[10 rows x 3 columns]
PROGRESS: Finished prediction in 0.226961s
Overall RMSE: 6.09897586494
Per User RMSE (best)
+-------------------------------+-------+------+
| user_id | count | rmse |
+-------------------------------+-------+------+
| 91e5266cafbdd11964d70fb1d8... | 1 | 0.0 |
+-------------------------------+-------+------+
[1 rows x 3 columns]
Per User RMSE (worst)
+-------------------------------+-------+---------------+
| user_id | count | rmse |
+-------------------------------+-------+---------------+
| 2c263b458bb317ee91c346ae90... | 4 | 161.518485703 |
+-------------------------------+-------+---------------+
[1 rows x 3 columns]
Per Item RMSE (best)
+--------------------+-------+------+
| music_id | count | rmse |
+--------------------+-------+------+
| SOOIQZC12A6701FEA1 | 2 | 0.0 |
+--------------------+-------+------+
[1 rows x 3 columns]
Per Item RMSE (worst)
+--------------------+-------+-------+
| music_id | count | rmse |
+--------------------+-------+-------+
| SOTGIKV12AB0182176 | 1 | 172.0 |
+--------------------+-------+-------+
[1 rows x 3 columns]
PROGRESS: Evaluate model M2
PROGRESS: recommendations finished on 1000/6871 queries. users per second: 10178.8
PROGRESS: recommendations finished on 2000/6871 queries. users per second: 11326.8
PROGRESS: recommendations finished on 3000/6871 queries. users per second: 12072.2
PROGRESS: recommendations finished on 4000/6871 queries. users per second: 12724.1
PROGRESS: recommendations finished on 5000/6871 queries. users per second: 12371.5
PROGRESS: recommendations finished on 6000/6871 queries. users per second: 12328.1
Precision and recall summary statistics by cutoff
+--------+-------------------+-------------------+
| cutoff | mean_precision | mean_recall |
+--------+-------------------+-------------------+
| 2 | 0.000291078445641 | 0.000204967738806 |
| 4 | 0.000291078445641 | 0.000321852261162 |
| 6 | 0.000315334982778 | 0.000400252854408 |
| 8 | 0.000382040459904 | 0.000704556888155 |
| 10 | 0.000480279435308 | 0.00107200975662 |
| 12 | 0.000533643817009 | 0.00138531272247 |
| 14 | 0.000550969914964 | 0.0016547624225 |
| 16 | 0.000591253092708 | 0.00185731432805 |
| 18 | 0.000582156891282 | 0.00207921438986 |
| 20 | 0.000574879930141 | 0.00223034316254 |
+--------+-------------------+-------------------+
[10 rows x 3 columns]
Overall RMSE: 7.66449264849
Per User RMSE (best)
+-------------------------------+-------+-------------------+
| user_id | count | rmse |
+-------------------------------+-------+-------------------+
| ac810151e32857e9f4200e8fa7... | 1 | 0.000812624627255 |
+-------------------------------+-------+-------------------+
[1 rows x 3 columns]
Per User RMSE (worst)
+-------------------------------+-------+---------------+
| user_id | count | rmse |
+-------------------------------+-------+---------------+
| 2c263b458bb317ee91c346ae90... | 4 | 182.725743431 |
+-------------------------------+-------+---------------+
[1 rows x 3 columns]
Per Item RMSE (best)
+--------------------+-------+-------------------+
| music_id | count | rmse |
+--------------------+-------+-------------------+
| SOJWIJT12A8C136100 | 1 | 0.000881766015195 |
+--------------------+-------+-------------------+
[1 rows x 3 columns]
Per Item RMSE (worst)
+--------------------+-------+--------------+
| music_id | count | rmse |
+--------------------+-------+--------------+
| SOTGIKV12AB0182176 | 1 | 236.91250578 |
+--------------------+-------+--------------+
[1 rows x 3 columns]
/Users/chengjun/anaconda/lib/python2.7/site-packages/matplotlib/figure.py:387: UserWarning: matplotlib is currently using a non-GUI backend, so cannot show the figure
"matplotlib is currently using a non-GUI backend, "
In [10]:
K = 10
users = gl.SArray(sf['user_id'].unique().head(100))
In [11]:
recs = item_sim_model.recommend(users=users, k=K)
recs.head()
Out[11]:
user_id
music_id
score
rank
279292bb36dbfc7f505e36ebf
038c81eb1d1d63e ...
SOFCGSE12AF72A674F
20.686440678
1
279292bb36dbfc7f505e36ebf
038c81eb1d1d63e ...
SOQYHJW12AB0182AA6
20.0
2
279292bb36dbfc7f505e36ebf
038c81eb1d1d63e ...
SOELDGL12A8C135ED7
20.0
3
279292bb36dbfc7f505e36ebf
038c81eb1d1d63e ...
SOUWZPO12A6D4F83E3
20.0
4
279292bb36dbfc7f505e36ebf
038c81eb1d1d63e ...
SONAQRQ12AB017FD0B
20.0
5
279292bb36dbfc7f505e36ebf
038c81eb1d1d63e ...
SOMVIOV12A6D4F719A
20.0
6
279292bb36dbfc7f505e36ebf
038c81eb1d1d63e ...
SOHJWLZ12A6D4F7756
20.0
7
279292bb36dbfc7f505e36ebf
038c81eb1d1d63e ...
SOIRUXQ12A8C133060
20.0
8
279292bb36dbfc7f505e36ebf
038c81eb1d1d63e ...
SOLVHIW12A8C13BA03
20.0
9
279292bb36dbfc7f505e36ebf
038c81eb1d1d63e ...
SOPWZGK12A67020744
20.0
10
[10 rows x 4 columns]
In [ ]:
Content source: Aggieyixin/cjc2016
Similar notebooks: