In [1]:
# The data is using movie lens here, from 
## F. Maxwell Harper and Joseph A. Konstan. 2015. The MovieLens Datasets: History and Context. 
## ACM Transactions on Interactive Intelligent Systems (TiiS) 5, 4, Article 19 (December 2015), 19 pages. 
## DOI=http://dx.doi.org/10.1145/2827872

import pandas as pd
import numpy as np

In [2]:
root_folder = 'ml-100k/'
rating_data = root_folder + 'u.data'
user_data = root_folder + 'u.user'
item_data = root_folder + 'u.item'

rating_train = root_folder + 'ua.base'
rating_test = root_folder + 'ua.test'

In [13]:
# The original data do not have column names in the csv file, you need to check their ReadMe, and add column names
## the data here can contain duplicated userid or itemid
rating_data_cols = ['userid', 'itemid', 'rating', 'timestamp']
rating_data_df = pd.read_csv(rating_data, sep='\t', names = rating_data_cols, encoding='latin-1')
print rating_data_df.shape
rating_data_df.head()


(100000, 4)
Out[13]:
userid itemid rating timestamp
0 196 242 3 881250949
1 186 302 3 891717742
2 22 377 1 878887116
3 244 51 2 880606923
4 166 346 1 886397596

In [8]:
# Method 1 - DIY collaborative filtering

user_ct = rating_data_df['userid'].unique().shape[0]
item_ct = rating_data_df['itemid'].unique().shape[0]
print(user_ct, item_ct)


(943, 1682)

In [9]:
for line in rating_data_df.itertuples():
    print line
    print line[1]
    break


Pandas(Index=0, userid=196, itemid=242, rating=3, timestamp=881250949)
196

In [10]:
data_matrix = np.zeros((user_ct, item_ct))

for r in rating_data_df.itertuples():
    data_matrix[r[1]-1, r[2]-1] = r[3]
    
data_matrix


Out[10]:
array([[ 5.,  3.,  4., ...,  0.,  0.,  0.],
       [ 4.,  0.,  0., ...,  0.,  0.,  0.],
       [ 0.,  0.,  0., ...,  0.,  0.,  0.],
       ..., 
       [ 5.,  0.,  0., ...,  0.,  0.,  0.],
       [ 0.,  0.,  0., ...,  0.,  0.,  0.],
       [ 0.,  5.,  0., ...,  0.,  0.,  0.]])

In [11]:
# calculate similarity
from sklearn.metrics.pairwise import pairwise_distances 

user_similarity = pairwise_distances(data_matrix, metric='cosine')
item_similarity = pairwise_distances(data_matrix.T, metric='cosine')

user_similarity


Out[11]:
array([[ 0.        ,  0.83306902,  0.95254046, ...,  0.85138306,
         0.82049212,  0.60182526],
       [ 0.83306902,  0.        ,  0.88940868, ...,  0.83851522,
         0.82773219,  0.89420212],
       [ 0.95254046,  0.88940868,  0.        , ...,  0.89875744,
         0.86658385,  0.97344413],
       ..., 
       [ 0.85138306,  0.83851522,  0.89875744, ...,  0.        ,
         0.8983582 ,  0.90488042],
       [ 0.82049212,  0.82773219,  0.86658385, ...,  0.8983582 ,
         0.        ,  0.81753534],
       [ 0.60182526,  0.89420212,  0.97344413, ...,  0.90488042,
         0.81753534,  0.        ]])

In [12]:
print(data_matrix.shape)
print(user_similarity.shape, item_similarity.shape)


(943, 1682)
((943, 943), (1682, 1682))

In [14]:
# user-user recommendation: predict the score that each item can be recommended to each user
rating_mean = data_matrix.mean(axis=1)
ratings_diff = data_matrix - rating_mean[:, np.newaxis]
user_recommendation = rating_mean[:, np.newaxis] \
            + user_similarity.dot(ratings_diff)/np.array([np.abs(user_similarity).sum(axis=1)]).T
print(user_recommendation.shape)
user_recommendation


(943, 1682)
Out[14]:
array([[ 2.06532606,  0.73430275,  0.62992381, ...,  0.39359041,
         0.39304874,  0.3927712 ],
       [ 1.76308836,  0.38404019,  0.19617889, ..., -0.08837789,
        -0.0869183 , -0.08671183],
       [ 1.79590398,  0.32904733,  0.15882885, ..., -0.13699223,
        -0.13496852, -0.13476488],
       ..., 
       [ 1.59151513,  0.27526889,  0.10219534, ..., -0.16735162,
        -0.16657451, -0.16641377],
       [ 1.81036267,  0.40479877,  0.27545013, ..., -0.00907358,
        -0.00846587, -0.00804858],
       [ 1.8384313 ,  0.47964837,  0.38496292, ...,  0.14686675,
         0.14629808,  0.14641455]])

In [15]:
# item-item prediction: predict the score that each item can be recommended to each user
item_recommendation = data_matrix.dot(item_similarity)/np.array([np.abs(item_similarity).sum(axis=1)])
print(item_recommendation.shape)
item_recommendation


(943, 1682)
Out[15]:
array([[ 0.44627765,  0.475473  ,  0.50593755, ...,  0.58815455,
         0.5731069 ,  0.56669645],
       [ 0.10854432,  0.13295661,  0.12558851, ...,  0.13445801,
         0.13657587,  0.13711081],
       [ 0.08568497,  0.09169006,  0.08764343, ...,  0.08465892,
         0.08976784,  0.09084451],
       ..., 
       [ 0.03230047,  0.0450241 ,  0.04292449, ...,  0.05302764,
         0.0519099 ,  0.05228033],
       [ 0.15777917,  0.17409459,  0.18900003, ...,  0.19979296,
         0.19739388,  0.20003117],
       [ 0.24767207,  0.24489212,  0.28263031, ...,  0.34410424,
         0.33051406,  0.33102478]])

In [16]:
# Method 2 - using turicreate collaborative filtering
import turicreate

ua_cols = ['userid', 'movieid', 'rating', 'unix_timestamp']
ratings_train = pd.read_csv(rating_train, sep='\t', names=ua_cols, encoding='latin-1')
ratings_test = pd.read_csv(rating_test, sep='\t', names=ua_cols, encoding='latin-1')
print ratings_train.shape, ratings_test.shape


(90570, 4) (9430, 4)

In [18]:
train_data = turicreate.SFrame(ratings_train)
test_data = turicreate.SFrame(ratings_test)
train_data.head()


Out[18]:
userid movieid rating unix_timestamp
1 1 5 874965758
1 2 3 876893171
1 3 4 878542960
1 4 3 876893119
1 5 3 889751712
1 6 5 887431973
1 7 4 875071561
1 8 1 875072484
1 9 5 878543541
1 10 3 875693118
[10 rows x 4 columns]

In [20]:
## Turicreate - recommend most popular items (in fact this will recommend the same thing to everyone)
popularity_model = turicreate.popularity_recommender\
                   .create(train_data, user_id='userid', item_id='movieid', target='rating')


Recsys training: model = popularity
Warning: Ignoring columns unix_timestamp;
    To use these columns in scoring predictions, use a model that allows the use of additional features.
Preparing data set.
    Data has 90570 observations with 943 users and 1680 items.
    Data prepared in: 0.130378s
90570 observations to process; with 1680 unique items.

In [24]:
# see, everyone gets the same recommendation
popularity_recomm = popularity_model.recommend(users=[4,10,7,9],k=3)
popularity_recomm.print_rows(num_rows=12)


+--------+---------+-------+------+
| userid | movieid | score | rank |
+--------+---------+-------+------+
|   4    |   1189  |  5.0  |  1   |
|   4    |   1122  |  5.0  |  2   |
|   4    |   814   |  5.0  |  3   |
|   10   |   1189  |  5.0  |  1   |
|   10   |   1122  |  5.0  |  2   |
|   10   |   814   |  5.0  |  3   |
|   7    |   1189  |  5.0  |  1   |
|   7    |   1122  |  5.0  |  2   |
|   7    |   814   |  5.0  |  3   |
|   9    |   1189  |  5.0  |  1   |
|   9    |   1122  |  5.0  |  2   |
|   9    |   814   |  5.0  |  3   |
+--------+---------+-------+------+
[12 rows x 4 columns]


In [26]:
## Turicreate - collaborative filtering
#Training the model
item_sim_model = turicreate.item_similarity_recommender\
            .create(train_data, user_id='userid', item_id='movieid', target='rating', similarity_type='cosine')


Recsys training: model = item_similarity
Warning: Ignoring columns unix_timestamp;
    To use these columns in scoring predictions, use a model that allows the use of additional features.
Preparing data set.
    Data has 90570 observations with 943 users and 1680 items.
    Data prepared in: 0.12825s
Training model from provided data.
Gathering per-item and per-user statistics.
+--------------------------------+------------+
| Elapsed Time (Item Statistics) | % Complete |
+--------------------------------+------------+
| 7.317ms                        | 100        |
+--------------------------------+------------+
Setting up lookup tables.
Processing data in one pass using dense lookup tables.
+-------------------------------------+------------------+-----------------+
| Elapsed Time (Constructing Lookups) | Total % Complete | Items Processed |
+-------------------------------------+------------------+-----------------+
| 19.633ms                            | 0.25             | 6               |
| 167.108ms                           | 100              | 1680            |
+-------------------------------------+------------------+-----------------+
Finalizing lookup tables.
Generating candidate set for working with new users.
Finished training in 0.174317s

In [27]:
# Making recommendations
item_sim_recomm = item_sim_model.recommend(users=[4,10,7,9],k=3)
item_sim_recomm.print_rows(num_rows=12)


+--------+---------+----------------+------+
| userid | movieid |     score      | rank |
+--------+---------+----------------+------+
|   4    |    50   | 1.13114770821  |  1   |
|   4    |   288   | 1.04871511459  |  2   |
|   4    |    56   | 0.996869802475 |  3   |
|   10   |   204   | 1.30251427115  |  1   |
|   10   |   423   | 1.22470301081  |  2   |
|   10   |   172   | 1.20107427445  |  3   |
|   7    |    88   | 0.48335224936  |  1   |
|   7    |    95   | 0.470542855845 |  2   |
|   7    |   209   | 0.422433135012 |  3   |
|   9    |   172   | 1.51975569129  |  1   |
|   9    |   204   | 1.46488795678  |  2   |
|   9    |   174   |  1.4442871958  |  3   |
+--------+---------+----------------+------+
[12 rows x 4 columns]


In [53]:
# movie data
item_data_cols = ['item_id', 'item_title ', 'release_date', 'video_release_date', 
                  'IMDb_URL', 'unknown', 'Action', 'Adventure', 'Animation', 'Children', 
                  'Comedy', 'Crime', 'Documentary', 'Drama', 'Fantasy', 'Film_Noir', 'Horror',
                  'Musical', 'Mystery', 'Romance', 'Sci-Fi', 'Thriller', 'War', 'Western']
item_data_df = pd.read_csv(item_data, sep='|', names = item_data_cols, encoding='latin-1')
print item_data_df.shape
item_data_df.head()


(1682, 24)
Out[53]:
item_id item_title release_date video_release_date IMDb_URL unknown Action Adventure Animation Children ... Fantasy Film_Noir Horror Musical Mystery Romance Sci-Fi Thriller War Western
0 1 Toy Story (1995) 01-Jan-1995 NaN http://us.imdb.com/M/title-exact?Toy%20Story%2... 0 0 0 1 1 ... 0 0 0 0 0 0 0 0 0 0
1 2 GoldenEye (1995) 01-Jan-1995 NaN http://us.imdb.com/M/title-exact?GoldenEye%20(... 0 1 1 0 0 ... 0 0 0 0 0 0 0 1 0 0
2 3 Four Rooms (1995) 01-Jan-1995 NaN http://us.imdb.com/M/title-exact?Four%20Rooms%... 0 0 0 0 0 ... 0 0 0 0 0 0 0 1 0 0
3 4 Get Shorty (1995) 01-Jan-1995 NaN http://us.imdb.com/M/title-exact?Get%20Shorty%... 0 1 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
4 5 Copycat (1995) 01-Jan-1995 NaN http://us.imdb.com/M/title-exact?Copycat%20(1995) 0 0 0 0 0 ... 0 0 0 0 0 0 0 1 0 0

5 rows × 24 columns


In [54]:
# user profile
user_data_cols = ['user_id', 'age', 'gender', 'occupation', 'zip_code']
user_data_df = pd.read_csv(user_data, sep='|', names = user_data_cols, encoding='latin-1')
print user_data_df.shape
user_data_df.head()


(943, 5)
Out[54]:
user_id age gender occupation zip_code
0 1 24 M technician 85711
1 2 53 F other 94043
2 3 23 M writer 32067
3 4 24 M technician 43537
4 5 33 F other 15213

In [55]:
ua_cols = ['user_id', 'item_id', 'rating', 'unix_timestamp']
ratings_train = pd.read_csv(rating_train, sep='\t', names=ua_cols, encoding='latin-1')
ratings_test = pd.read_csv(rating_test, sep='\t', names=ua_cols, encoding='latin-1')
print ratings_train.shape, ratings_test.shape

train_data = turicreate.SFrame(ratings_train)
test_data = turicreate.SFrame(ratings_test)
train_data.head()


(90570, 4) (9430, 4)
Out[55]:
user_id item_id rating unix_timestamp
1 1 5 874965758
1 2 3 876893171
1 3 4 878542960
1 4 3 876893119
1 5 3 889751712
1 6 5 887431973
1 7 4 875071561
1 8 1 875072484
1 9 5 878543541
1 10 3 875693118
[10 rows x 4 columns]

In [56]:
## Turicreate - Factorization Recommender, predict missing ratings
### since a user won't rate all the items, this method is to predict those missing ratings
user_sf = turicreate.SFrame(user_data_df)

item_data_df = item_data_df.drop(['video_release_date', 'IMDb_URL', 'release_date'], axis=1)
item_sf = turicreate.SFrame(item_data_df)

item_sf


Out[56]:
item_id item_title unknown Action Adventure Animation Children Comedy Crime Documentary
1 Toy Story (1995) 0 0 0 1 1 1 0 0
2 GoldenEye (1995) 0 1 1 0 0 0 0 0
3 Four Rooms (1995) 0 0 0 0 0 0 0 0
4 Get Shorty (1995) 0 1 0 0 0 1 0 0
5 Copycat (1995) 0 0 0 0 0 0 1 0
6 Shanghai Triad (Yao a yao
yao dao waipo qiao) ...
0 0 0 0 0 0 0 0
7 Twelve Monkeys (1995) 0 0 0 0 0 0 0 0
8 Babe (1995) 0 0 0 0 1 1 0 0
9 Dead Man Walking (1995) 0 0 0 0 0 0 0 0
10 Richard III (1995) 0 0 0 0 0 0 0 0
Drama Fantasy Film_Noir Horror Musical Mystery Romance Sci-Fi Thriller War Western
0 0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 1 0 0
0 0 0 0 0 0 0 0 1 0 0
1 0 0 0 0 0 0 0 0 0 0
1 0 0 0 0 0 0 0 1 0 0
1 0 0 0 0 0 0 0 0 0 0
1 0 0 0 0 0 0 1 0 0 0
1 0 0 0 0 0 0 0 0 0 0
1 0 0 0 0 0 0 0 0 0 0
1 0 0 0 0 0 0 0 0 1 0
[1682 rows x 21 columns]
Note: Only the head of the SFrame is printed.
You can use print_rows(num_rows=m, num_columns=n) to print more rows and columns.

In [57]:
fac_rem_model = turicreate.factorization_recommender.create(train_data, target='rating',
                                                user_data=user_sf,
                                                item_data=item_sf)


Recsys training: model = factorization_recommender
Preparing data set.
    Data has 90570 observations with 943 users and 1682 items.
    Data prepared in: 0.212253s
Training factorization_recommender for recommendations.
+--------------------------------+--------------------------------------------------+----------+
| Parameter                      | Description                                      | Value    |
+--------------------------------+--------------------------------------------------+----------+
| num_factors                    | Factor Dimension                                 | 8        |
| regularization                 | L2 Regularization on Factors                     | 1e-08    |
| solver                         | Solver used for training                         | adagrad  |
| linear_regularization          | L2 Regularization on Linear Coefficients         | 1e-10    |
| side_data_factorization        | Assign Factors for Side Data                     | True     |
| max_iterations                 | Maximum Number of Iterations                     | 50       |
+--------------------------------+--------------------------------------------------+----------+
  Optimizing model using SGD; tuning step size.
  Using 11321 / 90570 points for tuning the step size.
+---------+-------------------+------------------------------------------+
| Attempt | Initial Step Size | Estimated Objective Value                |
+---------+-------------------+------------------------------------------+
| 0       | 1.85185           | Not Viable                               |
| 1       | 0.462963          | Not Viable                               |
| 2       | 0.115741          | 0.256957                                 |
| 3       | 0.0578704         | 0.441567                                 |
| 4       | 0.0289352         | 0.668015                                 |
+---------+-------------------+------------------------------------------+
| Final   | 0.115741          | 0.256957                                 |
+---------+-------------------+------------------------------------------+
Starting Optimization.
+---------+--------------+-------------------+-----------------------+-------------+
| Iter.   | Elapsed Time | Approx. Objective | Approx. Training RMSE | Step Size   |
+---------+--------------+-------------------+-----------------------+-------------+
| Initial | 113us        | 1.26801           | 1.12606               |             |
+---------+--------------+-------------------+-----------------------+-------------+
| 1       | 220.043ms    | 1.02212           | 1.011                 | 0.115741    |
| 2       | 541.238ms    | 0.789752          | 0.888674              | 0.115741    |
| 3       | 870.654ms    | 0.726265          | 0.852206              | 0.115741    |
| 4       | 1.20s        | 0.690086          | 0.830707              | 0.115741    |
| 5       | 1.52s        | 0.667022          | 0.816707              | 0.115741    |
| 6       | 1.88s        | 0.649894          | 0.806152              | 0.115741    |
| 10      | 2.90s        | 0.612406          | 0.782555              | 0.115741    |
| 11      | 3.28s        | 0.607195          | 0.779218              | 0.115741    |
| 15      | 4.06s        | 0.590597          | 0.768493              | 0.115741    |
| 20      | 5.18s        | 0.577918          | 0.760198              | 0.115741    |
| 25      | 6.25s        | 0.569356          | 0.754546              | 0.115741    |
| 30      | 7.09s        | 0.563028          | 0.75034               | 0.115741    |
| 35      | 8.50s        | 0.557705          | 0.746784              | 0.115741    |
| 40      | 9.48s        | 0.553874          | 0.744215              | 0.115741    |
| 45      | 10.41s       | 0.550585          | 0.742001              | 0.115741    |
| 50      | 11.32s       | 0.547802          | 0.740124              | 0.115741    |
+---------+--------------+-------------------+-----------------------+-------------+
Optimization Complete: Maximum number of passes through the data reached.
Computing final objective value and training RMSE.
       Final objective value: 0.535105
       Final training RMSE: 0.731495

In [46]:
train_data


Out[46]:
userid movieid rating unix_timestamp
1 1 5 874965758
1 2 3 876893171
1 3 4 878542960
1 4 3 876893119
1 5 3 889751712
1 6 5 887431973
1 7 4 875071561
1 8 1 875072484
1 9 5 878543541
1 10 3 875693118
[90570 rows x 4 columns]
Note: Only the head of the SFrame is printed.
You can use print_rows(num_rows=m, num_columns=n) to print more rows and columns.

In [58]:
fac_rem_model


Out[58]:
Class                            : FactorizationRecommender

Schema
------
User ID                          : user_id
Item ID                          : item_id
Target                           : rating
Additional observation features  : 1
User side features               : ['user_id', 'age', 'gender', 'occupation', 'zip_code']
Item side features               : ['item_id', 'item_title ', 'unknown', 'Action', 'Adventure', 'Animation', 'Children', 'Comedy', 'Crime', 'Documentary', 'Drama', 'Fantasy', 'Film_Noir', 'Horror', 'Musical', 'Mystery', 'Romance', 'Sci-Fi', 'Thriller', 'War', 'Western']

Statistics
----------
Number of observations           : 90570
Number of users                  : 943
Number of items                  : 1682

Training summary
----------------
Training time                    : 12.2277

Model Parameters
----------------
Model class                      : FactorizationRecommender
num_factors                      : 8
binary_target                    : 0
side_data_factorization          : 1
solver                           : auto
nmf                              : 0
max_iterations                   : 50

Regularization Settings
-----------------------
regularization                   : 0.0
regularization_type              : normal
linear_regularization            : 0.0

Optimization Settings
---------------------
init_random_sigma                : 0.01
sgd_convergence_interval         : 4
sgd_convergence_threshold        : 0.0
sgd_max_trial_iterations         : 5
sgd_sampling_block_size          : 131072
sgd_step_adjustment_interval     : 4
sgd_step_size                    : 0.0
sgd_trial_sample_minimum_size    : 10000
sgd_trial_sample_proportion      : 0.125
step_size_decrease_rate          : 0.75
additional_iterations_if_unhealthy : 5
adagrad_momentum_weighting       : 0.9
num_tempering_iterations         : 4
tempering_regularization_start_value : 0.0
track_exact_loss                 : 0

In [59]:
fac_rem_model.evaluate_precision_recall(test_data)


Out[59]:
{'precision_recall_by_user': Columns:
 	user_id	int
 	cutoff	int
 	precision	float
 	recall	float
 	count	int
 
 Rows: 16974
 
 Data:
 +---------+--------+-----------+--------+-------+
 | user_id | cutoff | precision | recall | count |
 +---------+--------+-----------+--------+-------+
 |    1    |   1    |    0.0    |  0.0   |   10  |
 |    1    |   2    |    0.0    |  0.0   |   10  |
 |    1    |   3    |    0.0    |  0.0   |   10  |
 |    1    |   4    |    0.0    |  0.0   |   10  |
 |    1    |   5    |    0.0    |  0.0   |   10  |
 |    1    |   6    |    0.0    |  0.0   |   10  |
 |    1    |   7    |    0.0    |  0.0   |   10  |
 |    1    |   8    |    0.0    |  0.0   |   10  |
 |    1    |   9    |    0.0    |  0.0   |   10  |
 |    1    |   10   |    0.0    |  0.0   |   10  |
 +---------+--------+-----------+--------+-------+
 [16974 rows x 5 columns]
 Note: Only the head of the SFrame is printed.
 You can use print_rows(num_rows=m, num_columns=n) to print more rows and columns.,
 'precision_recall_overall': Columns:
 	cutoff	int
 	precision	float
 	recall	float
 
 Rows: 18
 
 Data:
 +--------+-----------------+------------------+
 | cutoff |    precision    |      recall      |
 +--------+-----------------+------------------+
 |   1    | 0.0084835630965 | 0.00084835630965 |
 |   2    | 0.0084835630965 | 0.0016967126193  |
 |   3    | 0.0134323082361 | 0.00402969247084 |
 |   4    | 0.0172322375398 | 0.00689289501591 |
 |   5    | 0.0190880169671 | 0.00954400848356 |
 |   6    | 0.0190880169671 | 0.0114528101803  |
 |   7    | 0.0192395091653 | 0.0134676564157  |
 |   8    | 0.0182926829268 | 0.0146341463415  |
 |   9    | 0.0183810533758 | 0.0165429480382  |
 |   10   | 0.0185577942736 | 0.0185577942736  |
 +--------+-----------------+------------------+
 [18 rows x 3 columns]
 Note: Only the head of the SFrame is printed.
 You can use print_rows(num_rows=m, num_columns=n) to print more rows and columns.}