In [1]:
# The data is using movie lens here, from
## F. Maxwell Harper and Joseph A. Konstan. 2015. The MovieLens Datasets: History and Context.
## ACM Transactions on Interactive Intelligent Systems (TiiS) 5, 4, Article 19 (December 2015), 19 pages.
## DOI=http://dx.doi.org/10.1145/2827872
import pandas as pd
import numpy as np
In [2]:
root_folder = 'ml-100k/'
rating_data = root_folder + 'u.data'
user_data = root_folder + 'u.user'
item_data = root_folder + 'u.item'
rating_train = root_folder + 'ua.base'
rating_test = root_folder + 'ua.test'
In [13]:
# The original data do not have column names in the csv file, you need to check their ReadMe, and add column names
## the data here can contain duplicated userid or itemid
rating_data_cols = ['userid', 'itemid', 'rating', 'timestamp']
rating_data_df = pd.read_csv(rating_data, sep='\t', names = rating_data_cols, encoding='latin-1')
print rating_data_df.shape
rating_data_df.head()
Out[13]:
In [8]:
# Method 1 - DIY collaborative filtering
user_ct = rating_data_df['userid'].unique().shape[0]
item_ct = rating_data_df['itemid'].unique().shape[0]
print(user_ct, item_ct)
In [9]:
for line in rating_data_df.itertuples():
print line
print line[1]
break
In [10]:
data_matrix = np.zeros((user_ct, item_ct))
for r in rating_data_df.itertuples():
data_matrix[r[1]-1, r[2]-1] = r[3]
data_matrix
Out[10]:
In [11]:
# calculate similarity
from sklearn.metrics.pairwise import pairwise_distances
user_similarity = pairwise_distances(data_matrix, metric='cosine')
item_similarity = pairwise_distances(data_matrix.T, metric='cosine')
user_similarity
Out[11]:
In [12]:
print(data_matrix.shape)
print(user_similarity.shape, item_similarity.shape)
In [14]:
# user-user recommendation: predict the score that each item can be recommended to each user
rating_mean = data_matrix.mean(axis=1)
ratings_diff = data_matrix - rating_mean[:, np.newaxis]
user_recommendation = rating_mean[:, np.newaxis] \
+ user_similarity.dot(ratings_diff)/np.array([np.abs(user_similarity).sum(axis=1)]).T
print(user_recommendation.shape)
user_recommendation
Out[14]:
In [15]:
# item-item prediction: predict the score that each item can be recommended to each user
item_recommendation = data_matrix.dot(item_similarity)/np.array([np.abs(item_similarity).sum(axis=1)])
print(item_recommendation.shape)
item_recommendation
Out[15]:
In [16]:
# Method 2 - using turicreate collaborative filtering
import turicreate
ua_cols = ['userid', 'movieid', 'rating', 'unix_timestamp']
ratings_train = pd.read_csv(rating_train, sep='\t', names=ua_cols, encoding='latin-1')
ratings_test = pd.read_csv(rating_test, sep='\t', names=ua_cols, encoding='latin-1')
print ratings_train.shape, ratings_test.shape
In [18]:
train_data = turicreate.SFrame(ratings_train)
test_data = turicreate.SFrame(ratings_test)
train_data.head()
Out[18]:
In [20]:
## Turicreate - recommend most popular items (in fact this will recommend the same thing to everyone)
popularity_model = turicreate.popularity_recommender\
.create(train_data, user_id='userid', item_id='movieid', target='rating')
In [24]:
# see, everyone gets the same recommendation
popularity_recomm = popularity_model.recommend(users=[4,10,7,9],k=3)
popularity_recomm.print_rows(num_rows=12)
In [26]:
## Turicreate - collaborative filtering
#Training the model
item_sim_model = turicreate.item_similarity_recommender\
.create(train_data, user_id='userid', item_id='movieid', target='rating', similarity_type='cosine')
In [27]:
# Making recommendations
item_sim_recomm = item_sim_model.recommend(users=[4,10,7,9],k=3)
item_sim_recomm.print_rows(num_rows=12)
In [53]:
# movie data
item_data_cols = ['item_id', 'item_title ', 'release_date', 'video_release_date',
'IMDb_URL', 'unknown', 'Action', 'Adventure', 'Animation', 'Children',
'Comedy', 'Crime', 'Documentary', 'Drama', 'Fantasy', 'Film_Noir', 'Horror',
'Musical', 'Mystery', 'Romance', 'Sci-Fi', 'Thriller', 'War', 'Western']
item_data_df = pd.read_csv(item_data, sep='|', names = item_data_cols, encoding='latin-1')
print item_data_df.shape
item_data_df.head()
Out[53]:
In [54]:
# user profile
user_data_cols = ['user_id', 'age', 'gender', 'occupation', 'zip_code']
user_data_df = pd.read_csv(user_data, sep='|', names = user_data_cols, encoding='latin-1')
print user_data_df.shape
user_data_df.head()
Out[54]:
In [55]:
ua_cols = ['user_id', 'item_id', 'rating', 'unix_timestamp']
ratings_train = pd.read_csv(rating_train, sep='\t', names=ua_cols, encoding='latin-1')
ratings_test = pd.read_csv(rating_test, sep='\t', names=ua_cols, encoding='latin-1')
print ratings_train.shape, ratings_test.shape
train_data = turicreate.SFrame(ratings_train)
test_data = turicreate.SFrame(ratings_test)
train_data.head()
Out[55]:
In [56]:
## Turicreate - Factorization Recommender, predict missing ratings
### since a user won't rate all the items, this method is to predict those missing ratings
user_sf = turicreate.SFrame(user_data_df)
item_data_df = item_data_df.drop(['video_release_date', 'IMDb_URL', 'release_date'], axis=1)
item_sf = turicreate.SFrame(item_data_df)
item_sf
Out[56]:
In [57]:
fac_rem_model = turicreate.factorization_recommender.create(train_data, target='rating',
user_data=user_sf,
item_data=item_sf)
In [46]:
train_data
Out[46]:
In [58]:
fac_rem_model
Out[58]:
In [59]:
fac_rem_model.evaluate_precision_recall(test_data)
Out[59]: