In [2]:
import numpy as np
import pandas as pd
from sklearn.metrics import mean_squared_error
from reco.datasets import loadMovieLens100k
from reco.recommender import FM
In [3]:
train, test, _, _ = loadMovieLens100k(train_test_split=True)
print(train.head())
So we have the user ids, item ids and the respective ratings in the 3 columns. Next we need to separate the rating column since we are going to predict that. Also we need to explicitly set the column data type to string for userId and itemId so that the model treats them as categorical variables, not integers. We'll do this for both the train and test sets.
In [4]:
y_train = train['rating']
train.drop(['rating'], axis=1, inplace=True)
train['userId'] = train['userId'].astype('str')
train['itemId'] = train['itemId'].astype('str')
y_test = test['rating']
test.drop(['rating'], axis=1, inplace=True)
test['userId'] = test['userId'].astype('str')
test['itemId'] = test['itemId'].astype('str')
Next we'll train the model. We choose 60 iterations here. Tweak the hyperparameters to get the best performance.
In [5]:
f = FM(k=10, iterations = 60, learning_rate = 0.003, regularizer=0.005)
f.fit(X=train, y=y_train)
In [7]:
y_pred = f.predict(test)
print("MSE: {}".format(mean_squared_error(y_test, y_pred)))
In [8]:
train, test, _, _ = loadMovieLens100k(train_test_split=True, all_columns=True)
print(train.head())
This time, we also need to change the data type of the columns gender and occupation to string so that they are treated as categorical variables and hence one-hot encoded.
In [9]:
y_train = train['rating']
train.drop(['rating'], axis=1, inplace=True)
train['userId'] = train['userId'].astype('str')
train['itemId'] = train['itemId'].astype('str')
train['gender'] = train['gender'].astype('str')
train['occupation'] = train['occupation'].astype('str')
y_test = test['rating']
test.drop(['rating'], axis=1, inplace=True)
test['userId'] = test['userId'].astype('str')
test['itemId'] = test['itemId'].astype('str')
test['gender'] = test['gender'].astype('str')
test['occupation'] = test['occupation'].astype('str')
Before training, we also need to normalize the age column since the values are greatly different from the other columns and hence will hamper the performance of the model. We choose min-max normaliztion here.
In [11]:
train['age'] = (train['age']-train['age'].min())/(train['age'].max()-train['age'].min())
test['age'] = (test['age']-test['age'].min())/(test['age'].max()-test['age'].min())
In [17]:
f = FM(k=10, iterations = 60, learning_rate = 0.003, regularizer=0.005)
f.fit(X=train, y=y_train)
In [18]:
y_pred = f.predict(test)
print("MSE: {}".format(mean_squared_error(y_test, y_pred)))
In [ ]: