In [1]:
import datetime
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.cross_validation import train_test_split
from ntflib import betantf
%matplotlib inline
sns.set(style="white")
In [2]:
def mapper(array):
array = np.sort(array)
int_map = np.arange(len(np.unique(array))).astype(int)
dict_map = dict(zip(np.sort(np.unique(array)), int_map))
tmp = pd.Series(array)
res = tmp.map(lambda x: dict_map[x])
inv_dict_map = {v: k for k, v in dict_map.items()}
return res.values, inv_dict_map
def rmse(x, y):
return np.sqrt((x - y)**2.0).sum()
In [2]:
!wget http://files.grouplens.org/datasets/movielens/ml-1m.zip
!unzip ml-1m.zip
In [3]:
ratings = pd.read_table('ml-1m/ratings.dat', sep='::', names=['UserID', 'MovieID', 'Rating', 'Timestamp'])
ratings.Timestamp = ratings.Timestamp.map(lambda x: datetime.datetime.fromtimestamp(x).strftime('%Y-%m'))
# movies = pd.read_table('ml-1m/movies.dat', sep='::', names=['MovieID', 'Title', 'Genres'])
# users = pd.read_table('ml-1m/users.dat', sep='::', names=['UserID' ,'Gender', 'Age', 'Occupation::Zip-code'])
In [4]:
# Converting dates to integers
ratings['UserID'], inv_uid_dict = mapper(ratings['UserID'])
ratings['MovieID'], inv_mid_dict = mapper(ratings['MovieID'])
ratings['Timestamp'], inv_ts_dict = mapper(ratings['Timestamp'])
In [5]:
x_indices = ratings[['UserID', 'MovieID', 'Timestamp']].copy()
x_indices['UserID'] = x_indices['UserID'] - x_indices['UserID'].min()
x_indices['MovieID'] = x_indices['MovieID'] - x_indices['MovieID'].min()
x_indices['Timestamp'] = x_indices['Timestamp'] - x_indices['Timestamp'].min()
print x_indices.min()
x_indices = x_indices.values
x_vals = ratings['Rating'].values
In [6]:
print 'Number of unique movie IDs: {0}'.format(len(ratings['MovieID'].unique()))
print 'Max movie ID: {0}'.format(ratings['MovieID'].max())
In [7]:
indices_train, indices_test, val_train, val_test = train_test_split(
x_indices, x_vals, test_size=0.40, random_state=42)
shape_uid = len(np.unique(x_indices[:,0]))
shape_mid = len(np.unique(x_indices[:,1]))
shape_ts = len(np.unique(x_indices[:,2]))
shape = [shape_uid, shape_mid, shape_ts]
shape
Out[7]:
In [8]:
indices_train
Out[8]:
In [9]:
# shape = [len(np.unique(ratings[x])) for x in ['UserID', 'MovieID', 'Timestamp']]
bnf = betantf.BetaNTF(shape, n_components=5, n_iters=10)
before = bnf.score(indices_train, val_train)
initial = bnf.impute(x_indices)
reconstructed = bnf.fit(indices_train, val_train)
after = bnf.score()
assert(after < before)
In [ ]:
debug
In [83]:
prediction = bnf.impute(indices_test)
In [85]:
rmse(prediction, val_test) / float(prediction.shape[0])
Out[85]:
In [11]:
!cat ml-1m/README
In [ ]:
In [ ]: