Twitter: https://twitter.com/fredayala
LinkedIn: https://linkedin.com/in/frederickayala
GitHub: https://github.com/frederickayala
Few notes:
Installation steps:
In [1]:
# -*- coding: utf-8 -*-
import theano
import pickle
import sys
import os
sys.path.append('../..')
import numpy as np
import pandas as pd
import gru4rec #If this shows an error probably the notebook is not in GRU4Rec/examples/rsc15/
import evaluation
In [2]:
# Validate that the following assert makes sense in your platform
# This works on Windows with a NVIDIA GPU
# In other platforms theano.config.device gives other things than 'cuda' when using the GPU
assert 'cuda' in theano.config.device,("Theano is not configured to use the GPU. Please check .theanorc. "
"Check http://deeplearning.net/software/theano/tutorial/using_gpu.html")
In [3]:
PATH_TO_TRAIN = 'C:/Users/frede/datasets/recsys2015/rsc15_train_full.txt'
PATH_TO_TEST = 'C:/Users/frede/datasets/recsys2015/rsc15_test.txt'
data = pd.read_csv(PATH_TO_TRAIN, sep='\t', dtype={'ItemId':np.int64})
valid = pd.read_csv(PATH_TO_TEST, sep='\t', dtype={'ItemId':np.int64})
In [4]:
%matplotlib inline
import numpy as np
import pandas as pd
from scipy import stats, integrate
import matplotlib.pyplot as plt
import seaborn as sns
sns.set(color_codes=True)
In [5]:
data.head()
Out[5]:
In [6]:
valid.head()
Out[6]:
In [7]:
sessions_training = set(data.SessionId)
print("There are %i sessions in the training dataset" % len(sessions_training))
sessions_testing = set(valid.SessionId)
print("There are %i sessions in the testing dataset" % len(sessions_testing))
assert len(sessions_testing.intersection(sessions_training)) == 0, ("Huhu!"
"there are sessions from the testing set in"
"the training set")
print("Sessions in the testing set doesn't exist in the training set")
In [8]:
items_training = set(data.ItemId)
print("There are %i items in the training dataset" % len(items_training))
items_testing = set(valid.ItemId)
print("There are %i items in the testing dataset" % len(items_testing))
assert items_testing.issubset(items_training), ("Huhu!"
"there are items from the testing set "
"that are not in the training set")
print("Items in the testing set exist in the training set")
In [9]:
df_visualization = data.copy()
df_visualization["value"] = 1
df_item_count = df_visualization[["ItemId","value"]].groupby("ItemId").sum()
In [10]:
# Most of the items are infrequent
df_item_count.describe().transpose()
Out[10]:
In [11]:
fig = plt.figure(figsize=[15,8])
ax = fig.add_subplot(111)
ax = sns.kdeplot(df_item_count["value"], ax=ax)
ax.set(xlabel='Item Frequency', ylabel='Kernel Density Estimation')
plt.show()
fig = plt.figure(figsize=[15,8])
ax = fig.add_subplot(111)
ax = sns.distplot(df_item_count["value"],
hist_kws=dict(cumulative=True),
kde_kws=dict(cumulative=True))
ax.set(xlabel='Item Frequency', ylabel='Cummulative Probability')
plt.show()
In [12]:
# Let's analyze the co-occurrence
df_cooccurrence = data.copy()
df_cooccurrence["next_SessionId"] = df_cooccurrence["SessionId"].shift(-1)
df_cooccurrence["next_ItemId"] = df_cooccurrence["ItemId"].shift(-1)
df_cooccurrence["next_Time"] = df_cooccurrence["Time"].shift(-1)
df_cooccurrence = df_cooccurrence.query("SessionId == next_SessionId").dropna()
df_cooccurrence["next_ItemId"] = df_cooccurrence["next_ItemId"].astype(int)
df_cooccurrence["next_SessionId"] = df_cooccurrence["next_SessionId"].astype(int)
In [13]:
df_cooccurrence.head()
Out[13]:
In [14]:
df_cooccurrence["time_difference_minutes"] = np.round((df_cooccurrence["next_Time"] - df_cooccurrence["Time"]) / 60, 2)
df_cooccurrence[["time_difference_minutes"]].describe().transpose()
Out[14]:
In [15]:
df_cooccurrence["value"] = 1
df_cooccurrence_sum = df_cooccurrence[["ItemId","next_ItemId","value"]].groupby(["ItemId","next_ItemId"]).sum().reset_index()
In [16]:
df_cooccurrence_sum[["value"]].describe().transpose()
Out[16]:
In [17]:
n_layers = 100
save_to = os.path.join(os.path.dirname(PATH_TO_TEST), "gru_" + str(n_layers) +".pickle")
In [18]:
if not os.path.exists(save_to):
print('Training GRU4Rec with ' + str(n_layers) + ' hidden units')
gru = gru4rec.GRU4Rec(layers=[n_layers], loss='top1', batch_size=50,
dropout_p_hidden=0.5, learning_rate=0.01, momentum=0.0)
gru.fit(data)
pickle.dump(gru, open(save_to, "wb"))
else:
print('Loading existing GRU4Rec model with ' + str(n_layers) + ' hidden units')
gru = pickle.load(open(save_to, "rb"))
In [19]:
res = evaluation.evaluate_sessions_batch(gru, valid, None,cut_off=20)
In [20]:
print('The proportion of cases having the desired item within the top 20 (i.e Recall@20): {}'.format(res[0]))
In [21]:
batch_size = 500
print("Now let's try to predict over the first %i items of our testint dataset" % batch_size)
In [22]:
df_valid = valid.head(batch_size)
df_valid["next_ItemId"] = df_valid["ItemId"].shift(-1)
df_valid["next_SessionId"] = df_valid["SessionId"].shift(-1)
In [23]:
session_ids = valid.head(batch_size)["SessionId"].values
input_item_ids = valid.head(batch_size)["ItemId"].values
predict_for_item_ids=None
In [24]:
%timeit gru.predict_next_batch(session_ids=session_ids, input_item_ids=input_item_ids, predict_for_item_ids=None, batch=batch_size)
In [25]:
df_preds = gru.predict_next_batch(session_ids=session_ids,
input_item_ids=input_item_ids,
predict_for_item_ids=None,
batch=batch_size)
In [26]:
df_valid.shape
Out[26]:
In [27]:
df_preds.shape
Out[27]:
In [28]:
df_preds.columns = df_valid.index.values
In [29]:
len(items_training)
Out[29]:
In [30]:
df_preds
Out[30]:
In [31]:
for c in df_preds:
df_preds[c] = df_preds[c].rank(ascending=False)
In [32]:
df_valid_preds = df_valid.join(df_preds.transpose())
df_valid_preds = df_valid_preds.query("SessionId == next_SessionId").dropna()
df_valid_preds["next_ItemId"] = df_valid_preds["next_ItemId"].astype(int)
df_valid_preds["next_SessionId"] = df_valid_preds["next_SessionId"].astype(int)
df_valid_preds["next_ItemId_at"] = df_valid_preds.apply(lambda x: x[int(x["next_ItemId"])], axis=1)
df_valid_preds_summary = df_valid_preds[["SessionId","ItemId","Time","next_ItemId","next_ItemId_at"]]
df_valid_preds_summary.head(20)
Out[32]:
In [33]:
cutoff = 20
df_valid_preds_summary_ok = df_valid_preds_summary.query("next_ItemId_at <= @cutoff")
df_valid_preds_summary_ok.head(20)
Out[33]:
In [34]:
recall_at_k = df_valid_preds_summary_ok.shape[0] / df_valid_preds_summary.shape[0]
print("The recall@%i for this batch is %f"%(cutoff,recall_at_k))
In [35]:
fig = plt.figure(figsize=[15,8])
ax = fig.add_subplot(111)
ax = sns.kdeplot(df_valid_preds_summary["next_ItemId_at"], ax=ax)
ax.set(xlabel='Next Desired Item @K', ylabel='Kernel Density Estimation')
plt.show()
fig = plt.figure(figsize=[15,8])
ax = fig.add_subplot(111)
ax = sns.distplot(df_valid_preds_summary["next_ItemId_at"],
hist_kws=dict(cumulative=True),
kde_kws=dict(cumulative=True))
ax.set(xlabel='Next Desired Item @K', ylabel='Cummulative Probability')
plt.show()
In [36]:
print("Statistics for the rank of the next desired item (Lower the best)")
df_valid_preds_summary[["next_ItemId_at"]].describe()
Out[36]: