Offline:
Online:
In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_style('white')
%matplotlib inline
In [3]:
# load smaller user behavior dataset
user_profile = pd.read_pickle('../data_user_view_buy/user_profile_items_nonnull_features_20_mins_5_views.pkl')
In [4]:
user_profile.head()
Out[4]:
In [5]:
# load item features (indexed by spu)
spu_fea = pd.read_pickle("../data_nn_features/spu_fea.pkl") #takes forever to load
In [6]:
len(user_profile)
Out[6]:
In [7]:
users = user_profile.user_id.unique()
len(users)
Out[7]:
In [8]:
len(user_profile.buy_sn.unique())
Out[8]:
In [11]:
# sample 100 users
users_sample = np.random.choice(users,size=100)
user_profile_sample = user_profile.loc[user_profile.user_id.isin(users_sample),]
len(user_profile_sample)
Out[11]:
In [16]:
users_sample
Out[16]:
In [12]:
# make a function for each user??
user_buy_dict = {}
average_viewed_features_dict = {}
# loop through users
for user_id in users_sample:
# get his trajectory
trajectory = user_profile_sample.loc[user_profile_sample.user_id==user_id,]
# save buy image
user_buy_dict[user_id] = trajectory.buy_spu.as_matrix()[0]
# save buy category
# remove buy item
trajectory = trajectory.loc[trajectory.view_spu!=user_buy_dict[user_id]]
n_features = len(spu_fea.features.as_matrix()[0])
n_views = len(trajectory)
# get previous views
features_items = np.empty((n_features,n_views))
for vi,view_spu in enumerate(trajectory.view_spu):
# load features for image
if view_spu in spu_fea.spu_id.values:
features_items[:,vi] = spu_fea.loc[spu_fea.spu_id==view_spu,'features'].as_matrix()[0] # return a 1-D np array
else:
# this shouldn't happen
raise ValueError('all items should have features')
features_items[:,vi] = np.ones(n_features) # if features don't exist for an item, add array of ones (shouldn't change average)
# average features
average_viewed_features_dict[user_id] = np.mean(features_items,axis=1)
In [14]:
#average_viewed_features_dict
In [15]:
def dot(K, L):
if len(K) != len(L): return 0
return sum(i[0]*i[1] for i in zip(K, L))
def similarity(item_1, item_2):
return dot(item_1, item_2) / np.sqrt(dot(item_1, item_1)*dot(item_2, item_2))
In [17]:
# for each user
user_buy_ranks = np.empty(len(users_sample))
no_ranks = np.empty(len(users_sample))
for ui,user_id in enumerate(users_sample):
print(ui)
# load average trajectory
average_features = average_viewed_features_dict[user_id]
# get bought item
buy_spu = user_buy_dict[user_id]
# find buy item categoriy
buy_sn = user_profile_sample.loc[user_profile_sample['buy_spu']==buy_spu,'buy_sn'].as_matrix()[0] # should assert they are all the same
# find all other items in the category
spus_in_category_b = user_profile.loc[user_profile.buy_sn==buy_sn,'buy_spu'].unique()
spus_in_category_v = user_profile.loc[user_profile.view_sn==buy_sn,'view_spu'].unique()
spus_in_category = list(spus_in_category_b)+list(spus_in_category_v)
# make sure buy item is in list
assert buy_spu in spus_in_category
# does it make sense to pre-calculate this matrix of similarities (average user similarity for each bought item) #
# calculate similarity with all candidate in buy items
item_sim_in_category = pd.DataFrame(data = spus_in_category,columns=['spu'])
for spu in spus_in_category:
# load features for image
features_other = spu_fea.loc[spu_fea.spu_id==spu,'features'].as_matrix()[0] # return a 1-D np array
item_sim_in_category.loc[item_sim_in_category['spu']==spu,'similarity']= similarity(average_features,features_other)
item_sim_in_category['rank']=item_sim_in_category['similarity'].rank()
user_buy_ranks[ui]=item_sim_in_category.loc[item_sim_in_category.spu==buy_spu,'rank'].as_matrix()[0]
no_ranks[ui]=item_sim_in_category['rank'].max()
In [18]:
item_sim_in_category.sort_values(by='rank')
Out[18]:
In [19]:
user_buy_ranks[ui]
Out[19]:
In [20]:
item_sim_in_category['rank'].max()
item_sim_in_category['rank'].unique()
Out[20]:
In [33]:
# plt.subplot(1,3,1)
# plt.scatter(np.arange(len(users_sample)),user_buy_ranks)
# plt.subplot(1,3,2)
# plt.scatter(np.arange(len(users_sample)),no_ranks)
plt.subplot(1,1,1)
plt.scatter(np.arange(len(users_sample)),user_buy_ranks/no_ranks)
sns.despine()
plt.axhline(y=0.5,label='chance',c='k',linestyle='--')
plt.axhline(y=np.mean(user_buy_ranks/no_ranks),label='mean')
plt.legend()
plt.xlabel('user (chosen randomly)')
plt.ylabel('ratio: buy rank / items in buy category')
Out[33]:
In [35]:
%%bash
jupyter nbconvert --to slides Recommendor_Method_Nathans.ipynb && mv Recommendor_Method_Nathans.slides.html ../notebook_slides/Recommendor_Method_Nathans_v2.slides.html
jupyter nbconvert --to html Recommendor_Method_Nathans.ipynb && mv Recommendor_Method_Nathans.html ../notebook_htmls/Recommendor_Method_Nathans_v2.html
cp Recommendor_Method_Nathans.ipynb ../notebook_versions/Recommendor_Method_Nathans_v2.ipynb