Offline:
Online:
In [27]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_style('white')
%matplotlib inline
In [28]:
import sys
import os
sys.path.append('../')
os.getcwd()
Out[28]:
In [39]:
import src
import src.recommendation
reload(src.recommendation)
from src.recommendation import *
See notes in the creating_dataset_for_evaluation.ipynb
From full dataset
In [18]:
# load smaller user behavior dataset
user_profile = pd.read_pickle('../data_user_view_buy/user_profile_items_nonnull_features_20_mins_5_views_v2_sample1000.pkl')
user_sample = user_profile.user_id.unique()
print(len(user_profile))
print(len(user_sample))
user_profile.head()
Out[18]:
In [19]:
# requires nn features
spu_fea = pd.read_pickle("../data_nn_features/spu_fea_sample1000.pkl")
In [20]:
# make sure all items have features ?? One missing
print(len(set(list(user_profile.buy_spu.unique())+list(user_profile.view_spu.unique()))))
print(len(spu_fea.spu_id.unique()))
In [10]:
# this might be faster #
# ## Precalculate average feature per user
# average_viewed_features_dict = {}
# for user_id in user_profile.user_id.unique():
# average_viewed_features_dict[user_id] = get_user_average_features(user_id,user_profile,spu_fea)
In [21]:
def get_user_buy_ranks(users_sample,user_profile,spu_fea,method,randomize_scores=False):
user_buy_ranks = np.empty(len(users_sample))
no_ranks = np.empty(len(users_sample))
for ui,user_id in enumerate(users_sample):
print(ui)
# rank items
item_score_in_category = rank_candidates(user_id,user_profile,spu_fea,method=method,extra_inputs={},randomize_scores=randomize_scores)
# get bought item rank and store into array
user_buy_ranks[ui]=item_score_in_category.loc[item_score_in_category.buy==1,'rank'].as_matrix()[0]
# get number of ranks per category
no_ranks[ui]=item_score_in_category['rank'].max()
return(user_buy_ranks,no_ranks,item_score_in_category)
In [22]:
users_sample = np.random.choice(user_sample,size=50)
In [23]:
# nathan's
user_buy_ranks1,no_ranks1,item_score_in_category=get_user_buy_ranks(users_sample,user_profile,spu_fea,method='AverageFeatureSim')
In [40]:
# just taking the last item
user_buy_ranks2,no_ranks2,_=get_user_buy_ranks(users_sample,user_profile,spu_fea,method='LastItemSim')
In [43]:
# randomize
user_buy_ranks3,no_ranks3,_=get_user_buy_ranks(users_sample,user_profile,spu_fea,method='Randomize',randomize_scores=True)
In [44]:
# stack
rank_percent = np.vstack((user_buy_ranks1/no_ranks1,user_buy_ranks2/no_ranks2,user_buy_ranks3/no_ranks3))
print(rank_percent.shape)
In [47]:
# Plot
mean = rank_percent.mean(axis=1)
n = np.shape(rank_percent)[1]
m = np.shape(rank_percent)[0]
print(n)
print(m)
sem = rank_percent.std(axis=1)/np.sqrt(n)
plt.errorbar(np.arange(m),y=mean,yerr=sem,linestyle='None',marker='o')
plt.xticks(np.arange(m),['AvgFeatures','LastFeat','Random \n Guess'])
plt.xlim([-1,m+1])
plt.ylim(0,1)
sns.despine()
plt.title('Recommendor Comparison')
plt.ylabel('Average (Buy Rank / # in Buy Category)')
plt.axhline(y=0.5,linestyle='--')
savefile = '../figures/recommender_comparison_sample_1000_subsample50_v1.png'
plt.savefig(savefile,dpi=300)
In [49]:
from src import s3_data_management
s3_data_management.push_results_to_s3(os.path.basename(savefile),savefile)
In [ ]:
%%bash
jupyter nbconvert --to slides Recommendation_Compare_Methods.ipynb && mv Recommendation_Compare_Methods.slides.html ../notebook_slides/Recommendation_Compare_Methods_v1.slides.html
jupyter nbconvert --to html Recommendation_Compare_Methods.ipynb && mv Recommendation_Compare_Methods.html ../notebook_htmls/Recommendation_Compare_Methods_v1.html
cp Recommendation_Compare_Methods.ipynb ../notebook_versions/Recommendation_Compare_Methods_v1.ipynb