Offline:
Online:
In [1]:
    
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_style('white')
%matplotlib inline
    
In [3]:
    
# load smaller user behavior dataset
user_profile = pd.read_pickle('../data_user_view_buy/user_profile_items_nonnull_features_20_mins_5_views.pkl')
    
In [4]:
    
user_profile.head()
    
    Out[4]:
In [5]:
    
# load item features (indexed by spu)
spu_fea = pd.read_pickle("../data_nn_features/spu_fea.pkl") #takes forever to load
    
In [6]:
    
len(user_profile)
    
    Out[6]:
In [7]:
    
users = user_profile.user_id.unique()
len(users)
    
    Out[7]:
In [8]:
    
len(user_profile.buy_sn.unique())
    
    Out[8]:
In [11]:
    
# sample 100 users 
users_sample = np.random.choice(users,size=100)
user_profile_sample = user_profile.loc[user_profile.user_id.isin(users_sample),]
len(user_profile_sample)
    
    Out[11]:
In [16]:
    
users_sample
    
    Out[16]:
In [12]:
    
# make a function for each user?? 
user_buy_dict = {}
average_viewed_features_dict = {}
# loop through users 
for user_id in users_sample:
    
    # get his trajectory 
    trajectory = user_profile_sample.loc[user_profile_sample.user_id==user_id,]
    # save buy image
    user_buy_dict[user_id] = trajectory.buy_spu.as_matrix()[0]
    
    # save buy category 
    
    
    # remove buy item
    trajectory = trajectory.loc[trajectory.view_spu!=user_buy_dict[user_id]]
    
    
    n_features = len(spu_fea.features.as_matrix()[0])
    n_views = len(trajectory)
    
    # get previous views
    features_items = np.empty((n_features,n_views))
    for vi,view_spu in enumerate(trajectory.view_spu):
        
        # load features for image
        if view_spu in spu_fea.spu_id.values:
            features_items[:,vi] = spu_fea.loc[spu_fea.spu_id==view_spu,'features'].as_matrix()[0] # return a 1-D np array 
        else:
            # this shouldn't happen 
            raise ValueError('all items should have features')
            features_items[:,vi] = np.ones(n_features) # if features don't exist for an item, add array of ones (shouldn't change average)
    # average features
    average_viewed_features_dict[user_id] = np.mean(features_items,axis=1)
    
In [14]:
    
#average_viewed_features_dict
    
In [15]:
    
def dot(K, L):
    if len(K) != len(L): return 0
    return sum(i[0]*i[1] for i in zip(K, L))
def similarity(item_1, item_2):
        return dot(item_1, item_2) / np.sqrt(dot(item_1, item_1)*dot(item_2, item_2))
    
In [17]:
    
# for each user 
user_buy_ranks = np.empty(len(users_sample))
no_ranks = np.empty(len(users_sample))
for ui,user_id in enumerate(users_sample):
    print(ui)
    # load average trajectory 
    average_features = average_viewed_features_dict[user_id]
    
    # get bought item
    buy_spu = user_buy_dict[user_id]
    
    # find buy item categoriy 
    buy_sn = user_profile_sample.loc[user_profile_sample['buy_spu']==buy_spu,'buy_sn'].as_matrix()[0] # should assert they are all the same
    
    # find all other items in the category
    spus_in_category_b = user_profile.loc[user_profile.buy_sn==buy_sn,'buy_spu'].unique() 
    spus_in_category_v = user_profile.loc[user_profile.view_sn==buy_sn,'view_spu'].unique() 
    spus_in_category = list(spus_in_category_b)+list(spus_in_category_v)
    
    # make sure buy item is in list
    assert buy_spu in spus_in_category
    
    # does it make sense to pre-calculate this matrix of similarities (average user similarity for each bought item)  # 
    
    # calculate similarity with all candidate in buy items 
    item_sim_in_category = pd.DataFrame(data = spus_in_category,columns=['spu'])
    for spu in spus_in_category:
        
        # load features for image
        features_other = spu_fea.loc[spu_fea.spu_id==spu,'features'].as_matrix()[0] # return a 1-D np array  
        item_sim_in_category.loc[item_sim_in_category['spu']==spu,'similarity']= similarity(average_features,features_other)
    
    item_sim_in_category['rank']=item_sim_in_category['similarity'].rank()    
    user_buy_ranks[ui]=item_sim_in_category.loc[item_sim_in_category.spu==buy_spu,'rank'].as_matrix()[0]
    no_ranks[ui]=item_sim_in_category['rank'].max()
    
    
In [18]:
    
item_sim_in_category.sort_values(by='rank')
    
    Out[18]:
In [19]:
    
user_buy_ranks[ui]
    
    Out[19]:
In [20]:
    
item_sim_in_category['rank'].max()
item_sim_in_category['rank'].unique()
    
    Out[20]:
In [33]:
    
# plt.subplot(1,3,1)
# plt.scatter(np.arange(len(users_sample)),user_buy_ranks)
# plt.subplot(1,3,2)
# plt.scatter(np.arange(len(users_sample)),no_ranks)
plt.subplot(1,1,1)
plt.scatter(np.arange(len(users_sample)),user_buy_ranks/no_ranks)
sns.despine()
plt.axhline(y=0.5,label='chance',c='k',linestyle='--')
plt.axhline(y=np.mean(user_buy_ranks/no_ranks),label='mean')
plt.legend()
plt.xlabel('user (chosen randomly)')
plt.ylabel('ratio: buy rank / items in buy category')
    
    Out[33]:
    
In [35]:
    
%%bash 
jupyter nbconvert --to slides Recommendor_Method_Nathans.ipynb && mv Recommendor_Method_Nathans.slides.html ../notebook_slides/Recommendor_Method_Nathans_v2.slides.html
jupyter nbconvert --to html Recommendor_Method_Nathans.ipynb && mv Recommendor_Method_Nathans.html ../notebook_htmls/Recommendor_Method_Nathans_v2.html
cp Recommendor_Method_Nathans.ipynb ../notebook_versions/Recommendor_Method_Nathans_v2.ipynb