Save out dataset for Evaluation: Dataset Eval 1

  • this saves out a smaller dataset to compare different recommendation algorithms on
  • it removes rows with viewed items that do not have features
  • it removes items viewed less 20 minutes before buying
  • it then removes users with <5 viewed items before buying.

Versions of Dataset:

  • v1: starting point
  • v2: removing rows for second items bought by user - I only want want trajectory per user so that I don't mess things up later (calculating similarity etc).

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
# get data
user_profile = pd.read_csv('../data_user_view_buy/user_profile.csv',sep='\t',header=None)
user_profile.columns = ['user_id','buy_spu','buy_sn','buy_ct3','view_spu','view_sn','view_ct3','time_interval','view_cnt','view_seconds']

In [3]:
user_profile.head()


Out[3]:
user_id buy_spu buy_sn buy_ct3 view_spu view_sn view_ct3 time_interval view_cnt view_seconds
0 3125745546 3454147345092617 10020201 334 100281596405534762 10029251 334 311066 3 50
1 3125745546 3454147345092617 10020201 334 104785174046949392 10014206 334 499624 1 7
2 3125745546 3454147345092617 10020201 334 10491053651988480 10010280 334 409509 2 9
3 3125745546 3454147345092617 10020201 334 20342683550576642 10015063 334 313623 1 2
4 3125745546 3454147345092617 10020201 334 224975005672079387 10020201 334 451607 4 12

In [4]:
spu_fea = pd.read_pickle("../data_nn_features/spu_fea.pkl") #takes forever to load

In [5]:
spu_fea.head()


Out[5]:
spu_id features
1 100000087145246721 [0.009, 0.839, 0.439, 1.673, 0.226, 0.055, 0.1...
2 100000090992795708 [0.0, 1.878, 0.353, 1.505, 0.047, 0.003, 0.0, ...
4 100281553839460375 [0.661, 1.84, 0.028, 1.128, 0.172, 1.097, 1.45...
8 100844539282133007 [0.444, 0.428, 0.0, 0.342, 0.314, 0.015, 0.529...
9 101688901872443404 [0.058, 2.361, 0.0, 1.037, 0.013, 1.009, 1.543...

In [6]:
spu_fea = spu_fea.reset_index()

Merge user data with feature data


In [7]:
spu_fea['view_spu']=spu_fea['spu_id']

In [8]:
spu_fea['view_spu']=spu_fea['spu_id']
user_profile_w_features = user_profile.merge(spu_fea,on='view_spu',how='left')
print('before merge nrow: {0}').format(len(user_profile))
print('after merge nrows:{0}').format(len(user_profile_w_features))


before merge nrow: 6538474
after merge nrows:6538474

In [9]:
user_profile_w_features.head(20)


Out[9]:
user_id buy_spu buy_sn buy_ct3 view_spu view_sn view_ct3 time_interval view_cnt view_seconds index spu_id features
0 3125745546 3454147345092617 10020201 334 100281596405534762 10029251 334 311066 3 50 74770.0 1.002816e+17 [0.0, 0.442, 0.0, 3.436, 0.061, 0.328, 0.262, ...
1 3125745546 3454147345092617 10020201 334 104785174046949392 10014206 334 499624 1 7 NaN NaN NaN
2 3125745546 3454147345092617 10020201 334 10491053651988480 10010280 334 409509 2 9 NaN NaN NaN
3 3125745546 3454147345092617 10020201 334 20342683550576642 10015063 334 313623 1 2 NaN NaN NaN
4 3125745546 3454147345092617 10020201 334 224975005672079387 10020201 334 451607 4 12 78708.0 2.249750e+17 [0.133, 0.082, 0.776, 0.481, 0.077, 0.256, 0.0...
5 3125745546 3454147345092617 10020201 334 232293356547952640 10015864 334 228407 1 2 39362.0 2.322934e+17 [0.083, 0.985, 0.501, 0.353, 0.097, 0.0, 0.914...
6 3125745546 3454147345092617 10020201 334 243833829111693330 10020201 334 499375 1 10 44426.0 2.438338e+17 [0.074, 0.114, 0.055, 0.153, 0.117, 0.0, 0.424...
7 3125745546 3454147345092617 10020201 334 245804153948667933 10020201 334 499347 1 2 62393.0 2.458042e+17 [0.098, 0.685, 0.072, 0.203, 0.009, 0.009, 0.7...
8 3125745546 3454147345092617 10020201 334 25127771918368778 10014206 334 499525 1 5 1554.0 2.512777e+16 [0.063, 0.121, 0.028, 0.0, 0.0, 0.054, 0.006, ...
9 3125745546 3454147345092617 10020201 334 299284399610007558 10024895 334 273848 1 4 70129.0 2.992844e+17 [0.556, 0.275, 0.349, 0.796, 1.001, 0.334, 0.0...
10 3125745546 3454147345092617 10020201 334 4017128505085958 10003609 334 21229 1 4 NaN NaN NaN
11 3125745546 3454147345092617 10020201 334 445369919971500040 10029139 334 54618 1 4 9577.0 4.453699e+17 [0.248, 0.657, 0.136, 0.57, 0.164, 0.0, 0.104,...
12 3125745546 3454147345092617 10020201 334 448466135488008195 10020201 334 282023 1 4 34846.0 4.484661e+17 [0.102, 0.629, 0.264, 0.145, 0.067, 0.118, 0.1...
13 3125745546 3454147345092617 10020201 334 464510191750230038 10000646 334 437330 1 10 59116.0 4.645102e+17 [0.272, 0.945, 0.384, 0.135, 0.289, 0.017, 0.0...
14 3125745546 3454147345092617 10020201 334 5006108822169149442 10003609 334 21355 2 6 NaN NaN NaN
15 3125745546 3454147345092617 10020201 334 5142995081465887 10020201 334 263745 5 37 24881.0 5.142995e+15 [0.148, 1.524, 0.0, 0.068, 0.072, 0.0, 0.292, ...
16 3125745546 3454147345092617 10020201 334 79170908361687055 10020201 334 281834 1 4 NaN NaN NaN
17 3125745546 3454147345092617 10020201 334 83955982965768208 10020201 334 281817 1 9 NaN NaN NaN
18 3125745546 3454147345092617 10020201 334 94370557104062470 10020201 334 71695 27 127 NaN NaN NaN
19 3125745546 3454147345092617 10020201 334 97466840982310922 10028697 334 101524 1 3 7393.0 9.746684e+16 [0.131, 0.41, 0.0, 0.702, 0.052, 0.018, 0.843,...

In [10]:
# takes too long
# user_profile_w_features.to_csv('../../data_user_view_buy/user_profile_items_with_features.csv') # I think this takes to long to save.

Eliminate Rows with viewed items that don't have features

  • this may break up some trajectories (view1,view2,view3-removed, view4,buy).

In [11]:
len(user_profile_w_features)


Out[11]:
6538474

In [12]:
user_profile_w_features_nonnull = user_profile_w_features.loc[~user_profile_w_features.features.isnull(),]
len(user_profile_w_features_nonnull)


Out[12]:
4784587

Eliminate Rows with bought items that don't have features

  • this will eliminate whole trajectories (view1,view2,view3,buy), because each of these rows is labeled with the buy id

In [13]:
spus_with_features =user_profile_w_features_nonnull.spu_id.unique() #
user_profile_w_features_nonnull = user_profile_w_features_nonnull[user_profile_w_features_nonnull['buy_spu'].isin(spus_with_features)]

In [14]:
len(user_profile_w_features_nonnull)


Out[14]:
614560

Eliminate Rows <20 minutes before buy


In [15]:
# remove rows <20 minutes before
user_profile_w_features_nonnull_20  = user_profile_w_features_nonnull.loc[(user_profile_w_features_nonnull.time_interval/60.0)>20.0]
len(user_profile_w_features_nonnull_20)


Out[15]:
550834

Eliminate Users with <5 previously viewed items


In [16]:
view_counts_per_user = user_profile_w_features_nonnull_20[['user_id','view_spu']].groupby(['user_id']).agg(['count'])
view_counts_per_user.head()


Out[16]:
view_spu
count
user_id
149036 28
814009 102
827915 77
1064602 2
1097992 5

In [17]:
user_profile_w_features_nonnull_20_5 = user_profile_w_features_nonnull_20.join(view_counts_per_user, on='user_id', rsuffix='_r')
columns = user_profile_w_features_nonnull_20_5.columns.values
columns[-1]='view_spu_count'
user_profile_w_features_nonnull_20_5.columns=columns
user_profile_w_features_nonnull_20_5.head()


/Users/chris/anaconda/envs/virtenv/lib/python2.7/site-packages/pandas/core/reshape/merge.py:551: UserWarning: merging between different levels can give an unintended result (1 levels on the left, 2 on the right)
  warnings.warn(msg, UserWarning)
Out[17]:
user_id buy_spu buy_sn buy_ct3 view_spu view_sn view_ct3 time_interval view_cnt view_seconds index spu_id features view_spu_count
89 960980006 2463545257346113536 10015294 334 107318456632717316 10015294 334 17345 1 5 69776.0 1.073185e+17 [0.31, 0.741, 0.066, 0.868, 0.266, 0.0, 0.086,... 139
90 960980006 2463545257346113536 10015294 334 107318460592529411 10007066 334 193314 1 5 17665.0 1.073185e+17 [0.0, 2.812, 0.284, 0.462, 0.002, 0.0, 0.417, ... 139
91 960980006 2463545257346113536 10015294 334 1376488951765147648 10015294 334 449577 2 43 51881.0 1.376489e+18 [0.563, 0.661, 0.162, 0.813, 0.149, 0.899, 0.5... 139
92 960980006 2463545257346113536 10015294 334 1600824511410274307 10015294 334 450245 1 7 53203.0 1.600825e+18 [0.421, 0.454, 0.0, 0.295, 1.546, 0.36, 0.102,... 139
93 960980006 2463545257346113536 10015294 334 1683296679586492416 10015294 334 16038 1 9 66087.0 1.683297e+18 [0.144, 1.025, 0.064, 1.208, 0.117, 0.205, 0.3... 139

In [18]:
user_profile_w_features_nonnull_20_5 = user_profile_w_features_nonnull_20_5.loc[user_profile_w_features_nonnull_20_5.view_spu_count>5,]

In [19]:
len(user_profile_w_features_nonnull_20_5)


Out[19]:
544913

Only use First Buy per User


In [20]:
user_profile_w_features_nonnull_20_5.user_id.unique()


Out[20]:
array([ 960980006, 4085513489, 3787002243, ..., 2818983094, 1272821118,
       2992930768])

In [21]:
# (super slow way of doing it)
user_profile_w_features_nonnull_20_5['drop']=0
for user_id in user_profile_w_features_nonnull_20_5.user_id.unique(): 
    
    # get bought items per user
    buy_spus = user_profile_w_features_nonnull_20_5.loc[user_profile_w_features_nonnull_20_5.user_id==user_id,'buy_spu'].unique()
    
    # eliminate second, third .. purchases
    if len(buy_spus)>1:
        for buy_spu in buy_spus[1::]:
            user_profile_w_features_nonnull_20_5.loc[(user_profile_w_features_nonnull_20_5.user_id==user_id)&(user_profile_w_features_nonnull_20_5.buy_spu==buy_spu),'drop']=1

In [26]:
print(len(user_profile_w_features_nonnull_20_5))
user_profile_w_features_nonnull_20_5 = user_profile_w_features_nonnull_20_5.loc[user_profile_w_features_nonnull_20_5['drop']!=1]
print(len(user_profile_w_features_nonnull_20_5))


Out[26]:
505723

Remove Features from DF before Saving


In [32]:
user_profile_w_features_nonnull_20_5_nofeatures = user_profile_w_features_nonnull_20_5.drop('features',axis=1)

Save Out


In [34]:
user_profile_w_features_nonnull_20_5_nofeatures.to_pickle('../data_user_view_buy/user_profile_items_nonnull_features_20_mins_5_views_v2.pkl')

Sub-Sample (save out v1000)


In [49]:
# sample 1000 users 
np.random.seed(1000)
users_sample = np.random.choice(user_profile_w_features_nonnull_20_5_nofeatures.user_id.unique(),size=1000)
print(users_sample[0:10])
user_profile_sample = user_profile_w_features_nonnull_20_5_nofeatures.loc[user_profile_w_features_nonnull_20_5_nofeatures.user_id.isin(users_sample),]
print(len(user_profile_sample))
print(len(user_profile_sample.user_id.unique()))


[1773431893 2634598191 1635456877 1053049783 2941097388 3039276910
 4064302909 2969332315  243745077 1530120165]
40141
961

In [37]:
user_profile_sample.to_pickle('../data_user_view_buy/user_profile_items_nonnull_features_20_mins_5_views_v2_sample1000.pkl')

Create Smaller spu_fea for subsample


In [44]:
intersection_of_spus = set(list(user_profile_sample.view_spu.unique())+list(user_profile_sample.buy_spu.unique()))
spu_fea_sample = spu_fea.loc[spu_fea['spu_id'].isin(list(intersection_of_spus))]

In [47]:
len(spu_fea)


Out[47]:
58820

In [46]:
len(spu_fea_sample)


Out[46]:
17987

In [45]:
spu_fea_sample.to_pickle('../data_nn_features/spu_fea_sample1000.pkl')