Versions of Dataset:
In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
In [2]:
# get data
user_profile = pd.read_csv('../data_user_view_buy/user_profile.csv',sep='\t',header=None)
user_profile.columns = ['user_id','buy_spu','buy_sn','buy_ct3','view_spu','view_sn','view_ct3','time_interval','view_cnt','view_seconds']
In [3]:
user_profile.head()
Out[3]:
In [4]:
spu_fea = pd.read_pickle("../data_nn_features/spu_fea.pkl") #takes forever to load
In [5]:
spu_fea.head()
Out[5]:
In [6]:
spu_fea = spu_fea.reset_index()
In [7]:
spu_fea['view_spu']=spu_fea['spu_id']
In [8]:
spu_fea['view_spu']=spu_fea['spu_id']
user_profile_w_features = user_profile.merge(spu_fea,on='view_spu',how='left')
print('before merge nrow: {0}').format(len(user_profile))
print('after merge nrows:{0}').format(len(user_profile_w_features))
In [9]:
user_profile_w_features.head(20)
Out[9]:
In [10]:
# takes too long
# user_profile_w_features.to_csv('../../data_user_view_buy/user_profile_items_with_features.csv') # I think this takes to long to save.
In [11]:
len(user_profile_w_features)
Out[11]:
In [12]:
user_profile_w_features_nonnull = user_profile_w_features.loc[~user_profile_w_features.features.isnull(),]
len(user_profile_w_features_nonnull)
Out[12]:
In [13]:
spus_with_features =user_profile_w_features_nonnull.spu_id.unique() #
user_profile_w_features_nonnull = user_profile_w_features_nonnull[user_profile_w_features_nonnull['buy_spu'].isin(spus_with_features)]
In [14]:
len(user_profile_w_features_nonnull)
Out[14]:
In [15]:
# remove rows <20 minutes before
user_profile_w_features_nonnull_20 = user_profile_w_features_nonnull.loc[(user_profile_w_features_nonnull.time_interval/60.0)>20.0]
len(user_profile_w_features_nonnull_20)
Out[15]:
In [16]:
view_counts_per_user = user_profile_w_features_nonnull_20[['user_id','view_spu']].groupby(['user_id']).agg(['count'])
view_counts_per_user.head()
Out[16]:
In [17]:
user_profile_w_features_nonnull_20_5 = user_profile_w_features_nonnull_20.join(view_counts_per_user, on='user_id', rsuffix='_r')
columns = user_profile_w_features_nonnull_20_5.columns.values
columns[-1]='view_spu_count'
user_profile_w_features_nonnull_20_5.columns=columns
user_profile_w_features_nonnull_20_5.head()
Out[17]:
In [18]:
user_profile_w_features_nonnull_20_5 = user_profile_w_features_nonnull_20_5.loc[user_profile_w_features_nonnull_20_5.view_spu_count>5,]
In [19]:
len(user_profile_w_features_nonnull_20_5)
Out[19]:
In [20]:
user_profile_w_features_nonnull_20_5.user_id.unique()
Out[20]:
In [21]:
# (super slow way of doing it)
user_profile_w_features_nonnull_20_5['drop']=0
for user_id in user_profile_w_features_nonnull_20_5.user_id.unique():
# get bought items per user
buy_spus = user_profile_w_features_nonnull_20_5.loc[user_profile_w_features_nonnull_20_5.user_id==user_id,'buy_spu'].unique()
# eliminate second, third .. purchases
if len(buy_spus)>1:
for buy_spu in buy_spus[1::]:
user_profile_w_features_nonnull_20_5.loc[(user_profile_w_features_nonnull_20_5.user_id==user_id)&(user_profile_w_features_nonnull_20_5.buy_spu==buy_spu),'drop']=1
In [26]:
print(len(user_profile_w_features_nonnull_20_5))
user_profile_w_features_nonnull_20_5 = user_profile_w_features_nonnull_20_5.loc[user_profile_w_features_nonnull_20_5['drop']!=1]
print(len(user_profile_w_features_nonnull_20_5))
Out[26]:
In [32]:
user_profile_w_features_nonnull_20_5_nofeatures = user_profile_w_features_nonnull_20_5.drop('features',axis=1)
In [34]:
user_profile_w_features_nonnull_20_5_nofeatures.to_pickle('../data_user_view_buy/user_profile_items_nonnull_features_20_mins_5_views_v2.pkl')
In [49]:
# sample 1000 users
np.random.seed(1000)
users_sample = np.random.choice(user_profile_w_features_nonnull_20_5_nofeatures.user_id.unique(),size=1000)
print(users_sample[0:10])
user_profile_sample = user_profile_w_features_nonnull_20_5_nofeatures.loc[user_profile_w_features_nonnull_20_5_nofeatures.user_id.isin(users_sample),]
print(len(user_profile_sample))
print(len(user_profile_sample.user_id.unique()))
In [37]:
user_profile_sample.to_pickle('../data_user_view_buy/user_profile_items_nonnull_features_20_mins_5_views_v2_sample1000.pkl')
In [44]:
intersection_of_spus = set(list(user_profile_sample.view_spu.unique())+list(user_profile_sample.buy_spu.unique()))
spu_fea_sample = spu_fea.loc[spu_fea['spu_id'].isin(list(intersection_of_spus))]
In [47]:
len(spu_fea)
Out[47]:
In [46]:
len(spu_fea_sample)
Out[46]:
In [45]:
spu_fea_sample.to_pickle('../data_nn_features/spu_fea_sample1000.pkl')