Data Exploration



In [2]:

    
import sys 
import os
sys.path.append(os.getcwd()+'/../')

# other
import numpy as np
import glob
import pandas as pd
import ntpath

#keras
from keras.preprocessing import image

# plotting
import seaborn as sns
sns.set_style('white')
import matplotlib.pyplot as plt
%matplotlib inline

# debuggin
from IPython.core.debugger import Tracer

#stats
import scipy.stats as stats

import bqplot.pyplot as bqplt









    



Using TensorFlow backend.

Data File



In [10]:

    
user_profile = pd.read_csv('../data_user_view_buy/user_profile.csv',sep='\t',header=None)



In [11]:

    
user_profile.columns = ['user_id','buy_spu','buy_sn','buy_ct3','view_spu','view_sn','view_ct3','time_interval','view_cnt','view_seconds']



In [11]:

    
string =str(user_profile.buy_spu.as_matrix()[3002])
print(string)
print(string[0:7]+'-'+string[7::])
#print(str(user_profile.buy_spu.as_matrix()[0])[7::])









    



293936355866378295
2939363-55866378295



In [12]:

    
user_profile.head(10)









    Out[12]:







  
    
      
      user_id
      buy_spu
      buy_sn
      buy_ct3
      view_spu
      view_sn
      view_ct3
      time_interval
      view_cnt
      view_seconds
    
  
  
    
      0
      3125745546
      3454147345092617
      10020201
      334
      100281596405534762
      10029251
      334
      311066
      3
      50
    
    
      1
      3125745546
      3454147345092617
      10020201
      334
      104785174046949392
      10014206
      334
      499624
      1
      7
    
    
      2
      3125745546
      3454147345092617
      10020201
      334
      10491053651988480
      10010280
      334
      409509
      2
      9
    
    
      3
      3125745546
      3454147345092617
      10020201
      334
      20342683550576642
      10015063
      334
      313623
      1
      2
    
    
      4
      3125745546
      3454147345092617
      10020201
      334
      224975005672079387
      10020201
      334
      451607
      4
      12
    
    
      5
      3125745546
      3454147345092617
      10020201
      334
      232293356547952640
      10015864
      334
      228407
      1
      2
    
    
      6
      3125745546
      3454147345092617
      10020201
      334
      243833829111693330
      10020201
      334
      499375
      1
      10
    
    
      7
      3125745546
      3454147345092617
      10020201
      334
      245804153948667933
      10020201
      334
      499347
      1
      2
    
    
      8
      3125745546
      3454147345092617
      10020201
      334
      25127771918368778
      10014206
      334
      499525
      1
      5
    
    
      9
      3125745546
      3454147345092617
      10020201
      334
      299284399610007558
      10024895
      334
      273848
      1
      4



In [14]:

    
print('n rows: {0}').format(len(user_profile))









    



n rows: 6538474

Plotting Functions



In [20]:

    
def plot_trajectory_scatter(user_profile,scatter_color_col=None,samplesize=50,size=10,savedir=None):
    plt.figure(figsize=(12,1*samplesize/10))

    for ui,user_id in enumerate(np.random.choice(user_profile.user_id.unique(),samplesize)):
        trajectory = user_profile.loc[user_profile.user_id==user_id,]

        time = 0-trajectory.time_interval.as_matrix()/60.0/60.0/24.0
        
        # add image or not
        if scatter_color_col is not None:
            c = trajectory[scatter_color_col].as_matrix()
        else:
            c = np.ones(len(trajectory))
        
        plt.scatter(time,np.ones(len(time))*ui,s=size,c=c,edgecolors="none",cmap="jet")
        plt.axvline(x=0,linewidth=1)
        sns.despine()
        plt.title('example user trajectories')
        plt.xlabel('days to purchase')
        if savedir is not None:
            plt.savefig(savedir,dpi=100)

Descriptions of Data



In [13]:

    
user_profile.describe()









    Out[13]:







  
    
      
      user_id
      buy_spu
      buy_sn
      buy_ct3
      view_spu
      view_sn
      view_ct3
      time_interval
      view_cnt
      view_seconds
    
  
  
    
      count
      6.538474e+06
      6.538474e+06
      6.538474e+06
      6538474.0
      6.538474e+06
      6.538474e+06
      6538474.0
      6.538474e+06
      6.538474e+06
      6.538474e+06
    
    
      mean
      2.157486e+09
      7.705014e+17
      1.001342e+07
      334.0
      7.804850e+17
      1.001284e+07
      334.0
      1.695374e+05
      2.124666e+00
      2.282485e+01
    
    
      std
      1.235368e+09
      1.776139e+18
      8.024860e+03
      0.0
      1.785675e+18
      8.012593e+03
      0.0
      1.727738e+05
      3.494220e+00
      5.179874e+01
    
    
      min
      4.079800e+04
      3.578723e+14
      1.000001e+07
      334.0
      3.578723e+14
      1.000001e+07
      334.0
      0.000000e+00
      1.000000e+00
      0.000000e+00
    
    
      25%
      1.087342e+09
      8.536339e+16
      1.000595e+07
      334.0
      8.620783e+16
      1.000571e+07
      334.0
      1.368525e+04
      1.000000e+00
      4.000000e+00
    
    
      50%
      2.161190e+09
      2.500263e+17
      1.001327e+07
      334.0
      2.511522e+17
      1.001232e+07
      334.0
      9.995600e+04
      1.000000e+00
      8.000000e+00
    
    
      75%
      3.224951e+09
      4.403034e+17
      1.002064e+07
      334.0
      4.408663e+17
      1.002042e+07
      334.0
      2.967050e+05
      2.000000e+00
      2.100000e+01
    
    
      max
      4.294950e+09
      9.187062e+18
      1.002953e+07
      334.0
      9.187062e+18
      1.002953e+07
      334.0
      6.047920e+05
      9.590000e+02
      6.369000e+03



In [14]:

    
print('unique users:{0}').format(len(user_profile.user_id.unique()))

print('unique items viewed:{0}').format(len(user_profile.view_spu.unique()))
print('unique items bought:{0}').format(len(user_profile.buy_spu.unique()))

print('unique categories viewed:{0}').format(len(user_profile.view_ct3.unique()))
print('unique categories bought:{0}').format(len(user_profile.buy_ct3.unique()))
print('unique brands viewed:{0}').format(len(user_profile.view_sn.unique()))
print('unique brands bought:{0}').format(len(user_profile.buy_sn.unique()))









    



unique users:99999
unique items viewed:96999
unique items bought:32342
unique categories viewed:1
unique categories bought:1
unique brands viewed:727
unique brands bought:557



In [15]:

    
samplesize = 2000
plt.figure(figsize=(12,4))
plt.subplot(1,3,1)
plt.hist(np.random.choice(user_profile.time_interval.as_matrix()/60.0/60.0,samplesize))
sns.despine()
plt.title('sample histogram from "time interval"')
plt.xlabel('hours from view to buy')
plt.ylabel('counts of items')

plt.subplot(1,3,2)
plt.hist(np.random.choice(user_profile.view_cnt.as_matrix(),samplesize))
sns.despine()
plt.title('sample histogram from "view count"')
plt.xlabel('view counts')
plt.ylabel('counts of items')

plt.subplot(1,3,3)
plt.hist(np.random.choice(user_profile.view_seconds.as_matrix(),samplesize))
sns.despine()
plt.title('sample histogram from "view lengths"')
plt.xlabel('view lengths (seconds)')
plt.ylabel('counts of items')









    Out[15]:





<matplotlib.text.Text at 0x11a316210>

there are many items that are viewed more than a day before buying
most items are viewed less than 10 times and for less than a couple minutes (though need to zoom in)



In [16]:

    
print('longest time interval')
print(user_profile.time_interval.min())

print('longest time interval')
print(user_profile.time_interval.max()/60.0/60.0/24)









    



longest time interval
0
longest time interval
6.99990740741

longest span from viewing to buying is 6 days

Average Time for Items Viewed before Being Bought



In [17]:

    
mean_time_interval = np.array([])
samplesize =1000
for user_id in np.random.choice(user_profile.user_id.unique(),samplesize):
    mean_time_interval = np.append(mean_time_interval, user_profile.loc[user_profile.user_id==user_id,'time_interval'].mean())



In [18]:

    
plt.figure(figsize=(12,3))
plt.hist(mean_time_interval/60.0,bins=200)
sns.despine()
plt.title('sample histogram of average length for user trajectories"')
plt.xlabel('minutes')
plt.ylabel('counts of items out of '+str(samplesize))









    Out[18]:





<matplotlib.text.Text at 0x119017f90>

5% look like they have relatively short sessions (maybe within one sitting)



In [19]:

    
plt.figure(figsize=(12,3))
plt.hist(mean_time_interval/60.0,bins=1000)
plt.xlim(0,100)
sns.despine()
plt.title('sample histogram of average length for user trajectories"')
plt.xlabel('minutes')
plt.ylabel('counts of items out of '+str(samplesize))









    Out[19]:





<matplotlib.text.Text at 0x11a9a2550>

zooming in to look at the shortest sessions.
about 7% have sessions <10 minutes



In [20]:

    
plt.figure(figsize=(8,3))
plt.hist(mean_time_interval/60.0,bins=200,cumulative=True,normed=True)
plt.xlim(0,2000)
sns.despine()
plt.title('sample cdf of average length for user trajectories"')
plt.xlabel('minutes')
plt.ylabel('counts of items out of '+str(samplesize))









    Out[20]:





<matplotlib.text.Text at 0x1272da510>

20% has sessions less <100 minutes

Example Trajectories



In [21]:

    
user_id = 1606682799
trajectory = user_profile.loc[user_profile.user_id==user_id,]
trajectory= trajectory.sort_values(by='time_interval',ascending=False)
trajectory









    Out[21]:







  
    
      
      user_id
      buy_spu
      buy_sn
      buy_ct3
      view_spu
      view_sn
      view_ct3
      time_interval
      view_cnt
      view_seconds
    
  
  
    
      941700
      1606682799
      31038742178611202
      10021072
      334
      1483874171215895
      10011806
      334
      1722
      1
      8
    
    
      3122676
      1606682799
      31038742178611202
      10021072
      334
      244396773139058688
      10023064
      334
      1612
      1
      7
    
    
      3122680
      1606682799
      31038742178611202
      10021072
      334
      98592737342341127
      10020640
      334
      1599
      1
      5
    
    
      941703
      1606682799
      31038742178611202
      10021072
      334
      460569523062440054
      10013861
      334
      1568
      2
      34
    
    
      941701
      1606682799
      31038742178611202
      10021072
      334
      292247513449336850
      10013861
      334
      1533
      1
      3
    
    
      941699
      1606682799
      31038742178611202
      10021072
      334
      102533392945205253
      10020640
      334
      1507
      1
      11
    
    
      5300612
      1606682799
      31038742178611202
      10021072
      334
      457754773298602023
      10013861
      334
      1428
      1
      8
    
    
      3122678
      1606682799
      31038742178611202
      10021072
      334
      295625232722276353
      10021072
      334
      1393
      3
      82
    
    
      941702
      1606682799
      31038742178611202
      10021072
      334
      323209779688951808
      10021072
      334
      1346
      1
      1
    
    
      3122677
      1606682799
      31038742178611202
      10021072
      334
      292810475522134093
      10028876
      334
      415
      1
      6
    
    
      3122679
      1606682799
      31038742178611202
      10021072
      334
      467043461724684336
      10022273
      334
      318
      1
      29
    
    
      5300611
      1606682799
      31038742178611202
      10021072
      334
      31038742178611202
      10021072
      334
      0
      6
      229

this is an example trajectory of someone who browsed a few items and then bought item 31.. within the same session.



In [115]:

    
plot_trajectory_scatter(user_profile)

here are 50 random subjects and when they view items (could make into an interactive plot)

What's the distribution of items that are bought? Are there some items that are much more popular than others?



In [20]:

    
samplesize =1000
number_of_times_item_bought = np.empty(samplesize)
number_of_times_item_viewed = np.empty(samplesize)
for ii,item_id in enumerate(np.random.choice(user_profile.view_spu.unique(),samplesize)):
    number_of_times_item_bought[ii] = len(user_profile.loc[user_profile.buy_spu==item_id,'user_id'].unique()) # assume the same user would not buy the same product 
    number_of_times_item_viewed[ii] = len(user_profile.loc[user_profile.view_spu==item_id]) # same user can view the same image more than once for this count



In [39]:

    
plt.figure(figsize=(12,4))
plt.subplot(1,2,1)
plt.bar(np.arange(len(number_of_times_item_bought)),number_of_times_item_bought)
sns.despine()
plt.title('item popularity (purchases)')
plt.xlabel('item')
plt.ylabel('# of times items were bought')

plt.subplot(1,2,2)
plt.hist(number_of_times_item_bought,bins=100)
sns.despine()
plt.title('item popularity (purchases)')
plt.xlabel('# of times items were bought sample size='+str(samplesize))
plt.ylabel('# of items')









    Out[39]:





<matplotlib.text.Text at 0x12617ed50>



In [38]:

    
plt.figure(figsize=(12,4))
plt.subplot(1,2,1)
plt.bar(np.arange(len(number_of_times_item_viewed)),number_of_times_item_viewed)
sns.despine()
plt.title('item popularity (views)')
plt.xlabel('item')
plt.ylabel('# of times items were viewed')

plt.subplot(1,2,2)
plt.hist(number_of_times_item_bought,bins=100)
sns.despine()
plt.title('item popularity (views) sample size='+str(samplesize))
plt.xlabel('# of times items were viewed')
plt.ylabel('# of items')









    Out[38]:





<matplotlib.text.Text at 0x124742bd0>



In [37]:

    
plt.figure(figsize=(6,4))
plt.subplot(1,1,1)
thresh =30
include = number_of_times_item_bought<thresh
plt.scatter(number_of_times_item_viewed[include],number_of_times_item_bought[include],)
(r,p) = stats.pearsonr(number_of_times_item_viewed[include],number_of_times_item_bought[include])
sns.despine()
plt.xlabel('number of times viewed')
plt.ylabel('number of times bought')
plt.title('r='+str(np.round(r,2))+' data truncated buys<'+str(thresh))









    Out[37]:





<matplotlib.text.Text at 0x12324c110>

Items bought and viewed per user?



In [9]:

    
samplesize =1000
items_bought_per_user = np.empty(samplesize)
items_viewed_per_user = np.empty(samplesize)
for ui,user_id in enumerate(np.random.choice(user_profile.user_id.unique(),samplesize)):
    items_bought_per_user[ui] = len(user_profile.loc[user_profile.user_id==user_id,'buy_spu'].unique())
    items_viewed_per_user[ui] = len(user_profile.loc[user_profile.user_id==user_id,'view_spu'].unique())



In [11]:

    
plt.figure(figsize=(12,4))
plt.subplot(1,2,1)
plt.hist(items_bought_per_user)
sns.despine()
plt.title('number of items bought per user (sample of 1000)')
plt.xlabel('# items bought')
plt.ylabel('# users')

plt.subplot(1,2,2)
plt.hist(items_viewed_per_user)
sns.despine()
plt.title('number of items viewed per user (sample of 1000)')
plt.xlabel('# items viewed')
plt.ylabel('# users')









    Out[11]:





<matplotlib.text.Text at 0x10d6bd290>

How many times did the user buy an item he/she already looked at?



In [ ]:

Image URLs

How many of the SPUs in our dataset (smaller) have urls in our url.csv?



In [65]:

    
urls = pd.read_csv('../../deep-learning-models-master/img/eval_img_url.csv',header=None)
urls.columns = ['spu','url']
print(len(urls))
urls.head(10)









    



33662






    Out[65]:







  
    
      
      spu
      url
    
  
  
    
      0
      100000000317431808
      http://a.vpimg2.com/upload/merchandise/pdc/808...
    
    
      1
      100000001066491904
      http://a.vpimg2.com/upload/merchandise/pdc/904...
    
    
      2
      100000005622976512
      http://a.vpimg2.com/upload/merchandise/pdc/512...
    
    
      3
      100000009136676865
      http://a.vpimg2.com/upload/merchandise/pdcvis/...
    
    
      4
      100000009494401198
      http://a.vpimg2.com/upload/merchandise/pdcvis/...
    
    
      5
      100000019080622267
      http://a.vpimg2.com/upload/merchandise/pdcvis/...
    
    
      6
      100000020837699950
      http://a.vpimg2.com/upload/merchandise/pdcvis/...
    
    
      7
      100000021928063132
      http://a.vpimg2.com/upload/merchandise/pdcvis/...
    
    
      8
      100000030105853978
      http://a.vpimg2.com/upload/merchandise/pdcvis/...
    
    
      9
      100000030491275422
      http://a.vpimg2.com/upload/merchandise/pdcvis/...



In [77]:

    
urls[['spu','url']].groupby(['spu']).agg(['count']).head()









    Out[77]:







  
    
      
      url
    
    
      
      count
    
    
      spu
      
    
  
  
    
      357870273655002
      2
    
    
      357870995513345
      1
    
    
      357872333107204
      1
    
    
      357875526676843
      1
    
    
      357875526680651
      1

items with more than one url?



In [73]:

    
urls.loc[urls.spu==357870273655002,'url'].as_matrix()









    Out[73]:





array([ 'http://a.vpimg2.com/upload/merchandise/pdcvis/2016/08/19/84/4686fee8-e13c-4f2b-afe5-47fd95d81d06.jpg',
       'http://a.vpimg2.com/upload/merchandise/pdcvis/2016/10/12/58/81934e84-b886-40a1-b311-dde38077b19a.jpg'], dtype=object)



In [76]:

    
urls.loc[urls.spu==357889732772303,'url'].as_matrix()









    Out[76]:





array([ 'http://a.vpimg2.com/upload/merchandise/pdcvis/2016/10/28/97/77e274ae-96d5-42ec-a8f7-a0d3e9612bd4.jpg',
       'http://a.vpimg2.com/upload/merchandise/pdcvis/2016/10/19/171/823f738b-0f0c-49f6-851c-a1c60a4fdd03.jpg',
       'http://a.vpimg2.com/upload/merchandise/pdcvis/2016/12/05/37/cdd764f9-48ae-4c57-a715-f9b6763cfc0d.jpg',
       'http://a.vpimg2.com/upload/merchandise/pdcvis/2016/11/09/116/d58df0b7-27a2-48ac-b657-1e85f4e863e3.jpg'], dtype=object)

these are the same item, just different images.



In [82]:

    
#urls.loc[urls.spu==1016200950427238422,'url']



In [84]:

    
tmp_urls = urls.loc[urls.spu==1016200950427238422,'url'].as_matrix()
tmp_urls









    Out[84]:





array([ 'http://a.vpimg2.com/upload/merchandise/pdc/422/238/1016200950427238422/0/2014138502-1554-5.jpg',
       'http://a.vpimg2.com/upload/merchandise/pdc/413/238/1016200950427238413/0/2014138517-9834-5.jpg'], dtype=object)



In [83]:

    
from urllib import urlretrieve
import time



In [88]:

    
# scrape images   
for i,tmp_url in enumerate(tmp_urls):
    urlretrieve(tmp_url, '../data_img_tmp/{}.jpg'.format(i))
    #time.sleep(3)



In [96]:

    
# plot them. 
print('two images from url with same spu (ugh)')
plt.figure(figsize=(8,3))
for i,tmp_url in enumerate(tmp_urls):
    img_path= '../data_img_tmp/{}.jpg'.format(i)
    img = image.load_img(img_path, target_size=(224, 224))
    plt.subplot(1,len(tmp_urls),i+1)
    plt.imshow(img)
    plt.grid(b=False)









    



two images from url with same spu (ugh)

These are different thought!!



In [78]:

    
urls.spu[0]









    Out[78]:





100000000317431808



In [54]:

    
urls.url[0]









    Out[54]:





'http://a.vpimg2.com/upload/merchandise/pdc/808/431/100000000317431808/0/2283211-5.jpg'

the url contains the spu, but I'm not sure what the other numbers are. The goods_num? The category etc?



In [51]:

    
view_spus = user_profile.view_spu.unique()
contained = 0
spus_with_url = list(urls.spu.as_matrix())
for view_spu in view_spus: 
    if view_spu in spus_with_url:
        contained+=1
print(contained/np.float(len(view_spus)))









    



0.090299899999



In [53]:

    
buy_spus = user_profile.buy_spu.unique()
contained = 0
spus_with_url = list(urls.spu.as_matrix())
for buy_spu in buy_spus: 
    if buy_spu in spus_with_url:
        contained+=1
print(contained/np.float(len(buy_spus)))









    



0.0732793271906

we only have the url for 7% of the bought items and 9% of the viewed items



In [64]:

    
buy_spu in spus_with_url









    Out[64]:





False



In [58]:

    
len(urls.spu.unique())
len(user_profile.view_spu.unique())









    Out[58]:





96999

Are the images we have in this new dataset?

at the moment, I don't know how to find the spu of the images we have.

Viewing DataSet with Feature Data in



In [3]:

    
spu_fea = pd.read_pickle("../data_nn_features/spu_fea.pkl") #takes forever to load



In [4]:

    
spu_fea['view_spu']=spu_fea['spu_id']



In [12]:

    
spu_fea['view_spu']=spu_fea['spu_id']
user_profile_w_features = user_profile.merge(spu_fea,on='view_spu',how='left')
print('before merge nrow: {0}').format(len(user_profile))
print('after merge nrows:{0}').format(len(user_profile_w_features))









    



before merge nrow: 6538474
after merge nrows:6538474



In [13]:

    
print('number of items with features: {0}').format(len(spu_fea))









    



number of items with features: 58820



In [14]:

    
spu_fea.head()









    Out[14]:







  
    
      
      spu_id
      features
      view_spu
    
  
  
    
      1
      100000087145246721
      [0.009, 0.839, 0.439, 1.673, 0.226, 0.055, 0.1...
      100000087145246721
    
    
      2
      100000090992795708
      [0.0, 1.878, 0.353, 1.505, 0.047, 0.003, 0.0, ...
      100000090992795708
    
    
      4
      100281553839460375
      [0.661, 1.84, 0.028, 1.128, 0.172, 1.097, 1.45...
      100281553839460375
    
    
      8
      100844539282133007
      [0.444, 0.428, 0.0, 0.342, 0.314, 0.015, 0.529...
      100844539282133007
    
    
      9
      101688901872443404
      [0.058, 2.361, 0.0, 1.037, 0.013, 1.009, 1.543...
      101688901872443404



In [15]:

    
# merge with userdata
spu_fea['view_spu']=spu_fea['spu_id']
user_profile_w_features = user_profile.merge(spu_fea,on='view_spu',how='left')
print('before merge nrow: {0}').format(len(user_profile))
print('after merge nrows:{0}').format(len(user_profile_w_features))









    



before merge nrow: 6538474
after merge nrows:6538474



In [16]:

    
user_profile_w_features['has_features']=user_profile_w_features.groupby(['view_spu'])['spu_id'].apply(lambda x: np.isnan(x))



In [130]:

    
user_profile_w_features.has_features= user_profile_w_features.has_features.astype('int')



In [131]:

    
user_profile_w_features.head()









    Out[131]:







  
    
      
      user_id
      buy_spu
      buy_sn
      buy_ct3
      view_spu
      view_sn
      view_ct3
      time_interval
      view_cnt
      view_seconds
      spu_id
      features
      has_features
    
  
  
    
      0
      3125745546
      3454147345092617
      10020201
      334
      100281596405534762
      10029251
      334
      311066
      3
      50
      1.002816e+17
      [0.0, 0.442, 0.0, 3.436, 0.061, 0.328, 0.262, ...
      0
    
    
      1
      3125745546
      3454147345092617
      10020201
      334
      104785174046949392
      10014206
      334
      499624
      1
      7
      NaN
      NaN
      1
    
    
      2
      3125745546
      3454147345092617
      10020201
      334
      10491053651988480
      10010280
      334
      409509
      2
      9
      NaN
      NaN
      1
    
    
      3
      3125745546
      3454147345092617
      10020201
      334
      20342683550576642
      10015063
      334
      313623
      1
      2
      NaN
      NaN
      1
    
    
      4
      3125745546
      3454147345092617
      10020201
      334
      224975005672079387
      10020201
      334
      451607
      4
      12
      2.249750e+17
      [0.133, 0.082, 0.776, 0.481, 0.077, 0.256, 0.0...
      0

Plotting Trajectories and Seeing How many features we have



In [154]:

    
plot_trajectory_scatter(user_profile_w_features,scatter_color_col='has_features',samplesize=100,size=10,savedir='../../test.png')

What percent of rows have features?



In [108]:

    
1-(user_profile_w_features['features'].isnull()).mean()









    Out[108]:





0.73175897005937474

What percent of bought items are in the feature list?



In [116]:

    
1-user_profile_w_features.groupby(['view_spu'])['spu_id'].apply(lambda x: np.isnan(x)).mean()









    Out[116]:





0.26824102994062526



In [27]:

    
buy_spus = user_profile.buy_spu.unique()
contained = 0
spus_with_features = list(spu_fea.spu_id.as_matrix())
for buy_spu in buy_spus: 
    if buy_spu in spus_with_features:
        contained+=1
print(contained/np.float(len(buy_spus)))









    



0.678931420444



In [28]:

    
contained









    Out[28]:





21958



In [29]:

    
len(buy_spus)









    Out[29]:





32342



In [30]:

    
view_spus = user_profile.view_spu.unique()
contained = 0
spus_with_features = list(spu_fea.spu_id.as_matrix())
for view_spu in view_spus: 
    if view_spu in spus_with_features:
        contained+=1
print(contained/np.float(len(view_spus)))









    



0.606398004103



In [31]:

    
len(view_spus)









    Out[31]:





96999

Evaluation Dataset



In [3]:

    
user_profile = pd.read_pickle('../data_user_view_buy/user_profile_items_nonnull_features_20_mins_5_views.pkl')



In [4]:

    
len(user_profile)









    Out[4]:





544913



In [6]:

    
print('unique users:{0}').format(len(user_profile.user_id.unique()))

print('unique items viewed:{0}').format(len(user_profile.view_spu.unique()))
print('unique items bought:{0}').format(len(user_profile.buy_spu.unique()))

print('unique categories viewed:{0}').format(len(user_profile.view_ct3.unique()))
print('unique categories bought:{0}').format(len(user_profile.buy_ct3.unique()))
print('unique brands viewed:{0}').format(len(user_profile.view_sn.unique()))
print('unique brands bought:{0}').format(len(user_profile.buy_sn.unique()))









    



unique users:11903
unique items viewed:45668
unique items bought:3782
unique categories viewed:1
unique categories bought:1
unique brands viewed:547
unique brands bought:354



In [16]:

    
#user_profile.groupby(['user_id'])['buy_spu'].nunique()



In [10]:

    
# how many items bought per user in this dataset? 
plt.figure(figsize=(8,3))
plt.hist(user_profile.groupby(['user_id'])['buy_spu'].nunique(),bins=20,normed=False)
sns.despine()
plt.xlabel('number of items bought per user')
plt.ylabel('number of user')









    Out[10]:





<matplotlib.text.Text at 0x124d72890>



In [15]:

    
user_profile.loc[user_profile.user_id==4283991208,]









    Out[15]:







  
    
      
      user_id
      buy_spu
      buy_sn
      buy_ct3
      view_spu
      view_sn
      view_ct3
      time_interval
      view_cnt
      view_seconds
      index
      spu_id
      features
      view_spu_count
    
  
  
    
      1717617
      4283991208
      3172717375315987
      10026622
      334
      103940738865885244
      10011540
      334
      93823
      1
      3
      1291.0
      1.039407e+17
      [0.0, 0.525, 0.101, 0.28, 0.0, 0.0, 0.018, 0.2...
      294
    
    
      1717621
      4283991208
      3172717375315987
      10026622
      334
      13305822896521234
      10014206
      334
      75794
      1
      4
      66061.0
      1.330582e+16
      [0.052, 0.05, 0.002, 0.0, 0.021, 0.0, 0.039, 0...
      294
    
    
      1717623
      4283991208
      3172717375315987
      10026622
      334
      1483775526015157
      10026622
      334
      21742
      1
      26
      10175.0
      1.483776e+15
      [0.267, 0.28, 0.003, 1.297, 0.28, 0.013, 0.162...
      294
    
    
      1717624
      4283991208
      3172717375315987
      10026622
      334
      17527905451290635
      10014936
      334
      94170
      5
      28
      7548.0
      1.752791e+16
      [0.175, 3.27, 0.225, 1.059, 0.561, 0.019, 0.03...
      294
    
    
      1717625
      4283991208
      3172717375315987
      10026622
      334
      225537933784404046
      10005711
      334
      103261
      1
      5
      80000.0
      2.255379e+17
      [0.651, 0.679, 0.0, 0.598, 0.215, 0.001, 0.04,...
      294
    
    
      1717627
      4283991208
      3172717375315987
      10026622
      334
      226945303689633846
      10011540
      334
      93076
      2
      13
      83804.0
      2.269453e+17
      [0.078, 0.471, 0.0, 0.562, 0.04, 0.0, 0.169, 0...
      294
    
    
      1717628
      4283991208
      3172717375315987
      10026622
      334
      23157348732739802
      10026622
      334
      23232
      2
      23
      15309.0
      2.315735e+16
      [0.123, 1.018, 0.0, 0.241, 0.037, 0.0, 0.794, ...
      294
    
    
      1717629
      4283991208
      3172717375315987
      10026622
      334
      23157423020531740
      10010458
      334
      95105
      1
      6
      1458.0
      2.315742e+16
      [0.313, 0.473, 0.205, 0.15, 0.36, 0.041, 0.167...
      294
    
    
      1717631
      4283991208
      3172717375315987
      10026622
      334
      243270852316745762
      10011540
      334
      85515
      4
      81
      2731.0
      2.432709e+17
      [0.064, 2.257, 0.057, 0.246, 0.024, 0.0, 0.27,...
      294
    
    
      1717632
      4283991208
      3172717375315987
      10026622
      334
      247774475044470785
      10026622
      334
      83427
      1
      3
      61120.0
      2.477745e+17
      [0.016, 2.397, 0.092, 0.378, 0.223, 0.006, 0.5...
      294
    
    
      1717633
      4283991208
      3172717375315987
      10026622
      334
      250307726756618325
      10011540
      334
      93012
      2
      22
      45725.0
      2.503077e+17
      [0.53, 0.019, 0.024, 0.297, 0.033, 0.0, 0.561,...
      294
    
    
      1717634
      4283991208
      3172717375315987
      10026622
      334
      250870682408312845
      10010782
      334
      100583
      1
      3
      55919.0
      2.508707e+17
      [0.179, 0.112, 0.129, 3.025, 0.0, 0.277, 0.057...
      294
    
    
      1717635
      4283991208
      3172717375315987
      10026622
      334
      27379572116287490
      10026622
      334
      29001
      2
      28
      9125.0
      2.737957e+16
      [0.128, 0.42, 0.208, 0.254, 0.288, 0.0, 0.622,...
      294
    
    
      1717636
      4283991208
      3172717375315987
      10026622
      334
      290277175168847890
      10014206
      334
      75201
      7
      132
      23007.0
      2.902772e+17
      [0.109, 0.75, 0.0, 0.181, 0.045, 0.045, 0.0, 0...
      294
    
    
      1717637
      4283991208
      3172717375315987
      10026622
      334
      295906672964997127
      10011540
      334
      93126
      1
      3
      61225.0
      2.959067e+17
      [0.047, 0.259, 0.0, 0.234, 0.0, 0.0, 0.264, 0....
      294
    
    
      1717638
      4283991208
      3172717375315987
      10026622
      334
      298158464936345758
      10008016
      334
      100428
      2
      42
      10511.0
      2.981585e+17
      [0.0, 0.163, 0.901, 0.43, 0.351, 0.016, 1.724,...
      294
    
    
      1717639
      4283991208
      3172717375315987
      10026622
      334
      299284391731855366
      10014206
      334
      75661
      1
      4
      32011.0
      2.992844e+17
      [0.102, 0.857, 0.46, 0.488, 0.001, 0.027, 0.0,...
      294
    
    
      1717640
      4283991208
      3172717375315987
      10026622
      334
      301817662738743306
      10026622
      334
      25487
      1
      3
      87865.0
      3.018177e+17
      [0.208, 1.086, 0.011, 1.154, 0.195, 0.0, 0.03,...
      294
    
    
      1717641
      4283991208
      3172717375315987
      10026622
      334
      309698873616236948
      10021212
      334
      100076
      4
      17
      19302.0
      3.096989e+17
      [0.097, 1.086, 0.233, 2.249, 0.264, 0.155, 0.4...
      294
    
    
      1717642
      4283991208
      3172717375315987
      10026622
      334
      31038746813526032
      10026622
      334
      14686
      1
      3
      42189.0
      3.103875e+16
      [0.0, 0.87, 0.0, 0.427, 0.175, 0.068, 0.492, 0...
      294
    
    
      1717643
      4283991208
      3172717375315987
      10026622
      334
      310543387016773641
      10026622
      334
      25300
      7
      96
      21854.0
      3.105434e+17
      [0.0, 0.995, 0.0, 0.565, 0.152, 0.006, 0.236, ...
      294
    
    
      1717644
      4283991208
      3172717375315987
      10026622
      334
      310543401807400960
      10012454
      334
      94139
      5
      73
      60043.0
      3.105434e+17
      [0.11, 0.677, 0.149, 0.154, 0.0, 0.019, 0.204,...
      294
    
    
      1717645
      4283991208
      3172717375315987
      10026622
      334
      310824846744035366
      10011540
      334
      93271
      1
      3
      52322.0





    




limit_output extension: Maximum message size of 10000 exceeded with 25694 characters

some people have longer viewing trajectories. first item was viewed 28hours ahead of time.



In [14]:

    
user_profile.loc[user_profile.user_id==6539296,]









    Out[14]:







  
    
      
      user_id
      buy_spu
      buy_sn
      buy_ct3
      view_spu
      view_sn
      view_ct3
      time_interval
      view_cnt
      view_seconds
      index
      spu_id
      features
      view_spu_count
    
  
  
    
      1909703
      6539296
      4714500743357079552
      10015678
      334
      230604513463791633
      10028297
      334
      393376
      2
      38
      17816.0
      2.306045e+17
      [0.09, 0.061, 0.0, 0.109, 0.486, 0.034, 0.108,...
      67
    
    
      1909704
      6539296
      4714500743357079552
      10015678
      334
      238204332051542018
      10028297
      334
      393455
      1
      4
      1497.0
      2.382043e+17
      [0.048, 0.326, 0.063, 0.302, 0.178, 0.319, 0.8...
      67
    
    
      1909707
      6539296
      4714500743357079552
      10015678
      334
      292529005121761410
      10016791
      334
      280246
      1
      5
      11737.0
      2.925290e+17
      [0.0, 1.925, 0.281, 0.025, 0.476, 0.0, 0.204, ...
      67
    
    
      1909708
      6539296
      4714500743357079552
      10015678
      334
      296188153247072271
      10001351
      334
      576605
      1
      20
      38219.0
      2.961882e+17
      [0.234, 0.398, 0.178, 2.219, 0.321, 0.048, 0.0...
      67
    
    
      1909709
      6539296
      4714500743357079552
      10015678
      334
      302662074473369605
      10015636
      334
      264693
      2
      33
      58662.0
      3.026621e+17
      [0.474, 0.197, 0.0, 0.978, 0.06, 0.0, 0.011, 0...
      67
    
    
      1909710
      6539296
      4714500743357079552
      10015678
      334
      307165668905115683
      10004555
      334
      430709
      1
      21
      86666.0
      3.071657e+17
      [0.173, 0.031, 0.013, 2.349, 0.376, 0.106, 0.1...
      67
    
    
      1909711
      6539296
      4714500743357079552
      10015678
      334
      307728624054161408
      10015636
      334
      392614
      1
      11
      53553.0
      3.077286e+17
      [0.044, 1.257, 0.0, 0.449, 0.333, 0.0, 0.649, ...
      67
    
    
      1909712
      6539296
      4714500743357079552
      10015678
      334
      321239449850023952
      10020640
      334
      479555
      1
      5
      44680.0
      3.212394e+17
      [0.0, 0.0, 0.0, 0.005, 0.263, 0.0, 0.22, 0.174...
      67
    
    
      1909714
      6539296
      4714500743357079552
      10015678
      334
      438051570214809601
      10020640
      334
      479535
      1
      5
      42373.0
      4.380516e+17
      [0.026, 0.954, 0.11, 0.632, 0.117, 0.113, 0.00...
      67
    
    
      1909715
      6539296
      4714500743357079552
      10015678
      334
      446777294492839942
      10020640
      334
      479581
      1
      5
      770.0
      4.467773e+17
      [0.188, 0.451, 0.02, 0.174, 0.106, 0.024, 0.20...
      67
    
    
      1909719
      6539296
      4714500743357079552
      10015678
      334
      6867784324568485888
      10005149
      334
      264581
      2
      25
      22329.0
      6.867784e+18
      [0.063, 0.309, 0.0, 0.36, 0.04, 0.044, 0.189, ...
      67
    
    
      1909723
      6539296
      4714500743357079552
      10015678
      334
      89585515455168564
      10015678
      334
      129900
      1
      4
      25143.0
      8.958552e+16
      [0.114, 0.313, 0.025, 0.679, 0.127, 0.042, 1.1...
      67
    
    
      1909725
      6539296
      4714500743357079552
      10015678
      334
      96622389872934948
      10015678
      334
      39642
      1
      6
      30247.0
      9.662239e+16
      [0.254, 0.25, 0.021, 1.455, 0.137, 0.005, 1.77...
      67
    
    
      1909727
      6539296
      88741090525036576
      10015678
      334
      18935283870048281
      10000351
      334
      576751
      1
      10
      55708.0
      1.893528e+16
      [0.036, 0.25, 0.0, 1.639, 0.0, 0.061, 0.499, 0...
      67
    
    
      1909729
      6539296
      88741090525036576
      10015678
      334
      23438889895616669
      10004555
      334
      430998
      2
      27
      72492.0
      2.343889e+16
      [0.13, 0.409, 0.595, 1.369, 0.103, 0.024, 0.24...
      67
    
    
      1909730
      6539296
      88741090525036576
      10015678
      334
      23438905780764691
      10015678
      334
      129763
      2
      7
      19058.0
      2.343891e+16
      [0.074, 1.258, 0.029, 0.001, 0.567, 0.014, 0.8...
      67
    
    
      1909731
      6539296
      88741090525036576
      10015678
      334
      238204332051542018
      10028297
      334
      393467
      1
      4
      1497.0
      2.382043e+17
      [0.048, 0.326, 0.063, 0.302, 0.178, 0.319, 0.8...
      67
    
    
      1909732
      6539296
      88741090525036576
      10015678
      334
      29068390742786093
      10012635
      334
      576642
      1
      12
      33210.0
      2.906839e+16
      [0.666, 1.926, 0.006, 0.425, 0.864, 0.063, 0.3...
      67
    
    
      1909733
      6539296
      88741090525036576
      10015678
      334
      314765487213334554
      10004555
      334
      430617
      1
      21
      87956.0
      3.147655e+17
      [0.177, 0.294, 0.635, 1.745, 0.026, 0.25, 0.0,...
      67
    
    
      1909735
      6539296
      88741090525036576
      10015678
      334
      437770078308651009
      10000683
      334
      576671
      1
      12
      72995.0
      4.377701e+17
      [0.068, 0.641, 0.0, 2.952, 0.499, 0.134, 0.0, ...
      67
    
    
      1909736
      6539296
      88741090525036576
      10015678
      334
      450436452539129891
      10003719
      334
      576825
      2
      33
      52617.0
      4.504365e+17
      [0.43, 0.137, 0.0, 0.01, 0.006, 0.0, 0.039, 0....
      67
    
    
      1909737
      6539296
      88741090525036576
      10015678
      334
      453532675442028551
      10020640
      334
      479560
      1
      3
      4512.0
      4.535327e+17
      [0.489, 0.432, 0.059, 0.128, 0.097, 0.298, 0.0...
      67
    
    
      1909738
      6539296
      88741090525036576
      10015678
      334
      468450827688092121
      10003301
      334
      576541
      2
      20
      21012.0
      4.684508e






    




limit_output extension: Maximum message size of 10000 exceeded with 25594 characters

this person bought two items.



In [21]:

    
plot_trajectory_scatter(user_profile,samplesize=100,size=10,savedir='../figures/trajectories_evaluation_dataset.png')

I'd like to make this figure better - easier to tell which rows people are on

Save Notebook



In [ ]:

    
%%bash 
jupyter nbconvert --to slides Exploring_Data.ipynb && mv Exploring_Data.slides.html ../notebook_slides/Exploring_Data_v1.slides.html
jupyter nbconvert --to html Exploring_Data.ipynb && mv Exploring_Data.html ../notebook_htmls/Exploring_Data_v1.html
cp Exploring_Data.ipynb ../notebook_versions/Exploring_Data_v1.ipynb



In [38]:

    
# push to s3 
import sys
import os
sys.path.append(os.getcwd()+'/../')
from src import s3_data_management
s3_data_management.push_results_to_s3('Exploring_Data_v1.html','../notebook_htmls/Exploring_Data_v1.html')
s3_data_management.push_results_to_s3('Exporing_Data_v1.slides.html','../notebook_slides/Exploring_Data_v1.slides.html')









    



..................

	user_id	buy_spu	buy_sn	buy_ct3	view_spu	view_sn	view_ct3	time_interval	view_cnt	view_seconds
0	3125745546	3454147345092617	10020201	334	100281596405534762	10029251	334	311066	3	50
1	3125745546	3454147345092617	10020201	334	104785174046949392	10014206	334	499624	1	7
2	3125745546	3454147345092617	10020201	334	10491053651988480	10010280	334	409509	2	9
3	3125745546	3454147345092617	10020201	334	20342683550576642	10015063	334	313623	1	2
4	3125745546	3454147345092617	10020201	334	224975005672079387	10020201	334	451607	4	12
5	3125745546	3454147345092617	10020201	334	232293356547952640	10015864	334	228407	1	2
6	3125745546	3454147345092617	10020201	334	243833829111693330	10020201	334	499375	1	10
7	3125745546	3454147345092617	10020201	334	245804153948667933	10020201	334	499347	1	2
8	3125745546	3454147345092617	10020201	334	25127771918368778	10014206	334	499525	1	5
9	3125745546	3454147345092617	10020201	334	299284399610007558	10024895	334	273848	1	4

	user_id	buy_spu	buy_sn	buy_ct3	view_spu	view_sn	view_ct3	time_interval	view_cnt	view_seconds
count	6.538474e+06	6.538474e+06	6.538474e+06	6538474.0	6.538474e+06	6.538474e+06	6538474.0	6.538474e+06	6.538474e+06	6.538474e+06
mean	2.157486e+09	7.705014e+17	1.001342e+07	334.0	7.804850e+17	1.001284e+07	334.0	1.695374e+05	2.124666e+00	2.282485e+01
std	1.235368e+09	1.776139e+18	8.024860e+03	0.0	1.785675e+18	8.012593e+03	0.0	1.727738e+05	3.494220e+00	5.179874e+01
min	4.079800e+04	3.578723e+14	1.000001e+07	334.0	3.578723e+14	1.000001e+07	334.0	0.000000e+00	1.000000e+00	0.000000e+00
25%	1.087342e+09	8.536339e+16	1.000595e+07	334.0	8.620783e+16	1.000571e+07	334.0	1.368525e+04	1.000000e+00	4.000000e+00
50%	2.161190e+09	2.500263e+17	1.001327e+07	334.0	2.511522e+17	1.001232e+07	334.0	9.995600e+04	1.000000e+00	8.000000e+00
75%	3.224951e+09	4.403034e+17	1.002064e+07	334.0	4.408663e+17	1.002042e+07	334.0	2.967050e+05	2.000000e+00	2.100000e+01
max	4.294950e+09	9.187062e+18	1.002953e+07	334.0	9.187062e+18	1.002953e+07	334.0	6.047920e+05	9.590000e+02	6.369000e+03

	user_id	buy_spu	buy_sn	buy_ct3	view_spu	view_sn	view_ct3	time_interval	view_cnt	view_seconds
941700	1606682799	31038742178611202	10021072	334	1483874171215895	10011806	334	1722	1	8
3122676	1606682799	31038742178611202	10021072	334	244396773139058688	10023064	334	1612	1	7
3122680	1606682799	31038742178611202	10021072	334	98592737342341127	10020640	334	1599	1	5
941703	1606682799	31038742178611202	10021072	334	460569523062440054	10013861	334	1568	2	34
941701	1606682799	31038742178611202	10021072	334	292247513449336850	10013861	334	1533	1	3
941699	1606682799	31038742178611202	10021072	334	102533392945205253	10020640	334	1507	1	11
5300612	1606682799	31038742178611202	10021072	334	457754773298602023	10013861	334	1428	1	8
3122678	1606682799	31038742178611202	10021072	334	295625232722276353	10021072	334	1393	3	82
941702	1606682799	31038742178611202	10021072	334	323209779688951808	10021072	334	1346	1	1
3122677	1606682799	31038742178611202	10021072	334	292810475522134093	10028876	334	415	1	6
3122679	1606682799	31038742178611202	10021072	334	467043461724684336	10022273	334	318	1	29
5300611	1606682799	31038742178611202	10021072	334	31038742178611202	10021072	334	0	6	229

	spu	url
0	100000000317431808	http://a.vpimg2.com/upload/merchandise/pdc/808...
1	100000001066491904	http://a.vpimg2.com/upload/merchandise/pdc/904...
2	100000005622976512	http://a.vpimg2.com/upload/merchandise/pdc/512...
3	100000009136676865	http://a.vpimg2.com/upload/merchandise/pdcvis/...
4	100000009494401198	http://a.vpimg2.com/upload/merchandise/pdcvis/...
5	100000019080622267	http://a.vpimg2.com/upload/merchandise/pdcvis/...
6	100000020837699950	http://a.vpimg2.com/upload/merchandise/pdcvis/...
7	100000021928063132	http://a.vpimg2.com/upload/merchandise/pdcvis/...
8	100000030105853978	http://a.vpimg2.com/upload/merchandise/pdcvis/...
9	100000030491275422	http://a.vpimg2.com/upload/merchandise/pdcvis/...

	url
	count
spu
357870273655002	2
357870995513345	1
357872333107204	1
357875526676843	1
357875526680651	1

	spu_id	features	view_spu
1	100000087145246721	[0.009, 0.839, 0.439, 1.673, 0.226, 0.055, 0.1...	100000087145246721
2	100000090992795708	[0.0, 1.878, 0.353, 1.505, 0.047, 0.003, 0.0, ...	100000090992795708
4	100281553839460375	[0.661, 1.84, 0.028, 1.128, 0.172, 1.097, 1.45...	100281553839460375
8	100844539282133007	[0.444, 0.428, 0.0, 0.342, 0.314, 0.015, 0.529...	100844539282133007
9	101688901872443404	[0.058, 2.361, 0.0, 1.037, 0.013, 1.009, 1.543...	101688901872443404

	user_id	buy_spu	buy_sn	buy_ct3	view_spu	view_sn	view_ct3	time_interval	view_cnt	view_seconds	index	spu_id	features	view_spu_count
1717617	4283991208	3172717375315987	10026622	334	103940738865885244	10011540	334	93823	1	3	1291.0	1.039407e+17	[0.0, 0.525, 0.101, 0.28, 0.0, 0.0, 0.018, 0.2...	294
1717621	4283991208	3172717375315987	10026622	334	13305822896521234	10014206	334	75794	1	4	66061.0	1.330582e+16	[0.052, 0.05, 0.002, 0.0, 0.021, 0.0, 0.039, 0...	294
1717623	4283991208	3172717375315987	10026622	334	1483775526015157	10026622	334	21742	1	26	10175.0	1.483776e+15	[0.267, 0.28, 0.003, 1.297, 0.28, 0.013, 0.162...	294
1717624	4283991208	3172717375315987	10026622	334	17527905451290635	10014936	334	94170	5	28	7548.0	1.752791e+16	[0.175, 3.27, 0.225, 1.059, 0.561, 0.019, 0.03...	294
1717625	4283991208	3172717375315987	10026622	334	225537933784404046	10005711	334	103261	1	5	80000.0	2.255379e+17	[0.651, 0.679, 0.0, 0.598, 0.215, 0.001, 0.04,...	294
1717627	4283991208	3172717375315987	10026622	334	226945303689633846	10011540	334	93076	2	13	83804.0	2.269453e+17	[0.078, 0.471, 0.0, 0.562, 0.04, 0.0, 0.169, 0...	294
1717628	4283991208	3172717375315987	10026622	334	23157348732739802	10026622	334	23232	2	23	15309.0	2.315735e+16	[0.123, 1.018, 0.0, 0.241, 0.037, 0.0, 0.794, ...	294
1717629	4283991208	3172717375315987	10026622	334	23157423020531740	10010458	334	95105	1	6	1458.0	2.315742e+16	[0.313, 0.473, 0.205, 0.15, 0.36, 0.041, 0.167...	294
1717631	4283991208	3172717375315987	10026622	334	243270852316745762	10011540	334	85515	4	81	2731.0	2.432709e+17	[0.064, 2.257, 0.057, 0.246, 0.024, 0.0, 0.27,...	294
1717632	4283991208	3172717375315987	10026622	334	247774475044470785	10026622	334	83427	1	3	61120.0	2.477745e+17	[0.016, 2.397, 0.092, 0.378, 0.223, 0.006, 0.5...	294
1717633	4283991208	3172717375315987	10026622	334	250307726756618325	10011540	334	93012	2	22	45725.0	2.503077e+17	[0.53, 0.019, 0.024, 0.297, 0.033, 0.0, 0.561,...	294
1717634	4283991208	3172717375315987	10026622	334	250870682408312845	10010782	334	100583	1	3	55919.0	2.508707e+17	[0.179, 0.112, 0.129, 3.025, 0.0, 0.277, 0.057...	294
1717635	4283991208	3172717375315987	10026622	334	27379572116287490	10026622	334	29001	2	28	9125.0	2.737957e+16	[0.128, 0.42, 0.208, 0.254, 0.288, 0.0, 0.622,...	294
1717636	4283991208	3172717375315987	10026622	334	290277175168847890	10014206	334	75201	7	132	23007.0	2.902772e+17	[0.109, 0.75, 0.0, 0.181, 0.045, 0.045, 0.0, 0...	294
1717637	4283991208	3172717375315987	10026622	334	295906672964997127	10011540	334	93126	1	3	61225.0	2.959067e+17	[0.047, 0.259, 0.0, 0.234, 0.0, 0.0, 0.264, 0....	294
1717638	4283991208	3172717375315987	10026622	334	298158464936345758	10008016	334	100428	2	42	10511.0	2.981585e+17	[0.0, 0.163, 0.901, 0.43, 0.351, 0.016, 1.724,...	294
1717639	4283991208	3172717375315987	10026622	334	299284391731855366	10014206	334	75661	1	4	32011.0	2.992844e+17	[0.102, 0.857, 0.46, 0.488, 0.001, 0.027, 0.0,...	294
1717640	4283991208	3172717375315987	10026622	334	301817662738743306	10026622	334	25487	1	3	87865.0	3.018177e+17	[0.208, 1.086, 0.011, 1.154, 0.195, 0.0, 0.03,...	294
1717641	4283991208	3172717375315987	10026622	334	309698873616236948	10021212	334	100076	4	17	19302.0	3.096989e+17	[0.097, 1.086, 0.233, 2.249, 0.264, 0.155, 0.4...	294
1717642	4283991208	3172717375315987	10026622	334	31038746813526032	10026622	334	14686	1	3	42189.0	3.103875e+16	[0.0, 0.87, 0.0, 0.427, 0.175, 0.068, 0.492, 0...	294
1717643	4283991208	3172717375315987	10026622	334	310543387016773641	10026622	334	25300	7	96	21854.0	3.105434e+17	[0.0, 0.995, 0.0, 0.565, 0.152, 0.006, 0.236, ...	294
1717644	4283991208	3172717375315987	10026622	334	310543401807400960	10012454	334	94139	5	73	60043.0	3.105434e+17	[0.11, 0.677, 0.149, 0.154, 0.0, 0.019, 0.204,...	294
1717645	4283991208	3172717375315987	10026622	334	310824846744035366	10011540	334	93271	1	3	52322.0

	user_id	buy_spu	buy_sn	buy_ct3	view_spu	view_sn	view_ct3	time_interval	view_cnt	view_seconds	index	spu_id	features	view_spu_count
1909703	6539296	4714500743357079552	10015678	334	230604513463791633	10028297	334	393376	2	38	17816.0	2.306045e+17	[0.09, 0.061, 0.0, 0.109, 0.486, 0.034, 0.108,...	67
1909704	6539296	4714500743357079552	10015678	334	238204332051542018	10028297	334	393455	1	4	1497.0	2.382043e+17	[0.048, 0.326, 0.063, 0.302, 0.178, 0.319, 0.8...	67
1909707	6539296	4714500743357079552	10015678	334	292529005121761410	10016791	334	280246	1	5	11737.0	2.925290e+17	[0.0, 1.925, 0.281, 0.025, 0.476, 0.0, 0.204, ...	67
1909708	6539296	4714500743357079552	10015678	334	296188153247072271	10001351	334	576605	1	20	38219.0	2.961882e+17	[0.234, 0.398, 0.178, 2.219, 0.321, 0.048, 0.0...	67
1909709	6539296	4714500743357079552	10015678	334	302662074473369605	10015636	334	264693	2	33	58662.0	3.026621e+17	[0.474, 0.197, 0.0, 0.978, 0.06, 0.0, 0.011, 0...	67
1909710	6539296	4714500743357079552	10015678	334	307165668905115683	10004555	334	430709	1	21	86666.0	3.071657e+17	[0.173, 0.031, 0.013, 2.349, 0.376, 0.106, 0.1...	67
1909711	6539296	4714500743357079552	10015678	334	307728624054161408	10015636	334	392614	1	11	53553.0	3.077286e+17	[0.044, 1.257, 0.0, 0.449, 0.333, 0.0, 0.649, ...	67
1909712	6539296	4714500743357079552	10015678	334	321239449850023952	10020640	334	479555	1	5	44680.0	3.212394e+17	[0.0, 0.0, 0.0, 0.005, 0.263, 0.0, 0.22, 0.174...	67
1909714	6539296	4714500743357079552	10015678	334	438051570214809601	10020640	334	479535	1	5	42373.0	4.380516e+17	[0.026, 0.954, 0.11, 0.632, 0.117, 0.113, 0.00...	67
1909715	6539296	4714500743357079552	10015678	334	446777294492839942	10020640	334	479581	1	5	770.0	4.467773e+17	[0.188, 0.451, 0.02, 0.174, 0.106, 0.024, 0.20...	67
1909719	6539296	4714500743357079552	10015678	334	6867784324568485888	10005149	334	264581	2	25	22329.0	6.867784e+18	[0.063, 0.309, 0.0, 0.36, 0.04, 0.044, 0.189, ...	67
1909723	6539296	4714500743357079552	10015678	334	89585515455168564	10015678	334	129900	1	4	25143.0	8.958552e+16	[0.114, 0.313, 0.025, 0.679, 0.127, 0.042, 1.1...	67
1909725	6539296	4714500743357079552	10015678	334	96622389872934948	10015678	334	39642	1	6	30247.0	9.662239e+16	[0.254, 0.25, 0.021, 1.455, 0.137, 0.005, 1.77...	67
1909727	6539296	88741090525036576	10015678	334	18935283870048281	10000351	334	576751	1	10	55708.0	1.893528e+16	[0.036, 0.25, 0.0, 1.639, 0.0, 0.061, 0.499, 0...	67
1909729	6539296	88741090525036576	10015678	334	23438889895616669	10004555	334	430998	2	27	72492.0	2.343889e+16	[0.13, 0.409, 0.595, 1.369, 0.103, 0.024, 0.24...	67
1909730	6539296	88741090525036576	10015678	334	23438905780764691	10015678	334	129763	2	7	19058.0	2.343891e+16	[0.074, 1.258, 0.029, 0.001, 0.567, 0.014, 0.8...	67
1909731	6539296	88741090525036576	10015678	334	238204332051542018	10028297	334	393467	1	4	1497.0	2.382043e+17	[0.048, 0.326, 0.063, 0.302, 0.178, 0.319, 0.8...	67
1909732	6539296	88741090525036576	10015678	334	29068390742786093	10012635	334	576642	1	12	33210.0	2.906839e+16	[0.666, 1.926, 0.006, 0.425, 0.864, 0.063, 0.3...	67
1909733	6539296	88741090525036576	10015678	334	314765487213334554	10004555	334	430617	1	21	87956.0	3.147655e+17	[0.177, 0.294, 0.635, 1.745, 0.026, 0.25, 0.0,...	67
1909735	6539296	88741090525036576	10015678	334	437770078308651009	10000683	334	576671	1	12	72995.0	4.377701e+17	[0.068, 0.641, 0.0, 2.952, 0.499, 0.134, 0.0, ...	67
1909736	6539296	88741090525036576	10015678	334	450436452539129891	10003719	334	576825	2	33	52617.0	4.504365e+17	[0.43, 0.137, 0.0, 0.01, 0.006, 0.0, 0.039, 0....	67
1909737	6539296	88741090525036576	10015678	334	453532675442028551	10020640	334	479560	1	3	4512.0	4.535327e+17	[0.489, 0.432, 0.059, 0.128, 0.097, 0.298, 0.0...	67
1909738	6539296	88741090525036576	10015678	334	468450827688092121	10003301	334	576541	2	20	21012.0	4.684508e