In [6]:
    
import sys 
import os
sys.path.append(os.getcwd()+'/../')
# other
import numpy as np
import glob
import pandas as pd
import ntpath
#keras
from keras.preprocessing import image
# plotting
import seaborn as sns
sns.set_style('white')
import matplotlib.pyplot as plt
%matplotlib inline
# debuggin
from IPython.core.debugger import Tracer
#stats
import scipy.stats as stats
import bqplot.pyplot as bqplt
    
    
In [10]:
    
user_profile = pd.read_csv('../data_user_view_buy/user_profile.csv',sep='\t',header=None)
    
In [11]:
    
user_profile.columns = ['user_id','buy_spu','buy_sn','buy_ct3','view_spu','view_sn','view_ct3','time_interval','view_cnt','view_seconds']
    
In [11]:
    
string =str(user_profile.buy_spu.as_matrix()[3002])
print(string)
print(string[0:7]+'-'+string[7::])
#print(str(user_profile.buy_spu.as_matrix()[0])[7::])
    
    
In [12]:
    
user_profile.head(10)
    
    Out[12]:
In [14]:
    
print('n rows: {0}').format(len(user_profile))
    
    
In [20]:
    
def plot_trajectory_scatter(user_profile,scatter_color_col=None,samplesize=50,size=10,savedir=None):
    plt.figure(figsize=(12,1*samplesize/10))
    for ui,user_id in enumerate(np.random.choice(user_profile.user_id.unique(),samplesize)):
        trajectory = user_profile.loc[user_profile.user_id==user_id,]
        time = 0-trajectory.time_interval.as_matrix()/60.0/60.0/24.0
        
        # add image or not
        if scatter_color_col is not None:
            c = trajectory[scatter_color_col].as_matrix()
        else:
            c = np.ones(len(trajectory))
        
        plt.scatter(time,np.ones(len(time))*ui,s=size,c=c,edgecolors="none",cmap="jet")
        plt.axvline(x=0,linewidth=1)
        sns.despine()
        plt.title('example user trajectories')
        plt.xlabel('days to purchase')
        if savedir is not None:
            plt.savefig(savedir,dpi=100)
    
In [13]:
    
user_profile.describe()
    
    Out[13]:
In [14]:
    
print('unique users:{0}').format(len(user_profile.user_id.unique()))
print('unique items viewed:{0}').format(len(user_profile.view_spu.unique()))
print('unique items bought:{0}').format(len(user_profile.buy_spu.unique()))
print('unique categories viewed:{0}').format(len(user_profile.view_ct3.unique()))
print('unique categories bought:{0}').format(len(user_profile.buy_ct3.unique()))
print('unique brands viewed:{0}').format(len(user_profile.view_sn.unique()))
print('unique brands bought:{0}').format(len(user_profile.buy_sn.unique()))
    
    
In [15]:
    
samplesize = 2000
plt.figure(figsize=(12,4))
plt.subplot(1,3,1)
plt.hist(np.random.choice(user_profile.time_interval.as_matrix()/60.0/60.0,samplesize))
sns.despine()
plt.title('sample histogram from "time interval"')
plt.xlabel('hours from view to buy')
plt.ylabel('counts of items')
plt.subplot(1,3,2)
plt.hist(np.random.choice(user_profile.view_cnt.as_matrix(),samplesize))
sns.despine()
plt.title('sample histogram from "view count"')
plt.xlabel('view counts')
plt.ylabel('counts of items')
plt.subplot(1,3,3)
plt.hist(np.random.choice(user_profile.view_seconds.as_matrix(),samplesize))
sns.despine()
plt.title('sample histogram from "view lengths"')
plt.xlabel('view lengths (seconds)')
plt.ylabel('counts of items')
    
    Out[15]:
    
In [16]:
    
print('longest time interval')
print(user_profile.time_interval.min())
print('longest time interval')
print(user_profile.time_interval.max()/60.0/60.0/24)
    
    
In [17]:
    
mean_time_interval = np.array([])
samplesize =1000
for user_id in np.random.choice(user_profile.user_id.unique(),samplesize):
    mean_time_interval = np.append(mean_time_interval, user_profile.loc[user_profile.user_id==user_id,'time_interval'].mean())
    
In [18]:
    
plt.figure(figsize=(12,3))
plt.hist(mean_time_interval/60.0,bins=200)
sns.despine()
plt.title('sample histogram of average length for user trajectories"')
plt.xlabel('minutes')
plt.ylabel('counts of items out of '+str(samplesize))
    
    Out[18]:
    
In [19]:
    
plt.figure(figsize=(12,3))
plt.hist(mean_time_interval/60.0,bins=1000)
plt.xlim(0,100)
sns.despine()
plt.title('sample histogram of average length for user trajectories"')
plt.xlabel('minutes')
plt.ylabel('counts of items out of '+str(samplesize))
    
    Out[19]:
    
In [20]:
    
plt.figure(figsize=(8,3))
plt.hist(mean_time_interval/60.0,bins=200,cumulative=True,normed=True)
plt.xlim(0,2000)
sns.despine()
plt.title('sample cdf of average length for user trajectories"')
plt.xlabel('minutes')
plt.ylabel('counts of items out of '+str(samplesize))
    
    Out[20]:
    
In [21]:
    
user_id = 1606682799
trajectory = user_profile.loc[user_profile.user_id==user_id,]
trajectory= trajectory.sort_values(by='time_interval',ascending=False)
trajectory
    
    Out[21]:
In [115]:
    
plot_trajectory_scatter(user_profile)
    
    
In [20]:
    
samplesize =1000
number_of_times_item_bought = np.empty(samplesize)
number_of_times_item_viewed = np.empty(samplesize)
for ii,item_id in enumerate(np.random.choice(user_profile.view_spu.unique(),samplesize)):
    number_of_times_item_bought[ii] = len(user_profile.loc[user_profile.buy_spu==item_id,'user_id'].unique()) # assume the same user would not buy the same product 
    number_of_times_item_viewed[ii] = len(user_profile.loc[user_profile.view_spu==item_id]) # same user can view the same image more than once for this count
    
In [39]:
    
plt.figure(figsize=(12,4))
plt.subplot(1,2,1)
plt.bar(np.arange(len(number_of_times_item_bought)),number_of_times_item_bought)
sns.despine()
plt.title('item popularity (purchases)')
plt.xlabel('item')
plt.ylabel('# of times items were bought')
plt.subplot(1,2,2)
plt.hist(number_of_times_item_bought,bins=100)
sns.despine()
plt.title('item popularity (purchases)')
plt.xlabel('# of times items were bought sample size='+str(samplesize))
plt.ylabel('# of items')
    
    Out[39]:
    
In [38]:
    
plt.figure(figsize=(12,4))
plt.subplot(1,2,1)
plt.bar(np.arange(len(number_of_times_item_viewed)),number_of_times_item_viewed)
sns.despine()
plt.title('item popularity (views)')
plt.xlabel('item')
plt.ylabel('# of times items were viewed')
plt.subplot(1,2,2)
plt.hist(number_of_times_item_bought,bins=100)
sns.despine()
plt.title('item popularity (views) sample size='+str(samplesize))
plt.xlabel('# of times items were viewed')
plt.ylabel('# of items')
    
    Out[38]:
    
In [37]:
    
plt.figure(figsize=(6,4))
plt.subplot(1,1,1)
thresh =30
include = number_of_times_item_bought<thresh
plt.scatter(number_of_times_item_viewed[include],number_of_times_item_bought[include],)
(r,p) = stats.pearsonr(number_of_times_item_viewed[include],number_of_times_item_bought[include])
sns.despine()
plt.xlabel('number of times viewed')
plt.ylabel('number of times bought')
plt.title('r='+str(np.round(r,2))+' data truncated buys<'+str(thresh))
    
    Out[37]:
    
In [9]:
    
samplesize =1000
items_bought_per_user = np.empty(samplesize)
items_viewed_per_user = np.empty(samplesize)
for ui,user_id in enumerate(np.random.choice(user_profile.user_id.unique(),samplesize)):
    items_bought_per_user[ui] = len(user_profile.loc[user_profile.user_id==user_id,'buy_spu'].unique())
    items_viewed_per_user[ui] = len(user_profile.loc[user_profile.user_id==user_id,'view_spu'].unique())
    
In [11]:
    
plt.figure(figsize=(12,4))
plt.subplot(1,2,1)
plt.hist(items_bought_per_user)
sns.despine()
plt.title('number of items bought per user (sample of 1000)')
plt.xlabel('# items bought')
plt.ylabel('# users')
plt.subplot(1,2,2)
plt.hist(items_viewed_per_user)
sns.despine()
plt.title('number of items viewed per user (sample of 1000)')
plt.xlabel('# items viewed')
plt.ylabel('# users')
    
    Out[11]:
    
In [ ]:
    
    
In [65]:
    
urls = pd.read_csv('../../deep-learning-models-master/img/eval_img_url.csv',header=None)
urls.columns = ['spu','url']
print(len(urls))
urls.head(10)
    
    
    Out[65]:
In [77]:
    
urls[['spu','url']].groupby(['spu']).agg(['count']).head()
    
    Out[77]:
In [73]:
    
urls.loc[urls.spu==357870273655002,'url'].as_matrix()
    
    Out[73]:
In [76]:
    
urls.loc[urls.spu==357889732772303,'url'].as_matrix()
    
    Out[76]:
In [82]:
    
#urls.loc[urls.spu==1016200950427238422,'url']
    
In [84]:
    
tmp_urls = urls.loc[urls.spu==1016200950427238422,'url'].as_matrix()
tmp_urls
    
    Out[84]:
In [83]:
    
from urllib import urlretrieve
import time
    
In [88]:
    
# scrape images   
for i,tmp_url in enumerate(tmp_urls):
    urlretrieve(tmp_url, '../data_img_tmp/{}.jpg'.format(i))
    #time.sleep(3)
    
In [96]:
    
# plot them. 
print('two images from url with same spu (ugh)')
plt.figure(figsize=(8,3))
for i,tmp_url in enumerate(tmp_urls):
    img_path= '../data_img_tmp/{}.jpg'.format(i)
    img = image.load_img(img_path, target_size=(224, 224))
    plt.subplot(1,len(tmp_urls),i+1)
    plt.imshow(img)
    plt.grid(b=False)
    
    
    
In [78]:
    
urls.spu[0]
    
    Out[78]:
In [54]:
    
urls.url[0]
    
    Out[54]:
In [51]:
    
view_spus = user_profile.view_spu.unique()
contained = 0
spus_with_url = list(urls.spu.as_matrix())
for view_spu in view_spus: 
    if view_spu in spus_with_url:
        contained+=1
print(contained/np.float(len(view_spus)))
    
    
In [53]:
    
buy_spus = user_profile.buy_spu.unique()
contained = 0
spus_with_url = list(urls.spu.as_matrix())
for buy_spu in buy_spus: 
    if buy_spu in spus_with_url:
        contained+=1
print(contained/np.float(len(buy_spus)))
    
    
In [64]:
    
buy_spu in spus_with_url
    
    Out[64]:
In [58]:
    
len(urls.spu.unique())
len(user_profile.view_spu.unique())
    
    Out[58]:
In [3]:
    
spu_fea = pd.read_pickle("../data_nn_features/spu_fea.pkl") #takes forever to load
    
In [4]:
    
spu_fea['view_spu']=spu_fea['spu_id']
    
In [12]:
    
spu_fea['view_spu']=spu_fea['spu_id']
user_profile_w_features = user_profile.merge(spu_fea,on='view_spu',how='left')
print('before merge nrow: {0}').format(len(user_profile))
print('after merge nrows:{0}').format(len(user_profile_w_features))
    
    
In [13]:
    
print('number of items with features: {0}').format(len(spu_fea))
    
    
In [14]:
    
spu_fea.head()
    
    Out[14]:
In [15]:
    
# merge with userdata
spu_fea['view_spu']=spu_fea['spu_id']
user_profile_w_features = user_profile.merge(spu_fea,on='view_spu',how='left')
print('before merge nrow: {0}').format(len(user_profile))
print('after merge nrows:{0}').format(len(user_profile_w_features))
    
    
In [16]:
    
user_profile_w_features['has_features']=user_profile_w_features.groupby(['view_spu'])['spu_id'].apply(lambda x: np.isnan(x))
    
In [130]:
    
user_profile_w_features.has_features= user_profile_w_features.has_features.astype('int')
    
In [131]:
    
user_profile_w_features.head()
    
    Out[131]:
In [154]:
    
plot_trajectory_scatter(user_profile_w_features,scatter_color_col='has_features',samplesize=100,size=10,savedir='../../test.png')
    
    
In [108]:
    
1-(user_profile_w_features['features'].isnull()).mean()
    
    Out[108]:
In [116]:
    
1-user_profile_w_features.groupby(['view_spu'])['spu_id'].apply(lambda x: np.isnan(x)).mean()
    
    Out[116]:
In [27]:
    
buy_spus = user_profile.buy_spu.unique()
contained = 0
spus_with_features = list(spu_fea.spu_id.as_matrix())
for buy_spu in buy_spus: 
    if buy_spu in spus_with_features:
        contained+=1
print(contained/np.float(len(buy_spus)))
    
    
In [28]:
    
contained
    
    Out[28]:
In [29]:
    
len(buy_spus)
    
    Out[29]:
In [30]:
    
view_spus = user_profile.view_spu.unique()
contained = 0
spus_with_features = list(spu_fea.spu_id.as_matrix())
for view_spu in view_spus: 
    if view_spu in spus_with_features:
        contained+=1
print(contained/np.float(len(view_spus)))
    
    
In [31]:
    
len(view_spus)
    
    Out[31]:
In [7]:
    
user_profile = pd.read_pickle('../data_user_view_buy/user_profile_items_nonnull_features_20_mins_5_views.pkl')
    
In [8]:
    
len(user_profile)
    
    Out[8]:
In [9]:
    
print('unique users:{0}').format(len(user_profile.user_id.unique()))
print('unique items viewed:{0}').format(len(user_profile.view_spu.unique()))
print('unique items bought:{0}').format(len(user_profile.buy_spu.unique()))
print('unique categories viewed:{0}').format(len(user_profile.view_ct3.unique()))
print('unique categories bought:{0}').format(len(user_profile.buy_ct3.unique()))
print('unique brands viewed:{0}').format(len(user_profile.view_sn.unique()))
print('unique brands bought:{0}').format(len(user_profile.buy_sn.unique()))
    
    
In [10]:
    
#user_profile.groupby(['user_id'])['buy_spu'].nunique()
    
In [11]:
    
# how many items bought per user in this dataset? 
plt.figure(figsize=(8,3))
plt.hist(user_profile.groupby(['user_id'])['buy_spu'].nunique(),bins=20,normed=False)
sns.despine()
plt.xlabel('number of items bought per user')
plt.ylabel('number of user')
    
    Out[11]:
    
In [12]:
    
user_profile.loc[user_profile.user_id==4283991208,].head()
    
    Out[12]:
In [13]:
    
user_profile.loc[user_profile.user_id==6539296,].head()
    
In [14]:
    
plot_trajectory_scatter(user_profile,samplesize=100,size=10,savedir='../figures/trajectories_evaluation_dataset.png')
    
    
In [15]:
    
user_profile = pd.read_pickle('../data_user_view_buy/user_profile_items_nonnull_features_20_mins_5_views_v2_sample1000.pkl')
    
In [16]:
    
print('unique users:{0}').format(len(user_profile.user_id.unique()))
print('unique items viewed:{0}').format(len(user_profile.view_spu.unique()))
print('unique items bought:{0}').format(len(user_profile.buy_spu.unique()))
print('unique categories viewed:{0}').format(len(user_profile.view_ct3.unique()))
print('unique categories bought:{0}').format(len(user_profile.buy_ct3.unique()))
print('unique brands viewed:{0}').format(len(user_profile.view_sn.unique()))
print('unique brands bought:{0}').format(len(user_profile.buy_sn.unique()))
    
    
In [17]:
    
# how many items bought per user in this dataset? 
plt.figure(figsize=(8,3))
plt.hist(user_profile.groupby(['user_id'])['buy_spu'].nunique(),bins=20,normed=False)
sns.despine()
plt.xlabel('number of items bought per user')
plt.ylabel('number of user')
    
    Out[17]:
    
In [1]:
    
%%bash 
jupyter nbconvert --to slides Exploring_Data.ipynb && mv Exploring_Data.slides.html ../notebook_slides/Exploring_Data_v2.slides.html
jupyter nbconvert --to html Exploring_Data.ipynb && mv Exploring_Data.html ../notebook_htmls/Exploring_Data_v2.html
cp Exploring_Data.ipynb ../notebook_versions/Exploring_Data_v2.ipynb
    
    
In [38]:
    
# push to s3 
import sys
import os
sys.path.append(os.getcwd()+'/../')
from src import s3_data_management
s3_data_management.push_results_to_s3('Exploring_Data_v1.html','../notebook_htmls/Exploring_Data_v1.html')
s3_data_management.push_results_to_s3('Exporing_Data_v1.slides.html','../notebook_slides/Exploring_Data_v1.slides.html')