In [6]:
import sys
import os
sys.path.append(os.getcwd()+'/../')
# other
import numpy as np
import glob
import pandas as pd
import ntpath
#keras
from keras.preprocessing import image
# plotting
import seaborn as sns
sns.set_style('white')
import matplotlib.pyplot as plt
%matplotlib inline
# debuggin
from IPython.core.debugger import Tracer
#stats
import scipy.stats as stats
import bqplot.pyplot as bqplt
In [10]:
user_profile = pd.read_csv('../data_user_view_buy/user_profile.csv',sep='\t',header=None)
In [11]:
user_profile.columns = ['user_id','buy_spu','buy_sn','buy_ct3','view_spu','view_sn','view_ct3','time_interval','view_cnt','view_seconds']
In [11]:
string =str(user_profile.buy_spu.as_matrix()[3002])
print(string)
print(string[0:7]+'-'+string[7::])
#print(str(user_profile.buy_spu.as_matrix()[0])[7::])
In [12]:
user_profile.head(10)
Out[12]:
In [14]:
print('n rows: {0}').format(len(user_profile))
In [20]:
def plot_trajectory_scatter(user_profile,scatter_color_col=None,samplesize=50,size=10,savedir=None):
plt.figure(figsize=(12,1*samplesize/10))
for ui,user_id in enumerate(np.random.choice(user_profile.user_id.unique(),samplesize)):
trajectory = user_profile.loc[user_profile.user_id==user_id,]
time = 0-trajectory.time_interval.as_matrix()/60.0/60.0/24.0
# add image or not
if scatter_color_col is not None:
c = trajectory[scatter_color_col].as_matrix()
else:
c = np.ones(len(trajectory))
plt.scatter(time,np.ones(len(time))*ui,s=size,c=c,edgecolors="none",cmap="jet")
plt.axvline(x=0,linewidth=1)
sns.despine()
plt.title('example user trajectories')
plt.xlabel('days to purchase')
if savedir is not None:
plt.savefig(savedir,dpi=100)
In [13]:
user_profile.describe()
Out[13]:
In [14]:
print('unique users:{0}').format(len(user_profile.user_id.unique()))
print('unique items viewed:{0}').format(len(user_profile.view_spu.unique()))
print('unique items bought:{0}').format(len(user_profile.buy_spu.unique()))
print('unique categories viewed:{0}').format(len(user_profile.view_ct3.unique()))
print('unique categories bought:{0}').format(len(user_profile.buy_ct3.unique()))
print('unique brands viewed:{0}').format(len(user_profile.view_sn.unique()))
print('unique brands bought:{0}').format(len(user_profile.buy_sn.unique()))
In [15]:
samplesize = 2000
plt.figure(figsize=(12,4))
plt.subplot(1,3,1)
plt.hist(np.random.choice(user_profile.time_interval.as_matrix()/60.0/60.0,samplesize))
sns.despine()
plt.title('sample histogram from "time interval"')
plt.xlabel('hours from view to buy')
plt.ylabel('counts of items')
plt.subplot(1,3,2)
plt.hist(np.random.choice(user_profile.view_cnt.as_matrix(),samplesize))
sns.despine()
plt.title('sample histogram from "view count"')
plt.xlabel('view counts')
plt.ylabel('counts of items')
plt.subplot(1,3,3)
plt.hist(np.random.choice(user_profile.view_seconds.as_matrix(),samplesize))
sns.despine()
plt.title('sample histogram from "view lengths"')
plt.xlabel('view lengths (seconds)')
plt.ylabel('counts of items')
Out[15]:
In [16]:
print('longest time interval')
print(user_profile.time_interval.min())
print('longest time interval')
print(user_profile.time_interval.max()/60.0/60.0/24)
In [17]:
mean_time_interval = np.array([])
samplesize =1000
for user_id in np.random.choice(user_profile.user_id.unique(),samplesize):
mean_time_interval = np.append(mean_time_interval, user_profile.loc[user_profile.user_id==user_id,'time_interval'].mean())
In [18]:
plt.figure(figsize=(12,3))
plt.hist(mean_time_interval/60.0,bins=200)
sns.despine()
plt.title('sample histogram of average length for user trajectories"')
plt.xlabel('minutes')
plt.ylabel('counts of items out of '+str(samplesize))
Out[18]:
In [19]:
plt.figure(figsize=(12,3))
plt.hist(mean_time_interval/60.0,bins=1000)
plt.xlim(0,100)
sns.despine()
plt.title('sample histogram of average length for user trajectories"')
plt.xlabel('minutes')
plt.ylabel('counts of items out of '+str(samplesize))
Out[19]:
In [20]:
plt.figure(figsize=(8,3))
plt.hist(mean_time_interval/60.0,bins=200,cumulative=True,normed=True)
plt.xlim(0,2000)
sns.despine()
plt.title('sample cdf of average length for user trajectories"')
plt.xlabel('minutes')
plt.ylabel('counts of items out of '+str(samplesize))
Out[20]:
In [21]:
user_id = 1606682799
trajectory = user_profile.loc[user_profile.user_id==user_id,]
trajectory= trajectory.sort_values(by='time_interval',ascending=False)
trajectory
Out[21]:
In [115]:
plot_trajectory_scatter(user_profile)
In [20]:
samplesize =1000
number_of_times_item_bought = np.empty(samplesize)
number_of_times_item_viewed = np.empty(samplesize)
for ii,item_id in enumerate(np.random.choice(user_profile.view_spu.unique(),samplesize)):
number_of_times_item_bought[ii] = len(user_profile.loc[user_profile.buy_spu==item_id,'user_id'].unique()) # assume the same user would not buy the same product
number_of_times_item_viewed[ii] = len(user_profile.loc[user_profile.view_spu==item_id]) # same user can view the same image more than once for this count
In [39]:
plt.figure(figsize=(12,4))
plt.subplot(1,2,1)
plt.bar(np.arange(len(number_of_times_item_bought)),number_of_times_item_bought)
sns.despine()
plt.title('item popularity (purchases)')
plt.xlabel('item')
plt.ylabel('# of times items were bought')
plt.subplot(1,2,2)
plt.hist(number_of_times_item_bought,bins=100)
sns.despine()
plt.title('item popularity (purchases)')
plt.xlabel('# of times items were bought sample size='+str(samplesize))
plt.ylabel('# of items')
Out[39]:
In [38]:
plt.figure(figsize=(12,4))
plt.subplot(1,2,1)
plt.bar(np.arange(len(number_of_times_item_viewed)),number_of_times_item_viewed)
sns.despine()
plt.title('item popularity (views)')
plt.xlabel('item')
plt.ylabel('# of times items were viewed')
plt.subplot(1,2,2)
plt.hist(number_of_times_item_bought,bins=100)
sns.despine()
plt.title('item popularity (views) sample size='+str(samplesize))
plt.xlabel('# of times items were viewed')
plt.ylabel('# of items')
Out[38]:
In [37]:
plt.figure(figsize=(6,4))
plt.subplot(1,1,1)
thresh =30
include = number_of_times_item_bought<thresh
plt.scatter(number_of_times_item_viewed[include],number_of_times_item_bought[include],)
(r,p) = stats.pearsonr(number_of_times_item_viewed[include],number_of_times_item_bought[include])
sns.despine()
plt.xlabel('number of times viewed')
plt.ylabel('number of times bought')
plt.title('r='+str(np.round(r,2))+' data truncated buys<'+str(thresh))
Out[37]:
In [9]:
samplesize =1000
items_bought_per_user = np.empty(samplesize)
items_viewed_per_user = np.empty(samplesize)
for ui,user_id in enumerate(np.random.choice(user_profile.user_id.unique(),samplesize)):
items_bought_per_user[ui] = len(user_profile.loc[user_profile.user_id==user_id,'buy_spu'].unique())
items_viewed_per_user[ui] = len(user_profile.loc[user_profile.user_id==user_id,'view_spu'].unique())
In [11]:
plt.figure(figsize=(12,4))
plt.subplot(1,2,1)
plt.hist(items_bought_per_user)
sns.despine()
plt.title('number of items bought per user (sample of 1000)')
plt.xlabel('# items bought')
plt.ylabel('# users')
plt.subplot(1,2,2)
plt.hist(items_viewed_per_user)
sns.despine()
plt.title('number of items viewed per user (sample of 1000)')
plt.xlabel('# items viewed')
plt.ylabel('# users')
Out[11]:
In [ ]:
In [65]:
urls = pd.read_csv('../../deep-learning-models-master/img/eval_img_url.csv',header=None)
urls.columns = ['spu','url']
print(len(urls))
urls.head(10)
Out[65]:
In [77]:
urls[['spu','url']].groupby(['spu']).agg(['count']).head()
Out[77]:
In [73]:
urls.loc[urls.spu==357870273655002,'url'].as_matrix()
Out[73]:
In [76]:
urls.loc[urls.spu==357889732772303,'url'].as_matrix()
Out[76]:
In [82]:
#urls.loc[urls.spu==1016200950427238422,'url']
In [84]:
tmp_urls = urls.loc[urls.spu==1016200950427238422,'url'].as_matrix()
tmp_urls
Out[84]:
In [83]:
from urllib import urlretrieve
import time
In [88]:
# scrape images
for i,tmp_url in enumerate(tmp_urls):
urlretrieve(tmp_url, '../data_img_tmp/{}.jpg'.format(i))
#time.sleep(3)
In [96]:
# plot them.
print('two images from url with same spu (ugh)')
plt.figure(figsize=(8,3))
for i,tmp_url in enumerate(tmp_urls):
img_path= '../data_img_tmp/{}.jpg'.format(i)
img = image.load_img(img_path, target_size=(224, 224))
plt.subplot(1,len(tmp_urls),i+1)
plt.imshow(img)
plt.grid(b=False)
In [78]:
urls.spu[0]
Out[78]:
In [54]:
urls.url[0]
Out[54]:
In [51]:
view_spus = user_profile.view_spu.unique()
contained = 0
spus_with_url = list(urls.spu.as_matrix())
for view_spu in view_spus:
if view_spu in spus_with_url:
contained+=1
print(contained/np.float(len(view_spus)))
In [53]:
buy_spus = user_profile.buy_spu.unique()
contained = 0
spus_with_url = list(urls.spu.as_matrix())
for buy_spu in buy_spus:
if buy_spu in spus_with_url:
contained+=1
print(contained/np.float(len(buy_spus)))
In [64]:
buy_spu in spus_with_url
Out[64]:
In [58]:
len(urls.spu.unique())
len(user_profile.view_spu.unique())
Out[58]:
In [3]:
spu_fea = pd.read_pickle("../data_nn_features/spu_fea.pkl") #takes forever to load
In [4]:
spu_fea['view_spu']=spu_fea['spu_id']
In [12]:
spu_fea['view_spu']=spu_fea['spu_id']
user_profile_w_features = user_profile.merge(spu_fea,on='view_spu',how='left')
print('before merge nrow: {0}').format(len(user_profile))
print('after merge nrows:{0}').format(len(user_profile_w_features))
In [13]:
print('number of items with features: {0}').format(len(spu_fea))
In [14]:
spu_fea.head()
Out[14]:
In [15]:
# merge with userdata
spu_fea['view_spu']=spu_fea['spu_id']
user_profile_w_features = user_profile.merge(spu_fea,on='view_spu',how='left')
print('before merge nrow: {0}').format(len(user_profile))
print('after merge nrows:{0}').format(len(user_profile_w_features))
In [16]:
user_profile_w_features['has_features']=user_profile_w_features.groupby(['view_spu'])['spu_id'].apply(lambda x: np.isnan(x))
In [130]:
user_profile_w_features.has_features= user_profile_w_features.has_features.astype('int')
In [131]:
user_profile_w_features.head()
Out[131]:
In [154]:
plot_trajectory_scatter(user_profile_w_features,scatter_color_col='has_features',samplesize=100,size=10,savedir='../../test.png')
In [108]:
1-(user_profile_w_features['features'].isnull()).mean()
Out[108]:
In [116]:
1-user_profile_w_features.groupby(['view_spu'])['spu_id'].apply(lambda x: np.isnan(x)).mean()
Out[116]:
In [27]:
buy_spus = user_profile.buy_spu.unique()
contained = 0
spus_with_features = list(spu_fea.spu_id.as_matrix())
for buy_spu in buy_spus:
if buy_spu in spus_with_features:
contained+=1
print(contained/np.float(len(buy_spus)))
In [28]:
contained
Out[28]:
In [29]:
len(buy_spus)
Out[29]:
In [30]:
view_spus = user_profile.view_spu.unique()
contained = 0
spus_with_features = list(spu_fea.spu_id.as_matrix())
for view_spu in view_spus:
if view_spu in spus_with_features:
contained+=1
print(contained/np.float(len(view_spus)))
In [31]:
len(view_spus)
Out[31]:
In [7]:
user_profile = pd.read_pickle('../data_user_view_buy/user_profile_items_nonnull_features_20_mins_5_views.pkl')
In [8]:
len(user_profile)
Out[8]:
In [9]:
print('unique users:{0}').format(len(user_profile.user_id.unique()))
print('unique items viewed:{0}').format(len(user_profile.view_spu.unique()))
print('unique items bought:{0}').format(len(user_profile.buy_spu.unique()))
print('unique categories viewed:{0}').format(len(user_profile.view_ct3.unique()))
print('unique categories bought:{0}').format(len(user_profile.buy_ct3.unique()))
print('unique brands viewed:{0}').format(len(user_profile.view_sn.unique()))
print('unique brands bought:{0}').format(len(user_profile.buy_sn.unique()))
In [10]:
#user_profile.groupby(['user_id'])['buy_spu'].nunique()
In [11]:
# how many items bought per user in this dataset?
plt.figure(figsize=(8,3))
plt.hist(user_profile.groupby(['user_id'])['buy_spu'].nunique(),bins=20,normed=False)
sns.despine()
plt.xlabel('number of items bought per user')
plt.ylabel('number of user')
Out[11]:
In [12]:
user_profile.loc[user_profile.user_id==4283991208,].head()
Out[12]:
In [13]:
user_profile.loc[user_profile.user_id==6539296,].head()
In [14]:
plot_trajectory_scatter(user_profile,samplesize=100,size=10,savedir='../figures/trajectories_evaluation_dataset.png')
In [15]:
user_profile = pd.read_pickle('../data_user_view_buy/user_profile_items_nonnull_features_20_mins_5_views_v2_sample1000.pkl')
In [16]:
print('unique users:{0}').format(len(user_profile.user_id.unique()))
print('unique items viewed:{0}').format(len(user_profile.view_spu.unique()))
print('unique items bought:{0}').format(len(user_profile.buy_spu.unique()))
print('unique categories viewed:{0}').format(len(user_profile.view_ct3.unique()))
print('unique categories bought:{0}').format(len(user_profile.buy_ct3.unique()))
print('unique brands viewed:{0}').format(len(user_profile.view_sn.unique()))
print('unique brands bought:{0}').format(len(user_profile.buy_sn.unique()))
In [17]:
# how many items bought per user in this dataset?
plt.figure(figsize=(8,3))
plt.hist(user_profile.groupby(['user_id'])['buy_spu'].nunique(),bins=20,normed=False)
sns.despine()
plt.xlabel('number of items bought per user')
plt.ylabel('number of user')
Out[17]:
In [1]:
%%bash
jupyter nbconvert --to slides Exploring_Data.ipynb && mv Exploring_Data.slides.html ../notebook_slides/Exploring_Data_v2.slides.html
jupyter nbconvert --to html Exploring_Data.ipynb && mv Exploring_Data.html ../notebook_htmls/Exploring_Data_v2.html
cp Exploring_Data.ipynb ../notebook_versions/Exploring_Data_v2.ipynb
In [38]:
# push to s3
import sys
import os
sys.path.append(os.getcwd()+'/../')
from src import s3_data_management
s3_data_management.push_results_to_s3('Exploring_Data_v1.html','../notebook_htmls/Exploring_Data_v1.html')
s3_data_management.push_results_to_s3('Exporing_Data_v1.slides.html','../notebook_slides/Exploring_Data_v1.slides.html')