In [1]:
import numpy as np
import pandas as pd
pd.set_option('display.max_colwidth', -1)
In [2]:
import pickle
In [3]:
from sklearn.preprocessing import MultiLabelBinarizer
In [4]:
orders_df = pd.read_csv("../data/raw/orders.csv")
op_train_df = pd.read_csv("../data/raw/order_products__train.csv")
op_prior_df = pd.read_csv("../data/raw/order_products__prior.csv")
In [5]:
orders_df.head()
Out[5]:
In [6]:
orders_prior_info = orders_df[orders_df.eval_set == "prior"]
In [7]:
orders_prior_info.shape
Out[7]:
In [8]:
prior_uid_pid_df = pd.merge(orders_df[['user_id', 'order_id']], op_prior_df, how = "inner", on = "order_id")
In [9]:
prior_uid_pid_df.shape
Out[9]:
In [10]:
num_orders_per_uid = orders_prior_info.groupby('user_id')['order_id'].\
agg('count').\
reset_index().\
rename(columns = {'order_id': 'num_orders'})
In [11]:
num_orders_per_uid.head()
Out[11]:
In [12]:
num_product_per_user = prior_uid_pid_df.groupby("order_id").agg({'product_id':'count', 'reordered': 'sum', 'user_id': 'first'}).\
reset_index().\
rename(columns = {'product_id': 'num_products', 'reordered': 'num_reordered'}).\
groupby("user_id").\
agg({'num_products': 'mean', 'num_reordered': 'mean'}).\
reset_index().\
rename(columns = {'num_products': 'avg_num_pids', 'num_reordered': 'avg_num_reordered'})
In [14]:
num_product_per_user['perc_reordered'] = num_product_per_user['avg_num_reordered']/num_product_per_user['avg_num_pids']
num_product_per_user = np.round(num_product_per_user, 2)
In [15]:
num_product_per_user.head()
Out[15]:
In [16]:
user_dow = orders_prior_info.groupby(['user_id', 'order_dow'])['order_id'].count().reset_index().\
rename(columns = {'order_id': 'cnt_dow'}).\
pivot(index = "user_id", columns = "order_dow", values = "cnt_dow")
In [17]:
user_dow.columns.name = None
user_dow = user_dow.add_prefix('dow_').reset_index().fillna(0)
In [18]:
user_dow.head()
Out[18]:
In [19]:
user_dow['most_dow'] = user_dow.ix[:, 1:].idxmax(axis=1)
In [20]:
user_dow.head()
Out[20]:
In [21]:
bins = [0, 6, 10, 13, 17, 20, 24]
group_names = ['sleeping', 'morning', 'noon', 'afternoon', 'evening', 'night']
In [22]:
orders_prior_info['daytime'] = pd.cut(orders_prior_info['order_hour_of_day'], bins = bins, labels= group_names, include_lowest=True)
In [23]:
orders_prior_info.head()
Out[23]:
In [24]:
user_daytime = orders_prior_info.groupby(['user_id', 'daytime'])['order_id'].count().reset_index().\
rename(columns = {'order_id': 'cnt_daytime'}).\
pivot(index = "user_id", columns = "daytime", values = "cnt_daytime").fillna(0)
In [25]:
user_daytime.columns.name = None
user_daytime = user_daytime.add_prefix('daytime_').reset_index().fillna(0)
user_daytime.head()
Out[25]:
In [26]:
user_daytime['most_daytime'] = user_daytime.ix[:, 1:].idxmax(axis=1)
In [27]:
user_daytime.head()
Out[27]:
In [28]:
user_days_since_po = orders_prior_info.groupby('user_id')['days_since_prior_order'].mean().reset_index().\
rename(columns = {'days_since_prior_order': 'avg_days_since_prior_order'})
In [29]:
user_days_since_po.head()
Out[29]:
In [30]:
products_df = pd.read_csv("../data/raw/products.csv")
aisles_df = pd.read_csv("../data/raw/aisles.csv")
departments_df = pd.read_csv("../data/raw/departments.csv")
In [31]:
products_df = pd.merge(pd.merge(products_df, aisles_df, on = "aisle_id"), departments_df, on = "department_id")
In [32]:
products_df.head()
Out[32]:
In [33]:
products_df.aisle.nunique()
Out[33]:
In [34]:
products_df.department.nunique()
Out[34]:
In [35]:
prior_uid_pid_df = pd.merge(prior_uid_pid_df, products_df[["product_id", "aisle_id", "department_id"]],\
how = "inner", on = "product_id")
In [36]:
prior_uid_pid_df.head()
Out[36]:
In [37]:
def get_most_reorder_pids(grouped_df, top = 3):
top3_pids = grouped_df['product_id'].value_counts().nlargest(top).index
top1 = int(top3_pids[0])
try:
top2 = int(top3_pids[1])
except:
top2 = np.nan
try:
top3 = int(top3_pids[2])
except:
top3 = np.nan
return pd.Series({'top1_reordered_pid': top1,
'top2_reordered_pid': top2,
'top3_reordered_pid': top3})
In [38]:
%time top3_reordered_pids_user = prior_uid_pid_df[prior_uid_pid_df.reordered == 1].groupby('user_id').apply(get_most_reorder_pids)
In [39]:
top3_reordered_pids_user = top3_reordered_pids_user.reset_index()
top3_reordered_pids_user.head()
Out[39]:
In [40]:
# def get_most_reordered_aiesle(grouped_df):
# top1 = grouped_df['aisle_id'].value_counts().nlargest(1).index[0]
# return pd.Series({'most_reordered_aiesle':top1})
def get_most_reordered_aiesle(grouped_df):
top1 = grouped_df['aisle_id'].value_counts().nlargest(1).index[0]
return top1
In [42]:
%time top1_reordered_aiesle_user = pd.DataFrame.from_records([ (g, get_most_reordered_aiesle(grp))\
for g, grp in prior_uid_pid_df[prior_uid_pid_df.reordered == 1].groupby('user_id')],\
columns = ['user_id', 'most_reordered_aiesle'])
In [43]:
top1_reordered_aiesle_user.head()
Out[43]:
In [46]:
def get_most_reordered_dpmt(grouped_df):
top1 = grouped_df['department_id'].value_counts().nlargest(1).index[0]
return top1
In [47]:
%time top1_reordered_dpmt_user = pd.DataFrame.from_records([(g, get_most_reordered_dpmt(grp))\
for g, grp in prior_uid_pid_df[prior_uid_pid_df.reordered == 1].groupby('user_id')],\
columns = ['user_id', 'most_reordered_dpmt'])
In [48]:
top1_reordered_dpmt_user.head()
Out[48]:
In [49]:
import gensim
In [50]:
prior_uid_pid_cnt_df = prior_uid_pid_df[['user_id', 'product_id']].groupby(['user_id','product_id']).size()
In [51]:
prior_uid_pid_cnt_df = prior_uid_pid_cnt_df.reset_index().rename(columns = {0:'count'})
In [52]:
prior_uid_pid_cnt_df.head()
Out[52]:
In [53]:
uid_pid_cnt_dict = [{'user_id': k, 'pid_freq_dict': dict(zip(g['product_id'], g['count'])) }\
for k,g in prior_uid_pid_cnt_df.groupby('user_id')]
uid_pid_freq_df = pd.DataFrame(uid_pid_cnt_dict)
In [54]:
uid_pid_freq_df.head()
Out[54]:
In [203]:
uid_pid_freq_df.shape
Out[203]:
In [204]:
# save the product count feature for all users
with open("../data/processed/user_prods_cnt_dict_features.pickle", "wb") as handle:
pickle.dump(uid_pid_freq_df, handle)
In [55]:
del uid_pid_cnt_dict, prior_uid_pid_cnt_df
In [56]:
product_emd = gensim.models.Word2Vec.load("../data/interim/product2vec.model")
In [57]:
def get_purchase_embedding(pid_freq_dict_series, product_emd):
all_uids_purchase_emd = []
for pfreq_d in pid_freq_dict_series:
uid_purchase_emd_ls = []
for pid, freq in pfreq_d.items():
try:
temp_vec = product_emd[str(pid)]*freq
uid_purchase_emd_ls.append(temp_vec)
except:
continue
one_uid_purchase_emd = np.stack(uid_purchase_emd_ls).sum(axis = 0)
all_uids_purchase_emd.append(one_uid_purchase_emd)
return np.stack(all_uids_purchase_emd)
In [58]:
uids_purchase_emd = get_purchase_embedding(uid_pid_freq_df['pid_freq_dict'], product_emd)
uid_pid_emb_df = pd.DataFrame(data = uids_purchase_emd, index = uid_pid_freq_df['user_id'])
uid_pid_emb_df = uid_pid_emb_df.add_prefix('emb_')
In [59]:
uid_pid_emb_df = uid_pid_emb_df.reset_index()
uid_pid_emb_df.head()
Out[59]:
joining all the features tables and the train set by user id
All the feature tables:
train set orders info:
train set labels info:
In [60]:
X_features = pd.concat([num_orders_per_uid, num_product_per_user, user_dow, user_daytime, user_days_since_po,\
top3_reordered_pids_user, top1_reordered_aiesle_user, top1_reordered_dpmt_user, uid_pid_emb_df], axis = 1)
In [61]:
X_features.shape
Out[61]:
In [62]:
X_features = X_features.loc[:, ~ X_features.columns.duplicated()]
In [63]:
X_features.head()
Out[63]:
In [64]:
# save the feature matrix for all users
with open("../data/processed/all_users_features.pickle", "wb") as handle:
pickle.dump(X_features, handle)
In [65]:
X_train_orders = orders_df[orders_df.eval_set == "train"]
In [66]:
X_train_orders.shape
Out[66]:
In [67]:
X_train_orders.head()
Out[67]:
In [68]:
# combine with the train set order info with features info
X_train = pd.merge(X_train_orders, X_features, how = "inner", on = "user_id")
In [69]:
y_train_df = pd.read_csv("../data/interim/train_labels.csv")
In [70]:
y_train_df.head()
Out[70]:
In [71]:
y_train_df[y_train_df.products == "None"].shape
Out[71]:
In [183]:
# number of unique reordered pid outcomes
pid_ls_str= ' '.join([pid_ls for pid_ls in y_train_df['products']])
num_reordered_pids = len(set(pid_ls_str.split(' ')))
In [184]:
num_reordered_pids
Out[184]:
In [73]:
test_user_ids = orders_df[orders_df.eval_set=="test"]['user_id'].tolist()
In [115]:
def products_concat(series):
out = ''
for product in series:
if product > 0:
out = out + str(int(product)) + ' '
if out != '':
return out.rstrip()
else:
return 'None'
In [122]:
test_uid_prev_prods = orders_df[(orders_df.user_id.isin(test_user_ids)) & (orders_df.eval_set=='prior')].merge(op_prior_df, on = "order_id")
In [124]:
test_users_prev_orders_prods = test_uid_prev_prods.groupby('user_id')['product_id'].apply(products_concat).reset_index()
In [131]:
test_users_prev_orders_prods.head()
Out[131]:
In [132]:
test_users_prev_orders_prods.to_csv("../data/interim/test_users_pids_bought.csv")
In [128]:
# number of products for users in the test set
test_pid_ls_str= ' '.join([pid_ls for pid_ls in test_users_prev_orders_prods['product_id']])
len(set(test_pid_ls_str.split(' ')))
Out[128]:
In [135]:
products_df.product_id.nunique()
Out[135]:
So it seems there are unseen product in the test set. It's better to represent the label to be a 49688 length of indicator vector.
In [136]:
X_train = pd.merge(X_train, y_train_df, how = "inner", on = "order_id")
In [137]:
X_train.head()
Out[137]:
In [223]:
X_train[X_train.products == "None"].head()
Out[223]:
In [140]:
list(set(products_df.product_id))
Out[140]:
In [173]:
all_outcomes = ['None'] + list(set(products_df.product_id.astype('str')))
In [174]:
len(all_outcomes)
Out[174]:
In [175]:
labels_tuple_ls = [tuple(label.split(' ')) for label in X_train['products']]
In [176]:
labels_tuple_ls[6] #double check
Out[176]:
In [177]:
mlb = MultiLabelBinarizer(sparse_output=True)
In [178]:
mlb.fit([all_outcomes])
Out[178]:
In [179]:
mlb.classes_
Out[179]:
In [180]:
y_train = mlb.transform(labels_tuple_ls)
In [181]:
type(y_train)
Out[181]:
In [182]:
y_train.shape
Out[182]:
In [183]:
y_train.data.nbytes # 3.35 MB
Out[183]:
In [184]:
#save the transformed train lables
with open("../data/processed/y_train.pickle", "wb") as handle:
pickle.dump(y_train, handle)
In [185]:
X_train.info(memory_usage= "deep")
In [186]:
# preprocess the data
features = X_train.columns
drop_cols = ['order_id', 'eval_set', 'products']
uint8_cols = features[features.str.startswith("dow_")].tolist() +\
features[features.str.startswith("daytime_")].tolist() +\
features[features.str.startswith("order_")].tolist() +\
features[features.str.startswith("avg_")].tolist() +\
["days_since_prior_order", "avg_days_since_prior_order"]
uint8_cols.remove('order_id')
str_cat_cols = features[features.str.startswith("top")].tolist() + \
features[features.str.startswith("most_reordered_")].tolist() +\
['most_daytime', 'most_dow']
round_cols = features[features.str.startswith("emb")].tolist()
In [187]:
def clean_data_type(df, convert_ls, float_round_ls = None, drop_cols_ls = None):
df = df.copy()
df = df.drop(drop_cols_ls, axis = 1)
for ct in convert_ls:
convert_type = ct[0]
convert_cols = ct[1]
for col in convert_cols:
df[col] = df[col].astype(convert_type)
df[float_round_ls] = np.round(df[float_round_ls], 2)
return df
In [188]:
convert_cols = [('uint8', uint8_cols), ('str', str_cat_cols)]
X_train = clean_data_type(X_train, convert_ls = convert_cols, float_round_ls = round_cols, drop_cols_ls = drop_cols)
In [189]:
X_train.info(memory_usage= "deep")
In [191]:
# save the cleaned and transformed training set - ready for modeling
with open("../data/processed/X_train.pickle", "wb") as handle:
pickle.dump(X_train, handle)
In [192]:
X_train.isnull().sum().sort_values(ascending=False)[0] #no missing values
Out[192]:
In [193]:
test_orders = orders_df[orders_df.eval_set == "test"]
In [194]:
test_orders.shape
Out[194]:
In [195]:
X_test = pd.merge(test_orders, X_features, how = "inner", on = "user_id")
In [196]:
X_test.shape #double check
Out[196]:
In [197]:
drop_cols = ['order_id', 'eval_set']
X_test = clean_data_type(X_test, convert_ls = convert_cols, float_round_ls = round_cols, drop_cols_ls = drop_cols)
In [198]:
X_test.shape
Out[198]:
In [199]:
X_test.info(memory_usage="deep")
In [200]:
# save the cleaned and transformed test set - ready for prediction
with open("../data/processed/X_test.pickle", "wb") as handle:
pickle.dump(X_test, handle)
In [201]:
X_test.isnull().sum().sort_values(ascending=False)[0] #no missing values
Out[201]:
In [ ]: