In [3]:
# Author : Paul-Antoine Nguyen
# This script considers all the products a user has ordered
#
# We train a model computing the probability of reorder on the "train" data
#
# For the submission, we keep the orders that have a probability of
# reorder higher than a threshold
# some overhead because of kernel memory limits
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"
from datetime import datetime
import numpy as np
import pandas as pd
import lightgbm as lgb
import gc
from tqdm import tqdm, tqdm_notebook
tqdm.pandas(desc="")
%load_ext ipycache
%load_ext cython
IDIR = 'input/'
In [ ]:
print('loading prior')
priors = pd.read_csv(IDIR + 'order_products__prior.csv')
print('loading train')
op_train = pd.read_csv(
IDIR + 'order_products__train.csv',
index_col=['order_id', 'product_id']
)
train_index = set(op_train.index)
del op_train
print('loading orders')
orders = pd.read_csv(IDIR + 'orders.csv')
print('loading products')
products = pd.read_csv(IDIR + 'products.csv')
departments = pd.read_csv(IDIR + 'departments.csv', engine='c')
aisles = pd.read_csv(IDIR + 'aisles.csv', engine='c')
print('priors {}: {}'.format(priors.shape, ', '.join(priors.columns)))
print('orders {}: {}'.format(orders.shape, ', '.join(orders.columns)))
# print('train {}: {}'.format(op_train.shape, ', '.join(op_train.columns)))
print('Total departments: {}'.format(departments.shape[0]))
print('Total aisles: {}'.format(aisles.shape[0]))
In [2]:
orders.order_dow = orders.order_dow.astype(np.int8)
orders.order_hour_of_day = orders.order_hour_of_day.astype(np.int8)
orders.order_number = orders.order_number.astype(np.int16)
orders.order_id = orders.order_id.astype(np.int32)
orders.user_id = orders.user_id.astype(np.int32)
orders.days_since_prior_order = orders.days_since_prior_order.astype(np.float32)
orders.set_index('order_id', inplace=True, drop=False)
products.drop(['product_name'], axis=1, inplace=True)
products.aisle_id = products.aisle_id.astype(np.int8)
products.department_id = products.department_id.astype(np.int8)
products.product_id = products.product_id.astype(np.int32)
products.set_index('product_id', drop=False, inplace=True)
# op_train.reordered = op_train.reordered.astype(np.int8)
# op_train.add_to_cart_order = op_train.add_to_cart_order.astype(np.int16)
# op_train.set_index(['order_id', 'product_id'], inplace=True, drop=False)
priors.order_id = priors.order_id.astype(np.int32)
priors.add_to_cart_order = priors.add_to_cart_order.astype(np.int16)
priors.reordered = priors.reordered.astype(np.int8)
priors.product_id = priors.product_id.astype(np.int32)
https://www.kaggle.com/c/instacart-market-basket-analysis/discussion/35468
Here are some feature ideas that can help new participants get started and may be you will find something you have missed:
In [3]:
priors = priors.join(orders, on='order_id', rsuffix='_')
priors = priors.join(products, on='product_id', rsuffix='_')
priors.drop(['product_id_', 'order_id_'], inplace=True, axis=1)
In [4]:
# Week No
o1_gr = orders.sort_values(["user_id", "order_number"]).groupby("user_id").agg({"days_since_prior_order": "cumsum"})
orders["user_weekno"] = (o1_gr["days_since_prior_order"] / 7).round().fillna(0)
orders["user_days"] = o1_gr["days_since_prior_order"].fillna(0)
# orders = orders.merge(
# orders.groupby("user_id").agg({
# "user_weekno": "max",
# "user_days": "max",
# }).rename(
# columns={
# "user_weekno": "user_weekno_max",
# "user_days": "user_days_max"
# }
# ).reset_index(),
# on="user_id",
# how="left"
# )
# orders["user_weekno_rev"] = abs(orders.user_weekno_max - orders.user_weekno).astype(np.int8)
# orders["user_days_rev"] = abs(orders.user_days_max - orders.user_days).astype(np.int16)
# orders = orders.drop(["user_weekno_max", "user_days_max"], axis=1)
del o1_gr
gc.collect()
Out[4]:
In [5]:
prods = pd.DataFrame()
p_grouped = priors.groupby("product_id")
prods['orders'] = p_grouped.size().astype(np.float32)
prods['order_freq'] = prods['orders'] / len(priors.order_id.unique())
prods['users'] = p_grouped.user_id.unique().apply(len)
prods['add_to_cart_order_mean'] = p_grouped.add_to_cart_order.mean()
prods['add_to_cart_order_std'] = p_grouped.add_to_cart_order.std()
prods['reorders'] = p_grouped['reordered'].sum().astype(np.int32)
prods['reorders_max'] = p_grouped['reordered'].max().astype(np.int32)
prods['reorders_min'] = p_grouped['reordered'].min().astype(np.int32)
prods['reorders_mean'] = p_grouped['reordered'].mean().astype(np.float32)
prods['reorders_std'] = p_grouped['reordered'].std().astype(np.float32)
prods['reorder_rate'] = (prods.reorders / prods.orders).astype(np.float32)
products = products.join(prods)
del prods
In [6]:
usr = pd.DataFrame()
o_grouped = orders.groupby('user_id')
p_grouped = priors.groupby('user_id')
usr['average_days_between_orders'] = o_grouped.days_since_prior_order.mean().astype(np.float32)
usr['max_days_between_orders'] = o_grouped.days_since_prior_order.max().astype(np.float32)
usr['min_days_between_orders'] = o_grouped.days_since_prior_order.min().astype(np.float32)
usr['std_days_between_orders'] = o_grouped.days_since_prior_order.std().astype(np.float32)
usr["period"] = o_grouped.days_since_prior_order.fillna(0).sum()
usr["weeks"] = o_grouped.user_weekno.fillna(0).max()
usr['nb_orders'] = o_grouped.size().astype(np.int16)
users = pd.DataFrame()
users['total_items'] = p_grouped.size().astype(np.int16)
users['all_products'] = p_grouped['product_id'].apply(set)
users['total_distinct_items'] = (users.all_products.map(len)).astype(np.int16)
users['reorders'] = p_grouped["reordered"].sum().astype(np.int32)
users['reorders_max'] = p_grouped["reordered"].max().astype(np.int32)
users['reorders_min'] = p_grouped["reordered"].min().astype(np.int32)
users['reorders_mean'] = p_grouped["reordered"].mean().astype(np.float32)
users['reorders_std'] = p_grouped["reordered"].std().astype(np.float32)
users = users.join(usr)
users['reorder_rate'] = (users.reorders / users.nb_orders).astype(np.float32)
users['average_basket'] = (users.total_items / users.nb_orders).astype(np.float32)
del usr
gc.collect()
print('user f', users.shape)
Out[6]:
In [7]:
prods = pd.DataFrame()
p_grouped = priors.groupby("aisle_id")
prods['orders'] = p_grouped.size().astype(np.float32)
prods['order_freq'] = (prods['orders'] / len(priors.order_id.unique())).astype(np.float32)
prods['users'] = p_grouped.user_id.unique().apply(len).astype(np.float32)
prods['add_to_cart_order_mean'] = p_grouped.add_to_cart_order.mean().astype(np.float32)
prods['add_to_cart_order_std'] = p_grouped.add_to_cart_order.std().astype(np.float32)
prods['reorders'] = p_grouped['reordered'].sum().astype(np.float32)
prods['reorder_rate'] = (prods.reorders / prods.orders).astype(np.float32)
aisles.set_index('aisle_id', drop=False, inplace=True)
aisles = aisles.join(prods)
del prods, p_grouped
In [8]:
prods = pd.DataFrame()
p_grouped = priors.groupby("department_id")
prods['orders'] = p_grouped.size().astype(np.float32)
prods['order_freq'] = (prods['orders'] / len(priors.order_id.unique())).astype(np.float32)
prods['users'] = p_grouped.user_id.unique().apply(len).astype(np.float32)
prods['add_to_cart_order_mean'] = p_grouped.add_to_cart_order.mean().astype(np.float32)
prods['add_to_cart_order_std'] = p_grouped.add_to_cart_order.std().astype(np.float32)
prods['reorders'] = p_grouped['reordered'].sum().astype(np.float32)
prods['reorder_rate'] = (prods.reorders / prods.orders).astype(np.float32)
departments.set_index('department_id', drop=False, inplace=True)
departments = departments.join(prods)
del prods, p_grouped
In [9]:
orders_last = orders[["order_id", "order_number", "user_id"]].rename(
columns={"order_id": "last_order_id"}
)
orders_first = orders[["order_id", "order_number", "user_id"]].rename(
columns={"order_id": "first_order_id"}
)
In [10]:
def flat_columns(df):
ind = pd.Index(["%s" % (e[1]) for e in df.columns.tolist()])
df.columns = ind
return df
In [11]:
priors['z'] = priors.product_id + priors.user_id * 100000
userXproduct = priors.groupby(["z", "user_id"]).agg({
"order_id": {"nb_orders": "count"},
"order_number": {
"last_order_number": "max",
"first_order_number": "min"
},
"add_to_cart_order": {
"sum_add_to_cart_order": "sum",
"min_add_to_cart_order": "min",
"max_add_to_cart_order": "max",
"mean_add_to_cart_order": "mean",
"std_add_to_cart_order": "std"
},
"reordered": {
"sum_reordered": "sum",
"mean_reordered": "mean",
"std_reordered": "std"
}
})
userXproduct = flat_columns(userXproduct).reset_index()
userXproduct = userXproduct.merge(
orders_last,
left_on=["user_id", "last_order_number"],
right_on=["user_id", "order_number"]
).drop("order_number", axis=1)
userXproduct = userXproduct.merge(
orders_first,
left_on=["user_id", "first_order_number"],
right_on=["user_id", "order_number"]
).drop(["user_id", "order_number"], axis=1)
userXproduct.drop_duplicates(subset=["z"], inplace=True)
userXproduct.set_index("z", inplace=True)
# d = dict()
# for row in tqdm(priors.itertuples(), total=len(priors)):
# z = row.z
# if z not in d:
# d[z] = (
# 1,
# (row.order_number, row.order_id),
# row.add_to_cart_order,
# row.reordered
# )
# else:
# d[z] = (
# d[z][0] + 1,
# max(d[z][1], (row.order_number, row.order_id)),
# d[z][2] + row.add_to_cart_order,
# d[z][3] + row.reordered
# )
# # priors.drop(['z'], axis=1, inplace=True)
# print('to dataframe (less memory)')
# d = pd.DataFrame.from_dict(d, orient='index')
# d.columns = ['nb_orders', 'last_order_id', 'sum_pos_in_cart', 'reorders']
# d.nb_orders = d.nb_orders.astype(np.int16)
# d.last_order_id = d.last_order_id.map(lambda x: x[1]).astype(np.int32)
# d.sum_pos_in_cart = d.sum_pos_in_cart.astype(np.int16)
# d.reorders = d.reorders.astype(np.int16)
# userXproduct = d
gc.collect()
print('user X product f', len(userXproduct))
Out[11]:
In [12]:
priors['z'] = priors.aisle_id + priors.user_id * 100000
userXaisle = priors.groupby(["z", "user_id"]).agg({
"order_id": {"nb_orders": "count"},
"order_number": {
"last_order_number": "max",
"first_order_number": "min"
},
"add_to_cart_order": {
"sum_add_to_cart_order": "sum",
"min_add_to_cart_order": "min",
"max_add_to_cart_order": "max",
"mean_add_to_cart_order": "mean",
"std_add_to_cart_order": "std"
},
"reordered": {
"sum_reordered": "sum",
"mean_reordered": "mean",
"std_reordered": "std"
}
})
userXaisle = flat_columns(userXaisle).reset_index()
userXaisle = userXaisle.merge(
orders_last,
left_on=["user_id", "last_order_number"],
right_on=["user_id", "order_number"]
).drop("order_number", axis=1)
userXaisle = userXaisle.merge(
orders_first,
left_on=["user_id", "first_order_number"],
right_on=["user_id", "order_number"]
).drop(["user_id", "order_number"], axis=1)
userXaisle.drop_duplicates(subset=["z"], inplace=True)
userXaisle.set_index("z", inplace=True)
gc.collect()
print('user X aisle f', len(userXaisle))
Out[12]:
In [13]:
priors['z'] = priors.department_id + priors.user_id * 100000
userXdepartment = priors.groupby(["z", "user_id"]).agg({
"order_id": {"nb_orders": "count"},
"order_number": {
"last_order_number": "max",
"first_order_number": "min"
},
"add_to_cart_order": {
"sum_add_to_cart_order": "sum",
"min_add_to_cart_order": "min",
"max_add_to_cart_order": "max",
"mean_add_to_cart_order": "mean",
"std_add_to_cart_order": "std"
},
"reordered": {
"sum_reordered": "sum",
"mean_reordered": "mean",
"std_reordered": "std"
}
})
userXdepartment = flat_columns(userXdepartment).reset_index()
userXdepartment = userXdepartment.merge(
orders_last,
left_on=["user_id", "last_order_number"],
right_on=["user_id", "order_number"]
).drop("order_number", axis=1)
userXdepartment = userXdepartment.merge(
orders_first,
left_on=["user_id", "first_order_number"],
right_on=["user_id", "order_number"]
).drop(["user_id", "order_number"], axis=1)
userXdepartment.drop_duplicates(subset=["z"], inplace=True)
userXdepartment.set_index("z", inplace=True)
gc.collect()
print('user X department f', len(userXdepartment))
Out[13]:
In [5]:
### build list of candidate products to reorder, with features ###
train_index = set(op_train.index)
def features(selected_orders, labels_given=False):
order_list = []
product_list = []
labels = []
for row in tqdm(selected_orders.itertuples(), total=len(selected_orders)):
order_id = row.order_id
user_id = row.user_id
user_products = list(users.all_products[user_id])
product_list += user_products
order_list += [order_id] * len(user_products)
if labels_given:
labels += [
(order_id, product) in train_index
for product in user_products
]
df = pd.DataFrame({'order_id': order_list, 'product_id': product_list})
df.order_id = df.order_id.astype(np.int32)
df.product_id = df.product_id.astype(np.int32)
df['user_id'] = df.order_id.map(orders.user_id).astype(np.int32)
df['aisle_id'] = df.product_id.map(products.aisle_id).astype(np.int8)
df['department_id'] = df.product_id.map(products.department_id).astype(np.int8)
labels = np.array(labels, dtype=np.int8)
del order_list
del product_list
print('user related features')
df['user_total_orders'] = df.user_id.map(users.nb_orders)
df['user_total_items'] = df.user_id.map(users.total_items)
df['user_total_distinct_items'] = df.user_id.map(users.total_distinct_items)
df['user_average_days_between_orders'] = df.user_id.map(users.average_days_between_orders)
df['user_max_days_between_orders'] = df.user_id.map(users.max_days_between_orders)
df['user_min_days_between_orders'] = df.user_id.map(users.min_days_between_orders)
df['user_std_days_between_orders'] = df.user_id.map(users.std_days_between_orders)
df['user_average_basket'] = df.user_id.map(users.average_basket)
df['user_reorders'] = df.user_id.map(users.reorders)
df['user_reorders_max'] = df.user_id.map(users.reorders_max)
df['user_reorders_min'] = df.user_id.map(users.reorders_min)
df['user_reorders_mean'] = df.user_id.map(users.reorders_mean)
df['user_reorders_std'] = df.user_id.map(users.reorders_std)
df['user_reorder_rate'] = df.user_id.map(users.reorder_rate)
df['user_period'] = df.user_id.map(users.period)
print('order related features')
df['dow'] = df.order_id.map(orders.order_dow)
df['order_hour_of_day'] = df.order_id.map(orders.order_hour_of_day)
df['days_since_prior_order'] = df.order_id.map(orders.days_since_prior_order)
df['days_since_ratio'] = df.days_since_prior_order / df.user_average_days_between_orders
print('product related features')
df['product_orders'] = df.product_id.map(products.orders).astype(np.float32)
df['product_users'] = df.product_id.map(products.users).astype(np.float32)
df['product_order_freq'] = df.product_id.map(products.order_freq).astype(np.float32)
df['product_reorders'] = df.product_id.map(products.reorders)
df['product_reorders_max'] = df.product_id.map(products.reorders_max)
df['product_reorders_min'] = df.product_id.map(products.reorders_min)
df['product_reorders_mean'] = df.product_id.map(products.reorders_mean)
df['product_reorders_std'] = df.product_id.map(products.reorders_std)
df['product_reorder_rate'] = df.product_id.map(products.reorder_rate)
df['product_add_to_cart_order_mean'] = df.product_id.map(products.add_to_cart_order_mean).astype(np.float32)
df['product_add_to_cart_order_std'] = df.product_id.map(products.add_to_cart_order_std).astype(np.float32)
print('aisle related features')
df['aisle_orders'] = df.aisle_id.map(aisles.orders)
df['aisle_users'] = df.aisle_id.map(aisles.users)
df['aisle_order_freq'] = df.aisle_id.map(aisles.order_freq)
df['aisle_reorders'] = df.aisle_id.map(aisles.reorders)
df['aisle_reorder_rate'] = df.aisle_id.map(aisles.reorder_rate)
df['aisle_add_to_cart_order_mean'] = df.aisle_id.map(aisles.add_to_cart_order_mean)
df['aisle_add_to_cart_order_std'] = df.aisle_id.map(aisles.add_to_cart_order_std)
print('department related features')
df['department_orders'] = df.department_id.map(departments.orders)
df['department_users'] = df.department_id.map(departments.users)
df['department_order_freq'] = df.department_id.map(departments.order_freq)
df['department_reorders'] = df.department_id.map(departments.reorders)
df['department_reorder_rate'] = df.department_id.map(departments.reorder_rate)
df['department_add_to_cart_order_mean'] = df.department_id.map(departments.add_to_cart_order_mean)
df['department_add_to_cart_order_std'] = df.department_id.map(departments.add_to_cart_order_std)
print('user_X_product related features')
df['z'] = df.product_id + df.user_id * 100000
df['UP_orders'] = df.z.map(userXproduct.nb_orders)
df['UP_orders_ratio'] = (df.UP_orders / df.user_total_orders).astype(np.float32)
df['UP_last_order_id'] = df.z.map(userXproduct.last_order_id)
df['UP_first_order_id'] = df.z.map(userXproduct.first_order_id)
df['UP_average_pos_in_cart'] = (df.z.map(userXproduct.sum_add_to_cart_order) / df.UP_orders).astype(np.float32)
df['UP_sum_add_to_cart_order'] = df.z.map(userXproduct.sum_add_to_cart_order)
df['UP_min_add_to_cart_order'] = df.z.map(userXproduct.min_add_to_cart_order)
df['UP_mean_add_to_cart_order'] = df.z.map(userXproduct.mean_add_to_cart_order)
df['UP_max_add_to_cart_order'] = df.z.map(userXproduct.max_add_to_cart_order)
df['UP_std_add_to_cart_order'] = df.z.map(userXproduct.std_add_to_cart_order)
df['UP_sum_reordered'] = df.z.map(userXproduct.sum_reordered)
df['UP_mean_reordered'] = df.z.map(userXproduct.mean_reordered)
df['UP_std_reordered'] = df.z.map(userXproduct.std_reordered)
df['UP_reorders_rate'] = (df.UP_sum_reordered / df.UP_orders).astype(np.float32)
df['UP_last_order_number'] = df.UP_last_order_id.map(orders.order_number)
df['UP_first_order_number'] = df.UP_first_order_id.map(orders.order_number)
df['UP_last_order_number_prc'] = (df.UP_last_order_number / df.user_total_orders).astype(np.float32)
df['UP_first_order_number_prc'] = (df.UP_first_order_number / df.user_total_orders).astype(np.float32)
df['UP_orders_since_last'] = df.user_total_orders - df.UP_last_order_number
df['UP_orders_rate_since_first_order'] = df.UP_orders / (df.user_total_orders - df.UP_first_order_number + 1)
df['UP_weeks_sinse_last'] = df.UP_last_order_id.map(orders.user_weekno) - df.order_id.map(orders.user_weekno)
df['UP_days_sinse_last'] = df.UP_last_order_id.map(orders.user_days) - df.order_id.map(orders.user_days)
df['UP_delta_hour_vs_last'] = abs(df.order_hour_of_day - \
df.UP_last_order_id.map(orders.order_hour_of_day)).map(
lambda x: min(x, 24-x)
).astype(np.int8)
print('user_X_aisle related features')
df['z'] = df.aisle_id + df.user_id * 100000
df['UA_orders'] = df.z.map(userXaisle.nb_orders)
df['UA_orders_ratio'] = (df.UA_orders / df.user_total_orders).astype(np.float32)
df['UA_last_order_id'] = df.z.map(userXaisle.last_order_id)
df['UA_first_order_id'] = df.z.map(userXaisle.first_order_id)
df['UA_average_pos_in_cart'] = (df.z.map(userXaisle.sum_add_to_cart_order) / df.UA_orders).astype(np.float32)
df['UA_sum_add_to_cart_order'] = df.z.map(userXaisle.sum_add_to_cart_order)
df['UA_min_add_to_cart_order'] = df.z.map(userXaisle.min_add_to_cart_order)
df['UA_mean_add_to_cart_order'] = df.z.map(userXaisle.mean_add_to_cart_order)
df['UA_max_add_to_cart_order'] = df.z.map(userXaisle.max_add_to_cart_order)
df['UA_std_add_to_cart_order'] = df.z.map(userXaisle.std_add_to_cart_order)
df['UA_sum_reordered'] = df.z.map(userXaisle.sum_reordered)
df['UA_mean_reordered'] = df.z.map(userXaisle.mean_reordered)
df['UA_std_reordered'] = df.z.map(userXaisle.std_reordered)
df['UA_reorders_rate'] = (df.UA_sum_reordered / df.UA_orders).astype(np.float32)
df['UA_last_order_number'] = df.UA_last_order_id.map(orders.order_number)
df['UA_first_order_number'] = df.UA_first_order_id.map(orders.order_number)
df['UA_last_order_number_prc'] = (df.UA_last_order_number / df.user_total_orders).astype(np.float32)
df['UA_first_order_number_prc'] = (df.UA_first_order_number / df.user_total_orders).astype(np.float32)
df['UA_orders_since_last'] = df.user_total_orders - df.UA_last_order_number
df['UA_orders_rate_since_first_order'] = df.UA_orders / (df.user_total_orders - df.UA_first_order_number + 1)
df['UA_weeks_sinse_last'] = df.UA_last_order_id.map(orders.user_weekno) - df.order_id.map(orders.user_weekno)
df['UA_days_sinse_last'] = df.UA_last_order_id.map(orders.user_days) - df.order_id.map(orders.user_days)
df['UA_delta_hour_vs_last'] = abs(df.order_hour_of_day - \
df.UA_last_order_id.map(orders.order_hour_of_day)).map(
lambda x: min(x, 24-x)
).astype(np.int8)
print('user_X_department related features')
df['z'] = df.department_id + df.user_id * 100000
df['UD_orders'] = df.z.map(userXdepartment.nb_orders)
df['UD_orders_ratio'] = (df.UD_orders / df.user_total_orders).astype(np.float32)
df['UD_last_order_id'] = df.z.map(userXdepartment.last_order_id)
df['UD_first_order_id'] = df.z.map(userXdepartment.first_order_id)
df['UD_average_pos_in_cart'] = (df.z.map(userXdepartment.sum_add_to_cart_order) / df.UD_orders).astype(np.float32)
df['UD_sum_add_to_cart_order'] = df.z.map(userXdepartment.sum_add_to_cart_order)
df['UD_min_add_to_cart_order'] = df.z.map(userXdepartment.min_add_to_cart_order)
df['UD_mean_add_to_cart_order'] = df.z.map(userXdepartment.mean_add_to_cart_order)
df['UD_max_add_to_cart_order'] = df.z.map(userXdepartment.max_add_to_cart_order)
df['UD_std_add_to_cart_order'] = df.z.map(userXdepartment.std_add_to_cart_order)
df['UD_sum_reordered'] = df.z.map(userXdepartment.sum_reordered)
df['UD_mean_reordered'] = df.z.map(userXdepartment.mean_reordered)
df['UD_std_reordered'] = df.z.map(userXdepartment.std_reordered)
df['UD_reorders_rate'] = (df.UD_sum_reordered / df.UD_orders).astype(np.float32)
df['UD_last_order_number'] = df.UD_last_order_id.map(orders.order_number)
df['UD_first_order_number'] = df.UD_first_order_id.map(orders.order_number)
df['UD_last_order_number_prc'] = (df.UD_last_order_number / df.user_total_orders).astype(np.float32)
df['UD_first_order_number_prc'] = (df.UD_first_order_number / df.user_total_orders).astype(np.float32)
df['UD_orders_since_last'] = df.user_total_orders - df.UD_last_order_number
df['UD_orders_rate_since_first_order'] = df.UD_orders / (df.user_total_orders - df.UD_first_order_number + 1)
df['UD_weeks_sinse_last'] = df.UD_last_order_id.map(orders.user_weekno) - df.order_id.map(orders.user_weekno)
df['UD_days_sinse_last'] = df.UD_last_order_id.map(orders.user_days) - df.order_id.map(orders.user_days)
df['UD_delta_hour_vs_last'] = abs(df.order_hour_of_day - \
df.UD_last_order_id.map(orders.order_hour_of_day)).map(
lambda x: min(x, 24-x)
).astype(np.int8)
df.drop([
'UP_last_order_id', 'UP_first_order_id',
'UA_last_order_id', 'UA_first_order_id',
'UD_last_order_id', 'UD_first_order_id',
'z'], axis=1, inplace=True
)
gc.collect()
return (df, labels)
In [15]:
### train / test orders ###
print('split orders : train, test')
test_orders = orders[orders.eval_set == 'test']
train_orders = orders[orders.eval_set == 'train']
df_train, labels = features(train_orders, labels_given=True)
df_test, _ = features(test_orders)
del test_orders, train_orders
In [13]:
### build list of candidate products to reorder, with features ###
df = op_train.groupby("order_id").agg({"reordered": "sum"})
df["reordered"] = df["reordered"].apply(lambda x: 1 if x == 0 else 0)
none_labels = df["reordered"].to_dict()
def none_features(selected_orders, labels_given=False):
order_list = []
labels = []
for order_id in tqdm(selected_orders, total=len(selected_orders)):
order_list += [order_id]
if labels_given:
labels += [none_labels[order_id]]
df = pd.DataFrame({'order_id': order_list})
df.order_id = df.order_id.astype(np.int32)
labels = np.array(labels, dtype=np.int8)
del order_list
print('user related features')
df['user_id'] = df.order_id.map(orders.user_id).astype(np.int32)
df['user_total_orders'] = df.user_id.map(users.nb_orders)
df['user_total_items'] = df.user_id.map(users.total_items)
df['user_total_distinct_items'] = df.user_id.map(users.total_distinct_items)
df['user_average_days_between_orders'] = df.user_id.map(users.average_days_between_orders)
df['user_average_basket'] = df.user_id.map(users.average_basket)
df['user_period'] = df.user_id.map(users.period)
print('order related features')
df['dow'] = df.order_id.map(orders.order_dow)
df['order_hour_of_day'] = df.order_id.map(orders.order_hour_of_day)
df['days_since_prior_order'] = df.order_id.map(orders.days_since_prior_order)
df['days_since_ratio'] = df.days_since_prior_order / df.user_average_days_between_orders
gc.collect()
return (df, labels)
In [14]:
def none_train(traindf, y):
none_params = {
'task': 'train',
'boosting_type': 'gbdt',
'objective': 'binary',
'metric': ['auc'],
'num_leaves': 96,
'feature_fraction': 0.9,
'bagging_fraction': 0.95,
'bagging_freq': 5
}
d_train = lgb.Dataset(
feature_select(traindf),
label=y,
categorical_feature=['aisle_id', 'department_id']
)
model = lgb.train(params, d_train, ROUNDS)
return model
def none_cv(traindf, y):
d_train = lgb.Dataset(
feature_select(traindf),
label=y,
categorical_feature=['aisle_id', 'department_id']
)
return lgb.cv(params, d_train, ROUNDS)
In [15]:
def none_predict(model, df):
return model.predict(feature_select(df))
In [2]:
import pickle
df_train = pd.read_pickle("df_train.pkl")
df_test = pd.read_pickle("df_test.pkl")
labels = pickle.load(open("labels.pkl"))
In [31]:
def feature_select(df):
return df.drop(
["user_id", "product_id", "order_id", "pred_ext"],
axis=1, errors="ignore"
)
In [7]:
params = {
'task': 'train',
'boosting_type': 'gbdt',
'objective': 'binary',
'metric': {'binary_logloss'},
'num_leaves': 96,
'feature_fraction': 0.9,
'bagging_fraction': 0.95,
'bagging_freq': 5
}
ROUNDS = 98
def train(traindf, y):
# none_df, none_labels = none_features(traindf["order_id"].unique(), True)
# none_model = none_train(none_df, none_labels)
d_train = lgb.Dataset(
feature_select(traindf),
label=y,
categorical_feature=['aisle_id', 'department_id']
)
model = lgb.train(params, d_train, ROUNDS)
return model, None
In [8]:
model, none_model = train(df_train, labels)
This kernel implements the O(n²) F1-Score expectation maximization algorithm presented in "Ye, N., Chai, K., Lee, W., and Chieu, H. Optimizing F-measures: A Tale of Two Approaches. In ICML, 2012."
It solves argmax_(0 <= k <= n,[[None]]) E[F1(P,k,[[None]])] with [[None]] being the indicator for predicting label "None" given posteriors P = [p_1, p_2, ... , p_n], where p_1 > p_2 > ... > p_n under label independence assumption by means of dynamic programming in O(n²).
In [9]:
def get_expectations(P, pNone=None):
expectations = []
P = np.sort(P)[::-1]
n = np.array(P).shape[0]
DP_C = np.zeros((n + 2, n + 1))
if pNone is None:
pNone = (1.0 - P).prod()
DP_C[0][0] = 1.0
for j in range(1, n):
DP_C[0][j] = (1.0 - P[j - 1]) * DP_C[0, j - 1]
for i in range(1, n + 1):
DP_C[i, i] = DP_C[i - 1, i - 1] * P[i - 1]
for j in range(i + 1, n + 1):
DP_C[i, j] = P[j - 1] * DP_C[i - 1, j - 1] + (1.0 - P[j - 1]) * DP_C[i, j - 1]
DP_S = np.zeros((2 * n + 1,))
DP_SNone = np.zeros((2 * n + 1,))
for i in range(1, 2 * n + 1):
DP_S[i] = 1. / (1. * i)
DP_SNone[i] = 1. / (1. * i + 1)
for k in range(n + 1)[::-1]:
f1 = 0
f1None = 0
for k1 in range(n + 1):
f1 += 2 * k1 * DP_C[k1][k] * DP_S[k + k1]
f1None += 2 * k1 * DP_C[k1][k] * DP_SNone[k + k1]
for i in range(1, 2 * k - 1):
DP_S[i] = (1 - P[k - 1]) * DP_S[i] + P[k - 1] * DP_S[i + 1]
DP_SNone[i] = (1 - P[k - 1]) * DP_SNone[i] + P[k - 1] * DP_SNone[i + 1]
expectations.append([f1None + 2 * pNone / (2 + k), f1])
return np.array(expectations[::-1]).T
def maximize_expectation(P, pNone=None):
expectations = get_expectations(P, pNone)
ix_max = np.unravel_index(expectations.argmax(), expectations.shape)
max_f1 = expectations[ix_max]
predNone = True if ix_max[0] == 0 else False
best_k = ix_max[1]
return best_k, predNone, max_f1
def _F1(tp, fp, fn):
return 2 * tp / (2 * tp + fp + fn)
def _Fbeta(tp, fp, fn, beta=1.0):
beta_squared = beta ** 2
return (1.0 + beta_squared) * tp / ((1.0 + beta_squared) * tp + fp + beta_squared * fn)
def print_best_prediction(P, pNone=None):
print("Maximize F1-Expectation")
print("=" * 23)
P = np.sort(P)[::-1]
n = P.shape[0]
L = ['L{}'.format(i + 1) for i in range(n)]
if pNone is None:
print("Estimate p(None|x) as (1-p_1)*(1-p_2)*...*(1-p_n)")
pNone = (1.0 - P).prod()
PL = ['p({}|x)={}'.format(l, p) for l, p in zip(L, P)]
print("Posteriors: {} (n={})".format(PL, n))
print("p(None|x)={}".format(pNone))
opt = F1Optimizer.maximize_expectation(P, pNone)
best_prediction = ['None'] if opt[1] else []
best_prediction += (L[:opt[0]])
f1_max = opt[2]
print("Prediction {} yields best E[F1] of {}\n".format(best_prediction, f1_max))
In [10]:
%%cython
import numpy as np
def get_expectations_cyt(P, pNone=None):
expectations = []
P = np.sort(P)[::-1]
n = np.array(P).shape[0]
DP_C = np.zeros((n + 2, n + 1))
if pNone is None:
pNone = (1.0 - P).prod()
DP_C[0][0] = 1.0
for j in range(1, n):
DP_C[0][j] = (1.0 - P[j - 1]) * DP_C[0, j - 1]
for i in range(1, n + 1):
DP_C[i, i] = DP_C[i - 1, i - 1] * P[i - 1]
for j in range(i + 1, n + 1):
DP_C[i, j] = P[j - 1] * DP_C[i - 1, j - 1] + (1.0 - P[j - 1]) * DP_C[i, j - 1]
DP_S = np.zeros((2 * n + 1,))
DP_SNone = np.zeros((2 * n + 1,))
for i in range(1, 2 * n + 1):
DP_S[i] = 1. / (1. * i)
DP_SNone[i] = 1. / (1. * i + 1)
for k in range(n + 1)[::-1]:
f1 = 0
f1None = 0
for k1 in range(n + 1):
f1 += 2 * k1 * DP_C[k1][k] * DP_S[k + k1]
f1None += 2 * k1 * DP_C[k1][k] * DP_SNone[k + k1]
for i in range(1, 2 * k - 1):
DP_S[i] = (1 - P[k - 1]) * DP_S[i] + P[k - 1] * DP_S[i + 1]
DP_SNone[i] = (1 - P[k - 1]) * DP_SNone[i] + P[k - 1] * DP_SNone[i + 1]
expectations.append([f1None + 2 * pNone / (2 + k), f1])
return np.array(expectations[::-1]).T
def maximize_expectation_cyt(P, pNone=None):
expectations = get_expectations_cyt(P, pNone)
ix_max = np.unravel_index(expectations.argmax(), expectations.shape)
max_f1 = expectations[ix_max]
predNone = True if ix_max[0] == 0 else False
best_k = ix_max[1]
return best_k, predNone, max_f1
def print_best_prediction_cyt(P, pNone=None):
print("Maximize F1-Expectation")
print("=" * 23)
P = np.sort(P)[::-1]
n = P.shape[0]
L = ['L{}'.format(i + 1) for i in range(n)]
if pNone is None:
print("Estimate p(None|x) as (1-p_1)*(1-p_2)*...*(1-p_n)")
pNone = (1.0 - P).prod()
PL = ['p({}|x)={}'.format(l, p) for l, p in zip(L, P)]
print("Posteriors: {} (n={})".format(PL, n))
print("p(None|x)={}".format(pNone))
opt = maximize_expectation_cyt(P, pNone)
best_prediction = ['None'] if opt[1] else []
best_prediction += (L[:opt[0]])
f1_max = opt[2]
print("Prediction {} yields best E[F1] of {}\n".format(best_prediction, f1_max))
In [41]:
def final_predict(df_test, none_model=None, TRESHOLD=0.5):
d = dict()
if none_model:
none_df, _ = none_features(df_test["order_id"].unique(), False)
none_df["pred"] = none_predict(none_model, none_df)
none_model_res = none_df.set_index("order_id")["pred"].to_dict()
# Вот тут можно отрезать не по threshold, а с помощью модели определять кол-во покупок
current_order_id = None
current_order_count = 0
current_order_basket_size = 0
for row in tqdm_notebook(df_test.sort_values(
by=["order_id", "pred"],
ascending=[False, False]
).itertuples(), total=len(df_test)):
order_id = row.order_id
if order_id != current_order_id:
current_order_id = order_id
current_order_count = 0
P = df_test[df_test.order_id == order_id].pred.values
# if none_model and none_model_res[order_id] > TRESHOLD:
# current_order_basket_size = 0
# else:
best_k, predNone, max_f1 = maximize_expectation_cyt(P)
current_order_basket_size = best_k
if predNone:
d[order_id] = 'None'
if current_order_count >= current_order_basket_size:
continue
current_order_count += 1
try:
d[order_id] += ' ' + str(row.product_id)
except KeyError:
d[order_id] = str(row.product_id)
for order_id in df_test.order_id:
if order_id not in d:
d[order_id] = 'None'
sub = pd.DataFrame.from_dict(d, orient='index')
sub.reset_index(inplace=True)
sub.columns = ['order_id', 'products']
return sub
def predict(model, df_test, none_model=None, TRESHOLD=0.5):
### build candidates list for test ###
df_test['pred'] = model.predict(feature_select(df_test))
if "pred_ext" in list(df_test.columns):
df_test['pred'] = (
df_test['pred'] * 0.2 +
df_test['pred_ext'] * 0.8
)
print("average pred and pred_ext")
return final_predict(df_test, none_model=None, TRESHOLD=0.5)
In [37]:
# Загружаем внешний prediction
pred_ext = pd.read_csv("prediction_lgbm.csv").rename(
columns={"prediction": "pred_ext"}
)
df_test = df_test.merge(pred_ext, on=["order_id", "product_id"])
df_test.head()
Out[37]:
In [42]:
sub = final_predict(df_test)
sub.to_csv('sub2.csv', index=False)
In [43]:
sub = final_predict(pd.read_csv("prediction_lgbm.csv").rename(
columns={"prediction": "pred"}
))
sub.to_csv('sub.csv', index=False)
In [35]:
sub = predict(model, df_test, none_model, TRESHOLD=0.8)
sub.to_csv('sub2.csv', index=False)
In [ ]:
lgb.cv(params, d_train, ROUNDS, nfold=5, verbose_eval=10)
In [21]:
%%cache df_train_gt.pkl df_train_gt
from functools import partial
products_raw = pd.read_csv(IDIR + 'products.csv')
# combine aisles, departments and products (left joined to products)
goods = pd.merge(left=pd.merge(left=products_raw, right=departments, how='left'), right=aisles, how='left')
# to retain '-' and make product names more "standard"
goods.product_name = goods.product_name.str.replace(' ', '_').str.lower()
# retype goods to reduce memory usage
goods.product_id = goods.product_id.astype(np.int32)
goods.aisle_id = goods.aisle_id.astype(np.int16)
goods.department_id = goods.department_id.astype(np.int8)
# initialize it with train dataset
train_details = pd.merge(
left=op_train,
right=orders,
how='left',
on='order_id'
).apply(partial(pd.to_numeric, errors='ignore', downcast='integer'))
# add order hierarchy
train_details = pd.merge(
left=train_details,
right=goods[['product_id',
'aisle_id',
'department_id']].apply(partial(pd.to_numeric,
errors='ignore',
downcast='integer')),
how='left',
on='product_id'
)
train_gtl = []
for uid, subset in train_details.groupby('user_id'):
subset1 = subset[subset.reordered == 1]
oid = subset.order_id.values[0]
if len(subset1) == 0:
train_gtl.append((oid, 'None'))
continue
ostr = ' '.join([str(int(e)) for e in subset1.product_id.values])
# .strip is needed because join can have a padding space at the end
train_gtl.append((oid, ostr.strip()))
del train_details
del goods
del products_raw
gc.collect()
df_train_gt = pd.DataFrame(train_gtl)
df_train_gt.columns = ['order_id', 'products']
df_train_gt.set_index('order_id', inplace=True)
df_train_gt.sort_index(inplace=True)
In [22]:
from sklearn.model_selection import GroupKFold
def f1_score(cvpred):
joined = df_train_gt.join(cvpred, rsuffix="_cv", how="inner")
lgts = joined.products.replace("None", "-1").apply(lambda x: x.split(" ")).values
lpreds = joined.products_cv.replace("None", "-1").apply(lambda x: x.split(" ")).values
f1 = []
for lgt, lpred in zip(lgts, lpreds):
rr = (np.intersect1d(lgt, lpred))
precision = np.float(len(rr)) / len(lpred)
recall = np.float(len(rr)) / len(lgt)
denom = precision + recall
f1.append(((2 * precision * recall) / denom) if denom > 0 else 0)
return np.mean(f1)
def cv(threshold=0.5, n=5):
nsplits = n
if n == 1:
nsplits = 2
gkf = GroupKFold(n_splits=nsplits)
scores = []
for train_idx, test_idx in gkf.split(df_train.index, groups=df_train.user_id):
dftrain = df_train.iloc[train_idx]
dftest = df_train.iloc[test_idx]
y = labels[train_idx]
model, none_model = train(dftrain, y)
pred = predict(model, dftest, none_model, threshold)
f1 = f1_score(pred.set_index("order_id"))
print(f1)
scores.append(f1)
del dftrain
del dftest
gc.collect()
if n == 1:
break
return np.mean(scores), np.std(scores)
In [23]:
cv(n=1)
Out[23]:
In [37]:
for th in [0.8, 0.7, 0.6, 0.5, 0.4]:
print(th)
print("\t", cv(threshold=th, n=1))
print()
0.372658477911
0.9: 0.386930142523
0.8: 0.386930142523
0.7: 0.38692823718
0.6: 0.386927269387
подмешанный none score: 0.378085325812
без none: 0.386930142523
In [17]:
0.18
0.375669602808
0.37518960199
0.376068733519
0.374880658158
0.371575669134
(0.37467685312194482, 0.0016027896306283745)
0.19
0.375981281546
0.375613273106
0.37623495823
0.374958453045
0.371884026622
(0.3749343985097483, 0.0015845275427144021)
0.2
0.376141810192
0.375593739202
0.375961736002
0.375124046483
0.371748172351
(0.37491390084571824, 0.001620734287706205)
0.21
0.375454836995
0.374657579102
0.375585106194
0.374639123067
0.371277685501
(0.37432286617177202, 0.0015722458019732746)
0.2
0.376141810192
0.375593739202
0.375961736002
0.375124046483
0.371748172351
(0.37491390084571824, 0.001620734287706205)
0.374504880043
0.372459365153
0.374241429517
0.373332070018
0.370178093483
(0.37294316764289259, 0.0015591904647740879) 0.22
0.370290530162
0.369518178297
0.370515696117
0.369568282123
0.3673846793
(0.36945547319979183, 0.0011069090226251931) 0.24
0.363691285892
0.363725106289
0.363492700824
0.364412180878
0.363024994542
(0.36366925368510306, 0.00044761289123321511) 0.26
In [3]:
prior_orders_count = priors[["order_id", "product_id"]].groupby("order_id").count()
prior_orders_count = prior_orders_count.rename(columns={"product_id": "product_counts"})
train_orders_count = op_train.drop(["product_id", "order_id"], axis=1, errors="ignore")
train_orders_count = train_orders_count.reset_index()[["order_id", "product_id"]].groupby("order_id").count()
train_orders_count = train_orders_count.rename(columns={"product_id": "product_counts"})
prior_orders_count = orders.join(prior_orders_count, how='inner')
train_orders_count = orders.join(train_orders_count, how='inner')
prior_orders_count.head(15)
Out[3]:
In [4]:
from sklearn.linear_model import Lasso
from sklearn.metrics import mean_squared_error
def get_order_count(order, alpha=0.5):
user_id = order["user_id"]
df = prior_orders_count[prior_orders_count["user_id"] == user_id]
feats = ["order_number", "order_dow", "order_hour_of_day", "days_since_prior_order"]
X = df[feats].fillna(0).values
y = df["product_counts"].values
# create dataset for lightgbm
lgb_train = lgb.Dataset(X, y)
# specify your configurations as a dict
params = {
'task': 'train',
'boosting_type': 'gbdt',
'objective': 'regression',
'metric': {'rmse'},
'num_leaves': 100,
'learning_rate': 0.01,
'feature_fraction': 0.9,
'bagging_fraction': 0.8,
'bagging_freq': 5,
'verbose': 0,
}
# train
clf = lgb.train(params,
lgb_train,
num_boost_round=40)
# clf = Lasso(alpha=0.01)
# clf.fit(X, y)
Xpred = np.array([order[f] or 0 for f in feats]).reshape(1, -1)
Xpred = np.nan_to_num(Xpred, 0)
return clf.predict(Xpred)[0]
df = train_orders_count.head(1000)
df["pred_products_count"] = df.apply(get_order_count, axis=1)
print(mean_squared_error(
df["product_counts"],
df["pred_products_count"]
))
In [ ]:
### build list of candidate products to reorder, with features ###
train_index = set(op_train.index)
def none_features(selected_orders, labels_given=False):
order_list = []
product_list = []
labels = []
for row in tqdm(selected_orders.itertuples(), total=len(selected_orders)):
order_id = row.order_id
user_id = row.user_id
order_list += [order_id]
if labels_given:
labels += [
(order_id, product) in train_index
for product in user_products
]
df = pd.DataFrame({'order_id': order_list})
df.order_id = df.order_id.astype(np.int32)
labels = np.array(labels, dtype=np.int8)
del order_list
print('user related features')
df['user_id'] = df.order_id.map(orders.user_id).astype(np.int32)
df['user_total_orders'] = df.user_id.map(users.nb_orders)
df['user_total_items'] = df.user_id.map(users.total_items)
df['user_total_distinct_items'] = df.user_id.map(users.total_distinct_items)
df['user_average_days_between_orders'] = df.user_id.map(users.average_days_between_orders)
df['user_average_basket'] = df.user_id.map(users.average_basket)
df['user_period'] = df.user_id.map(users.period)
print('order related features')
df['dow'] = df.order_id.map(orders.order_dow)
df['order_hour_of_day'] = df.order_id.map(orders.order_hour_of_day)
df['days_since_prior_order'] = df.order_id.map(orders.days_since_prior_order)
df['days_since_ratio'] = df.days_since_prior_order / df.user_average_days_between_orders
gc.collect()
return (df, labels)
In [10]:
train_index = set(op_train.index)
train_orders = orders[orders.eval_set == 'train']
selected_orders = train_orders
labels_given=True
order_list = []
product_list = []
labels = []
for row in tqdm(selected_orders.itertuples(), total=len(selected_orders)):
order_id = row.order_id
user_id = row.user_id
user_products = list(users.all_products[user_id])
product_list += user_products
order_list += [order_id] * len(user_products)
if labels_given:
labels += [
(order_id, product) in train_index
for product in user_products
]
In [18]:
df = pd.DataFrame({'order_id': order_list, 'product_id': product_list})
df.order_id = df.order_id.astype(np.int32)
df.product_id = df.product_id
df['user_id'] = df.order_id.map(orders.user_id)
df['aisle_id'] = df.product_id.map(products.aisle_id).astype(np.int8)
df['department_id'] = df.product_id.map(products.department_id).astype(np.int8)
labels = np.array(labels, dtype=np.int8)
# del order_list
# del product_list
print('user related features')
df['user_total_orders'] = df.user_id.map(users.nb_orders)
df['user_total_items'] = df.user_id.map(users.total_items)
df['user_total_distinct_items'] = df.user_id.map(users.total_distinct_items)
df['user_average_days_between_orders'] = df.user_id.map(users.average_days_between_orders)
df['user_max_days_between_orders'] = df.user_id.map(users.max_days_between_orders)
df['user_min_days_between_orders'] = df.user_id.map(users.min_days_between_orders)
df['user_std_days_between_orders'] = df.user_id.map(users.std_days_between_orders)
df['user_average_basket'] = df.user_id.map(users.average_basket)
df['user_reorders'] = df.user_id.map(users.reorders)
df['user_reorder_rate'] = df.user_id.map(users.reorder_rate)
df['user_period'] = df.user_id.map(users.period)
print('order related features')
df['dow'] = df.order_id.map(orders.order_dow)
df['order_hour_of_day'] = df.order_id.map(orders.order_hour_of_day)
df['days_since_prior_order'] = df.order_id.map(orders.days_since_prior_order)
df['days_since_ratio'] = df.days_since_prior_order / df.user_average_days_between_orders
print('product related features')
df['product_orders'] = df.product_id.map(products.orders).astype(np.int32)
df['product_users'] = df.product_id.map(products.users).astype(np.int32)
df['product_order_freq'] = df.product_id.map(products.order_freq).astype(np.float32)
df['product_reorders'] = df.product_id.map(products.reorders).astype(np.int32)
df['product_reorder_rate'] = df.product_id.map(products.reorder_rate)
df['product_add_to_cart_order_mean'] = df.product_id.map(products.add_to_cart_order_mean).astype(np.float32)
df['product_add_to_cart_order_std'] = df.product_id.map(products.add_to_cart_order_std).astype(np.float32)
print('aisle related features')
df['aisle_orders'] = df.aisle_id.map(aisles.orders).astype(np.int32)
df['aisle_users'] = df.aisle_id.map(aisles.users).astype(np.int32)
df['aisle_order_freq'] = df.aisle_id.map(aisles.order_freq).astype(np.float32)
df['aisle_reorders'] = df.aisle_id.map(aisles.reorders).astype(np.int32)
df['aisle_reorder_rate'] = df.aisle_id.map(aisles.reorder_rate).astype(np.float32)
df['aisle_add_to_cart_order_mean'] = df.aisle_id.map(aisles.add_to_cart_order_mean).astype(np.float32)
df['aisle_add_to_cart_order_std'] = df.aisle_id.map(aisles.add_to_cart_order_std).astype(np.float32)
print('department related features')
df['department_orders'] = df.department_id.map(departments.orders).astype(np.int32)
df['department_users'] = df.department_id.map(departments.users).astype(np.int32)
df['department_order_freq'] = df.department_id.map(departments.order_freq).astype(np.float32)
df['department_reorders'] = df.department_id.map(departments.reorders).astype(np.int32)
df['department_reorder_rate'] = df.department_id.map(departments.reorder_rate).astype(np.float32)
df['department_add_to_cart_order_mean'] = df.department_id.map(departments.add_to_cart_order_mean).astype(np.float32)
df['department_add_to_cart_order_std'] = df.department_id.map(departments.add_to_cart_order_std).astype(np.float32)
print('user_X_product related features')
df['z'] = df.product_id + df.user_id * 100000
df['UP_orders'] = df.z.map(userXproduct.nb_orders)
df['UP_orders_ratio'] = (df.UP_orders / df.user_total_orders).astype(np.float32)
df['UP_last_order_id'] = df.z.map(userXproduct.last_order_id)
# df['UP_first_order_id'] = df.z.map(userXproduct.first_order_id)
df['UP_average_pos_in_cart'] = (df.z.map(userXproduct.sum_pos_in_cart) / df.UP_orders).astype(np.float32)
df['UP_reorders'] = df.z.map(userXproduct.reorders)
df['UP_last_order_number'] = df.UP_last_order_id.map(orders.order_number)
# df['UP_first_order_number'] = df.UP_first_order_id.map(orders.order_number)
df['UP_orders_since_last'] = df.user_total_orders - df.UP_last_order_number
# df['UP_orders_rate_since_first_order'] = df.UP_orders / (df.user_total_orders - df.UP_first_order_number + 1)
df['UP_weeks_sinse_last'] = df.UP_last_order_id.map(orders.user_weekno_rev)
df['UP_days_sinse_last'] = df.UP_last_order_id.map(orders.user_days_rev)
df['UP_delta_hour_vs_last'] = abs(df.order_hour_of_day - \
df.UP_last_order_id.map(orders.order_hour_of_day)).map(
lambda x: min(x, 24-x)
).astype(np.int8)
#df['UP_same_dow_as_last_order'] = df.UP_last_order_id.map(orders.order_dow) == \
# df.order_id.map(orders.order_dow)
df.drop(['UP_last_order_id', 'z'], axis=1, inplace=True)
gc.collect()
In [13]:
df = pd.DataFrame({'order_id':order_list, 'product_id':product_list})
df.order_id = df.order_id.astype(np.int32)
df.product_id = df.product_id.astype(np.int32)
df['user_id'] = df.order_id.map(orders.user_id)
df["user_id"] = df["user_id"].astype(np.int32)
print('user related features')
df['user_total_orders'] = df.user_id.map(users.nb_orders)
df['user_total_items'] = df.user_id.map(users.total_items)
df['user_total_distinct_items'] = df.user_id.map(users.total_distinct_items)
df['user_average_days_between_orders'] = df.user_id.map(users.average_days_between_orders)
df['user_average_basket'] = df.user_id.map(users.average_basket)
df['user_period'] = df.user_id.map(users.period)
print('order related features')
# df['dow'] = df.order_id.map(orders.order_dow)
df['order_hour_of_day'] = df.order_id.map(orders.order_hour_of_day)
df['days_since_prior_order'] = df.order_id.map(orders.days_since_prior_order)
df['days_since_ratio'] = df.days_since_prior_order / df.user_average_days_between_orders
print('product related features')
df['aisle_id'] = df.product_id.map(products.aisle_id).astype(np.int8)
df['department_id'] = df.product_id.map(products.department_id).astype(np.int8)
df['product_orders'] = df.product_id.map(products.orders).astype(np.float32)
df['product_users'] = df.product_id.map(products.users).astype(np.float32)
df['product_order_freq'] = df.product_id.map(products.order_freq).astype(np.float32)
df['product_reorders'] = df.product_id.map(products.reorders).astype(np.float32)
df['product_reorder_rate'] = df.product_id.map(products.reorder_rate)
print('user_X_product related features')
df['z'] = df.product_id + df.user_id * 100000
df['UP_orders'] = df.z.map(userXproduct.nb_orders)
df['UP_orders_ratio'] = (df.UP_orders / df.user_total_orders).astype(np.float32)
df['UP_last_order_id'] = df.z.map(userXproduct.last_order_id)
df['UP_average_pos_in_cart'] = (df.z.map(userXproduct.sum_pos_in_cart) / df.UP_orders).astype(np.float32)
df['UP_reorders'] = df.z.map(userXproduct.reorders)
df['UP_orders_since_last'] = df.user_total_orders - df.UP_last_order_id.map(orders.order_number)
df['UP_delta_hour_vs_last'] = abs(df.order_hour_of_day - \
df.UP_last_order_id.map(orders.order_hour_of_day)).map(lambda x: min(x, 24-x)).astype(np.int8)
# df['UP_days_past_last_buy'] =
#df['UP_same_dow_as_last_order'] = df.UP_last_order_id.map(orders.order_dow) == \
# df.order_id.map(orders.order_dow)
df.drop(['UP_last_order_id', 'z'], axis=1, inplace=True)
In [4]:
ds = pd.read_pickle("../../imba/data/dataset.pkl")
In [ ]: