In [1]:
# Author : Paul-Antoine Nguyen
# This script considers all the products a user has ordered
#
# We train a model computing the probability of reorder on the "train" data
#
# For the submission, we keep the orders that have a probability of
# reorder higher than a threshold
# some overhead because of kernel memory limits
import numpy as np
import pandas as pd
import lightgbm as lgb
import xgboost as xgb
import gc
from tqdm import tqdm, tqdm_notebook
tqdm.pandas(desc="")
tqdm_notebook().pandas(desc="")
%load_ext ipycache
IDIR = 'input/'
print('loading prior')
priors = pd.read_csv(IDIR + 'order_products__prior.csv')
print('loading train')
op_train = pd.read_csv(IDIR + 'order_products__train.csv')
print('loading orders')
orders = pd.read_csv(IDIR + 'orders.csv')
print('loading products')
products = pd.read_csv(IDIR + 'products.csv')
departments = pd.read_csv(IDIR + 'departments.csv', engine='c')
aisles = pd.read_csv(IDIR + 'aisles.csv', engine='c')
print('priors {}: {}'.format(priors.shape, ', '.join(priors.columns)))
print('orders {}: {}'.format(orders.shape, ', '.join(orders.columns)))
print('train {}: {}'.format(op_train.shape, ', '.join(op_train.columns)))
print('Total departments: {}'.format(departments.shape[0]))
print('Total aisles: {}'.format(aisles.shape[0]))
In [2]:
orders.order_dow = orders.order_dow.astype(np.int8)
orders.order_hour_of_day = orders.order_hour_of_day.astype(np.int8)
orders.order_number = orders.order_number.astype(np.int16)
orders.order_id = orders.order_id.astype(np.int32)
orders.user_id = orders.user_id.astype(np.int32)
orders.days_since_prior_order = orders.days_since_prior_order.astype(np.float32)
orders.set_index('order_id', inplace=True, drop=False)
products.drop(['product_name'], axis=1, inplace=True)
products.aisle_id = products.aisle_id.astype(np.int8)
products.department_id = products.department_id.astype(np.int8)
products.product_id = products.product_id.astype(np.int32)
op_train.reordered = op_train.reordered.astype(np.int8)
op_train.add_to_cart_order = op_train.add_to_cart_order.astype(np.int16)
op_train.set_index(['order_id', 'product_id'], inplace=True, drop=False)
priors.order_id = priors.order_id.astype(np.int32)
priors.add_to_cart_order = priors.add_to_cart_order.astype(np.int16)
priors.reordered = priors.reordered.astype(np.int8)
priors.product_id = priors.product_id.astype(np.int32)
https://www.kaggle.com/c/instacart-market-basket-analysis/discussion/35468
Here are some feature ideas that can help new participants get started and may be you will find something you have missed:
In [3]:
priors = priors.join(orders, on='order_id', rsuffix='_')
priors = priors.join(products, on='product_id', rsuffix='_')
priors.drop(['product_id_', 'order_id_'], inplace=True, axis=1)
In [4]:
prods = pd.DataFrame()
prods['orders'] = priors.groupby(priors.product_id).size().astype(np.float32)
prods['order_freq'] = prods['orders'] / len(priors.order_id.unique())
prods['users'] = priors.groupby(priors.product_id).user_id.unique().apply(len)
prods['add_to_cart_order_mean'] = priors.groupby(priors.product_id).add_to_cart_order.mean()
prods['add_to_cart_order_std'] = priors.groupby(priors.product_id).add_to_cart_order.std()
prods['reorders'] = priors['reordered'].groupby(priors.product_id).sum().astype(np.float32)
prods['reorder_rate'] = (prods.reorders / prods.orders).astype(np.float32)
products = products.join(prods, on='product_id')
products.set_index('product_id', drop=False, inplace=True)
del prods
In [5]:
usr = pd.DataFrame()
usr['average_days_between_orders'] = orders.groupby('user_id')['days_since_prior_order'].mean().astype(np.float32)
usr["period"] = orders.groupby('user_id').days_since_prior_order.fillna(0).sum()
usr['nb_orders'] = orders.groupby('user_id').size().astype(np.int16)
users = pd.DataFrame()
users['total_items'] = priors.groupby('user_id').size().astype(np.int16)
users['all_products'] = priors.groupby('user_id')['product_id'].apply(set)
users['total_distinct_items'] = (users.all_products.map(len)).astype(np.int16)
users = users.join(usr)
del usr
users['average_basket'] = (users.total_items / users.nb_orders).astype(np.float32)
gc.collect()
print('user f', users.shape)
Out[5]:
In [9]:
# %%cache userXproduct.pkl userXproduct
priors['user_product'] = priors.product_id + priors.user_id * 100000
d = dict()
for row in tqdm(priors.itertuples(), total=len(priors)):
z = row.user_product
if z not in d:
d[z] = (
1,
(row.order_number, row.order_id),
row.add_to_cart_order,
row.reordered
)
else:
d[z] = (
d[z][0] + 1,
max(d[z][1], (row.order_number, row.order_id)),
d[z][2] + row.add_to_cart_order,
d[z][3] + row.reordered
)
print('to dataframe (less memory)')
d = pd.DataFrame.from_dict(d, orient='index')
d.columns = ['nb_orders', 'last_order_id', 'sum_pos_in_cart', 'reorders']
d.nb_orders = d.nb_orders.astype(np.int16)
d.last_order_id = d.last_order_id.map(lambda x: x[1]).astype(np.int32)
d.sum_pos_in_cart = d.sum_pos_in_cart.astype(np.int16)
userXproduct = d
print('user X product f', len(userXproduct))
In [7]:
### build list of candidate products to reorder, with features ###
train_index = set(op_train.index)
def features(selected_orders, labels_given=False):
order_list = []
product_list = []
labels = []
for row in tqdm(selected_orders.itertuples(), total=len(selected_orders)):
order_id = row.order_id
user_id = row.user_id
user_products = users.all_products[user_id]
product_list += user_products
order_list += [order_id] * len(user_products)
if labels_given:
labels += [(order_id, product) in train_index for product in user_products]
df = pd.DataFrame({'order_id':order_list, 'product_id':product_list})
df.order_id = df.order_id.astype(np.int32)
df.product_id = df.product_id.astype(np.int32)
labels = np.array(labels, dtype=np.int8)
del order_list
del product_list
print('user related features')
df['user_id'] = df.order_id.map(orders.user_id).astype(np.int32)
df['user_total_orders'] = df.user_id.map(users.nb_orders)
df['user_total_items'] = df.user_id.map(users.total_items)
df['user_total_distinct_items'] = df.user_id.map(users.total_distinct_items)
df['user_average_days_between_orders'] = df.user_id.map(users.average_days_between_orders)
df['user_average_basket'] = df.user_id.map(users.average_basket)
df['user_period'] = df.user_id.map(users.period)
print('order related features')
# df['dow'] = df.order_id.map(orders.order_dow)
df['order_hour_of_day'] = df.order_id.map(orders.order_hour_of_day)
df['days_since_prior_order'] = df.order_id.map(orders.days_since_prior_order)
df['days_since_ratio'] = df.days_since_prior_order / df.user_average_days_between_orders
print('product related features')
df['aisle_id'] = df.product_id.map(products.aisle_id).astype(np.int8)
df['department_id'] = df.product_id.map(products.department_id).astype(np.int8)
df['product_orders'] = df.product_id.map(products.orders).astype(np.float32)
df['product_users'] = df.product_id.map(products.users).astype(np.float32)
df['product_order_freq'] = df.product_id.map(products.order_freq).astype(np.float32)
df['product_reorders'] = df.product_id.map(products.reorders).astype(np.float32)
df['product_reorder_rate'] = df.product_id.map(products.reorder_rate)
print('user_X_product related features')
df['z'] = df.product_id + df.user_id * 100000
df['UP_orders'] = df.z.map(userXproduct.nb_orders)
df['UP_orders_ratio'] = (df.UP_orders / df.user_total_orders).astype(np.float32)
df['UP_last_order_id'] = df.z.map(userXproduct.last_order_id)
df['UP_average_pos_in_cart'] = (df.z.map(userXproduct.sum_pos_in_cart) / df.UP_orders).astype(np.float32)
df['UP_reorders'] = df.z.map(userXproduct.reorders)
df['UP_orders_since_last'] = df.user_total_orders - df.UP_last_order_id.map(orders.order_number)
df['UP_delta_hour_vs_last'] = abs(df.order_hour_of_day - \
df.UP_last_order_id.map(orders.order_hour_of_day)).map(lambda x: min(x, 24-x)).astype(np.int8)
# df['UP_days_past_last_buy'] =
#df['UP_same_dow_as_last_order'] = df.UP_last_order_id.map(orders.order_dow) == \
# df.order_id.map(orders.order_dow)
df.drop(['UP_last_order_id', 'z'], axis=1, inplace=True)
gc.collect()
return (df, labels)
In [10]:
### train / test orders ###
print('split orders : train, test')
test_orders = orders[orders.eval_set == 'test']
train_orders = orders[orders.eval_set == 'train']
df_train, labels = features(train_orders, labels_given=True)
df_test, _ = features(test_orders)
In [11]:
f_to_use = [
'user_total_orders', 'user_total_items', 'user_total_distinct_items',
'user_average_days_between_orders', 'user_average_basket',
'order_hour_of_day', 'days_since_prior_order', 'days_since_ratio',
'aisle_id', 'department_id', 'product_orders', 'product_reorders',
'product_reorder_rate', 'UP_orders', 'UP_orders_ratio',
'UP_average_pos_in_cart', 'UP_reorders', 'UP_orders_since_last',
'UP_delta_hour_vs_last'
]
def feature_select(df):
return df.drop(["user_id", "order_id", "product_id"], axis=1, errors="ignore")
In [12]:
params = {
'task': 'train',
'boosting_type': 'gbdt',
'objective': 'binary',
'metric': {'binary_logloss'},
'num_leaves': 96,
'feature_fraction': 0.9,
'bagging_fraction': 0.95,
'bagging_freq': 5
}
ROUNDS = 98
def train(traindf, y):
d_train = lgb.Dataset(
feature_select(traindf),
label=y,
categorical_feature=['aisle_id', 'department_id']
)
model = lgb.train(params, d_train, ROUNDS)
return model
In [13]:
model = train(df_train, labels)
In [17]:
def predict(model, df_test, TRESHOLD=0.19, predicted_basket_size=None):
### build candidates list for test ###
df_test['pred'] = model.predict(feature_select(df_test))
d = dict()
if not predicted_basket_size:
for row in df_test.itertuples():
if row.pred > TRESHOLD:
try:
d[row.order_id] += ' ' + str(row.product_id)
except KeyError:
d[row.order_id] = str(row.product_id)
else:
# Вот тут можно отрезать не по threshold, а с помощью модели определять кол-во покупок
current_order_id = None
current_order_count = 0
for row in df_test.sort_values(
by=["order_id", "pred"],
ascending=[False, False]
).itertuples():
order_id = row.order_id
if order_id != current_order_id:
current_order_id = order_id
current_order_count = 0
if current_order_count >= predicted_basket_size[current_order_id]:
continue
current_order_count += 1
try:
d[order_id] += ' ' + str(row.product_id)
except KeyError:
d[order_id] = str(row.product_id)
for order_id in df_test.order_id:
if order_id not in d:
d[order_id] = 'None'
sub = pd.DataFrame.from_dict(d, orient='index')
sub.reset_index(inplace=True)
sub.columns = ['order_id', 'products']
return sub
In [18]:
# Загружаем предсказанное кол-во покупок
predicted_basket_size = pd.read_csv("test_orders_products_count.csv", index_col="order_id")
predicted_basket_size = predicted_basket_size["pred_products_count"].to_dict()
In [19]:
sub = predict(model, df_test, predicted_basket_size=predicted_basket_size)
sub.to_csv('sub.csv', index=False)
In [ ]:
lgb.cv(params, d_train, ROUNDS, nfold=5, verbose_eval=10)
In [12]:
%%cache df_train_gt.pkl df_train_gt
from functools import partial
products_raw = pd.read_csv(IDIR + 'products.csv')
# combine aisles, departments and products (left joined to products)
goods = pd.merge(left=pd.merge(left=products_raw, right=departments, how='left'), right=aisles, how='left')
# to retain '-' and make product names more "standard"
goods.product_name = goods.product_name.str.replace(' ', '_').str.lower()
# retype goods to reduce memory usage
goods.product_id = goods.product_id.astype(np.int32)
goods.aisle_id = goods.aisle_id.astype(np.int16)
goods.department_id = goods.department_id.astype(np.int8)
# initialize it with train dataset
train_details = pd.merge(
left=op_train,
right=orders,
how='left',
on='order_id'
).apply(partial(pd.to_numeric, errors='ignore', downcast='integer'))
# add order hierarchy
train_details = pd.merge(
left=train_details,
right=goods[['product_id',
'aisle_id',
'department_id']].apply(partial(pd.to_numeric,
errors='ignore',
downcast='integer')),
how='left',
on='product_id'
)
train_gtl = []
for uid, subset in train_details.groupby('user_id'):
subset1 = subset[subset.reordered == 1]
oid = subset.order_id.values[0]
if len(subset1) == 0:
train_gtl.append((oid, 'None'))
continue
ostr = ' '.join([str(int(e)) for e in subset1.product_id.values])
# .strip is needed because join can have a padding space at the end
train_gtl.append((oid, ostr.strip()))
del train_details
del goods
del products_raw
gc.collect()
df_train_gt = pd.DataFrame(train_gtl)
df_train_gt.columns = ['order_id', 'products']
df_train_gt.set_index('order_id', inplace=True)
df_train_gt.sort_index(inplace=True)
In [14]:
from sklearn.model_selection import GroupKFold
def f1_score(cvpred):
joined = df_train_gt.join(cvpred, rsuffix="_cv", how="inner")
lgts = joined.products.replace("None", "-1").apply(lambda x: x.split(" ")).values
lpreds = joined.products_cv.replace("None", "-1").apply(lambda x: x.split(" ")).values
f1 = []
for lgt, lpred in zip(lgts, lpreds):
rr = (np.intersect1d(lgt, lpred))
precision = np.float(len(rr)) / len(lpred)
recall = np.float(len(rr)) / len(lgt)
denom = precision + recall
f1.append(((2 * precision * recall) / denom) if denom > 0 else 0)
return np.mean(f1)
def cv(threshold=0.22):
gkf = GroupKFold(n_splits=5)
scores = []
for train_idx, test_idx in gkf.split(df_train.index, groups=df_train.user_id):
dftrain = df_train.iloc[train_idx]
dftest = df_train.iloc[test_idx]
y = labels[train_idx]
model = train(dftrain, y)
pred = predict(model, dftest, threshold).set_index("order_id")
f1 = f1_score(pred)
print f1
scores.append(f1)
del dftrain
del dftest
gc.collect()
return np.mean(scores), np.std(scores)
In [ ]:
cv()
In [18]:
for th in np.arange(0.18, 0.22, 0.01):
print th
print cv(threshold=th)
print
0.372658477911
In [17]:
0.18
0.375669602808
0.37518960199
0.376068733519
0.374880658158
0.371575669134
(0.37467685312194482, 0.0016027896306283745)
0.19
0.375981281546
0.375613273106
0.37623495823
0.374958453045
0.371884026622
(0.3749343985097483, 0.0015845275427144021)
0.2
0.376141810192
0.375593739202
0.375961736002
0.375124046483
0.371748172351
(0.37491390084571824, 0.001620734287706205)
0.21
0.375454836995
0.374657579102
0.375585106194
0.374639123067
0.371277685501
(0.37432286617177202, 0.0015722458019732746)
0.2
0.376141810192
0.375593739202
0.375961736002
0.375124046483
0.371748172351
(0.37491390084571824, 0.001620734287706205)
0.374504880043
0.372459365153
0.374241429517
0.373332070018
0.370178093483
(0.37294316764289259, 0.0015591904647740879) 0.22
0.370290530162
0.369518178297
0.370515696117
0.369568282123
0.3673846793
(0.36945547319979183, 0.0011069090226251931) 0.24
0.363691285892
0.363725106289
0.363492700824
0.364412180878
0.363024994542
(0.36366925368510306, 0.00044761289123321511) 0.26
In [6]:
prior_orders_count = priors[["order_id", "reordered"]].groupby("order_id").sum()
prior_orders_count = prior_orders_count.rename(columns={"reordered": "product_counts"})
train_orders_count = op_train.drop(["product_id", "order_id"], axis=1, errors="ignore")
train_orders_count = train_orders_count.reset_index()[["order_id", "reordered"]].groupby("order_id").sum()
train_orders_count = train_orders_count.rename(columns={"reordered": "product_counts"})
prior_orders_count = orders.join(prior_orders_count, how='inner')
train_orders_count = orders.join(train_orders_count, how='inner')
def extend_prev_prod_count(df, period=1):
global prior_orders_count
prior_orders_count["next_order_number"] = prior_orders_count["order_number"] + period
mdf = prior_orders_count[["user_id", "next_order_number", "product_counts"]]
mdf = mdf.add_suffix("_prev%s" % period)
try:
return df.merge(
mdf,
left_on=["user_id", "order_number"],
right_on=["user_id_prev%s" % period, "next_order_number_prev%s" % period],
how="left",
).drop([
"next_order_number",
"next_order_number_prev%s" % period,
"user_id_prev%s" % period,
], axis=1, errors="ignore")
finally:
prior_orders_count.drop("next_order_number", axis=1, inplace=True)
train_orders_count = extend_prev_prod_count(train_orders_count, 1)
train_orders_count = extend_prev_prod_count(train_orders_count, 2)
prior_orders_count = extend_prev_prod_count(prior_orders_count, 1)
prior_orders_count = extend_prev_prod_count(prior_orders_count, 2)
prior_orders_count.head(15)
Out[6]:
In [7]:
from sklearn.linear_model import Lasso
from sklearn.metrics import mean_squared_error
def get_order_count(order, alpha=0.5):
user_id = order["user_id"]
df = prior_orders_count[prior_orders_count["user_id"] == user_id]
feats = [
"order_number", "product_counts_prev1", "product_counts_prev2",
"order_dow", "order_hour_of_day", "days_since_prior_order"
]
X = df[feats].values
# X = np.nan_to_num(X, 0)
y = df["product_counts"].values
# create dataset for lightgbm
# lgb_train = lgb.Dataset(X, y)
# params = {
# 'task': 'train',
# 'boosting_type': 'gbdt',
# 'objective': 'regression',
# 'metric': {'rmse'},
# 'num_leaves': 100,
# 'learning_rate': 0.01,
# 'feature_fraction': 0.9,
# 'bagging_fraction': 0.8,
# 'bagging_freq': 5,
# 'verbose': 0,
# }
# clf = lgb.train(params,
# lgb_train,
# num_boost_round=40)
xgb_params = {
'max_depth': 3,
'n_estimators': 70,
'learning_rate': 0.05,
'objective': 'reg:linear',
'eval_metric': 'rmse',
'silent': 1
}
dtrain_all = xgb.DMatrix(X, y)
clf = xgb.train(xgb_params, dtrain_all, num_boost_round=400)
# clf = Lasso(alpha=0.01)
# clf.fit(X, y)
Xpred = np.array([order[f] or 0 for f in feats]).reshape(1, -1)
# Xpred = np.nan_to_num(Xpred, 0)
Xpred = xgb.DMatrix(Xpred)
return int(round(np.round(clf.predict(Xpred)[0])))
df = train_orders_count.head(100)
df["pred_products_count"] = df.apply(get_order_count, axis=1)
print(mean_squared_error(
df["product_counts"],
df["pred_products_count"]
))
In [8]:
df = orders[orders.eval_set == 'test']
df = extend_prev_prod_count(df, 1)
df = extend_prev_prod_count(df, 2)
df["pred_products_count"] = df.progress_apply(get_order_count, axis=1)
df.to_csv("test_orders_products_count.csv", index=False, header=True)
df.head()
Out[8]:
In [ ]: