In [1]:
# This script considers all the products a user has ordered
#
# We train a model computing the probability of reorder on the "train" data
#
# For the submission, we keep the orders that have a probability of
# reorder higher than a threshold
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"
import numpy as np
import pandas as pd
import lightgbm as lgb
import xgboost as xgb
import gc
from tqdm import tqdm, tqdm_notebook
tqdm.pandas(desc="")
%load_ext ipycache
IDIR = 'input/'
print('loading prior')
priors = pd.read_csv(
IDIR + 'order_products__prior.csv',
dtype=dict(
order_id=np.int32,
add_to_cart_order=np.int16,
reordered=np.int8,
product_id=np.int32
)
)
print('loading train')
op_train = pd.read_csv(
IDIR + 'order_products__train.csv',
dtype=dict(reordered=np.int8, add_to_cart_order=np.int16)
)
op_train.set_index(['order_id', 'product_id'], inplace=True, drop=False)
print('loading orders')
eval_sets = ["prior", "train", "test"]
orders = pd.read_csv(
IDIR + 'orders.csv',
dtype=dict(
order_dow=np.int8,
order_hour_of_day=np.int8,
order_number=np.int16,
order_id=np.int32,
user_id=np.int32,
days_since_prior_order=np.float32,
)
)
orders["eval_set"] = orders["eval_set"].apply(eval_sets.index).astype(np.int8)
orders.set_index('order_id', inplace=True, drop=False)
print('loading products')
products = pd.read_csv(
IDIR + 'products.csv',
dtype=dict(
aisle_id=np.int16,
department_id=np.int16,
product_id=np.int32
)
)
products.drop(['product_name'], axis=1, inplace=True)
departments = pd.read_csv(IDIR + 'departments.csv')
aisles = pd.read_csv(IDIR + 'aisles.csv')
print('priors {}: {}'.format(priors.shape, ', '.join(priors.columns)))
print('orders {}: {}'.format(orders.shape, ', '.join(orders.columns)))
print('train {}: {}'.format(op_train.shape, ', '.join(op_train.columns)))
print('Total departments: {}'.format(departments.shape[0]))
print('Total aisles: {}'.format(aisles.shape[0]))
In [2]:
# Week No
o1_gr = orders.groupby("user_id").agg({"days_since_prior_order": "cumsum"})
orders["user_weekno"] = (o1_gr["days_since_prior_order"] / 7).round().fillna(0)
orders = orders.merge(
orders.groupby("user_id").agg({"user_weekno": "max"}).rename(
columns={"user_weekno": "user_weekno_max"}
).reset_index(),
on="user_id",
how="left"
)
orders["user_weekno_rev"] = abs(orders.user_weekno_max - orders.user_weekno).astype(np.int8)
orders = orders.drop("user_weekno_max", axis=1)
gc.collect()
Out[2]:
https://www.kaggle.com/c/instacart-market-basket-analysis/discussion/35468
Here are some feature ideas that can help new participants get started and may be you will find something you have missed:
In [3]:
priors = priors.join(orders, on='order_id', rsuffix='_')
priors = priors.join(products, on='product_id', rsuffix='_')
priors.drop(['product_id_', 'order_id_'], inplace=True, axis=1)
In [4]:
usr = pd.DataFrame()
o_grouped = orders.groupby('user_id')
p_grouped = priors.groupby('user_id')
usr['average_days_between_orders'] = o_grouped.days_since_prior_order.mean()
usr['max_days_between_orders'] = o_grouped.days_since_prior_order.max()
usr['min_days_between_orders'] = o_grouped.days_since_prior_order.min()
usr['std_days_between_orders'] = o_grouped.days_since_prior_order.std()
usr["period"] = o_grouped.days_since_prior_order.fillna(0).sum()
usr['nb_orders'] = o_grouped.size().astype(np.int16)
users = pd.DataFrame()
users['total_items'] = p_grouped.size().astype(np.int16)
users['all_products'] = p_grouped['product_id'].apply(set)
users['total_distinct_items'] = (users.all_products.map(len)).astype(np.int16)
users['reorders'] = p_grouped["reordered"].sum()
users['reorder_rate'] = (users.reorders / usr.nb_orders)
users = users.join(usr)
del usr, o_grouped, p_grouped
users['average_basket'] = (users.total_items / users.nb_orders).astype(np.float32)
gc.collect()
print('user f', users.shape)
Out[4]:
In [5]:
def merge_user_features(df):
df['user_total_orders'] = df.user_id.map(users.nb_orders)
df['user_total_items'] = df.user_id.map(users.total_items)
df['user_total_distinct_items'] = df.user_id.map(users.total_distinct_items)
df['user_average_days_between_orders'] = df.user_id.map(users.average_days_between_orders)
df['user_max_days_between_orders'] = df.user_id.map(users.max_days_between_orders)
df['user_min_days_between_orders'] = df.user_id.map(users.min_days_between_orders)
df['user_std_days_between_orders'] = df.user_id.map(users.std_days_between_orders)
df['user_average_basket'] = df.user_id.map(users.average_basket)
df['user_period'] = df.user_id.map(users.period)
return df
In [6]:
prods = pd.DataFrame()
p_grouped = priors.groupby(priors.product_id)
prods['orders'] = p_grouped.size().astype(np.float32)
prods['order_freq'] = (prods['orders'] / len(priors.order_id.unique()))
prods['users'] = p_grouped.user_id.unique().apply(len)
prods['add_to_cart_order_mean'] = p_grouped.add_to_cart_order.mean().astype(np.float32)
prods['add_to_cart_order_std'] = p_grouped.add_to_cart_order.std().astype(np.float32)
prods['reorders'] = p_grouped['reordered'].sum().astype(np.float32)
prods['reorder_rate'] = (prods.reorders / prods.orders).astype(np.float32)
products = products.join(prods, on='product_id')
products.set_index('product_id', drop=False, inplace=True)
del prods, p_grouped
gc.collect()
Out[6]:
In [7]:
def merge_product_features(df):
df['product_orders'] = df.product_id.map(products.orders).astype(np.float32)
df['product_users'] = df.product_id.map(products.users).astype(np.float32)
df['product_order_freq'] = df.product_id.map(products.order_freq).astype(np.float32)
df['product_reorders'] = df.product_id.map(products.reorders).astype(np.float32)
df['product_reorder_rate'] = df.product_id.map(products.reorder_rate).astype(np.float32)
df['product_add_to_cart_order_mean'] = df.product_id.map(products.add_to_cart_order_mean).astype(np.float32)
df['product_add_to_cart_order_std'] = df.product_id.map(products.add_to_cart_order_std).astype(np.float32)
return df
In [8]:
prods = pd.DataFrame()
p_grouped = priors.groupby(priors.aisle_id)
prods['orders'] = p_grouped.size().astype(np.float32)
prods['order_freq'] = (prods['orders'] / len(priors.order_id.unique())).astype(np.float32)
prods['users'] = p_grouped.user_id.unique().apply(len).astype(np.float32)
prods['add_to_cart_order_mean'] = p_grouped.add_to_cart_order.mean().astype(np.float32)
prods['add_to_cart_order_std'] = p_grouped.add_to_cart_order.std().astype(np.float32)
prods['reorders'] = p_grouped['reordered'].sum().astype(np.float32)
prods['reorder_rate'] = (prods.reorders / prods.orders).astype(np.float32)
aisles.set_index('aisle_id', drop=False, inplace=True)
aisles = aisles.join(prods)
del prods, p_grouped
In [9]:
def merge_aisle_features(df):
df['aisle_orders'] = df.aisle_id.map(aisles.orders)
df['aisle_users'] = df.aisle_id.map(aisles.users)
df['aisle_order_freq'] = df.aisle_id.map(aisles.order_freq)
df['aisle_reorders'] = df.aisle_id.map(aisles.reorders)
df['aisle_reorder_rate'] = df.aisle_id.map(aisles.reorder_rate)
df['aisle_add_to_cart_order_mean'] = df.aisle_id.map(aisles.add_to_cart_order_mean)
df['aisle_add_to_cart_order_std'] = df.aisle_id.map(aisles.add_to_cart_order_std)
return df
In [10]:
prods = pd.DataFrame()
p_grouped = priors.groupby(priors.department_id)
prods['orders'] = p_grouped.size().astype(np.float32)
prods['order_freq'] = (prods['orders'] / len(priors.order_id.unique())).astype(np.float32)
prods['users'] = p_grouped.user_id.unique().apply(len).astype(np.float32)
prods['add_to_cart_order_mean'] = p_grouped.add_to_cart_order.mean().astype(np.float32)
prods['add_to_cart_order_std'] = p_grouped.add_to_cart_order.std().astype(np.float32)
prods['reorders'] = p_grouped['reordered'].sum().astype(np.float32)
prods['reorder_rate'] = (prods.reorders / prods.orders).astype(np.float32)
departments.set_index('department_id', drop=False, inplace=True)
departments = departments.join(prods)
del prods, p_grouped
In [11]:
def merge_department_features(df):
df['department_orders'] = df.department_id.map(departments.orders)
df['department_users'] = df.department_id.map(departments.users)
df['department_order_freq'] = df.department_id.map(departments.order_freq)
df['department_reorders'] = df.department_id.map(departments.reorders)
df['department_reorder_rate'] = df.department_id.map(departments.reorder_rate)
df['department_add_to_cart_order_mean'] = df.department_id.map(departments.add_to_cart_order_mean)
df['department_add_to_cart_order_std'] = df.department_id.map(departments.add_to_cart_order_std)
return df
In [12]:
%%cache userXproduct.pkl userXproduct
priors['z'] = priors.product_id + priors.user_id * 100000
d = dict()
for row in tqdm_notebook(priors.itertuples(), total=len(priors)):
z = row.z
if z not in d:
d[z] = (
1,
(row.order_number, row.order_id),
row.add_to_cart_order,
row.reordered
)
else:
d[z] = (
d[z][0] + 1,
max(d[z][1], (row.order_number, row.order_id)),
d[z][2] + row.add_to_cart_order,
d[z][3] + row.reordered
)
priors.drop(['z'], axis=1, inplace=True)
print('to dataframe (less memory)')
userXproduct = pd.DataFrame.from_dict(d, orient='index')
del d
gc.collect()
userXproduct.columns = ['nb_orders', 'last_order_id', 'sum_pos_in_cart', 'reorders']
userXproduct.nb_orders = userXproduct.nb_orders.astype(np.int16)
userXproduct.last_order_id = userXproduct.last_order_id.map(lambda x: x[1]).astype(np.int32)
userXproduct.sum_pos_in_cart = userXproduct.sum_pos_in_cart.astype(np.int16)
userXproduct.reorders = userXproduct.reorders.astype(np.int16)
print('user X product f', len(userXproduct))
In [13]:
def merge_user_X_product_features(df):
df['z'] = df.product_id + df.user_id * 100000
df['UP_orders'] = df.z.map(userXproduct.nb_orders)
df['UP_orders_ratio'] = (df.UP_orders / df.user_total_orders)
df['UP_last_order_id'] = df.z.map(userXproduct.last_order_id)
df['UP_average_pos_in_cart'] = (df.z.map(userXproduct.sum_pos_in_cart) / df.UP_orders)
df['UP_reorders'] = df.z.map(userXproduct.reorders)
df['UP_orders_since_last'] = df.user_total_orders - df.UP_last_order_id.map(orders.order_number)
#df['UP_days_since_last'] =
# df['UP_delta_hour_vs_last'] = abs(
# df.order_hour_of_day - df.UP_last_order_id.map(orders.order_hour_of_day)
# ).map(lambda x: min(x, 24-x)).astype(np.int8)
#df['UP_same_dow_as_last_order'] = df.UP_last_order_id.map(orders.order_dow) == \
# df.order_id.map(orders.order_dow)
df.drop(['UP_last_order_id', 'z'], axis=1, inplace=True)
return df
In [14]:
### build list of candidate products to reorder, with features ###
train_index = set(op_train.index)
def features(selected_orders, labels_given=False):
order_list = []
product_list = []
labels = []
for row in tqdm_notebook(
selected_orders.itertuples(),
total=len(selected_orders)
):
order_id = row.order_id
user_id = row.user_id
user_products = users.all_products[user_id]
product_list += user_products
order_list += [order_id] * len(user_products)
if labels_given:
labels += [
(order_id, product) in train_index
for product in user_products
]
df = pd.DataFrame({'order_id': order_list, 'product_id': product_list})
df.order_id = df.order_id.astype(np.int32)
df.product_id = df.product_id.astype(np.int32)
del order_list
del product_list
df['user_id'] = df.order_id.map(orders.user_id)
df['aisle_id'] = df.product_id.map(products.aisle_id).astype(np.int8)
df['department_id'] = df.product_id.map(products.department_id).astype(np.int8)
labels = np.array(labels, dtype=np.int8)
print('user related features')
df = merge_user_features(df)
print('order related features')
df['dow'] = df.order_id.map(orders.order_dow)
df['order_hour_of_day'] = df.order_id.map(orders.order_hour_of_day)
df['days_since_prior_order'] = df.order_id.map(orders.days_since_prior_order)
df['days_since_ratio'] = df.days_since_prior_order / df.user_average_days_between_orders
print('product related features')
df = merge_product_features(df)
print('aisle related features')
df = merge_aisle_features(df)
print('department related features')
df = merge_department_features(df)
print('user_X_product related features')
df = merge_user_X_product_features(df)
return (df, labels)
In [15]:
# %%cache dataset.pkl df_train df_test labels
### train / test orders ###
print('split orders : train, test')
test_orders = orders[orders.eval_set == 2]
train_orders = orders[orders.eval_set == 1]
df_train, labels = features(train_orders, labels_given=True)
del train_orders
gc.collect()
df_test, _ = features(test_orders, labels_given=False)
del test_orders
gc.collect()
In [28]:
f_to_use = [
'user_total_orders', 'user_total_items', 'user_total_distinct_items',
'user_average_days_between_orders', 'user_average_basket',
'order_hour_of_day', 'days_since_prior_order', 'days_since_ratio',
'aisle_id', 'department_id', 'product_orders', 'product_reorders',
'product_reorder_rate', 'UP_orders', 'UP_orders_ratio',
'UP_average_pos_in_cart', 'UP_reorders', 'UP_orders_since_last',
'UP_delta_hour_vs_last'
]
def feature_select(df):
# return df[f_to_use]
return df.drop(
["user_id", "order_id", "product_id"],
axis=1, errors="ignore"
)
In [24]:
params = {
'task': 'train',
'boosting_type': 'gbdt',
'objective': 'binary',
'metric': {'binary_logloss'},
'num_leaves': 96,
'feature_fraction': 0.9,
'bagging_fraction': 0.95,
'bagging_freq': 5
}
ROUNDS = 98
def train(traindf, y):
d_train = lgb.Dataset(
feature_select(traindf),
label=y,
categorical_feature=['aisle_id', 'department_id']
)
model = lgb.train(params, d_train, ROUNDS)
return model
In [17]:
model = train(df_train, labels)
In [25]:
def predict(model, df_test, TRESHOLD=0.19):
### build candidates list for test ###
df_test['pred'] = model.predict(feature_select(df_test))
# Нужно добавить https://www.kaggle.com/mmueller/f1-score-expectation-maximization-in-o-n/code
d = dict()
for row in df_test.itertuples():
# Вот тут можно отрезать не по threshold, а с помощью модели определять кол-во покупок
if row.pred > TRESHOLD:
try:
d[row.order_id] += ' ' + str(row.product_id)
except KeyError:
d[row.order_id] = str(row.product_id)
for order_id in df_test.order_id:
if order_id not in d:
d[order_id] = 'None'
sub = pd.DataFrame.from_dict(d, orient='index')
sub.reset_index(inplace=True)
sub.columns = ['order_id', 'products']
return sub
In [19]:
sub = predict(model, df_test)
sub.to_csv('sub.csv', index=False)
In [ ]:
lgb.cv(params, d_train, ROUNDS, nfold=5, verbose_eval=10)
In [20]:
%%cache df_train_gt.pkl df_train_gt
from functools import partial
products_raw = pd.read_csv(IDIR + 'products.csv')
# combine aisles, departments and products (left joined to products)
goods = pd.merge(
left=pd.merge(
left=products_raw,
right=departments, how='left'
),
right=aisles, how='left'
)
# to retain '-' and make product names more "standard"
goods.product_name = goods.product_name.str.replace(' ', '_').str.lower()
# retype goods to reduce memory usage
goods.product_id = goods.product_id.astype(np.int32)
goods.aisle_id = goods.aisle_id.astype(np.int16)
goods.department_id = goods.department_id.astype(np.int8)
# initialize it with train dataset
train_details = pd.merge(
left=op_train,
right=orders,
how='left',
on='order_id'
).apply(partial(pd.to_numeric, errors='ignore', downcast='integer'))
# add order hierarchy
train_details = pd.merge(
left=train_details,
right=goods[['product_id',
'aisle_id',
'department_id']].apply(partial(pd.to_numeric,
errors='ignore',
downcast='integer')),
how='left',
on='product_id'
)
train_gtl = []
for uid, subset in train_details.groupby('user_id'):
subset1 = subset[subset.reordered == 1]
oid = subset.order_id.values[0]
if len(subset1) == 0:
train_gtl.append((oid, 'None'))
continue
ostr = ' '.join([str(int(e)) for e in subset1.product_id.values])
# .strip is needed because join can have a padding space at the end
train_gtl.append((oid, ostr.strip()))
del train_details
del goods
del products_raw
gc.collect()
df_train_gt = pd.DataFrame(train_gtl)
df_train_gt.columns = ['order_id', 'products']
df_train_gt.set_index('order_id', inplace=True)
df_train_gt.sort_index(inplace=True)
In [26]:
from sklearn.model_selection import GroupKFold
def f1_score(cvpred):
joined = df_train_gt.join(cvpred, rsuffix="_cv", how="inner")
lgts = joined.products.replace("None", "-1").apply(lambda x: x.split(" ")).values
lpreds = joined.products_cv.replace("None", "-1").apply(lambda x: x.split(" ")).values
f1 = []
for lgt, lpred in zip(lgts, lpreds):
rr = (np.intersect1d(lgt, lpred))
precision = np.float(len(rr)) / len(lpred)
recall = np.float(len(rr)) / len(lgt)
denom = precision + recall
f1.append(((2 * precision * recall) / denom) if denom > 0 else 0)
return np.mean(f1)
def cv(threshold=0.22):
gkf = GroupKFold(n_splits=5)
scores = []
for train_idx, test_idx in gkf.split(df_train.index, groups=df_train.user_id):
dftrain = df_train.iloc[train_idx]
dftest = df_train.iloc[test_idx]
y = labels[train_idx]
model = train(dftrain, y)
pred = predict(model, dftest, threshold).set_index("order_id")
f1 = f1_score(pred)
print(f1)
scores.append(f1)
del dftrain
del dftest
gc.collect()
return np.mean(scores), np.std(scores)
In [29]:
cv()
In [18]:
for th in np.arange(0.18, 0.22, 0.01):
print th
print cv(threshold=th)
print
0.372658477911
In [17]:
0.18
0.375669602808
0.37518960199
0.376068733519
0.374880658158
0.371575669134
(0.37467685312194482, 0.0016027896306283745)
0.19
0.375981281546
0.375613273106
0.37623495823
0.374958453045
0.371884026622
(0.3749343985097483, 0.0015845275427144021)
0.2
0.376141810192
0.375593739202
0.375961736002
0.375124046483
0.371748172351
(0.37491390084571824, 0.001620734287706205)
0.21
0.375454836995
0.374657579102
0.375585106194
0.374639123067
0.371277685501
(0.37432286617177202, 0.0015722458019732746)
0.2
0.376141810192
0.375593739202
0.375961736002
0.375124046483
0.371748172351
(0.37491390084571824, 0.001620734287706205)
0.374504880043
0.372459365153
0.374241429517
0.373332070018
0.370178093483
(0.37294316764289259, 0.0015591904647740879) 0.22
0.370290530162
0.369518178297
0.370515696117
0.369568282123
0.3673846793
(0.36945547319979183, 0.0011069090226251931) 0.24
0.363691285892
0.363725106289
0.363492700824
0.364412180878
0.363024994542
(0.36366925368510306, 0.00044761289123321511) 0.26
In [3]:
prior_orders_count = priors[["order_id", "product_id"]].groupby("order_id").count()
prior_orders_count = prior_orders_count.rename(columns={"product_id": "product_counts"})
train_orders_count = op_train.drop(["product_id", "order_id"], axis=1, errors="ignore")
train_orders_count = train_orders_count.reset_index()[["order_id", "product_id"]].groupby("order_id").count()
train_orders_count = train_orders_count.rename(columns={"product_id": "product_counts"})
prior_orders_count = orders.join(prior_orders_count, how='inner')
train_orders_count = orders.join(train_orders_count, how='inner')
prior_orders_count.head(15)
Out[3]:
In [13]:
from sklearn.linear_model import Lasso
from sklearn.metrics import mean_squared_error
def get_order_count(order, alpha=0.5):
user_id = order["user_id"]
df = prior_orders_count[prior_orders_count["user_id"] == user_id]
feats = ["order_number", "order_dow", "order_hour_of_day", "days_since_prior_order"]
X = df[feats].fillna(0).values
y = df["product_counts"].values
# create dataset for lightgbm
# lgb_train = lgb.Dataset(X, y)
# params = {
# 'task': 'train',
# 'boosting_type': 'gbdt',
# 'objective': 'regression',
# 'metric': {'rmse'},
# 'num_leaves': 100,
# 'learning_rate': 0.01,
# 'feature_fraction': 0.9,
# 'bagging_fraction': 0.8,
# 'bagging_freq': 5,
# 'verbose': 0,
# }
# clf = lgb.train(params,
# lgb_train,
# num_boost_round=40)
xgb_params = {
'max_depth': 5,
'n_estimators': 200,
'learning_rate': 0.05,
'objective': 'reg:linear',
'eval_metric': 'rmse',
'silent': 1
}
dtrain_all = xgb.DMatrix(X, y)
clf = xgb.train(xgb_params, dtrain_all, num_boost_round=400)
# clf = Lasso(alpha=0.01)
# clf.fit(X, y)
Xpred = np.array([order[f] or 0 for f in feats]).reshape(1, -1)
Xpred = np.nan_to_num(Xpred, 0)
Xpred = xgb.DMatrix(Xpred)
return clf.predict(Xpred)[0]
df = train_orders_count.head(10000)
df["pred_products_count"] = df.apply(get_order_count, axis=1)
print(mean_squared_error(
df["product_counts"],
df["pred_products_count"]
))
In [ ]: