In [1]:
# Author : Paul-Antoine Nguyen
# This script considers all the products a user has ordered
#
# We train a model computing the probability of reorder on the "train" data
#
# For the submission, we keep the orders that have a probability of
# reorder higher than a threshold
# some overhead because of kernel memory limits
In [2]:
import numpy as np
import pandas as pd
import lightgbm as lgb
import gc
In [4]:
IDIR = 'input/'
priors = pd.read_csv(IDIR + 'order_products__prior.csv')
train = pd.read_csv(IDIR + 'order_products__train.csv')
orders = pd.read_csv(IDIR + 'orders.csv')
products = pd.read_csv(IDIR + 'products.csv')
print('priors {}: {}'.format(priors.shape, ', '.join(priors.columns)))
print('orders {}: {}'.format(orders.shape, ', '.join(orders.columns)))
print('train {}: {}'.format(train.shape, ', '.join(train.columns)))
In [ ]:
###
# some memory measures for kaggle kernel
print('optimize memory')
orders.order_dow = orders.order_dow.astype(np.int8)
orders.order_hour_of_day = orders.order_hour_of_day.astype(np.int8)
orders.order_number = orders.order_number.astype(np.int16)
orders.order_id = orders.order_id.astype(np.int32)
orders.user_id = orders.user_id.astype(np.int32)
orders.days_since_prior_order = orders.days_since_prior_order.astype(np.float32)
products.drop(['product_name'], axis=1, inplace=True)
products.aisle_id = products.aisle_id.astype(np.int8)
products.department_id = products.department_id.astype(np.int8)
products.product_id = products.product_id.astype(np.int32)
train.reordered = train.reordered.astype(np.int8)
train.add_to_cart_order = train.add_to_cart_order.astype(np.int16)
priors.order_id = priors.order_id.astype(np.int32)
priors.add_to_cart_order = priors.add_to_cart_order.astype(np.int16)
priors.reordered = priors.reordered.astype(np.int8)
priors.product_id = priors.product_id.astype(np.int32)
print('computing product f')
prods = pd.DataFrame()
prods['orders'] = priors.groupby(priors.product_id).size().astype(np.float32)
prods['reorders'] = priors['reordered'].groupby(priors.product_id).sum().astype(np.float32)
prods['reorder_rate'] = (prods.reorders / prods.orders).astype(np.float32)
products = products.join(prods, on='product_id')
products.set_index('product_id', drop=False, inplace=True)
del prods
print('add order info to priors')
orders.set_index('order_id', inplace=True, drop=False)
priors = priors.join(orders, on='order_id', rsuffix='_')
priors.drop('order_id_', inplace=True, axis=1)
### user features
print('computing user f')
usr = pd.DataFrame()
usr['average_days_between_orders'] = orders.groupby('user_id')['days_since_prior_order'].mean().astype(np.float32)
usr['nb_orders'] = orders.groupby('user_id').size().astype(np.int16)
users = pd.DataFrame()
users['total_items'] = priors.groupby('user_id').size().astype(np.int16)
users['all_products'] = priors.groupby('user_id')['product_id'].apply(set)
users['total_distinct_items'] = (users.all_products.map(len)).astype(np.int16)
users = users.join(usr)
del usr
users['average_basket'] = (users.total_items / users.nb_orders).astype(np.float32)
gc.collect()
print('user f', users.shape)
### userXproduct features
print('compute userXproduct f - this is long...')
priors['user_product'] = priors.product_id + priors.user_id * 100000
# This was to slow !!
#def last_order(order_group):
# ix = order_group.order_number.idxmax
# return order_group.shape[0], order_group.order_id[ix], order_group.add_to_cart_order.mean()
#userXproduct = pd.DataFrame()
#userXproduct['tmp'] = df.groupby('user_product').apply(last_order)
d= dict()
for row in priors.itertuples():
z = row.user_product
if z not in d:
d[z] = (1,
(row.order_number, row.order_id),
row.add_to_cart_order)
else:
d[z] = (d[z][0] + 1,
max(d[z][1], (row.order_number, row.order_id)),
d[z][2] + row.add_to_cart_order)
print('to dataframe (less memory)')
d = pd.DataFrame.from_dict(d, orient='index')
d.columns = ['nb_orders', 'last_order_id', 'sum_pos_in_cart']
d.nb_orders = d.nb_orders.astype(np.int16)
d.last_order_id = d.last_order_id.map(lambda x: x[1]).astype(np.int32)
d.sum_pos_in_cart = d.sum_pos_in_cart.astype(np.int16)
userXproduct = d
print('user X product f', len(userXproduct))
del priors
### train / test orders ###
print('split orders : train, test')
test_orders = orders[orders.eval_set == 'test']
train_orders = orders[orders.eval_set == 'train']
train.set_index(['order_id', 'product_id'], inplace=True, drop=False)
### build list of candidate products to reorder, with features ###
def features(selected_orders, labels_given=False):
print('build candidate list')
order_list = []
product_list = []
labels = []
i=0
for row in selected_orders.itertuples():
i+=1
if i%10000 == 0: print('order row',i)
order_id = row.order_id
user_id = row.user_id
user_products = users.all_products[user_id]
product_list += user_products
order_list += [order_id] * len(user_products)
if labels_given:
labels += [(order_id, product) in train.index for product in user_products]
df = pd.DataFrame({'order_id':order_list, 'product_id':product_list})
df.order_id = df.order_id.astype(np.int32)
df.product_id = df.product_id.astype(np.int32)
labels = np.array(labels, dtype=np.int8)
del order_list
del product_list
print('user related features')
df['user_id'] = df.order_id.map(orders.user_id).astype(np.int32)
df['user_total_orders'] = df.user_id.map(users.nb_orders)
df['user_total_items'] = df.user_id.map(users.total_items)
df['total_distinct_items'] = df.user_id.map(users.total_distinct_items)
df['user_average_days_between_orders'] = df.user_id.map(users.average_days_between_orders)
df['user_average_basket'] = df.user_id.map(users.average_basket)
print('order related features')
# df['dow'] = df.order_id.map(orders.order_dow)
df['order_hour_of_day'] = df.order_id.map(orders.order_hour_of_day)
df['days_since_prior_order'] = df.order_id.map(orders.days_since_prior_order)
df['days_since_ratio'] = df.days_since_prior_order / df.user_average_days_between_orders
print('product related features')
df['aisle_id'] = df.product_id.map(products.aisle_id).astype(np.int8)
df['department_id'] = df.product_id.map(products.department_id).astype(np.int8)
df['product_orders'] = df.product_id.map(products.orders).astype(np.float32)
df['product_reorders'] = df.product_id.map(products.reorders).astype(np.float32)
df['product_reorder_rate'] = df.product_id.map(products.reorder_rate)
print('user_X_product related features')
df['z'] = df.user_id * 100000 + df.product_id
df.drop(['user_id'], axis=1, inplace=True)
df['UP_orders'] = df.z.map(userXproduct.nb_orders)
df['UP_orders_ratio'] = (df.UP_orders / df.user_total_orders).astype(np.float32)
df['UP_last_order_id'] = df.z.map(userXproduct.last_order_id)
df['UP_average_pos_in_cart'] = (df.z.map(userXproduct.sum_pos_in_cart) / df.UP_orders).astype(np.float32)
df['UP_reorder_rate'] = (df.UP_orders / df.user_total_orders).astype(np.float32)
df['UP_orders_since_last'] = df.user_total_orders - df.UP_last_order_id.map(orders.order_number)
df['UP_delta_hour_vs_last'] = abs(df.order_hour_of_day - \
df.UP_last_order_id.map(orders.order_hour_of_day)).map(lambda x: min(x, 24-x)).astype(np.int8)
#df['UP_same_dow_as_last_order'] = df.UP_last_order_id.map(orders.order_dow) == \
# df.order_id.map(orders.order_dow)
df.drop(['UP_last_order_id', 'z'], axis=1, inplace=True)
print(df.dtypes)
print(df.memory_usage())
print(train.memory_usage())
print(products.memory_usage())
gc.collect()
return (df, labels)
df_train, labels = features(train_orders, labels_given=True)
f_to_use = ['user_total_orders', 'user_total_items', 'total_distinct_items',
'user_average_days_between_orders', 'user_average_basket',
'order_hour_of_day', 'days_since_prior_order', 'days_since_ratio',
'aisle_id', 'department_id', 'product_orders', 'product_reorders',
'product_reorder_rate', 'UP_orders', 'UP_orders_ratio',
'UP_average_pos_in_cart', 'UP_reorder_rate', 'UP_orders_since_last',
'UP_delta_hour_vs_last'] # 'dow', 'UP_same_dow_as_last_order'
print('formating for lgb')
d_train = lgb.Dataset(df_train[f_to_use],
label=labels,
categorical_feature=['aisle_id', 'department_id']) # , 'order_hour_of_day', 'dow'
del df_train
gc.collect()
params = {
'task': 'train',
'boosting_type': 'gbdt',
'objective': 'binary',
'metric': {'binary_logloss'},
'num_leaves': 96,
'feature_fraction': 0.9,
'bagging_fraction': 0.95,
'bagging_freq': 5
}
ROUNDS = 98
print('light GBM train :-)')
bst = lgb.train(params, d_train, ROUNDS)
lgb.plot_importance(bst, figsize=(9,20))
del d_train
gc.collect()
In [5]:
### build candidates list for test ###
df_test, _ = features(test_orders)
print('light GBM predict')
preds = bst.predict(df_test[f_to_use])
df_test['pred'] = preds
TRESHOLD = 0.22 # guess, should be tuned with crossval on a subset of train data
d = dict()
for row in df_test.itertuples():
if row.pred > TRESHOLD:
try:
d[row.order_id] += ' ' + str(row.product_id)
except:
d[row.order_id] = str(row.product_id)
for order in test_orders.order_id:
if order not in d:
d[order] = 'None'
sub = pd.DataFrame.from_dict(d, orient='index')
sub.reset_index(inplace=True)
sub.columns = ['order_id', 'products']
sub.to_csv('sub.csv', index=False)
In [ ]: