In [2]:
#importing packages

import numpy as np
import pandas as pd
import lightgbm as lgb
import gc

In [6]:
#reading Data
IDIR = '../input/'
print('loading prior')
priors = pd.read_csv(IDIR + 'order_products__prior.csv')
print('loading train')
train = pd.read_csv(IDIR + 'order_products__train.csv')
print('loading orders')
orders = pd.read_csv(IDIR + 'orders.csv')
print('loading products')
products = pd.read_csv(IDIR + 'products.csv')

print('priors {}: {}'.format(priors.shape, ', '.join(priors.columns)))
print('orders {}: {}'.format(orders.shape, ', '.join(orders.columns)))
print('train {}: {}'.format(train.shape, ', '.join(train.columns)))


loading prior
loading train
loading orders
loading products
priors (32434489, 4): order_id, product_id, add_to_cart_order, reordered
orders (3421083, 7): order_id, user_id, eval_set, order_number, order_dow, order_hour_of_day, days_since_prior_order
train (1384617, 4): order_id, product_id, add_to_cart_order, reordered

In [7]:
# Data type conversion
print('Data type conversion')
orders.order_dow = orders.order_dow.astype(np.int8)
orders.order_hour_of_day = orders.order_hour_of_day.astype(np.int8)
orders.order_number = orders.order_number.astype(np.int16)
orders.order_id = orders.order_id.astype(np.int32)
orders.user_id = orders.user_id.astype(np.int32)
orders.days_since_prior_order = orders.days_since_prior_order.astype(np.float32)

products.drop(['product_name'], axis=1, inplace=True)
products.aisle_id = products.aisle_id.astype(np.int8)
products.department_id = products.department_id.astype(np.int8)
products.product_id = products.product_id.astype(np.int32)

train.reordered = train.reordered.astype(np.int8)
train.add_to_cart_order = train.add_to_cart_order.astype(np.int16)

priors.order_id = priors.order_id.astype(np.int32)
priors.add_to_cart_order = priors.add_to_cart_order.astype(np.int16)
priors.reordered = priors.reordered.astype(np.int8)
priors.product_id = priors.product_id.astype(np.int32)


Data type conversion

In [ ]:


In [8]:
#calculating labels 

print('computing product reordering frequency or probability')
prods = pd.DataFrame()
prods['orders'] = priors.groupby(priors.product_id).size().astype(np.float32)
prods['reorders'] = priors['reordered'].groupby(priors.product_id).sum().astype(np.float32)
prods['reorder_rate'] = (prods.reorders / prods.orders).astype(np.float32)
products = products.join(prods, on='product_id')
products.set_index('product_id', drop=False, inplace=True)
del prods


print('concatinate order info and priors info')
orders.set_index('order_id', inplace=True, drop=False)
priors = priors.join(orders, on='order_id', rsuffix='_')
priors.drop('order_id_', inplace=True, axis=1)

#features from user ordering frequency

print('features from user')
usr = pd.DataFrame()
usr['average_days_between_orders'] = orders.groupby('user_id')['days_since_prior_order'].mean().astype(np.float32)
usr['nb_orders'] = orders.groupby('user_id').size().astype(np.int16)

users = pd.DataFrame()
users['total_items'] = priors.groupby('user_id').size().astype(np.int16)
users['all_products'] = priors.groupby('user_id')['product_id'].apply(set)
users['total_distinct_items'] = (users.all_products.map(len)).astype(np.int16)

users = users.join(usr)
del usr
users['average_basket'] = (users.total_items / users.nb_orders).astype(np.float32)
gc.collect()
print('user features', users.shape)


computing product reordering frequency or probability
concatinate order info and priors info
features from user
user f (206209, 6)

In [10]:
### userXproduct features

print('userXproduct features')
priors['user_product'] = priors.product_id + priors.user_id * 100000

d= dict()
for row in priors.itertuples():
    z = row.user_product
    if z not in d:
        d[z] = (1,
                (row.order_number, row.order_id),
                row.add_to_cart_order)
    else:
        d[z] = (d[z][0] + 1,
                max(d[z][1], (row.order_number, row.order_id)),
                d[z][2] + row.add_to_cart_order)

print('to dataframe')
d = pd.DataFrame.from_dict(d, orient='index')
d.columns = ['nb_orders', 'last_order_id', 'sum_pos_in_cart']
d.nb_orders = d.nb_orders.astype(np.int16)
d.last_order_id = d.last_order_id.map(lambda x: x[1]).astype(np.int32)
d.sum_pos_in_cart = d.sum_pos_in_cart.astype(np.int16)
   
userXproduct = d
print('user X product f', len(userXproduct))

del priors


userXproduct features
to dataframe (less memory)
user X product f 13307953

In [11]:
### train / test orders ###
print('split orders : train, test')
test_orders = orders[orders.eval_set == 'test']
train_orders = orders[orders.eval_set == 'train']

train.set_index(['order_id', 'product_id'], inplace=True, drop=False)


split orders : train, test

In [12]:
### function for binding user info, reorder info, and other features ###

def features(selected_orders, labels_given=False):
    print('build candidate list')
    order_list = []
    product_list = []
    labels = []
    i=0
    for row in selected_orders.itertuples():
        i+=1
        if i%10000 == 0: print('order row',i)
        order_id = row.order_id
        user_id = row.user_id
        user_products = users.all_products[user_id]
        product_list += user_products
        order_list += [order_id] * len(user_products)
        if labels_given:
            labels += [(order_id, product) in train.index for product in user_products]
        
    df = pd.DataFrame({'order_id':order_list, 'product_id':product_list})
    df.order_id = df.order_id.astype(np.int32)
    df.product_id = df.product_id.astype(np.int32)
    labels = np.array(labels, dtype=np.int8)
    del order_list
    del product_list
    
    print('user related features')
    df['user_id'] = df.order_id.map(orders.user_id).astype(np.int32)
    df['user_total_orders'] = df.user_id.map(users.nb_orders)
    df['user_total_items'] = df.user_id.map(users.total_items)
    df['total_distinct_items'] = df.user_id.map(users.total_distinct_items)
    df['user_average_days_between_orders'] = df.user_id.map(users.average_days_between_orders)
    df['user_average_basket'] =  df.user_id.map(users.average_basket)
    
    print('order related features')
    # df['dow'] = df.order_id.map(orders.order_dow)
    df['order_hour_of_day'] = df.order_id.map(orders.order_hour_of_day)
    df['days_since_prior_order'] = df.order_id.map(orders.days_since_prior_order)
    df['days_since_ratio'] = df.days_since_prior_order / df.user_average_days_between_orders
    
    print('product related features')
    df['aisle_id'] = df.product_id.map(products.aisle_id).astype(np.int8)
    df['department_id'] = df.product_id.map(products.department_id).astype(np.int8)
    df['product_orders'] = df.product_id.map(products.orders).astype(np.float32)
    df['product_reorders'] = df.product_id.map(products.reorders).astype(np.float32)
    df['product_reorder_rate'] = df.product_id.map(products.reorder_rate)

    print('user_X_product related features')
    df['z'] = df.user_id * 100000 + df.product_id
    df.drop(['user_id'], axis=1, inplace=True)
    df['UP_orders'] = df.z.map(userXproduct.nb_orders)
    df['UP_orders_ratio'] = (df.UP_orders / df.user_total_orders).astype(np.float32)
    df['UP_last_order_id'] = df.z.map(userXproduct.last_order_id)
    df['UP_average_pos_in_cart'] = (df.z.map(userXproduct.sum_pos_in_cart) / df.UP_orders).astype(np.float32)
    df['UP_reorder_rate'] = (df.UP_orders / df.user_total_orders).astype(np.float32)
    df['UP_orders_since_last'] = df.user_total_orders - df.UP_last_order_id.map(orders.order_number)
    df['UP_delta_hour_vs_last'] = abs(df.order_hour_of_day - \
                  df.UP_last_order_id.map(orders.order_hour_of_day)).map(lambda x: min(x, 24-x)).astype(np.int8)
    #df['UP_same_dow_as_last_order'] = df.UP_last_order_id.map(orders.order_dow) == \
    #                                              df.order_id.map(orders.order_dow)

    df.drop(['UP_last_order_id', 'z'], axis=1, inplace=True)
    # print(df.dtypes)
    # print(df.memory_usage())
    # print(train.memory_usage())
    # print(products.memory_usage())
    gc.collect()
    return (df, labels)

In [13]:
df_train, labels = features(train_orders, labels_given=True)

#selected features

f_to_use = ['user_total_orders', 'user_total_items', 'total_distinct_items',
       'user_average_days_between_orders', 'user_average_basket',
       'order_hour_of_day', 'days_since_prior_order', 'days_since_ratio',
       'aisle_id', 'department_id', 'product_orders', 'product_reorders',
       'product_reorder_rate', 'UP_orders', 'UP_orders_ratio',
       'UP_average_pos_in_cart', 'UP_reorder_rate', 'UP_orders_since_last',
       'UP_delta_hour_vs_last'] # 'dow', 'UP_same_dow_as_last_order'


print('formating for lgb')
d_train = lgb.Dataset(df_train[f_to_use],
                      label=labels,
                      categorical_feature=['aisle_id', 'department_id'])  # , 'order_hour_of_day', 'dow'
del df_train
gc.collect()

params = {
    'task': 'train',
    'boosting_type': 'gbdt',
    'objective': 'binary',
    'metric': {'binary_logloss'},
    'num_leaves': 96,
    'feature_fraction': 0.9,
    'bagging_fraction': 0.95,
    'bagging_freq': 5
}
ROUNDS = 98

print('light GBM train :-)')
bst = lgb.train(params, d_train, ROUNDS)
lgb.plot_importance(bst, figsize=(9,20))
del d_train
gc.collect()


build candidate list
order row 10000
order row 20000
order row 30000
order row 40000
order row 50000
order row 60000
order row 70000
order row 80000
order row 90000
order row 100000
order row 110000
order row 120000
order row 130000
user related features
order related features
product related features
user_X_product related features
order_id                              int32
product_id                            int32
user_total_orders                     int16
user_total_items                      int16
total_distinct_items                  int16
user_average_days_between_orders    float32
user_average_basket                 float32
order_hour_of_day                      int8
days_since_prior_order              float32
days_since_ratio                    float32
aisle_id                               int8
department_id                          int8
product_orders                      float32
product_reorders                    float32
product_reorder_rate                float32
UP_orders                             int16
UP_orders_ratio                     float32
UP_average_pos_in_cart              float32
UP_reorder_rate                     float32
UP_orders_since_last                  int16
UP_delta_hour_vs_last                  int8
dtype: object
Index                                     80
order_id                            33898644
product_id                          33898644
user_total_orders                   16949322
user_total_items                    16949322
total_distinct_items                16949322
user_average_days_between_orders    33898644
user_average_basket                 33898644
order_hour_of_day                    8474661
days_since_prior_order              33898644
days_since_ratio                    33898644
aisle_id                             8474661
department_id                        8474661
product_orders                      33898644
product_reorders                    33898644
product_reorder_rate                33898644
UP_orders                           16949322
UP_orders_ratio                     33898644
UP_average_pos_in_cart              33898644
UP_reorder_rate                     33898644
UP_orders_since_last                16949322
UP_delta_hour_vs_last                8474661
dtype: int64
Index                12439708
order_id             11076936
product_id           11076936
add_to_cart_order     2769234
reordered             1384617
dtype: int64
Index            397504
product_id       198752
aisle_id          49688
department_id     49688
orders           198752
reorders         198752
reorder_rate     198752
dtype: int64
formating for lgb
light GBM train :-)
Out[13]:
815

In [14]:
### build candidates list for test ###

df_test, _ = features(test_orders)

print('light GBM predict')
preds = bst.predict(df_test[f_to_use])

df_test['pred'] = preds

TRESHOLD = 0.25  # for reorder probability 

d = dict()
for row in df_test.itertuples():
    if row.pred > TRESHOLD:
        try:
            d[row.order_id] += ' ' + str(row.product_id)
        except:
            d[row.order_id] = str(row.product_id)

for order in test_orders.order_id:
    if order not in d:
        d[order] = 'None'

sub = pd.DataFrame.from_dict(d, orient='index')

sub.reset_index(inplace=True)
sub.columns = ['order_id', 'products']
sub.to_csv('sub.csv', index=False)


build candidate list
order row 10000
order row 20000
order row 30000
order row 40000
order row 50000
order row 60000
order row 70000
user related features
order related features
product related features
user_X_product related features
order_id                              int32
product_id                            int32
user_total_orders                     int16
user_total_items                      int16
total_distinct_items                  int16
user_average_days_between_orders    float32
user_average_basket                 float32
order_hour_of_day                      int8
days_since_prior_order              float32
days_since_ratio                    float32
aisle_id                               int8
department_id                          int8
product_orders                      float32
product_reorders                    float32
product_reorder_rate                float32
UP_orders                             int16
UP_orders_ratio                     float32
UP_average_pos_in_cart              float32
UP_reorder_rate                     float32
UP_orders_since_last                  int16
UP_delta_hour_vs_last                  int8
dtype: object
Index                                     80
order_id                            19333168
product_id                          19333168
user_total_orders                    9666584
user_total_items                     9666584
total_distinct_items                 9666584
user_average_days_between_orders    19333168
user_average_basket                 19333168
order_hour_of_day                    4833292
days_since_prior_order              19333168
days_since_ratio                    19333168
aisle_id                             4833292
department_id                        4833292
product_orders                      19333168
product_reorders                    19333168
product_reorder_rate                19333168
UP_orders                            9666584
UP_orders_ratio                     19333168
UP_average_pos_in_cart              19333168
UP_reorder_rate                     19333168
UP_orders_since_last                 9666584
UP_delta_hour_vs_last                4833292
dtype: int64
Index                12439708
order_id             11076936
product_id           11076936
add_to_cart_order     2769234
reordered             1384617
dtype: int64
Index            397504
product_id       198752
aisle_id          49688
department_id     49688
orders           198752
reorders         198752
reorder_rate     198752
dtype: int64
light GBM predict

In [ ]: