In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

import xgboost as xgb

%matplotlib inline
pd.options.display.max_columns = 100

In [2]:
aisles = pd.read_csv('data/aisles.csv', index_col='aisle_id').to_dict()['aisle']
depart = pd.read_csv('data/departments.csv', index_col='department_id').to_dict()['department']
products_df = pd.read_csv('data/products.csv', index_col='product_id')
products_df['aisle'] = products_df.aisle_id.map(lambda x: aisles[x])
products_df['department'] = products_df.department_id.map(lambda x: depart[x])
products_df.drop(['aisle_id', 'department_id'], axis=1, inplace=True)

In [3]:
prior_df = pd.read_csv('data/order_products__prior.csv')
train_df = pd.read_csv('data/order_products__train.csv')
orders_df = pd.read_csv('data/orders.csv')

In [4]:
order_products = pd.merge(orders_df, prior_df, left_on='order_id', right_on='order_id').sort_values(['user_id', 'order_number', 'add_to_cart_order'])
order_products['product_time'] = order_products[['user_id', 'order_number', 'product_id']].groupby(['user_id', 'product_id']).cumcount() + 1

In [5]:
order_products.head(20)


Out[5]:
order_id user_id eval_set order_number order_dow order_hour_of_day days_since_prior_order product_id add_to_cart_order reordered product_time
0 2539329 1 prior 1 2 8 NaN 196 1 0 1
1 2539329 1 prior 1 2 8 NaN 14084 2 0 1
2 2539329 1 prior 1 2 8 NaN 12427 3 0 1
3 2539329 1 prior 1 2 8 NaN 26088 4 0 1
4 2539329 1 prior 1 2 8 NaN 26405 5 0 1
5 2398795 1 prior 2 3 7 15.0 196 1 1 2
6 2398795 1 prior 2 3 7 15.0 10258 2 0 1
7 2398795 1 prior 2 3 7 15.0 12427 3 1 2
8 2398795 1 prior 2 3 7 15.0 13176 4 0 1
9 2398795 1 prior 2 3 7 15.0 26088 5 1 2
10 2398795 1 prior 2 3 7 15.0 13032 6 0 1
11 473747 1 prior 3 3 12 21.0 196 1 1 3
12 473747 1 prior 3 3 12 21.0 12427 2 1 3
13 473747 1 prior 3 3 12 21.0 10258 3 1 2
14 473747 1 prior 3 3 12 21.0 25133 4 0 1
15 473747 1 prior 3 3 12 21.0 30450 5 0 1
16 2254736 1 prior 4 4 7 29.0 196 1 1 4
17 2254736 1 prior 4 4 7 29.0 12427 2 1 4
18 2254736 1 prior 4 4 7 29.0 10258 3 1 3
19 2254736 1 prior 4 4 7 29.0 25133 4 1 2

In [6]:
tmp = order_products[['product_id', 'product_time']]
product_stats = pd.DataFrame(index=sorted(tmp.product_id.unique()))
product_stats['prod_orders'] = tmp.groupby('product_id').count()
product_stats['prod_reorders'] = tmp[tmp.product_time>1].groupby('product_id').count()
product_stats['prod_1st_orders'] = tmp[tmp.product_time==1].groupby('product_id').count()
product_stats['prod_2nd_orders'] = tmp[tmp.product_time==2].groupby('product_id').count()
product_stats = product_stats.fillna(0)

product_stats['prod_reorder_probability'] = product_stats.prod_2nd_orders / product_stats.prod_1st_orders
product_stats['prod_reorder_times'] = 1 + product_stats.prod_reorders / product_stats.prod_1st_orders
product_stats['prod_reorder_ratio'] = product_stats.prod_reorders / product_stats.prod_orders
product_stats = product_stats.drop(['prod_reorders', 'prod_1st_orders', 'prod_2nd_orders', ], axis=1)
product_stats.head()


Out[6]:
prod_orders prod_reorder_probability prod_reorder_times prod_reorder_ratio
1 1852 0.385475 2.586592 0.613391
2 90 0.102564 1.153846 0.133333
3 277 0.486486 3.743243 0.732852
4 329 0.351648 1.807692 0.446809
5 15 0.666667 2.500000 0.600000

In [7]:
tmp = orders_df[orders_df.eval_set == 'prior'].groupby('user_id')
user_stats_1 = pd.DataFrame(index=sorted(orders_df.user_id.unique()))
user_stats_1['user_orders'] = tmp['order_number'].max()
user_stats_1['user_period'] = tmp['days_since_prior_order'].sum()
user_stats_1['user_mean_days_since_prior'] = tmp['days_since_prior_order'].mean()

user_stats_2 = pd.DataFrame(index=sorted(orders_df.user_id.unique()))
user_stats_2['user_total_products'] = order_products.groupby('user_id').size()
user_stats_2['user_reorders'] = order_products[order_products.reordered==1].groupby('user_id').size()
user_stats_2['user_not_first_product_orders'] = order_products[order_products.order_number > 1].groupby('user_id').size()
user_stats_2['user_reorder_ratio'] = 1.0 * user_stats_2.user_reorders / user_stats_2.user_not_first_product_orders
user_stats_2['user_distinct_products'] = order_products.groupby('user_id')['product_id'].nunique()
user_stats_2 = user_stats_2.fillna(0).drop(['user_reorders', 'user_not_first_product_orders'], axis=1)

user_stats = pd.merge(user_stats_1, user_stats_2, left_index=True, right_index=True)
user_stats['user_average_basket'] = user_stats.user_total_products / user_stats.user_orders

user_stats_3 = orders_df[orders_df.eval_set != 'prior'][['user_id', 'order_id', 'eval_set', 'days_since_prior_order']].set_index('user_id')
user_stats = pd.merge(user_stats, user_stats_3, left_index=True, right_index=True)

user_stats.head()


Out[7]:
user_orders user_period user_mean_days_since_prior user_total_products user_reorder_ratio user_distinct_products user_average_basket order_id eval_set days_since_prior_order
1 10 176.0 19.555556 59 0.759259 18 5.900000 1187899 train 14.0
2 14 198.0 15.230769 195 0.510989 102 13.928571 1492625 train 30.0
3 12 133.0 12.090909 88 0.705128 33 7.333333 2774568 test 11.0
4 5 55.0 13.750000 18 0.071429 17 3.600000 329954 test 30.0
5 4 40.0 13.333333 37 0.538462 23 9.250000 2196797 train 6.0

In [8]:
tmp = order_products[['user_id', 'product_id', 'order_number', 'add_to_cart_order']].groupby(['user_id', 'product_id'])
up_stats = tmp.size().to_frame('up_orders')
up_stats['up_first_order'] = tmp.order_number.min()
up_stats['up_last_order'] = tmp.order_number.max()
up_stats['up_average_cart_position'] = tmp.add_to_cart_order.mean()
up_stats = up_stats.reset_index()

up_stats = pd.merge(up_stats, product_stats, left_on='product_id', right_index=True)
up_stats = pd.merge(up_stats, user_stats, left_on='user_id', right_index=True)
up_stats['up_order_rate'] = up_stats.up_orders / up_stats.user_orders
up_stats['up_orders_since_last_order'] = up_stats.user_orders - up_stats.up_last_order
up_stats['up_order_rate_since_first_order'] = up_stats.up_orders / (up_stats.user_orders - up_stats.up_first_order + 1)
up_stats = pd.merge(
    up_stats, train_df[['order_id', 'product_id', 'reordered']],
    how='left', left_on=('order_id', 'product_id'), right_on=('order_id', 'product_id'))
up_stats.head()


Out[8]:
user_id product_id up_orders up_first_order up_last_order up_average_cart_position prod_orders prod_reorder_probability prod_reorder_times prod_reorder_ratio user_orders user_period user_mean_days_since_prior user_total_products user_reorder_ratio user_distinct_products user_average_basket order_id eval_set days_since_prior_order up_order_rate up_orders_since_last_order up_order_rate_since_first_order reordered
0 1 196 10 1 10 1.400000 35791 0.582500 4.473875 0.776480 10 176.0 19.555556 59 0.759259 18 5.9 1187899 train 14.0 1.0 0 1.000000 1.0
1 1 10258 9 2 10 3.333333 1946 0.552962 3.493716 0.713772 10 176.0 19.555556 59 0.759259 18 5.9 1187899 train 14.0 0.9 0 1.000000 1.0
2 1 10326 1 5 5 5.000000 5526 0.521581 2.873635 0.652009 10 176.0 19.555556 59 0.759259 18 5.9 1187899 train 14.0 0.1 5 0.166667 NaN
3 1 12427 10 1 10 3.300000 6476 0.529482 3.857058 0.740735 10 176.0 19.555556 59 0.759259 18 5.9 1187899 train 14.0 1.0 0 1.000000 NaN
4 1 13032 3 2 10 6.333333 3751 0.479782 2.916796 0.657158 10 176.0 19.555556 59 0.759259 18 5.9 1187899 train 14.0 0.3 0 0.333333 1.0

In [10]:
train = up_stats[up_stats.eval_set=='train']
train.loc[:, 'reordered'] = train.reordered.fillna(0)
label = train.reordered
train = train.drop(['eval_set', 'user_id', 'product_id', 'order_id', 'reordered'], axis=1)

test = up_stats[up_stats.eval_set=='test']
test = test.drop(['eval_set', 'user_id', 'reordered'], axis=1)

xm_train = xgb.DMatrix(train, label=label)
xm_test = xgb.DMatrix(test.drop(['product_id', 'order_id'], axis=1))

In [13]:
params = {
    'eval_metric': 'logloss',
}
model = xgb.train(params=params, dtrain=xm_train, evals=((xm_train, 'train'),), verbose_eval=1)
test['reordered'] = model.predict(xm_test)

def gather(df):
    l = df[df.reordered > 0.21].product_id.tolist()
    if not l:
        l = [None]
    return ' '.join(str(e) for e in l)
answer = test.groupby('order_id').apply(gather).to_frame('products').to_csv('xgb_subm2.csv')


[0]	train-logloss:0.507798
[1]	train-logloss:0.404791
[2]	train-logloss:0.343775
[3]	train-logloss:0.306623
[4]	train-logloss:0.283518
[5]	train-logloss:0.269131
[6]	train-logloss:0.260211
[7]	train-logloss:0.254762
[8]	train-logloss:0.251345
[9]	train-logloss:0.249242

In [16]:
test[['order_id', 'product_id', 'reordered']].sort_values(['order_id', 'reordered'], ascending=False).to_csv('insta_data.csv')


Out[16]:
order_id product_id reordered
11718922 3421054 31231 0.740327
11718921 3421054 11123 0.498876
11718942 3421054 18426 0.394089
11718946 3421054 13375 0.390213
11718945 3421054 38448 0.262539
11718914 3421054 24852 0.241875
11718957 3421054 15802 0.235128
11718934 3421054 13493 0.232947
11718916 3421054 5818 0.167550
11718968 3421054 15732 0.148582
11718927 3421054 38340 0.122383
11718947 3421054 27429 0.119547
11718933 3421054 31102 0.115065
11718920 3421054 8580 0.108862
11718951 3421054 10388 0.096932
11718955 3421054 27677 0.096932
11718956 3421054 39647 0.094751
11718959 3421054 27370 0.092619
11718969 3421054 23286 0.090438
11718941 3421054 29211 0.089139
11718917 3421054 28842 0.085296
11718929 3421054 48790 0.084103
11718928 3421054 43183 0.078368
11718931 3421054 19653 0.076366
11718938 3421054 9837 0.076366
11718932 3421054 26497 0.074042
11718925 3421054 7644 0.073169
11718939 3421054 48808 0.072967
11718923 3421054 44051 0.065488
11718926 3421054 14536 0.062018
... ... ... ...
1499998 34 37119 0.032042
1499999 34 36205 0.032042
1500025 34 6317 0.032042
1500027 34 49621 0.032042
1500005 34 36994 0.031458
1500008 34 22819 0.031458
1500048 34 39550 0.031458
1500039 34 40417 0.027201
1500036 34 4031 0.020895
10959131 17 13107 0.411131
10959124 17 21463 0.240081
10959117 17 39275 0.185666
10959118 17 38777 0.181699
10959116 17 21903 0.131100
10959112 17 21709 0.129925
10959130 17 26429 0.123886
10959114 17 47766 0.119121
10959119 17 18288 0.085184
10959125 17 15613 0.085184
10959126 17 6291 0.085184
10959113 17 39928 0.082768
10959123 17 31964 0.082768
10959115 17 16965 0.076623
10959121 17 40002 0.070936
10959127 17 44056 0.066457
10959120 17 13535 0.053227
10959122 17 7035 0.041879
10959132 17 48896 0.041536
10959128 17 11494 0.033729
10959129 17 1283 0.027142

4833292 rows × 3 columns


In [ ]: