In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import xgboost as xgb
%matplotlib inline
pd.options.display.max_columns = 100
In [2]:
aisles = pd.read_csv('data/aisles.csv', index_col='aisle_id').to_dict()['aisle']
depart = pd.read_csv('data/departments.csv', index_col='department_id').to_dict()['department']
products_df = pd.read_csv('data/products.csv', index_col='product_id')
products_df['aisle'] = products_df.aisle_id.map(lambda x: aisles[x])
products_df['department'] = products_df.department_id.map(lambda x: depart[x])
products_df.drop(['aisle_id', 'department_id'], axis=1, inplace=True)
In [3]:
prior_df = pd.read_csv('data/order_products__prior.csv')
train_df = pd.read_csv('data/order_products__train.csv')
orders_df = pd.read_csv('data/orders.csv')
In [4]:
order_products = pd.merge(orders_df, prior_df, left_on='order_id', right_on='order_id').sort_values(['user_id', 'order_number', 'add_to_cart_order'])
order_products['product_time'] = order_products[['user_id', 'order_number', 'product_id']].groupby(['user_id', 'product_id']).cumcount() + 1
In [5]:
order_products.head(20)
Out[5]:
In [6]:
tmp = order_products[['product_id', 'product_time']]
product_stats = pd.DataFrame(index=sorted(tmp.product_id.unique()))
product_stats['prod_orders'] = tmp.groupby('product_id').count()
product_stats['prod_reorders'] = tmp[tmp.product_time>1].groupby('product_id').count()
product_stats['prod_1st_orders'] = tmp[tmp.product_time==1].groupby('product_id').count()
product_stats['prod_2nd_orders'] = tmp[tmp.product_time==2].groupby('product_id').count()
product_stats = product_stats.fillna(0)
product_stats['prod_reorder_probability'] = product_stats.prod_2nd_orders / product_stats.prod_1st_orders
product_stats['prod_reorder_times'] = 1 + product_stats.prod_reorders / product_stats.prod_1st_orders
product_stats['prod_reorder_ratio'] = product_stats.prod_reorders / product_stats.prod_orders
product_stats = product_stats.drop(['prod_reorders', 'prod_1st_orders', 'prod_2nd_orders', ], axis=1)
product_stats.head()
Out[6]:
In [7]:
tmp = orders_df[orders_df.eval_set == 'prior'].groupby('user_id')
user_stats_1 = pd.DataFrame(index=sorted(orders_df.user_id.unique()))
user_stats_1['user_orders'] = tmp['order_number'].max()
user_stats_1['user_period'] = tmp['days_since_prior_order'].sum()
user_stats_1['user_mean_days_since_prior'] = tmp['days_since_prior_order'].mean()
user_stats_2 = pd.DataFrame(index=sorted(orders_df.user_id.unique()))
user_stats_2['user_total_products'] = order_products.groupby('user_id').size()
user_stats_2['user_reorders'] = order_products[order_products.reordered==1].groupby('user_id').size()
user_stats_2['user_not_first_product_orders'] = order_products[order_products.order_number > 1].groupby('user_id').size()
user_stats_2['user_reorder_ratio'] = 1.0 * user_stats_2.user_reorders / user_stats_2.user_not_first_product_orders
user_stats_2['user_distinct_products'] = order_products.groupby('user_id')['product_id'].nunique()
user_stats_2 = user_stats_2.fillna(0).drop(['user_reorders', 'user_not_first_product_orders'], axis=1)
user_stats = pd.merge(user_stats_1, user_stats_2, left_index=True, right_index=True)
user_stats['user_average_basket'] = user_stats.user_total_products / user_stats.user_orders
user_stats_3 = orders_df[orders_df.eval_set != 'prior'][['user_id', 'order_id', 'eval_set', 'days_since_prior_order']].set_index('user_id')
user_stats = pd.merge(user_stats, user_stats_3, left_index=True, right_index=True)
user_stats.head()
Out[7]:
In [8]:
tmp = order_products[['user_id', 'product_id', 'order_number', 'add_to_cart_order']].groupby(['user_id', 'product_id'])
up_stats = tmp.size().to_frame('up_orders')
up_stats['up_first_order'] = tmp.order_number.min()
up_stats['up_last_order'] = tmp.order_number.max()
up_stats['up_average_cart_position'] = tmp.add_to_cart_order.mean()
up_stats = up_stats.reset_index()
up_stats = pd.merge(up_stats, product_stats, left_on='product_id', right_index=True)
up_stats = pd.merge(up_stats, user_stats, left_on='user_id', right_index=True)
up_stats['up_order_rate'] = up_stats.up_orders / up_stats.user_orders
up_stats['up_orders_since_last_order'] = up_stats.user_orders - up_stats.up_last_order
up_stats['up_order_rate_since_first_order'] = up_stats.up_orders / (up_stats.user_orders - up_stats.up_first_order + 1)
up_stats = pd.merge(
up_stats, train_df[['order_id', 'product_id', 'reordered']],
how='left', left_on=('order_id', 'product_id'), right_on=('order_id', 'product_id'))
up_stats.head()
Out[8]:
In [10]:
train = up_stats[up_stats.eval_set=='train']
train.loc[:, 'reordered'] = train.reordered.fillna(0)
label = train.reordered
train = train.drop(['eval_set', 'user_id', 'product_id', 'order_id', 'reordered'], axis=1)
test = up_stats[up_stats.eval_set=='test']
test = test.drop(['eval_set', 'user_id', 'reordered'], axis=1)
xm_train = xgb.DMatrix(train, label=label)
xm_test = xgb.DMatrix(test.drop(['product_id', 'order_id'], axis=1))
In [13]:
params = {
'eval_metric': 'logloss',
}
model = xgb.train(params=params, dtrain=xm_train, evals=((xm_train, 'train'),), verbose_eval=1)
test['reordered'] = model.predict(xm_test)
def gather(df):
l = df[df.reordered > 0.21].product_id.tolist()
if not l:
l = [None]
return ' '.join(str(e) for e in l)
answer = test.groupby('order_id').apply(gather).to_frame('products').to_csv('xgb_subm2.csv')
In [16]:
test[['order_id', 'product_id', 'reordered']].sort_values(['order_id', 'reordered'], ascending=False).to_csv('insta_data.csv')
Out[16]:
In [ ]: