In [3]:
import pandas as pd
import lightgbm as lgb
import re
%matplotlib inline
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
sns.set_style("whitegrid")
import calendar
In [7]:
# First, let's import requisite files
orders = pd.read_csv('../Instacart_Input/orders.csv')
prior_set = pd.read_csv('../Instacart_Input/order_products__prior.csv')
train_set = pd.read_csv('../Instacart_Input/order_products__train.csv')
aisles = pd.read_csv('../Instacart_Input/aisles.csv')
departments = pd.read_csv('../Instacart_Input/departments.csv')
products = pd.read_csv('../Instacart_Input/products.csv')
In [5]:
orders.set_index('order_id', inplace=True, drop=False)
prior_set = prior_set.join(orders, on='order_id', rsuffix='_')
prior_set.drop('order_id_', inplace=True, axis=1)
temp = pd.DataFrame()
temp['average_days_between_orders'] = orders.groupby('user_id')['days_since_prior_order'].mean().astype(np.float32)
temp['orders'] = orders[orders['eval_set'] == 'prior'].groupby('user_id').size().astype(np.int16)
user_data = pd.DataFrame()
user_data['total_items'] = prior_set.groupby('user_id').size().astype(np.int16)
user_data['all_products'] = prior_set.groupby('user_id')['product_id'].apply(set)
user_data['total_unique_items'] = (user_data.all_products.map(len)).astype(np.int16)
user_data = user_data.join(temp)
user_data['avg_basket_size'] = (user_data.total_items / user_data.orders).astype(np.float32)
user_data.reset_index(inplace=True)
user_data.head(20)
Out[5]:
In [ ]:
In [8]:
train = orders[orders['eval_set'] == 'train']
train_user_orders = orders[orders['user_id'].isin(train['user_id'].values)]
train_user_orders = train_user_orders.merge(prior_set, on='order_id')
train_user_orders = train_user_orders.merge(user_data, on='user_id')
train_user_orders = train_user_orders.merge(products, on='product_id')
temp = pd.DataFrame(train_user_orders.groupby(['user_id',
'product_id']
).size()).reset_index()
temp.columns = ['user_id', 'product_id', 'usr_order_instances']
train_df = train_user_orders.groupby(['user_id',
'product_id']
).mean().reset_index()
train_df.merge(temp,
on=['user_id',
'product_id']
)
train_df = train_df.drop(['order_id',
'order_number',
'reordered',
], axis=1)
train_df['order_dow'] = train_df['order_dow'].astype(np.float32)
train_df['order_hour_of_day'] = train_df['order_hour_of_day'].astype(np.float32)
train_df['days_since_prior_order'] = train_df['days_since_prior_order'].astype(np.float32)
train_df['add_to_cart_order'] = train_df['add_to_cart_order'].astype(np.float32)
train_df['avg_basket_size'] = train_df['avg_basket_size'].astype(np.float32)
train_df['aisle_id'] = train_df['aisle_id'].astype(np.int16)
train_df['department_id'] = train_df['department_id'].astype(np.int16)
train_df.head()
Out[8]:
In [ ]:
# I've previously created 20 test submissions without machine learning algorithms
# and I benefited frmo starting with the most recent orders to get F1 score 0.365+ (top 50%)
# So I'm including this feature:
# Reorder rates (% of order that includes reordered products) for recent orders
order_reup = train_user_orders.groupby(['user_id', 'order_number']).mean()
last_order = train_user_orders.groupby(['user_id'])['order_number'].max()
d = {}
for user, order in order_reup['reordered'].index.values:
if user not in d:
count = 0
d[user] = 0
if ( (order > 1) & (order >= last_order[user] - 4) ):
d[user] += order_reup['reordered'][(user, order)]
count+=1
if order == last_order[user]:
d[user] /= count
d
# Add to train_df [Warning: LONG PROCESSING TIME...]
#train_df['recent_reorder_rate'] = 0
#for i in d.keys():
# train_df.loc[train_df.user_id == i, 'recent_reorder_rate'] = d[i]
In [ ]:
2-fold cross-validation? i.e. splitting training set into two groups roughly the size of the actual test set, and running 5-fold CV on each?
How do I know when I'm overfitting? I only have 5 submissions per day, and would like to be able to estimate it without submitting.
ensemble methods - LightGBM and XGBoost? Using predictions from first model as input? Ranking predictions, max/min/std of predictions?
Any resources for parameter tuning for LightGBM and XGBoost?
How can I stratify training data into sub-sets that reflect the general popluation?
Should I train using a separate validation set, or is cross-validation per above sufficient?
libFM and Factorizing machines?
In [ ]:
#user1 = orders[orders['user_id'] == 1]['order_id'].values
#prior_set[prior_set['order_id'].isin(user1)]
In [ ]:
#users['total_items'] = train_user_orders.groupby(['user_id', 'product_id']).size() #[train_user_orders['user_id'] == 1]
#users = pd.DataFrame()
#users['total_items'] = train_user_orders.groupby('product_id').size().astype(np.int16)
#users['product_set'] = train_user_orders.groupby('user_id')['product_id'].apply(set)
#user_array = train_user_orders.groupby('user_id').size().index.values
#for user in user_array:
# users['total_uniqueItems'] = len(np.unique(train_user_orders[train_user_orders['user_id'] == user]['product_id']))
#orders[orders['order_id'] == 1187899]
#user_1 = orders[orders['user_id'] == 1].groupby('order_id').size()
#user_1.index.values
# 20.6M rows if you have unique rows for each (order_id | product) tuple
# vs.
# 8.5M rows if you have unique rows for each (user_id | product) tuple
# User 1 has 18 unique products spread across 10 prior orders (not including train order 11)
#np.unique(train_user_orders[train_user_orders['user_id'] == 1]['product_id'])
In [ ]:
#train_df.to_csv('train_df_LightGBM_vXXXXX.csv', index=False)