In [1]:
import numpy as np
import pandas as pd
import scipy as sp
import lightgbm as lgb
import gc
In [2]:
import gensim
import pickle
In [3]:
from sklearn.model_selection import train_test_split
In [4]:
IDIR = '../data/raw/'
print('loading prior')
priors = pd.read_csv(IDIR + 'order_products__prior.csv', dtype={
'order_id': np.int32,
'product_id': np.uint16,
'add_to_cart_order': np.int16,
'reordered': np.int8})
print('loading train')
train = pd.read_csv(IDIR + 'order_products__train.csv', dtype={
'order_id': np.int32,
'product_id': np.uint16,
'add_to_cart_order': np.int16,
'reordered': np.int8})
print('loading orders')
orders = pd.read_csv(IDIR + 'orders.csv', dtype={
'order_id': np.int32,
'user_id': np.int32,
'order_number': np.int16,
'order_dow': np.int8,
'order_hour_of_day': np.int8,
'days_since_prior_order': np.float32})
print('loading products')
products = pd.read_csv(IDIR + 'products.csv', dtype={
'product_id': np.uint16,
'order_id': np.int32,
'aisle_id': np.uint8,
'department_id': np.uint8},
usecols=['product_id', 'aisle_id', 'department_id'])
print('priors {}: {}'.format(priors.shape, ', '.join(priors.columns)))
print('orders {}: {}'.format(orders.shape, ', '.join(orders.columns)))
print('train {}: {}'.format(train.shape, ', '.join(train.columns)))
print('train {}: {}'.format(train.shape, ', '.join(products.columns)))
In [5]:
print('computing product f')
prods = pd.DataFrame()
prods['orders'] = priors.groupby(priors.product_id).size().astype(np.int32)
prods['reorders'] = priors['reordered'].groupby(priors.product_id).sum().astype(np.float32)
prods['reorder_rate'] = (prods.reorders / prods.orders).astype(np.float32)
products = products.join(prods, on='product_id')
products.set_index('product_id', drop=False, inplace=True)
del prods
gc.collect()
Out[5]:
In [6]:
print('add order info to priors')
orders.set_index('order_id', inplace=True, drop=False)
priors = priors.join(orders, on='order_id', rsuffix='_')
priors.drop('order_id_', inplace=True, axis=1)
In [7]:
### user features
print('computing user f')
usr = pd.DataFrame()
usr['average_days_between_orders'] = orders[orders.eval_set == "prior"].groupby('user_id')['days_since_prior_order'].mean().astype(np.float32)
usr['nb_orders'] = orders[orders.eval_set == "prior"].groupby('user_id').size().astype(np.int16)
users = pd.DataFrame()
users['total_items'] = priors.groupby('user_id').size().astype(np.int16)
users['all_products'] = priors.groupby('user_id')['product_id'].apply(set)
users['total_distinct_items'] = (users.all_products.map(len)).astype(np.int16)
users = users.join(usr)
users['average_basket'] = (users.total_items / users.nb_orders).astype(np.float32)
print('user f', users.shape)
del usr, priors
gc.collect()
Out[7]:
In [35]:
### build list of candidate products to reorder, with features ###
def features(selected_orders, labels_given=False):
print('build candidate list')
order_list = []
product_list = []
labels = []
i=0
for row in selected_orders.itertuples():
i+=1
if i%10000 == 0: print('order row',i)
order_id = row.order_id
user_id = row.user_id
user_products = users.all_products[user_id]
product_list += user_products
order_list += [order_id] * len(user_products)
if labels_given:
labels += [(order_id, product) in train.index for product in user_products]
df = pd.DataFrame({'order_id':order_list, 'product_id':product_list}, dtype=np.int32)
labels = np.array(labels, dtype=np.int8)
del order_list
del product_list
print('user related features')
df['user_id'] = df.order_id.map(orders.user_id)
df['user_total_orders'] = df.user_id.map(users.nb_orders)
df['user_total_items'] = df.user_id.map(users.total_items)
df['total_distinct_items'] = df.user_id.map(users.total_distinct_items)
df['user_average_days_between_orders'] = df.user_id.map(users.average_days_between_orders)
df['user_average_basket'] = df.user_id.map(users.average_basket)
print('order related features')
df['dow'] = df.order_id.map(orders.order_dow)
df['order_hour_of_day'] = df.order_id.map(orders.order_hour_of_day)
df['days_since_prior_order'] = df.order_id.map(orders.days_since_prior_order)
df['days_since_ratio'] = df.days_since_prior_order / df.user_average_days_between_orders
print('product related features')
df['aisle_id'] = df.product_id.map(products.aisle_id)
df['department_id'] = df.product_id.map(products.department_id)
df['product_orders'] = df.product_id.map(products.orders).astype(np.int32)
df['product_reorders'] = df.product_id.map(products.reorders)
df['product_reorder_rate'] = df.product_id.map(products.reorder_rate)
print(df.dtypes)
print(df.memory_usage())
return (df, labels)
In [32]:
### train / test orders ###
print('split orders : train, test')
train_orders = orders[orders.eval_set == 'train']
test_orders = orders[orders.eval_set == 'test']
train.set_index(['order_id', 'product_id'], inplace=True, drop=False)
In [10]:
df_train, labels = features(train_orders, labels_given=True)
In [11]:
del train_orders
gc.collect()
Out[11]:
In [12]:
df_train.shape
Out[12]:
In [13]:
df_train.head()
Out[13]:
In [12]:
# all user feature(others)
all_users_features_df = pd.read_pickle("../data/processed/cleaned_all_users_features.pickle")
In [14]:
cols = all_users_features_df.columns
dow_cols = cols[cols.str.startswith('dow_')].tolist() + cols[cols.str.startswith('daytime_')].tolist()
most_cols = cols[cols.str.startswith('most_')].tolist()
top_cols = cols[cols.str.startswith('top')].tolist()
emb_cols = cols[cols.str.startswith('emb_')].tolist()
In [15]:
print("join with the user features")
to_join = ["user_id", 'user_avg_reordered', 'user_perc_reordered'] + most_cols + dow_cols + emb_cols + top_cols
df_train = pd.merge(df_train, all_users_features_df[to_join], on ="user_id")
In [16]:
gc.collect()
Out[16]:
In [17]:
df_train.info()
In [15]:
#product_emd = gensim.models.Word2Vec.load("../data/interim/product2vec.model")
In [16]:
#product_emd_dict = {k: product_emd[k] for k in product_emd.wv.vocab.keys()}
In [17]:
# product_emd_df = np.round(pd.DataFrame.from_dict(product_emd_dict, orient='index', dtype = np.float32),2).\
# add_prefix('prod2vec_').reset_index().\
# rename(columns = {'index': 'product_id'})
In [23]:
product_emd_df.head()
Out[23]:
In [22]:
#prod2vec_cols = product_emd_df.columns[product_emd_df.columns.str.startswith('prod2vec')]
In [23]:
# for col in prod2vec_cols:
# product_emd_df[col] = product_emd_df[col].astype('float32')
In [32]:
#product_emd_df.to_pickle("../data/interim/prod2vec_df.pickle")
product_emd_df = pd.read_pickle("../data/interim/prod2vec_df.pickle")
In [33]:
product_emd_df.head()
Out[33]:
In [39]:
product_emd_df['product_id'] = product_emd_df['product_id'].astype('int32')
In [42]:
print("joint product embedding")
df_train = pd.merge(df_train, product_emd_df, on = "product_id", how = "left")
In [40]:
#df_train.drop(prod2vec_cols, axis = 1, inplace=True)
In [44]:
df_train.info()
In [18]:
del all_users_features_df#, product_emd_df
gc.collect()
Out[18]:
In [47]:
df_train[prod2vec_cols] = df_train[prod2vec_cols].fillna(df_train[prod2vec_cols].mean())
In [48]:
df_train.shape
Out[48]:
In [19]:
features = df_train.columns
dow_cols = features[features.str.startswith("dow_")].tolist()
daytime_cols = features[features.str.startswith("daytime_")].tolist()
emb_cols = features[features.str.startswith("emb_")].tolist()
#most_cols = features[features.str.startswith("most_")].tolist()
#top_cols = features[features.str.startswith("top_")].tolist()
#prod2vec_cols = features[features.str.startswith("prod2vec")].tolist()
f_to_use = ['user_total_orders', 'user_total_items', 'total_distinct_items', 'dow',
'user_average_days_between_orders', 'user_average_basket',
'order_hour_of_day', 'days_since_prior_order', 'days_since_ratio',
'aisle_id', 'department_id', 'product_orders', 'product_reorders',
'product_reorder_rate'] + dow_cols + daytime_cols + emb_cols
In [26]:
gc.collect()
Out[26]:
In [29]:
print("split the train and validation set")
X_train, X_valid, y_train, y_valid = train_test_split(df_train[f_to_use], labels, test_size = 0.3, random_state=2017)
In [30]:
X_train = pd.DataFrame(X_train, columns = f_to_use)
X_valid = pd.DataFrame(X_valid, columns = f_to_use)
In [31]:
print('formating training and validation dataset for lgb')
d_train = lgb.Dataset(X_train,
label=y_train,
categorical_feature=['aisle_id', 'department_id'])
d_valid = lgb.Dataset(X_valid,
label=y_valid,
categorical_feature=['aisle_id', 'department_id'])
In [32]:
#del df_train
gc.collect()
Out[32]:
In [33]:
params = {
'task': 'train',
'boosting_type': 'gbdt',
'objective': 'binary',
'metric': {'binary_logloss'},
'num_leaves': 80,
'max_depth': 10,
'feature_fraction': 0.85,
'bagging_fraction': 0.9,
'bagging_freq': 8
}
ROUNDS = 300
In [34]:
print('Training light GBM ...')
bst = lgb.train(params, d_train, valid_sets= [d_valid], num_boost_round= ROUNDS, early_stopping_rounds = 10)
In [35]:
bst.save_model('../models/lightGBM_80_10.txt', num_iteration=bst.best_iteration)
In [20]:
most_cols = features[features.str.startswith("most_")].tolist()
top_cols = features[features.str.startswith("top")].tolist()
f_to_use = ['user_total_orders', 'user_total_items', 'total_distinct_items', 'dow',
'user_average_days_between_orders', 'user_average_basket',
'order_hour_of_day', 'days_since_prior_order', 'days_since_ratio',
'aisle_id', 'department_id', 'product_orders', 'product_reorders',
'product_reorder_rate'] + dow_cols + daytime_cols + emb_cols + most_cols + top_cols
print(f_to_use)
In [37]:
del d_train, d_valid
gc.collect()
Out[37]:
In [21]:
print("split the train and validation set")
X_train, X_valid, y_train, y_valid = train_test_split(df_train[f_to_use], labels, test_size = 0.3, random_state=2017)
In [22]:
X_train = pd.DataFrame(X_train, columns = f_to_use)
X_valid = pd.DataFrame(X_valid, columns = f_to_use)
In [23]:
print('formating training and validation dataset for lgb')
d_train = lgb.Dataset(X_train,
label=y_train,
categorical_feature=['aisle_id', 'department_id', 'most_reordered_aiesle',
'most_reordered_dpmt','top1_reordered_pid', 'top2_reordered_pid',
'top3_reordered_pid' ])
d_valid = lgb.Dataset(X_valid,
label=y_valid,
categorical_feature=['aisle_id', 'department_id', 'most_reordered_aiesle', 'most_reordered_dpmt',
'top1_reordered_pid', 'top2_reordered_pid',
'top3_reordered_pid' ])
In [24]:
params = {
'task': 'train',
'boosting_type': 'gbdt',
'objective': 'binary',
'metric': {'binary_logloss'},
'num_leaves': 80,
'max_depth': 10,
'feature_fraction': 0.85,
'bagging_fraction': 0.9,
'bagging_freq': 8
}
ROUNDS = 500
In [25]:
print('Training light GBM ...')
bst = lgb.train(params, d_train, valid_sets= [d_valid], num_boost_round= ROUNDS, early_stopping_rounds = 10)
In [26]:
bst.save_model('../models/lightGBM_morefeatures_80_10.txt', num_iteration=bst.best_iteration)
It seems it doesn't add much more values.
In [54]:
#df_train.drop(emb_cols, axis = 1, inplace = True)
df_train.info()
In [55]:
gc.collect()
Out[55]:
In [53]:
prod2vec_cols = features[features.str.startswith("prod2vec")].tolist()
f_to_use = ['user_total_orders', 'user_total_items', 'total_distinct_items', 'dow',
'user_average_days_between_orders', 'user_average_basket',
'order_hour_of_day', 'days_since_prior_order', 'days_since_ratio',
'aisle_id', 'department_id', 'product_orders', 'product_reorders',
'product_reorder_rate'] + dow_cols + daytime_cols + prod2vec_cols
print(f_to_use)
In [56]:
print("split the train and validation set")
X_train, X_valid, y_train, y_valid = train_test_split(df_train[f_to_use], labels, test_size = 0.3, random_state=2017)
In [58]:
X_train = pd.DataFrame(X_train, columns = f_to_use)
X_valid = pd.DataFrame(X_valid, columns = f_to_use)
In [59]:
print('formating training and validation dataset for lgb')
d_train = lgb.Dataset(X_train,
label=y_train,
categorical_feature=['aisle_id', 'department_id'])
d_valid = lgb.Dataset(X_valid,
label=y_valid,
categorical_feature=['aisle_id', 'department_id'])
In [60]:
params = {
'task': 'train',
'boosting_type': 'gbdt',
'objective': 'binary',
'metric': {'binary_logloss'},
'num_leaves': 100,
'max_depth': 12,
'feature_fraction': 0.85,
'bagging_fraction': 0.9,
'bagging_freq': 8
}
ROUNDS = 500
In [61]:
gc.collect()
Out[61]:
In [62]:
print('Training light GBM ...')
bst_all = lgb.train(params, d_train, valid_sets= [d_valid], num_boost_round= ROUNDS, early_stopping_rounds = 10)
In [63]:
bst_all.save_model('../models/lightGBM_prodfeats_100_12.txt', num_iteration=bst_all.best_iteration)
In [36]:
### build candidates list for test ###
df_test, _ = features(test_orders)
In [38]:
# all user feature(others)
all_users_features_df = pd.read_pickle("../data/processed/cleaned_all_users_features.pickle")
cols = all_users_features_df.columns
dow_cols = cols[cols.str.startswith('dow_')].tolist() + cols[cols.str.startswith('daytime_')].tolist()
most_cols = cols[cols.str.startswith('most_')].tolist()
top_cols = cols[cols.str.startswith('top')].tolist()
emb_cols = cols[cols.str.startswith('emb_')].tolist()
print("join with the user features")
to_join = ["user_id", 'user_avg_reordered', 'user_perc_reordered'] + most_cols + dow_cols + emb_cols + top_cols
df_test = pd.merge(df_test, all_users_features_df[to_join], on ="user_id")
In [41]:
df_test.shape
Out[41]:
In [42]:
del all_users_features_df, test_orders
gc.collect()
Out[42]:
In [43]:
len(f_to_use)
Out[43]:
In [45]:
# load the model
bst_best = lgb.Booster(model_file='../models/lightGBM_morefeatures_80_10.txt')
In [47]:
print('light GBM predict')
preds = bst_best.predict(df_test[f_to_use])
In [48]:
df_test['pred'] = preds
In [50]:
def generate_submission(df_test, test_orders_ids, file_name, threshold = 0.2, single_thres = True):
"""function to generate label predictions submission format"""
if single_thres:
TRESHOLD = threshold
d = dict()
for row in df_test.itertuples():
if row.pred > TRESHOLD:
try:
d[row.order_id] += ' ' + str(row.product_id)
except:
d[row.order_id] = str(row.product_id)
for order in test_orders_ids:
if order not in d:
d[order] = 'None'
sub = pd.DataFrame.from_dict(d, orient='index')
sub.reset_index(inplace=True)
sub.columns = ['order_id', 'products']
sub.to_csv(file_name, index=False)
else:
pass
In [52]:
test_order_ids = orders[orders.eval_set == 'test'].order_id
In [53]:
len(test_order_ids)
Out[53]:
In [59]:
df_test[['order_id', 'pred', 'product_id']].to_csv("../data/processed/lightGBM_morefeatures_prob_preds.csv",index=False)
In [60]:
generate_submission(df_test, test_orders_ids, '../models/lightGBM_morefeatures_preds_20%thr.csv', threshold = 0.2)
In [ ]: