Instacart
This workbook about feature generation The generated fetaures are stored as csv files, so can be loaded from subseqent pages
TODO Combine the feature generation for train and test.
In [94]:
import numpy as np
import pandas as pd
import time
from tqdm import tqdm
import gc
print('loading prior')
priors = pd.read_csv('./data/order_products__prior.csv')
print('loading train')
train_all = pd.read_csv('./data/order_products__train.csv')
## Have split the trian data into two sets, train and eval
train = pd.read_csv('./data/train_new.csv')
train_eval = pd.read_csv('./data/train_eval.csv')
print('loading orders')
orders = pd.read_csv(IDIR + 'orders.csv')
print('priors {}: {}'.format(priors.shape, ', '.join(priors.columns)))
print('orders {}: {}'.format(orders.shape, ', '.join(orders.columns)))
print('train {}: {}'.format(train.shape, ', '.join(train.columns)))
###
# some memory measures for kaggle kernel
print('optimize memory')
orders.order_dow = orders.order_dow.astype(np.int8)
orders.order_hour_of_day = orders.order_hour_of_day.astype(np.int8)
orders.order_number = orders.order_number.astype(np.int16)
orders.order_id = orders.order_id.astype(np.int32)
orders.user_id = orders.user_id.astype(np.int32)
orders.days_since_prior_order = orders.days_since_prior_order.astype(np.float32)
train.reordered = train.reordered.astype(np.int8)
train.add_to_cart_order = train.add_to_cart_order.astype(np.int16)
train_eval.reordered = train.reordered.astype(np.int8)
train_eval.add_to_cart_order = train.add_to_cart_order.astype(np.int16)
train_all.reordered = train.reordered.astype(np.int8)
train_all.add_to_cart_order = train.add_to_cart_order.astype(np.int16)
priors.order_id = priors.order_id.astype(np.int32)
priors.add_to_cart_order = priors.add_to_cart_order.astype(np.int16)
priors.reordered = priors.reordered.astype(np.int8)
priors.product_id = priors.product_id.astype(np.int32)
In [95]:
print('loading products')
products = pd.read_csv(IDIR + 'products.csv')
products.drop(['product_name'], axis=1, inplace=True)
products.aisle_id = products.aisle_id.astype(np.int8)
products.department_id = products.department_id.astype(np.int8)
products.product_id = products.product_id.astype(np.int32)
In [96]:
prior_train = pd.concat([priors, train_all], axis = 0)
print prior_train.shape
if ( priors.shape[0] + train.shape[0] ) == prior_train.shape[0]:
print "concat successful"
In [97]:
## set the index, for the join
orders.set_index('order_id', inplace=True, drop=False)
## Join with prior_train
print "Joining orders with prior_train"
prior_train = prior_train.join(orders, on = 'order_id', how = 'left', lsuffix = '_')
prior_train.drop('order_id_', inplace = True, axis = 1)
## Repeat the same only for prior
print "Joining orders with priors"
priors = priors.join(orders, on = 'order_id', how = 'left', lsuffix = '_')
priors.drop('order_id_', inplace = True, axis = 1)
## Joining orders with train
print "Joining orders with train"
train = train.join(orders, on = 'order_id', how = 'left', lsuffix = '_')
train.drop('order_id_', inplace = True, axis = 1)
## Joining orders with train_eval
print "Joining orders with train_eval"
train_eval = train_eval.join(orders, on = 'order_id', how = 'left', lsuffix = '_')
train_eval.drop('order_id_', inplace = True, axis = 1)
## reset the order table index
orders.reset_index(inplace=True, drop=True)
orders.head()
Out[97]:
In [99]:
## Using prior and train data
users_prior_all = pd.DataFrame()
users_prior_all['prod_list'] = prior_train.groupby('user_id')['product_id'].apply(set)
users_prior_all.reset_index(inplace = True, drop = False)
## Using only prior data
users_prior = pd.DataFrame()
users_prior['prod_list'] = priors.groupby('user_id')['product_id'].apply(set)
users_prior.reset_index(inplace = True, drop = False)
users_prior.head()
Out[99]:
In [102]:
"""
import random
train.reset_index(drop = True, inplace = True)
order_ids = list(train['order_id'].unique())
sample_size = int(0.25 * len(order_ids))
sample_orders = random.sample(order_ids, sample_size)
train_eval = train[train['order_id'].isin(sample_orders)]
train_new = train[~train['order_id'].isin(sample_orders)]
train_eval.to_csv('./data/train_eval.csv', index = False)
train_new.to_csv('./data/train_new.csv', index = False)
del train
train = train_new
del train_new
train.head()
"""
Out[102]:
Features built
Order features
User Order Features
Product Features
User Features
User Product Features
In [104]:
## using prior stats
user_order_prior = pd.DataFrame()
user_order_prior['avg_days_since_prior_order'] = priors.groupby('user_id')['days_since_prior_order'].agg('mean')
## using prior train stats
user_order_all = pd.DataFrame()
user_order_all['avg_days_since_prior_order'] = prior_train.groupby('user_id')['days_since_prior_order'].agg('mean')
user_order_prior.reset_index(drop = False, inplace = True)
user_order_all.reset_index(drop = False, inplace = True)
print user_order_prior.head()
print
print user_order_all.head()
In [105]:
def last_order_features(priors):
max_order = pd.DataFrame()
max_order['max_order'] = priors.groupby(['user_id'])['order_number'].agg('max')
max_order = max_order.rename(columns = {"max_order":"order_number"})
max_order.reset_index(inplace = True, drop = False)
max_order.set_index(['user_id','order_number'], drop = False, inplace = True)
priors.set_index(['user_id','order_number'], drop = False, inplace = True)
max_order = max_order.join(priors[['user_id', 'order_id', 'order_number','order_dow','order_hour_of_day','days_since_prior_order']], rsuffix="_")
max_order.reset_index(drop =True, inplace = True)
priors.reset_index(drop = True, inplace = True)
max_order.drop('user_id_', inplace = True, axis =1)
max_order.drop('order_number_', inplace = True, axis = 1)
max_order.drop('order_number', inplace = True, axis = 1)
max_order = max_order.rename(columns = {"order_id":"prev_order_id","order_dow":"prev_order_dow"
,"order_hour_of_day":"prev_order_hour_of_day"
,"days_since_prior_order":"prev_days_since_prior_order"})
max_order.drop_duplicates(inplace = True)
return max_order
In [106]:
## Stats from prior
max_order = last_order_features(priors)
print max_order.head()
print
## Stats from prior and train
max_order_all = last_order_features(prior_train)
print max_order_all.head()
In [107]:
def product_features(priors):
prods = pd.DataFrame()
prods['p_orders'] = priors.groupby(priors.product_id).size().astype(np.float32)
prods['p_reorders'] = priors['reordered'].groupby(priors.product_id).sum().astype(np.float32)
prods['p_reorder_rate'] = (prods.p_reorders / prods.p_orders).astype(np.float32)
## set the index for products
products.set_index('product_id', inplace = True, drop = False)
products_prior = products.join(prods, rsuffix="_")
## Reset the index
products_prior.reset_index(inplace = True, drop = True)
del prods
return products_prior
In [108]:
### Stats from prior and train
products_all = product_features(prior_train)
print products_all.head()
print
## Stats from prior
products_prior = product_features(priors)
print products_prior.head()
In [109]:
def user_features(priors):
prod_count_prior = pd.DataFrame()
prod_count_prior['basket_size'] = priors.groupby(['user_id','order_id'])['product_id'].size().astype(np.int32)
prod_count_prior['reorder_size'] = priors.groupby(['user_id','order_id'])['reordered'].agg('sum').astype(np.int32)
# reset / set index
prod_count_prior = prod_count_prior.reset_index()
prod_count_prior.set_index('user_id', inplace = True, drop =False)
prod_count_prior['tot_orders'] = prod_count_prior.groupby(['user_id']).size().astype(np.int32)
prod_count_prior['tot_prods'] = prod_count_prior.groupby(['user_id'])['basket_size'].agg(['sum'])
prod_count_prior['avg_basket'] = prod_count_prior.groupby(['user_id'])['basket_size'].agg(['mean'])
prod_count_prior['avg_reorder'] = prod_count_prior.groupby(['user_id'])['reorder_size'].agg(['mean'])
prod_count_prior['std_basket'] = prod_count_prior.groupby(['user_id'])['basket_size'].agg(['std'])
prod_count_prior.drop(['order_id','basket_size','reorder_size'], inplace=True, axis=1)
prod_count_prior.drop_duplicates(inplace = True)
prod_count_prior = prod_count_prior.reset_index(level = 'user_id', drop = True)
return prod_count_prior
In [110]:
## Stats from only prior
prod_count_prior = user_features(priors)
## Stats from all
prod_count_all = user_features(prior_train)
print prod_count_prior.head()
print
print prod_count_all.head()
In [111]:
def user_prod_features(priors):
user_prod_prior = pd.DataFrame()
## Number of user's order where product id is present
user_prod_prior['up_orders'] = priors.groupby(['user_id','product_id'])['order_id'].size()
user_prod_prior.reset_index(inplace = True, drop = False)
user_prod_prior.set_index(['user_id', 'product_id'], inplace = True, drop = False)
## Number of times the product was re-ordered by the user
user_prod_prior['up_reorder'] = priors.groupby(['user_id', 'product_id'])['reordered'].agg(['sum'])
user_prod_prior['up_reorder_rate'] = user_prod_prior.up_reorder / user_prod_prior.up_orders
user_prod_prior.reset_index(drop = True, inplace= True)
return user_prod_prior
In [112]:
## Stats from prior
user_prod_prior = user_prod_features(priors)
user_prod_all = user_prod_features(prior_train)
print user_prod_prior.head()
print
print user_prod_all.head()
In [ ]:
In [113]:
feature_dict_prior = {}
feature_dict_prior[1] = {"name":"user_order_feature","obj":user_order_prior,"index":['user_id']}
feature_dict_prior[2] = {"name":"last_order_feature","obj":max_order,"index":['user_id']}
feature_dict_prior[3] = {"name":"product_feature","obj":products_prior,"index":['product_id']}
feature_dict_prior[4] = {"name":"user_feature","obj":prod_count_prior,"index":['user_id']}
feature_dict_prior[5] = {"name":"user_pro_feature","obj":user_prod_prior,"index":['user_id','product_id']}
feature_dict_all = {}
feature_dict_all[1] = {"name":"user_order_feature","obj":user_order_all,"index":['user_id']}
feature_dict_all[2] = {"name":"last_order_feature","obj":max_order_all,"index":['user_id']}
feature_dict_all[3] = {"name":"product_feature","obj":products_all,"index":['product_id']}
feature_dict_all[4] = {"name":"user_feature","obj":prod_count_all,"index":['user_id']}
feature_dict_all[5] = {"name":"user_prod_feature","obj":user_prod_all,"index":['user_id','product_id']}
In [114]:
def join_features(feature_dict, features):
for k,v in feature_dict.items():
print "Joining {} feature".format(v['name'])
obj = v['obj']
index = v['index']
features.set_index(index, drop = False, inplace = True)
obj.set_index(index, drop = False, inplace = True)
features = features.join(obj ,on =index, rsuffix='_')
index_ = [idx + '_' for idx in index]
features.drop(index_, inplace = True, axis = 1)
features.reset_index(drop = True, inplace = True)
obj.reset_index(drop = True, inplace = True)
features.drop( ['prev_order_id'], inplace = True, axis = 1 )
return features
In [115]:
## Join train
train.head()
Out[115]:
Here we use the train table as our y variable. We create a dataset with user_id, product_id, is_product_in_train columns.
We can then use this to merge with user, product and user/product features.
This will serve as our testing data before use actually run our model on real data.
In [83]:
## This block needs to be run only once
## Later the output of this block is stored in ./data/features.csv file
## The next block reads the file, it enough to run the next block subsequently
## We could have got this from order_id
## However since we have separted our train into two sets
## we need to interate the train_new aka train data we have created.
train.reset_index(inplace = True, drop = True)
train_list = pd.DataFrame()
train_list['ignore'] = train.groupby(['user_id','order_id'], group_keys = True).size()
train_list.reset_index(inplace = True, drop = False)
train.set_index(['order_id', 'product_id'], inplace = True, drop = False)
print "features"
count = 0
order_list = []
product_list = []
user_list = []
labels = []
for user_record in train_list.itertuples():
count+=1
if count%10000 == 0:
print "Finished {} users".format(count)
user_id = user_record.user_id
order_id = user_record.order_id
prev_products = list(users_prior[users_prior.user_id == user_id]['prod_list'].values.tolist()[0])
product_list+= prev_products
order_list+=[order_id] * len(prev_products)
user_list+=[user_id] * len(prev_products)
labels+=[(order_id, product) in train.index for product in prev_products]
feature_df = pd.DataFrame({'user_id':user_list,'product_id':product_list,'order_id':order_list,'in_next_order':labels}, dtype=np.int32)
print feature_df.head()
feature_df.to_csv('./features/features.csv', index = False)
In [84]:
features = pd.read_csv('./features/features.csv')
features.head()
Out[84]:
In [85]:
print "Order features"
features.set_index('order_id',inplace = True, drop = False)
orders.set_index('order_id', inplace = True, drop = False)
features = pd.merge(features, orders, left_on = 'order_id', right_on = 'order_id')
#features.drop('order_id', inplace = True, axis =1)
features.drop('eval_set', inplace = True, axis =1)
features.drop('user_id_y', inplace = True, axis =1)
features.drop('order_number', inplace = True, axis =1)
features = features.rename(columns={"user_id_x":"user_id"})
features.reset_index(drop = True, inplace= True)
train.reset_index(drop = True, inplace = True)
features.head()
Out[85]:
In [86]:
features = join_features(feature_dict_prior, features)
features.head()
Out[86]:
In [87]:
features.to_csv('./features/features_train.csv', index = False)
In [90]:
def get_y(test_list, users_prior):
feature = []
count = 0
for user_record in (test_list.itertuples()):
count+=1
if count%10000 == 0:
print "Finished {} users".format(count)
user_id = user_record.user_id
order_id = user_record.order_id
prev_products = list(users_prior[users_prior.user_id == user_id]['prod_list'].values.tolist()[0])
for p_p in prev_products:
feature.append((order_id, user_id ,p_p))
test_df = pd.DataFrame(data = feature, columns =['order_id','user_id','product_id'])
return test_df
In [118]:
train_eval.head()
Out[118]:
In [119]:
train_eval.reset_index(inplace = True, drop = True)
train_eval_list = pd.DataFrame()
train_eval_list['ignore'] = train_eval.groupby(['user_id','order_id'], group_keys = True).size()
train_eval_list.reset_index(inplace = True, drop = False)
test_df = get_y(train_eval_list, users_prior, orders)
test_df.head()
Out[119]:
In [120]:
test_df = join_features(feature_dict_prior, test_df)
test_df.head()
Out[120]:
In [121]:
test_df.to_csv('./features/features_eval.csv',index = False)
In [114]:
test_list = orders[orders.eval_set == 'test']
feature = []
count = 0
for user_record in (test_list.itertuples()):
count+=1
if count%10000 == 0:
print "Finished {} users".format(count)
user_id = user_record.user_id
order_id = user_record.order_id
prev_products = list(users_prior[users_prior.user_id == user_id]['prod_list'].values.tolist()[0])
for p_p in prev_products:
feature.append((order_id, user_id ,p_p))
test_df = pd.DataFrame(data = feature, columns =['order_id','user_id','product_id'])
print test_df.head()
In [115]:
## Order features
print "Order features"
test_df.set_index('order_id',inplace = True, drop = False)
orders.set_index('order_id', inplace = True, drop = False)
test_df = pd.merge(test_df, orders, left_on = 'order_id', right_on = 'order_id')
test_df.drop('eval_set', inplace = True, axis =1)
test_df.drop('user_id_y', inplace = True, axis =1)
test_df.drop('order_number', inplace = True, axis =1)
test_df = test_df.rename(columns={"user_id_x":"user_id"})
test_df.reset_index(drop = True, inplace= True)
train.reset_index(drop = True, inplace = True)
test_df.head()
Out[115]:
In [116]:
test_df = join_features(feature_dict_all, test_df)
test_df.head()
Out[116]:
In [117]:
test_df.to_csv('./features/features_test.csv',index = False)
In [ ]:
"""
## Numpy savetxt is extremely slow
#VW_train = np.column_stack((y_train, X_train))
#print "Save"
#np.savetxt('./data/vw_train.csv', VW_train)
#print "done"
VW_train = pd.concat([Y, X],axis =1 )
print VW_train.shape
VW_train.head()
print "VW_train"
VW_train.to_csv('./data/vw_train.csv', index = False)
#python csv2vw.py ./data/vw_train.csv ./data/vw_train.txt 0 1
#python csv2vw.py ./data/vw_test.csv ./data/vw_test.txt 0 1
### Vowpal wabbit baseline model
#time vw ./data/vw_train.txt --predictions vwpred_train.out
vw_pred_train = pd.read_csv('vwpred_train.out', names=['y_p'])
vw_pred_train['y_pp']= vw_pred_train['y_p'].apply(lambda x: 1.0 if x > 0.35 else 0.0)
y_p3 = vw_pred_train['y_pp'].values
print "Vowpal wabbit accuracy {0:.2f}, precision {0:.2f}, recall {0:.2f}, f1-score {0:.2f}".format(
accuracy_score(y_train, y_p3),
precision_score(y_train, y_p3),
recall_score(y_train, y_p3),
f1_score(y_train, y_p3))
print confusion_matrix(y_train, y_p3)
"""
In [ ]: