Instacart

This workbook about feature generation The generated fetaures are stored as csv files, so can be loaded from subseqent pages

TODO Combine the feature generation for train and test.

1. Load data


In [94]:
import numpy as np 
import pandas as pd 
import time
from tqdm import tqdm

import gc


print('loading prior')
priors = pd.read_csv('./data/order_products__prior.csv')
print('loading train')

train_all = pd.read_csv('./data/order_products__train.csv')
## Have split the trian data into two sets, train and eval
train = pd.read_csv('./data/train_new.csv')
train_eval = pd.read_csv('./data/train_eval.csv')

print('loading orders')
orders = pd.read_csv(IDIR + 'orders.csv')

print('priors {}: {}'.format(priors.shape, ', '.join(priors.columns)))
print('orders {}: {}'.format(orders.shape, ', '.join(orders.columns)))
print('train {}: {}'.format(train.shape, ', '.join(train.columns)))

###
# some memory measures for kaggle kernel
print('optimize memory')
orders.order_dow = orders.order_dow.astype(np.int8)
orders.order_hour_of_day = orders.order_hour_of_day.astype(np.int8)
orders.order_number = orders.order_number.astype(np.int16)
orders.order_id = orders.order_id.astype(np.int32)
orders.user_id = orders.user_id.astype(np.int32)
orders.days_since_prior_order = orders.days_since_prior_order.astype(np.float32)


train.reordered = train.reordered.astype(np.int8)
train.add_to_cart_order = train.add_to_cart_order.astype(np.int16)

train_eval.reordered = train.reordered.astype(np.int8)
train_eval.add_to_cart_order = train.add_to_cart_order.astype(np.int16)

train_all.reordered = train.reordered.astype(np.int8)
train_all.add_to_cart_order = train.add_to_cart_order.astype(np.int16)




priors.order_id = priors.order_id.astype(np.int32)
priors.add_to_cart_order = priors.add_to_cart_order.astype(np.int16)
priors.reordered = priors.reordered.astype(np.int8)
priors.product_id = priors.product_id.astype(np.int32)


loading prior
loading train
loading orders
priors (32434489, 4): order_id, product_id, add_to_cart_order, reordered
orders (3421083, 7): order_id, user_id, eval_set, order_number, order_dow, order_hour_of_day, days_since_prior_order
train (1038783, 4): order_id, product_id, add_to_cart_order, reordered
optimize memory

Load Products


In [95]:
print('loading products')


products = pd.read_csv(IDIR + 'products.csv')

products.drop(['product_name'], axis=1, inplace=True)
products.aisle_id = products.aisle_id.astype(np.int8)
products.department_id = products.department_id.astype(np.int8)
products.product_id = products.product_id.astype(np.int32)


loading products

2. Prepare data

Concat prior and train

To be used for final predictions


In [96]:
prior_train = pd.concat([priors, train_all], axis = 0)
print prior_train.shape

if ( priors.shape[0] + train.shape[0] ) ==  prior_train.shape[0]:
    print "concat successful"


(33473272, 4)
concat successful

Join with orders table to get the user id

This gets us the user_id and other order features


In [97]:
## set the index, for the join
orders.set_index('order_id', inplace=True, drop=False)

## Join with prior_train
print "Joining orders with prior_train"
prior_train = prior_train.join(orders, on = 'order_id', how = 'left', lsuffix = '_')
prior_train.drop('order_id_', inplace = True, axis = 1)

## Repeat the same only for prior
print "Joining orders with priors"
priors = priors.join(orders, on = 'order_id', how = 'left', lsuffix = '_')
priors.drop('order_id_', inplace = True, axis = 1)


## Joining orders with train
print "Joining orders with train"
train = train.join(orders, on = 'order_id', how = 'left', lsuffix = '_')
train.drop('order_id_', inplace = True, axis = 1)

## Joining orders with train_eval
print "Joining orders with train_eval"
train_eval = train_eval.join(orders, on = 'order_id', how = 'left', lsuffix = '_')
train_eval.drop('order_id_', inplace = True, axis = 1)


## reset the order table index
orders.reset_index(inplace=True, drop=True)
orders.head()


Joining orders with prior_train
Out[97]:
product_id add_to_cart_order reordered order_id user_id eval_set order_number order_dow order_hour_of_day days_since_prior_order
0 33120 1 1 2 202279 prior 3 5 9 8.0
1 28985 2 1 2 202279 prior 3 5 9 8.0
2 9327 3 0 2 202279 prior 3 5 9 8.0
3 45918 4 1 2 202279 prior 3 5 9 8.0
4 30035 5 0 2 202279 prior 3 5 9 8.0

Make a data frame of user and previous product list


In [99]:
## Using prior and train data
users_prior_all = pd.DataFrame()
users_prior_all['prod_list'] = prior_train.groupby('user_id')['product_id'].apply(set)
users_prior_all.reset_index(inplace = True, drop = False)


## Using only prior data
users_prior = pd.DataFrame()
users_prior['prod_list'] = priors.groupby('user_id')['product_id'].apply(set)
users_prior.reset_index(inplace = True, drop = False)
users_prior.head()


Out[99]:
user_id prod_list
0 1 {17122, 196, 26405, 27845, 13032, 39657, 12427...
1 2 {45066, 2573, 18961, 23, 32792, 22559, 12324, ...
2 3 {17668, 39190, 44683, 21903, 14992, 21137, 324...
3 4 {26576, 21573, 17769, 25623, 35469, 37646, 366...
4 5 {11777, 40706, 48775, 20754, 28289, 6808, 1398...

Create train eval

Keep 25% of the train data outside for cross validation


In [102]:
"""
import random
train.reset_index(drop = True, inplace = True)
order_ids = list(train['order_id'].unique())
sample_size = int(0.25 * len(order_ids))
sample_orders = random.sample(order_ids, sample_size)

train_eval = train[train['order_id'].isin(sample_orders)]
train_new = train[~train['order_id'].isin(sample_orders)]


train_eval.to_csv('./data/train_eval.csv', index = False)
train_new.to_csv('./data/train_new.csv', index = False)

del train
train = train_new
del train_new
train.head()

"""


Out[102]:
"\nimport random\ntrain.reset_index(drop = True, inplace = True)\norder_ids = list(train['order_id'].unique())\nsample_size = int(0.25 * len(order_ids))\nsample_orders = random.sample(order_ids, sample_size)\n\ntrain_eval = train[train['order_id'].isin(sample_orders)]\ntrain_new = train[~train['order_id'].isin(sample_orders)]\n\n\ntrain_eval.to_csv('./data/train_eval.csv', index = False)\ntrain_new.to_csv('./data/train_new.csv', index = False)\n\ndel train\ntrain = train_new\ndel train_new\ntrain.head()\n\n"

3. Feature Generation

Features built

  • Order features

    • Current Order - u'order_dow', u'order_hour_of_day', u'days_since_prior_order'
    • Prev Order - u'prev_order_dow',u'prev_order_hour_of_day', u'prev_days_since_prior_order'
  • User Order Features

    • u'avg_days_since_prior_order
  • Product Features

    • u'aisle_id', u'department_id'
    • u'p_orders', u'p_reorders', u'p_reorder_rate'
  • User Features

    • u'tot_orders', u'tot_prods', u'avg_basket', u'avg_reorder', u'std_basket'
  • User Product Features

    • u'up_orders', u'up_reorder', u'up_reorder_rate'

User Order features

How much time on an average does a user leave between orders


In [104]:
## using prior stats

user_order_prior = pd.DataFrame()
user_order_prior['avg_days_since_prior_order'] = priors.groupby('user_id')['days_since_prior_order'].agg('mean')

## using prior train stats
user_order_all = pd.DataFrame()
user_order_all['avg_days_since_prior_order'] = prior_train.groupby('user_id')['days_since_prior_order'].agg('mean')

user_order_prior.reset_index(drop = False, inplace = True)
user_order_all.reset_index(drop = False, inplace = True)

print user_order_prior.head()
print
print user_order_all.head()


   user_id  avg_days_since_prior_order
0        1                   20.259260
1        2                   15.967033
2        3                   11.487180
3        4                   15.357142
4        5                   14.500000

   user_id  avg_days_since_prior_order
0        1                   19.200001
1        2                   18.009390
2        3                   11.487180
3        4                   15.357142
4        5                   12.314285

User id last order feature

The last order of the user, fetaures of it.


In [105]:
def last_order_features(priors):
    
    max_order = pd.DataFrame()
    max_order['max_order'] = priors.groupby(['user_id'])['order_number'].agg('max')
    max_order = max_order.rename(columns = {"max_order":"order_number"})
    max_order.reset_index(inplace = True, drop = False)

    max_order.set_index(['user_id','order_number'], drop = False, inplace = True)
    priors.set_index(['user_id','order_number'], drop = False, inplace = True)

    max_order = max_order.join(priors[['user_id', 'order_id', 'order_number','order_dow','order_hour_of_day','days_since_prior_order']], rsuffix="_")

    max_order.reset_index(drop =True, inplace = True)
    priors.reset_index(drop = True, inplace = True)

    max_order.drop('user_id_', inplace = True, axis =1)
    max_order.drop('order_number_', inplace = True, axis = 1)
    max_order.drop('order_number', inplace = True, axis = 1)


    max_order = max_order.rename(columns = {"order_id":"prev_order_id","order_dow":"prev_order_dow"
                                           ,"order_hour_of_day":"prev_order_hour_of_day"
                                           ,"days_since_prior_order":"prev_days_since_prior_order"})

    max_order.drop_duplicates(inplace = True)
    return max_order

In [106]:
## Stats from prior
max_order = last_order_features(priors)
print max_order.head()
print
## Stats from prior and train
max_order_all = last_order_features(prior_train)
print max_order_all.head()


    user_id  prev_order_id  prev_order_dow  prev_order_hour_of_day  \
0         1        2550362               4                       8   
9         2         839880               3                      10   
25        3        1402502               1                      15   
31        4        2557754               5                      13   
34        5         157374               1                      18   

    prev_days_since_prior_order  
0                          30.0  
9                          13.0  
25                         15.0  
31                          0.0  
34                         19.0  

    user_id  prev_order_id  prev_order_dow  prev_order_hour_of_day  \
0         1        1187899               4                       8   
11        2        1492625               1                      11   
42        3        1402502               1                      15   
48        4        2557754               5                      13   
51        5        2196797               0                      11   

    prev_days_since_prior_order  
0                          14.0  
11                         30.0  
42                         15.0  
48                          0.0  
51                          6.0  

Product Features

Features of the product, including the number of orders in which the product has appeared, number of re-orders where it has appeared and reorder rate


In [107]:
def product_features(priors):
    
    
    prods = pd.DataFrame()
    prods['p_orders'] = priors.groupby(priors.product_id).size().astype(np.float32)
    prods['p_reorders'] = priors['reordered'].groupby(priors.product_id).sum().astype(np.float32)

    prods['p_reorder_rate'] = (prods.p_reorders / prods.p_orders).astype(np.float32)

    ## set the index for products
    products.set_index('product_id', inplace = True, drop = False)
    products_prior = products.join(prods, rsuffix="_")
    ## Reset the index
    products_prior.reset_index(inplace = True, drop = True)
    del prods
    return products_prior

In [108]:
### Stats from prior and train
products_all = product_features(prior_train)
print products_all.head()
print
## Stats from prior
products_prior = product_features(priors)
print products_prior.head()


   product_id  aisle_id  department_id  p_orders  p_reorders  p_reorder_rate
0           1        61             19    1907.0      1171.0        0.614053
1           2       104             13      93.0        13.0        0.139785
2           3        94              7     282.0       208.0        0.737589
3           4        38              1     344.0       157.0        0.456395
4           5         5             13      16.0        10.0        0.625000

   product_id  aisle_id  department_id  p_orders  p_reorders  p_reorder_rate
0           1        61             19    1852.0      1136.0        0.613391
1           2       104             13      90.0        12.0        0.133333
2           3        94              7     277.0       203.0        0.732852
3           4        38              1     329.0       147.0        0.446809
4           5         5             13      15.0         9.0        0.600000

User features


In [109]:
def user_features(priors):
    prod_count_prior = pd.DataFrame()

    prod_count_prior['basket_size'] = priors.groupby(['user_id','order_id'])['product_id'].size().astype(np.int32)
    prod_count_prior['reorder_size'] = priors.groupby(['user_id','order_id'])['reordered'].agg('sum').astype(np.int32)

    # reset / set index
    prod_count_prior = prod_count_prior.reset_index()
    prod_count_prior.set_index('user_id', inplace = True, drop =False)

    prod_count_prior['tot_orders']  =  prod_count_prior.groupby(['user_id']).size().astype(np.int32)
    prod_count_prior['tot_prods']   =  prod_count_prior.groupby(['user_id'])['basket_size'].agg(['sum'])
    prod_count_prior['avg_basket']  =  prod_count_prior.groupby(['user_id'])['basket_size'].agg(['mean'])
    prod_count_prior['avg_reorder'] =  prod_count_prior.groupby(['user_id'])['reorder_size'].agg(['mean'])
    prod_count_prior['std_basket']  =  prod_count_prior.groupby(['user_id'])['basket_size'].agg(['std'])


    prod_count_prior.drop(['order_id','basket_size','reorder_size'], inplace=True, axis=1)
    prod_count_prior.drop_duplicates(inplace = True)
    prod_count_prior = prod_count_prior.reset_index(level = 'user_id', drop = True)

    return prod_count_prior

In [110]:
## Stats from only prior
prod_count_prior = user_features(priors)
## Stats from all
prod_count_all = user_features(prior_train)

print prod_count_prior.head()
print
print prod_count_all.head()


/Library/Python/2.7/site-packages/ipykernel-4.3.1-py2.7.egg/ipykernel/__main__.py:11: FutureWarning: 'user_id' is both a column name and an index level.
Defaulting to column but this will raise an ambiguity error in a future version
/Library/Python/2.7/site-packages/ipykernel-4.3.1-py2.7.egg/ipykernel/__main__.py:12: FutureWarning: 'user_id' is both a column name and an index level.
Defaulting to column but this will raise an ambiguity error in a future version
/Library/Python/2.7/site-packages/ipykernel-4.3.1-py2.7.egg/ipykernel/__main__.py:13: FutureWarning: 'user_id' is both a column name and an index level.
Defaulting to column but this will raise an ambiguity error in a future version
/Library/Python/2.7/site-packages/ipykernel-4.3.1-py2.7.egg/ipykernel/__main__.py:14: FutureWarning: 'user_id' is both a column name and an index level.
Defaulting to column but this will raise an ambiguity error in a future version
/Library/Python/2.7/site-packages/ipykernel-4.3.1-py2.7.egg/ipykernel/__main__.py:15: FutureWarning: 'user_id' is both a column name and an index level.
Defaulting to column but this will raise an ambiguity error in a future version
   user_id  tot_orders  tot_prods  avg_basket  avg_reorder  std_basket
0        1          10         59    5.900000     4.100000    1.523884
1        2          14        195   13.928571     6.642857    5.717238
2        3          12         88    7.333333     4.583333    2.103388
3        4           5         18    3.600000     0.200000    2.073644
4        5           4         37    9.250000     3.500000    3.095696

   user_id  tot_orders  tot_prods  avg_basket  avg_reorder  std_basket
0        1          11         70    6.363636     4.636364    2.110579
1        2          15        226   15.066667     7.000000    7.055562
2        3          12         88    7.333333     4.583333    2.103388
3        4           5         18    3.600000     0.200000    2.073644
4        5           5         46    9.200000     3.600000    2.683282

User Product Features


In [111]:
def user_prod_features(priors):
    user_prod_prior = pd.DataFrame()

    ## Number of user's order where product id is present
    user_prod_prior['up_orders'] = priors.groupby(['user_id','product_id'])['order_id'].size()

    user_prod_prior.reset_index(inplace = True, drop = False)
    user_prod_prior.set_index(['user_id', 'product_id'], inplace = True, drop = False)

    ## Number of times the product was re-ordered by the user
    user_prod_prior['up_reorder']      = priors.groupby(['user_id', 'product_id'])['reordered'].agg(['sum'])
    user_prod_prior['up_reorder_rate'] = user_prod_prior.up_reorder / user_prod_prior.up_orders

    user_prod_prior.reset_index(drop = True, inplace= True)
    return user_prod_prior

In [112]:
## Stats from prior

user_prod_prior = user_prod_features(priors)
user_prod_all   = user_prod_features(prior_train)


print user_prod_prior.head()
print
print user_prod_all.head()


   user_id  product_id  up_orders  up_reorder  up_reorder_rate
0        1         196         10           9         0.900000
1        1       10258          9           8         0.888889
2        1       10326          1           0         0.000000
3        1       12427         10           9         0.900000
4        1       13032          3           2         0.666667

   user_id  product_id  up_orders  up_reorder  up_reorder_rate
0        1         196         11          10         0.909091
1        1       10258         10           9         0.900000
2        1       10326          1           0         0.000000
3        1       12427         10           9         0.900000
4        1       13032          4           3         0.750000

Product days since last ordered


In [ ]:

Create a feature dict


In [113]:
feature_dict_prior = {}

feature_dict_prior[1] = {"name":"user_order_feature","obj":user_order_prior,"index":['user_id']}
feature_dict_prior[2] = {"name":"last_order_feature","obj":max_order,"index":['user_id']}
feature_dict_prior[3] = {"name":"product_feature","obj":products_prior,"index":['product_id']}
feature_dict_prior[4] = {"name":"user_feature","obj":prod_count_prior,"index":['user_id']}
feature_dict_prior[5] = {"name":"user_pro_feature","obj":user_prod_prior,"index":['user_id','product_id']}

feature_dict_all = {}

feature_dict_all[1] = {"name":"user_order_feature","obj":user_order_all,"index":['user_id']}
feature_dict_all[2] = {"name":"last_order_feature","obj":max_order_all,"index":['user_id']}
feature_dict_all[3] = {"name":"product_feature","obj":products_all,"index":['product_id']}
feature_dict_all[4] = {"name":"user_feature","obj":prod_count_all,"index":['user_id']}
feature_dict_all[5] = {"name":"user_prod_feature","obj":user_prod_all,"index":['user_id','product_id']}

4. Join the features


In [114]:
def join_features(feature_dict, features):
    
    for k,v in feature_dict.items():
        print "Joining {} feature".format(v['name'])
        obj = v['obj']
        index = v['index']

        features.set_index(index, drop = False, inplace = True)
        obj.set_index(index, drop = False, inplace = True)

        features = features.join(obj ,on =index, rsuffix='_')
        index_ = [idx + '_' for idx in index]
        features.drop(index_, inplace = True, axis = 1)

        features.reset_index(drop = True, inplace = True)
        obj.reset_index(drop = True, inplace = True)
        
    features.drop( ['prev_order_id'], inplace = True, axis = 1 )


    return features

In [115]:
## Join train

train.head()


Out[115]:
order_id product_id add_to_cart_order reordered
0 1 49302 1 1
1 1 11109 2 1
2 1 10246 3 0
3 1 49683 4 0
4 1 43633 5 1

Prepare Y Variable from train

  • Here we use the train table as our y variable. We create a dataset with user_id, product_id, is_product_in_train columns.

  • We can then use this to merge with user, product and user/product features.

  • This will serve as our testing data before use actually run our model on real data.


In [83]:
## This block needs to be run only once
## Later the output of this block is stored in ./data/features.csv file
## The next block reads the file, it enough to run the next block subsequently

## We could have got this from order_id
## However since we have separted our train into two sets
## we need to interate the train_new aka train data we have created.


train.reset_index(inplace = True, drop = True)

train_list = pd.DataFrame()
train_list['ignore'] = train.groupby(['user_id','order_id'], group_keys = True).size()
train_list.reset_index(inplace = True, drop = False)



train.set_index(['order_id', 'product_id'], inplace = True, drop = False)


print "features"
count = 0

order_list = []
product_list  = []
user_list = []
labels = []

for user_record  in train_list.itertuples():
    count+=1
    if count%10000 == 0:
        print "Finished {} users".format(count)

    user_id  = user_record.user_id
    order_id = user_record.order_id 
    prev_products = list(users_prior[users_prior.user_id == user_id]['prod_list'].values.tolist()[0])
    product_list+= prev_products
    order_list+=[order_id] * len(prev_products)
    user_list+=[user_id] * len(prev_products)
    labels+=[(order_id, product) in train.index for product in prev_products]
    


feature_df = pd.DataFrame({'user_id':user_list,'product_id':product_list,'order_id':order_list,'in_next_order':labels}, dtype=np.int32)

print feature_df.head()

feature_df.to_csv('./features/features.csv', index = False)


features
Finished 10000 users
Finished 20000 users
Finished 30000 users
Finished 40000 users
Finished 50000 users
Finished 60000 users
Finished 70000 users
Finished 80000 users
Finished 90000 users
   in_next_order  order_id  product_id  user_id
0              0   1187899       17122        1
1              1   1187899         196        1
2              1   1187899       26405        1
3              1   1187899       13032        1
4              1   1187899       39657        1

In [84]:
features = pd.read_csv('./features/features.csv')
features.head()


Out[84]:
in_next_order order_id product_id user_id
0 0 1187899 17122 1
1 1 1187899 196 1
2 1 1187899 26405 1
3 1 1187899 13032 1
4 1 1187899 39657 1

Order Features

Attributes of the order we are about to predict


In [85]:
print "Order features"

features.set_index('order_id',inplace = True, drop = False)
orders.set_index('order_id', inplace = True, drop = False)

features = pd.merge(features, orders, left_on = 'order_id', right_on = 'order_id')

#features.drop('order_id', inplace = True, axis =1)
features.drop('eval_set', inplace = True, axis =1)
features.drop('user_id_y', inplace = True, axis =1)
features.drop('order_number', inplace = True, axis =1)



features = features.rename(columns={"user_id_x":"user_id"})


features.reset_index(drop = True, inplace= True)
train.reset_index(drop = True, inplace = True)

features.head()


Order features
Out[85]:
in_next_order order_id product_id user_id order_dow order_hour_of_day days_since_prior_order
0 0 1187899 17122 1 4 8 14.0
1 1 1187899 196 1 4 8 14.0
2 1 1187899 26405 1 4 8 14.0
3 1 1187899 13032 1 4 8 14.0
4 1 1187899 39657 1 4 8 14.0

In [86]:
features = join_features(feature_dict_prior, features)
features.head()


Joining user_order_feature feature
Joining last_order_feature feature
Joining product_feature feature
Joining user_feature feature
Joining user_pro_feature feature
Out[86]:
in_next_order order_id product_id user_id order_dow order_hour_of_day days_since_prior_order avg_days_since_prior_order prev_order_dow prev_order_hour_of_day ... p_reorders p_reorder_rate tot_orders tot_prods avg_basket avg_reorder std_basket up_orders up_reorder up_reorder_rate
0 0 1187899 17122 1 4 8 14.0 20.25926 4 8 ... 9377.0 0.675576 10 59 5.9 4.1 1.523884 1 0 0.000000
1 1 1187899 196 1 4 8 14.0 20.25926 4 8 ... 27791.0 0.776480 10 59 5.9 4.1 1.523884 10 9 0.900000
2 1 1187899 26405 1 4 8 14.0 20.25926 4 8 ... 536.0 0.441516 10 59 5.9 4.1 1.523884 2 1 0.500000
3 1 1187899 13032 1 4 8 14.0 20.25926 4 8 ... 2465.0 0.657158 10 59 5.9 4.1 1.523884 3 2 0.666667
4 1 1187899 39657 1 4 8 14.0 20.25926 4 8 ... 3846.0 0.766288 10 59 5.9 4.1 1.523884 1 0 0.000000

5 rows × 24 columns


In [87]:
features.to_csv('./features/features_train.csv', index = False)

Prepare y variable


In [90]:
def get_y(test_list, users_prior):
    
    feature = []
    count = 0
    for user_record in (test_list.itertuples()):
        count+=1
        if count%10000 == 0:
            print "Finished {} users".format(count)
        user_id = user_record.user_id
        order_id = user_record.order_id 
        prev_products = list(users_prior[users_prior.user_id == user_id]['prod_list'].values.tolist()[0])

        for p_p in prev_products:
            feature.append((order_id, user_id ,p_p))

    test_df = pd.DataFrame(data = feature, columns =['order_id','user_id','product_id'])
    
    
    return test_df

In [118]:
train_eval.head()


Out[118]:
product_id add_to_cart_order reordered order_id user_id eval_set order_number order_dow order_hour_of_day days_since_prior_order
0 27104 1 1 2298068 13 prior 10 6 22 7.0
1 21174 2 1 2298068 13 prior 10 6 22 7.0
2 41860 3 0 2298068 13 prior 10 6 22 7.0
3 38273 4 0 2298068 13 prior 10 6 22 7.0
4 47209 5 1 2298068 13 prior 10 6 22 7.0

Prepare Y Varible from eval


In [119]:
train_eval.reset_index(inplace = True, drop = True)

train_eval_list = pd.DataFrame()
train_eval_list['ignore'] = train_eval.groupby(['user_id','order_id'], group_keys = True).size()
train_eval_list.reset_index(inplace = True, drop = False)


test_df = get_y(train_eval_list, users_prior, orders)
test_df.head()


Finished 10000 users
Finished 20000 users
Finished 30000 users
Out[119]:
order_id user_id product_id order_dow order_hour_of_day days_since_prior_order
0 2298068 13 41351 6 22 7.0
1 2298068 13 41480 6 22 7.0
2 2298068 13 37385 6 22 7.0
3 2298068 13 31372 6 22 7.0
4 2298068 13 42125 6 22 7.0

In [120]:
test_df = join_features(feature_dict_prior, test_df)
test_df.head()


Joining user_order_feature feature
Joining last_order_feature feature
Joining product_feature feature
Joining user_feature feature
Joining user_pro_feature feature
Out[120]:
order_id user_id product_id order_dow order_hour_of_day days_since_prior_order avg_days_since_prior_order prev_order_dow prev_order_hour_of_day prev_days_since_prior_order ... p_reorders p_reorder_rate tot_orders tot_prods avg_basket avg_reorder std_basket up_orders up_reorder up_reorder_rate
0 2298068 13 41351 6 22 7.0 7.381579 6 13 6.0 ... 2074.0 0.520844 12 81 6.75 4.333333 2.005674 2 1 0.5
1 2298068 13 41480 6 22 7.0 7.381579 6 13 6.0 ... 519.0 0.428571 12 81 6.75 4.333333 2.005674 1 0 0.0
2 2298068 13 37385 6 22 7.0 7.381579 6 13 6.0 ... 451.0 0.513083 12 81 6.75 4.333333 2.005674 1 0 0.0
3 2298068 13 31372 6 22 7.0 7.381579 6 13 6.0 ... 541.0 0.242601 12 81 6.75 4.333333 2.005674 1 0 0.0
4 2298068 13 42125 6 22 7.0 7.381579 6 13 6.0 ... 22.0 0.468085 12 81 6.75 4.333333 2.005674 1 0 0.0

5 rows × 23 columns


In [121]:
test_df.to_csv('./features/features_eval.csv',index = False)

Prepare Y Variable from test


In [114]:
test_list = orders[orders.eval_set == 'test']

feature = []
count = 0
for user_record in (test_list.itertuples()):
	count+=1
	if count%10000 == 0:
		print "Finished {} users".format(count)
	user_id = user_record.user_id
	order_id = user_record.order_id 
	prev_products = list(users_prior[users_prior.user_id == user_id]['prod_list'].values.tolist()[0])

	for p_p in prev_products:
		feature.append((order_id, user_id ,p_p))

test_df = pd.DataFrame(data = feature, columns =['order_id','user_id','product_id'])

print test_df.head()


Finished 10000 users
Finished 20000 users
Finished 30000 users
Finished 40000 users
Finished 50000 users
Finished 60000 users
Finished 70000 users
   order_id  user_id  product_id
0   2774568        3       17668
1   2774568        3       39190
2   2774568        3       44683
3   2774568        3       21903
4   2774568        3       14992

Order Feature


In [115]:
## Order features

print "Order features"

test_df.set_index('order_id',inplace = True, drop = False)
orders.set_index('order_id', inplace = True, drop = False)

test_df = pd.merge(test_df, orders, left_on = 'order_id', right_on = 'order_id')

test_df.drop('eval_set', inplace = True, axis =1)
test_df.drop('user_id_y', inplace = True, axis =1)
test_df.drop('order_number', inplace = True, axis =1)



test_df = test_df.rename(columns={"user_id_x":"user_id"})


test_df.reset_index(drop = True, inplace= True)
train.reset_index(drop = True, inplace = True)

test_df.head()


Order features
Out[115]:
order_id user_id product_id order_dow order_hour_of_day days_since_prior_order
0 2774568 3 17668 5 15 11.0
1 2774568 3 39190 5 15 11.0
2 2774568 3 44683 5 15 11.0
3 2774568 3 21903 5 15 11.0
4 2774568 3 14992 5 15 11.0

In [116]:
test_df = join_features(feature_dict_all, test_df)
test_df.head()


Joining user_order_feature feature
Joining last_order_feature feature
Joining product_feature feature
Joining user_feature feature
Joining user_prod_feature feature
Out[116]:
order_id user_id product_id order_dow order_hour_of_day days_since_prior_order avg_days_since_prior_order prev_order_dow prev_order_hour_of_day prev_days_since_prior_order ... p_reorders p_reorder_rate tot_orders tot_prods avg_basket avg_reorder std_basket up_orders up_reorder up_reorder_rate
0 2774568 3 17668 5 15 11.0 11.48718 1 15 15.0 ... 1269.0 0.577869 12 88 7.333333 4.583333 2.103388 5 4 0.800
1 2774568 3 39190 5 15 11.0 11.48718 1 15 15.0 ... 6626.0 0.576776 12 88 7.333333 4.583333 2.103388 10 9 0.900
2 2774568 3 44683 5 15 11.0 11.48718 1 15 15.0 ... 12631.0 0.539233 12 88 7.333333 4.583333 2.103388 2 1 0.500
3 2774568 3 21903 5 15 11.0 11.48718 1 15 15.0 ... 194939.0 0.774474 12 88 7.333333 4.583333 2.103388 8 7 0.875
4 2774568 3 14992 5 15 11.0 11.48718 1 15 15.0 ... 17639.0 0.583513 12 88 7.333333 4.583333 2.103388 2 1 0.500

5 rows × 23 columns


In [117]:
test_df.to_csv('./features/features_test.csv',index = False)

Vowpalwabbit - Experimental


In [ ]:
"""
## Numpy savetxt is extremely slow
#VW_train = np.column_stack((y_train, X_train))
#print "Save"
#np.savetxt('./data/vw_train.csv', VW_train)
#print "done"

VW_train = pd.concat([Y, X],axis =1 )
print VW_train.shape
VW_train.head()

print "VW_train"
VW_train.to_csv('./data/vw_train.csv', index = False)

#python csv2vw.py ./data/vw_train.csv ./data/vw_train.txt 0 1
#python csv2vw.py ./data/vw_test.csv ./data/vw_test.txt 0 1

### Vowpal wabbit baseline model

#time vw ./data/vw_train.txt --predictions vwpred_train.out


vw_pred_train = pd.read_csv('vwpred_train.out', names=['y_p'])
vw_pred_train['y_pp']= vw_pred_train['y_p'].apply(lambda x: 1.0 if x > 0.35 else 0.0)
y_p3 = vw_pred_train['y_pp'].values
print "Vowpal wabbit accuracy {0:.2f}, precision {0:.2f}, recall {0:.2f}, f1-score {0:.2f}".format(
             accuracy_score(y_train, y_p3), 
             precision_score(y_train, y_p3),
             recall_score(y_train, y_p3),
             f1_score(y_train, y_p3))

print confusion_matrix(y_train, y_p3)
"""

In [ ]: