notebook.community

Edit and run



In [1]:

    
import numpy as np
import pandas as pd
pd.set_option('display.max_colwidth', -1)



In [2]:

    
import pickle



In [3]:

    
from sklearn.preprocessing import MultiLabelBinarizer



In [4]:

    
orders_df = pd.read_csv("../data/raw/orders.csv")
op_train_df = pd.read_csv("../data/raw/order_products__train.csv")
op_prior_df = pd.read_csv("../data/raw/order_products__prior.csv")



In [5]:

    
orders_df.head()









    Out[5]:






  
    
      
      order_id
      user_id
      eval_set
      order_number
      order_dow
      order_hour_of_day
      days_since_prior_order
    
  
  
    
      0
      2539329
      1
      prior
      1
      2
      8
      NaN
    
    
      1
      2398795
      1
      prior
      2
      3
      7
      15.0
    
    
      2
      473747
      1
      prior
      3
      3
      12
      21.0
    
    
      3
      2254736
      1
      prior
      4
      4
      7
      29.0
    
    
      4
      431534
      1
      prior
      5
      4
      15
      28.0

Extract features based on the "prior" set



In [6]:

    
orders_prior_info = orders_df[orders_df.eval_set == "prior"]



In [7]:

    
orders_prior_info.shape









    Out[7]:





(3214874, 7)



In [8]:

    
prior_uid_pid_df = pd.merge(orders_df[['user_id', 'order_id']], op_prior_df, how = "inner", on = "order_id")



In [9]:

    
prior_uid_pid_df.shape









    Out[9]:





(32434489, 5)

User features

number of orders per user



In [10]:

    
num_orders_per_uid = orders_prior_info.groupby('user_id')['order_id'].\
                                      agg('count').\
                                      reset_index().\
                                      rename(columns = {'order_id': 'num_orders'})



In [11]:

    
num_orders_per_uid.head()

average number of (reordered) products per user



In [12]:

    
num_product_per_user = prior_uid_pid_df.groupby("order_id").agg({'product_id':'count', 'reordered': 'sum', 'user_id': 'first'}).\
                        reset_index().\
                        rename(columns = {'product_id': 'num_products', 'reordered': 'num_reordered'}).\
                        groupby("user_id").\
                        agg({'num_products': 'mean', 'num_reordered': 'mean'}).\
                        reset_index().\
                        rename(columns = {'num_products': 'avg_num_pids', 'num_reordered': 'avg_num_reordered'})



In [14]:

    
num_product_per_user['perc_reordered'] = num_product_per_user['avg_num_reordered']/num_product_per_user['avg_num_pids']
num_product_per_user = np.round(num_product_per_user, 2)



In [15]:

    
num_product_per_user.head()









    Out[15]:






  
    
      
      user_id
      avg_num_reordered
      avg_num_pids
      perc_reordered
    
  
  
    
      0
      1
      4.10
      5.90
      0.69
    
    
      1
      2
      6.64
      13.93
      0.48
    
    
      2
      3
      4.58
      7.33
      0.62
    
    
      3
      4
      0.20
      3.60
      0.06
    
    
      4
      5
      3.50
      9.25
      0.38

Order Temporal features

number of orders for each day of week



In [16]:

    
user_dow = orders_prior_info.groupby(['user_id', 'order_dow'])['order_id'].count().reset_index().\
            rename(columns = {'order_id': 'cnt_dow'}).\
            pivot(index = "user_id", columns = "order_dow", values = "cnt_dow")



In [17]:

    
user_dow.columns.name = None
user_dow = user_dow.add_prefix('dow_').reset_index().fillna(0)



In [18]:

    
user_dow.head()

most often shopping day



In [19]:

    
user_dow['most_dow'] = user_dow.ix[:, 1:].idxmax(axis=1)



In [20]:

    
user_dow.head()

number of orders by day time (sleeping, morning, noon, afternoon, evening, night)



In [21]:

    
bins = [0, 6, 10, 13, 17, 20, 24]
group_names = ['sleeping', 'morning', 'noon', 'afternoon', 'evening', 'night']



In [22]:

    
orders_prior_info['daytime'] = pd.cut(orders_prior_info['order_hour_of_day'], bins = bins, labels= group_names, include_lowest=True)









    



c:\users\jpzha\appdata\local\programs\python\python35\lib\site-packages\ipykernel\__main__.py:1: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if __name__ == '__main__':



In [23]:

    
orders_prior_info.head()









    Out[23]:






  
    
      
      order_id
      user_id
      eval_set
      order_number
      order_dow
      order_hour_of_day
      days_since_prior_order
      daytime
    
  
  
    
      0
      2539329
      1
      prior
      1
      2
      8
      NaN
      morning
    
    
      1
      2398795
      1
      prior
      2
      3
      7
      15.0
      morning
    
    
      2
      473747
      1
      prior
      3
      3
      12
      21.0
      noon
    
    
      3
      2254736
      1
      prior
      4
      4
      7
      29.0
      morning
    
    
      4
      431534
      1
      prior
      5
      4
      15
      28.0
      afternoon



In [24]:

    
user_daytime = orders_prior_info.groupby(['user_id', 'daytime'])['order_id'].count().reset_index().\
            rename(columns = {'order_id': 'cnt_daytime'}).\
            pivot(index = "user_id", columns = "daytime", values = "cnt_daytime").fillna(0)



In [25]:

    
user_daytime.columns.name = None
user_daytime = user_daytime.add_prefix('daytime_').reset_index().fillna(0)
user_daytime.head()









    Out[25]:






  
    
      
      user_id
      daytime_sleeping
      daytime_morning
      daytime_noon
      daytime_afternoon
      daytime_evening
      daytime_night
    
  
  
    
      0
      1
      0.0
      6.0
      1.0
      3.0
      0.0
      0.0
    
    
      1
      2
      0.0
      8.0
      5.0
      1.0
      0.0
      0.0
    
    
      2
      3
      0.0
      0.0
      0.0
      9.0
      3.0
      0.0
    
    
      3
      4
      0.0
      0.0
      4.0
      1.0
      0.0
      0.0
    
    
      4
      5
      0.0
      0.0
      1.0
      1.0
      2.0
      0.0

most often shopping daytime



In [26]:

    
user_daytime['most_daytime'] = user_daytime.ix[:, 1:].idxmax(axis=1)



In [27]:

    
user_daytime.head()









    Out[27]:






  
    
      
      user_id
      daytime_sleeping
      daytime_morning
      daytime_noon
      daytime_afternoon
      daytime_evening
      daytime_night
      most_daytime
    
  
  
    
      0
      1
      0.0
      6.0
      1.0
      3.0
      0.0
      0.0
      daytime_morning
    
    
      1
      2
      0.0
      8.0
      5.0
      1.0
      0.0
      0.0
      daytime_morning
    
    
      2
      3
      0.0
      0.0
      0.0
      9.0
      3.0
      0.0
      daytime_afternoon
    
    
      3
      4
      0.0
      0.0
      4.0
      1.0
      0.0
      0.0
      daytime_noon
    
    
      4
      5
      0.0
      0.0
      1.0
      1.0
      2.0
      0.0
      daytime_evening

average of days_since_prior_order per user



In [28]:

    
user_days_since_po = orders_prior_info.groupby('user_id')['days_since_prior_order'].mean().reset_index().\
                        rename(columns = {'days_since_prior_order': 'avg_days_since_prior_order'})



In [29]:

    
user_days_since_po.head()









    Out[29]:






  
    
      
      user_id
      avg_days_since_prior_order
    
  
  
    
      0
      1
      19.555556
    
    
      1
      2
      15.230769
    
    
      2
      3
      12.090909
    
    
      3
      4
      13.750000
    
    
      4
      5
      13.333333

Product Aggregated statistics features



In [30]:

    
products_df = pd.read_csv("../data/raw/products.csv")
aisles_df = pd.read_csv("../data/raw/aisles.csv")
departments_df = pd.read_csv("../data/raw/departments.csv")



In [31]:

    
products_df = pd.merge(pd.merge(products_df, aisles_df, on = "aisle_id"), departments_df, on = "department_id")



In [32]:

    
products_df.head()









    Out[32]:






  
    
      
      product_id
      product_name
      aisle_id
      department_id
      aisle
      department
    
  
  
    
      0
      1
      Chocolate Sandwich Cookies
      61
      19
      cookies cakes
      snacks
    
    
      1
      78
      Nutter Butter Cookie Bites Go-Pak
      61
      19
      cookies cakes
      snacks
    
    
      2
      102
      Danish Butter Cookies
      61
      19
      cookies cakes
      snacks
    
    
      3
      172
      Gluten Free All Natural Chocolate Chip Cookies
      61
      19
      cookies cakes
      snacks
    
    
      4
      285
      Mini Nilla Wafers Munch Pack
      61
      19
      cookies cakes
      snacks



In [33]:

    
products_df.aisle.nunique()









    Out[33]:





134



In [34]:

    
products_df.department.nunique()









    Out[34]:





21



In [35]:

    
prior_uid_pid_df = pd.merge(prior_uid_pid_df, products_df[["product_id", "aisle_id", "department_id"]],\
                            how = "inner", on = "product_id")



In [36]:

    
prior_uid_pid_df.head()









    Out[36]:






  
    
      
      user_id
      order_id
      product_id
      add_to_cart_order
      reordered
      aisle_id
      department_id
    
  
  
    
      0
      1
      2539329
      196
      1
      0
      77
      7
    
    
      1
      1
      2398795
      196
      1
      1
      77
      7
    
    
      2
      1
      473747
      196
      1
      1
      77
      7
    
    
      3
      1
      2254736
      196
      1
      1
      77
      7
    
    
      4
      1
      431534
      196
      1
      1
      77
      7

most often reordered product



In [37]:

    
def get_most_reorder_pids(grouped_df, top = 3):
    top3_pids = grouped_df['product_id'].value_counts().nlargest(top).index
    top1 = int(top3_pids[0])
    try: 
        top2 = int(top3_pids[1])
    except:
        top2 = np.nan
    try:
        top3 = int(top3_pids[2])
    except:
        top3 = np.nan
    return pd.Series({'top1_reordered_pid': top1, 
                      'top2_reordered_pid': top2, 
                      'top3_reordered_pid': top3})



In [38]:

    
%time top3_reordered_pids_user = prior_uid_pid_df[prior_uid_pid_df.reordered == 1].groupby('user_id').apply(get_most_reorder_pids)









    



Wall time: 3min 16s



In [39]:

    
top3_reordered_pids_user = top3_reordered_pids_user.reset_index()
top3_reordered_pids_user.head()









    Out[39]:






  
    
      
      user_id
      top1_reordered_pid
      top2_reordered_pid
      top3_reordered_pid
    
  
  
    
      0
      1
      12427.0
      196.0
      10258.0
    
    
      1
      2
      32792.0
      47209.0
      24852.0
    
    
      2
      3
      39190.0
      47766.0
      21903.0
    
    
      3
      4
      35469.0
      NaN
      NaN
    
    
      4
      5
      26604.0
      11777.0
      43693.0

most often reordered aiesle



In [40]:

    
# def get_most_reordered_aiesle(grouped_df):
#     top1 = grouped_df['aisle_id'].value_counts().nlargest(1).index[0]
#     return pd.Series({'most_reordered_aiesle':top1})
def get_most_reordered_aiesle(grouped_df):
    top1 = grouped_df['aisle_id'].value_counts().nlargest(1).index[0]
    return top1



In [42]:

    
%time top1_reordered_aiesle_user = pd.DataFrame.from_records([ (g, get_most_reordered_aiesle(grp))\
                                           for g, grp in prior_uid_pid_df[prior_uid_pid_df.reordered == 1].groupby('user_id')],\
                                        columns = ['user_id', 'most_reordered_aiesle'])









    



Wall time: 2min 51s



In [43]:

    
top1_reordered_aiesle_user.head()









    Out[43]:






  
    
      
      user_id
      most_reordered_aiesle
    
  
  
    
      0
      1
      77
    
    
      1
      2
      120
    
    
      2
      3
      24
    
    
      3
      4
      38
    
    
      4
      5
      123

most often reordered department



In [46]:

    
def get_most_reordered_dpmt(grouped_df):
    top1 = grouped_df['department_id'].value_counts().nlargest(1).index[0]
    return top1



In [47]:

    
%time top1_reordered_dpmt_user = pd.DataFrame.from_records([(g, get_most_reordered_dpmt(grp))\
                                           for g, grp in prior_uid_pid_df[prior_uid_pid_df.reordered == 1].groupby('user_id')],\
                                                          columns = ['user_id', 'most_reordered_dpmt'])









    



Wall time: 2min 40s



In [48]:

    
top1_reordered_dpmt_user.head()









    Out[48]:






  
    
      
      user_id
      most_reordered_dpmt
    
  
  
    
      0
      1
      19
    
    
      1
      2
      16
    
    
      2
      3
      4
    
    
      3
      4
      1
    
    
      4
      5
      4

Product embedding features



In [49]:

    
import gensim









    



c:\users\jpzha\appdata\local\programs\python\python35\lib\site-packages\gensim\utils.py:860: UserWarning: detected Windows; aliasing chunkize to chunkize_serial
  warnings.warn("detected Windows; aliasing chunkize to chunkize_serial")

user purchased product frequency vector



In [50]:

    
prior_uid_pid_cnt_df = prior_uid_pid_df[['user_id', 'product_id']].groupby(['user_id','product_id']).size()



In [51]:

    
prior_uid_pid_cnt_df = prior_uid_pid_cnt_df.reset_index().rename(columns = {0:'count'})



In [52]:

    
prior_uid_pid_cnt_df.head()



In [53]:

    
uid_pid_cnt_dict = [{'user_id': k, 'pid_freq_dict': dict(zip(g['product_id'], g['count'])) }\
                    for k,g in prior_uid_pid_cnt_df.groupby('user_id')]
uid_pid_freq_df = pd.DataFrame(uid_pid_cnt_dict)



In [54]:

    
uid_pid_freq_df.head()









    Out[54]:






  
    
      
      pid_freq_dict
      user_id
    
  
  
    
      0
      {17122: 1, 41787: 1, 196: 10, 26405: 2, 26088: 2, 39657: 1, 12427: 10, 25133: 8, 35951: 1, 38928: 1, 13032: 3, 10258: 9, 49235: 2, 10326: 1, 13176: 2, 14084: 1, 30450: 1, 46149: 3}
      1
    
    
      1
      {38656: 1, 34688: 3, 40198: 2, 45066: 3, 2573: 2, 37646: 1, 44303: 1, 18961: 1, 5907: 1, 24852: 7, 27413: 1, 23: 1, 32792: 9, 30489: 1, 7963: 1, 8479: 1, 46886: 2, 13351: 2, 19240: 1, 22825: 1, 49451: 1, 22829: 1, 20785: 1, 32052: 3, 17758: 2, 41787: 2, 27966: 2, 10305: 1, 32139: 4, 15841: 1, 17224: 1, 5450: 1, 14306: 1, 3151: 1, 48210: 1, 46676: 1, 5322: 1, 27737: 1, 18523: 6, 5212: 1, 4957: 1, 20574: 2, 7781: 3, 42342: 1, 8296: 1, 47209: 8, 19051: 4, 22124: 4, 35917: 1, 22474: 3, 19057: 1, 42356: 1, 13176: 1, 49273: 1, 24954: 1, 40571: 1, 45948: 1, 36735: 3, 21376: 2, 16521: 1, 1559: 6, 47766: 4, 16797: 1, 24990: 1, 9124: 1, 33957: 1, 47526: 1, 13742: 1, 47792: 1, 22963: 2, 21150: 1, 20084: 1, 22559: 1, 30908: 1, 28874: 1, 36287: 1, 24768: 1, 47553: 1, 39877: 1, 8138: 1, 21709: 2, 16589: 5, 27344: 2, 9681: 2, 2002: 4, 19156: 6, 14553: 1, 33754: 5, 79: 1, 45613: 2, 12000: 5, 17872: 3, 12258: 1, 48099: 1, 4071: 1, 21227: 1, 5869: 1, 48110: 2, 47144: 1, 28918: 1, ...}
      2
    
    
      2
      {18370: 1, 9387: 5, 17668: 5, 16965: 2, 24010: 2, 44683: 2, 12845: 1, 14992: 2, 21137: 1, 32402: 3, 22035: 3, 8021: 1, 39190: 10, 42265: 1, 38596: 1, 7503: 1, 40604: 1, 16797: 3, 23650: 2, 1819: 3, 15143: 1, 24810: 3, 18599: 4, 1005: 1, 47766: 9, 39922: 1, 49683: 1, 21903: 8, 248: 1, 43961: 4, 48523: 2, 42557: 1, 28373: 2}
      3
    
    
      3
      {42329: 1, 26576: 1, 43704: 1, 21573: 1, 17769: 1, 22199: 1, 35469: 2, 37646: 1, 1200: 1, 19057: 1, 2707: 1, 7350: 1, 25623: 1, 7160: 1, 11865: 1, 25146: 1, 36606: 1}
      4
    
    
      4
      {11777: 4, 40706: 2, 8518: 2, 28289: 1, 18761: 1, 22475: 1, 27344: 1, 20754: 1, 24535: 3, 6808: 1, 31717: 1, 3376: 1, 13988: 2, 21413: 2, 24231: 1, 16168: 1, 48775: 1, 26604: 4, 43693: 3, 13870: 1, 5999: 1, 21616: 1, 15349: 1}
      5



In [203]:

    
uid_pid_freq_df.shape









    Out[203]:





(206209, 2)



In [204]:

    
# save the product count feature for all users 
with open("../data/processed/user_prods_cnt_dict_features.pickle", "wb") as handle:
    pickle.dump(uid_pid_freq_df, handle)



In [55]:

    
del uid_pid_cnt_dict, prior_uid_pid_cnt_df

calculate the purchase embedding vector



In [56]:

    
product_emd = gensim.models.Word2Vec.load("../data/interim/product2vec.model")



In [57]:

    
def get_purchase_embedding(pid_freq_dict_series, product_emd):
    all_uids_purchase_emd = []
    for pfreq_d in pid_freq_dict_series:
        uid_purchase_emd_ls = []
        for pid, freq in pfreq_d.items():
            try:
                temp_vec = product_emd[str(pid)]*freq
                uid_purchase_emd_ls.append(temp_vec)
            except:
                continue
        one_uid_purchase_emd = np.stack(uid_purchase_emd_ls).sum(axis = 0)
        all_uids_purchase_emd.append(one_uid_purchase_emd)
    return np.stack(all_uids_purchase_emd)



In [58]:

    
uids_purchase_emd = get_purchase_embedding(uid_pid_freq_df['pid_freq_dict'], product_emd)
uid_pid_emb_df = pd.DataFrame(data = uids_purchase_emd, index = uid_pid_freq_df['user_id'])
uid_pid_emb_df = uid_pid_emb_df.add_prefix('emb_')



In [59]:

    
uid_pid_emb_df = uid_pid_emb_df.reset_index()
uid_pid_emb_df.head()









    Out[59]:






  
    
      
      user_id
      emb_0
      emb_1
      emb_2
      emb_3
      emb_4
      emb_5
      emb_6
      emb_7
      emb_8
      ...
      emb_90
      emb_91
      emb_92
      emb_93
      emb_94
      emb_95
      emb_96
      emb_97
      emb_98
      emb_99
    
  
  
    
      0
      1
      -26.714828
      10.802811
      -25.158247
      0.034250
      -13.170386
      37.165924
      -57.506771
      -67.410599
      46.844711
      ...
      -9.796089
      31.366833
      36.130058
      23.546530
      10.425104
      -45.494610
      -41.286137
      -15.234282
      -27.943871
      -1.224820
    
    
      1
      2
      81.621399
      38.889706
      82.688820
      -102.128090
      -19.691454
      287.458893
      -180.793015
      35.621387
      -54.345078
      ...
      120.507309
      -108.388176
      1.776031
      23.358995
      -148.381149
      88.404236
      -56.325516
      -31.194061
      18.958136
      25.864012
    
    
      2
      3
      40.915524
      -18.381775
      75.038582
      -1.873290
      -58.543594
      93.821640
      -24.789171
      79.996437
      -30.540491
      ...
      27.517372
      -92.990982
      48.024872
      -35.506332
      -128.823105
      77.397263
      16.856283
      -16.234632
      28.117483
      46.908470
    
    
      3
      4
      1.429444
      2.203788
      -0.556785
      -11.420183
      -6.547275
      0.305758
      0.160394
      -8.673797
      -5.231474
      ...
      3.262614
      18.981794
      -9.738063
      21.261660
      2.639111
      2.720276
      -2.564438
      4.853611
      -3.863805
      9.918034
    
    
      4
      5
      7.796177
      -11.155602
      12.636064
      37.787033
      11.076339
      43.303268
      3.458212
      15.993218
      -27.569876
      ...
      -9.303144
      -57.355591
      -3.194045
      -31.843639
      -23.899057
      35.830971
      18.920025
      3.286705
      26.969088
      18.922802
    
  

5 rows × 101 columns

Prepare the training set

joining all the features tables and the train set by user id

All the feature tables:

num_orders_per_uid
num_product_per_user
user_dow
user_daytime
user_days_since_po
top3_reordered_pids_user
top1_reordered_aiesle_user
top1_reordered_dpmt_user
uid_pid_emb_df

train set orders info:

orders_df[orders_df.eval_set = "train"]

train set labels info:

"../data/interim/train_labels.csv"



In [60]:

    
X_features = pd.concat([num_orders_per_uid, num_product_per_user, user_dow, user_daytime, user_days_since_po,\
          top3_reordered_pids_user, top1_reordered_aiesle_user, top1_reordered_dpmt_user, uid_pid_emb_df], axis = 1)



In [61]:

    
X_features.shape









    Out[61]:





(206209, 134)



In [62]:

    
X_features = X_features.loc[:, ~ X_features.columns.duplicated()]



In [63]:

    
X_features.head()









    Out[63]:






  
    
      
      user_id
      num_orders
      avg_num_reordered
      avg_num_pids
      perc_reordered
      dow_0
      dow_1
      dow_2
      dow_3
      dow_4
      ...
      emb_90
      emb_91
      emb_92
      emb_93
      emb_94
      emb_95
      emb_96
      emb_97
      emb_98
      emb_99
    
  
  
    
      0
      1
      10
      4.10
      5.90
      0.69
      0.0
      3.0
      2.0
      2.0
      3.0
      ...
      -9.796089
      31.366833
      36.130058
      23.546530
      10.425104
      -45.494610
      -41.286137
      -15.234282
      -27.943871
      -1.224820
    
    
      1
      2
      14
      6.64
      13.93
      0.48
      0.0
      5.0
      5.0
      2.0
      1.0
      ...
      120.507309
      -108.388176
      1.776031
      23.358995
      -148.381149
      88.404236
      -56.325516
      -31.194061
      18.958136
      25.864012
    
    
      2
      3
      12
      4.58
      7.33
      0.62
      6.0
      2.0
      1.0
      3.0
      0.0
      ...
      27.517372
      -92.990982
      48.024872
      -35.506332
      -128.823105
      77.397263
      16.856283
      -16.234632
      28.117483
      46.908470
    
    
      3
      4
      5
      0.20
      3.60
      0.06
      0.0
      0.0
      0.0
      0.0
      2.0
      ...
      3.262614
      18.981794
      -9.738063
      21.261660
      2.639111
      2.720276
      -2.564438
      4.853611
      -3.863805
      9.918034
    
    
      4
      5
      4
      3.50
      9.25
      0.38
      1.0
      1.0
      0.0
      2.0
      0.0
      ...
      -9.303144
      -57.355591
      -3.194045
      -31.843639
      -23.899057
      35.830971
      18.920025
      3.286705
      26.969088
      18.922802
    
  

5 rows × 126 columns



In [64]:

    
# save the feature matrix for all users 
with open("../data/processed/all_users_features.pickle", "wb") as handle:
    pickle.dump(X_features, handle)

Get the train feature matrix



In [65]:

    
X_train_orders = orders_df[orders_df.eval_set == "train"]



In [66]:

    
X_train_orders.shape









    Out[66]:





(131209, 7)



In [67]:

    
X_train_orders.head()









    Out[67]:






  
    
      
      order_id
      user_id
      eval_set
      order_number
      order_dow
      order_hour_of_day
      days_since_prior_order
    
  
  
    
      10
      1187899
      1
      train
      11
      4
      8
      14.0
    
    
      25
      1492625
      2
      train
      15
      1
      11
      30.0
    
    
      49
      2196797
      5
      train
      5
      0
      11
      6.0
    
    
      74
      525192
      7
      train
      21
      2
      11
      6.0
    
    
      78
      880375
      8
      train
      4
      1
      14
      10.0



In [68]:

    
# combine with the train set order info with features info
X_train = pd.merge(X_train_orders, X_features, how = "inner", on = "user_id")

join the train label information to keep the order consistent



In [69]:

    
y_train_df = pd.read_csv("../data/interim/train_labels.csv")



In [70]:

    
y_train_df.head()









    Out[70]:






  
    
      
      order_id
      products
    
  
  
    
      0
      1
      49302 11109 43633 22035
    
    
      1
      36
      19660 43086 46620 34497 48679 46979
    
    
      2
      38
      21616
    
    
      3
      96
      20574 40706 27966 24489 39275
    
    
      4
      98
      8859 19731 43654 13176 4357 37664 34065 35951 43560 9896 27509 15455 27966 47601 40396 35042 40986 1939 46313 329 30776 36695 27683 15995 27344 47333 48287 45204 24964 18117 46413 34126 9373 22935 46720 44479 790 18441 45007 20520 7461 26317 3880 41387 17747



In [71]:

    
y_train_df[y_train_df.products == "None"].shape









    Out[71]:





(8602, 2)



In [183]:

    
# number of unique reordered pid outcomes
pid_ls_str= ' '.join([pid_ls for pid_ls in y_train_df['products']])
num_reordered_pids = len(set(pid_ls_str.split(' ')))



In [184]:

    
num_reordered_pids









    Out[184]:





30177

check the unique pids in the purchase history of users in the test set



In [73]:

    
test_user_ids = orders_df[orders_df.eval_set=="test"]['user_id'].tolist()



In [115]:

    
def products_concat(series):
    out = ''
    for product in series:
        if product > 0:
            out = out + str(int(product)) + ' '
    
    if out != '':
        return out.rstrip()
    else:
        return 'None'



In [122]:

    
test_uid_prev_prods = orders_df[(orders_df.user_id.isin(test_user_ids)) & (orders_df.eval_set=='prior')].merge(op_prior_df, on = "order_id")



In [124]:

    
test_users_prev_orders_prods = test_uid_prev_prods.groupby('user_id')['product_id'].apply(products_concat).reset_index()



In [131]:

    
test_users_prev_orders_prods.head()









    Out[131]:






  
    
      
      user_id
      product_id
    
  
  
    
      0
      3
      9387 17668 15143 16797 39190 47766 21903 39922 24810 32402 38596 21903 248 40604 8021 17668 21137 23650 32402 39190 47766 21903 49683 28373 7503 1819 12845 9387 16965 24010 39190 9387 17668 47766 16965 28373 21903 22035 42265 18370 44683 22035 39190 9387 47766 1819 24810 14992 44683 39190 1819 21903 43961 16797 24010 14992 18599 9387 39190 22035 43961 18599 21903 47766 42557 48523 39190 47766 16797 43961 48523 18599 17668 47766 39190 1005 32402 39190 47766 21903 43961 17668 39190 18599 23650 21903 47766 24810
    
    
      1
      4
      36606 7350 35469 2707 42329 7160 1200 17769 43704 37646 11865 35469 19057 22199 25146 26576 25623 21573
    
    
      2
      6
      27521 38293 21903 48679 38293 20323 40992 21903 45007 11068 10644 49401 25659 8424
    
    
      3
      11
      47912 8309 28465 42585 36070 2002 16869 35765 35738 18987 10644 38456 35640 47601 18465 35640 35948 8670 14947 42585 41290 34658 33731 27959 36070 23400 41290 14947 35948 27959 8197 8670 33731 35640 33572 42585 16869 34658 8309 38456 19543 35948 8309 8670 5605 95 19769 14947 12384 28465 26209 38970 10644 42736 27451 20383 30855 44632 43352 44987 27959 35948 8230 30480 22950 34551 44632 15261 33037 20383 35321 27959 40738 41319 28465 8309 14947 34658 40055 24799 17706 33572 27959 48697 49374 8309 30563 5989 41909 17794 13176 4799 33021 21040
    
    
      4
      12
      17159 21616 40377 13176 10863 17766 7076 28134 46157 20350 48835 33761 28985 5876 34358 11845 21198 27243 12206 14992 5746 47672 8239 38164 42450 12872 44683 10863 7076 20350 16589 34216 13176 14992 42736 21616 19895 18761 45056 37646 38693 47144 8239 44661 22935 35345 12797 10978 33443 28342 20144 5746 13176 14992 44422 11520 31506 22959 7120 37687 19006 39667 17794 28134 10863 7076 30489 49683 24390 47509 34243 48364 16185 248



In [132]:

    
test_users_prev_orders_prods.to_csv("../data/interim/test_users_pids_bought.csv")



In [128]:

    
# number of products for users in the test set 
test_pid_ls_str= ' '.join([pid_ls for pid_ls in test_users_prev_orders_prods['product_id']])
len(set(test_pid_ls_str.split(' ')))









    Out[128]:





48457



In [135]:

    
products_df.product_id.nunique()









    Out[135]:





49688

So it seems there are unseen product in the test set. It's better to represent the label to be a 49688 length of indicator vector.



In [136]:

    
X_train = pd.merge(X_train, y_train_df, how = "inner", on = "order_id")



In [137]:

    
X_train.head()









    Out[137]:






  
    
      
      order_id
      user_id
      eval_set
      order_number
      order_dow
      order_hour_of_day
      days_since_prior_order
      num_orders
      avg_num_reordered
      avg_num_pids
      ...
      emb_91
      emb_92
      emb_93
      emb_94
      emb_95
      emb_96
      emb_97
      emb_98
      emb_99
      products
    
  
  
    
      0
      1187899
      1
      train
      11
      4
      8
      14.0
      10
      4.10
      5.90
      ...
      31.366833
      36.130058
      23.546530
      10.425104
      -45.494610
      -41.286137
      -15.234282
      -27.943871
      -1.224820
      196 25133 38928 26405 39657 10258 13032 26088 49235 46149
    
    
      1
      1492625
      2
      train
      15
      1
      11
      30.0
      14
      6.64
      13.93
      ...
      -108.388176
      1.776031
      23.358995
      -148.381149
      88.404236
      -56.325516
      -31.194061
      18.958136
      25.864012
      22963 7963 16589 32792 41787 22825 24852 45066 5450 22559 45613 33957
    
    
      2
      2196797
      5
      train
      5
      0
      11
      6.0
      4
      3.50
      9.25
      ...
      -57.355591
      -3.194045
      -31.843639
      -23.899057
      35.830971
      18.920025
      3.286705
      26.969088
      18.922802
      15349 21413 40706 21616
    
    
      3
      525192
      7
      train
      21
      2
      11
      6.0
      20
      6.90
      10.30
      ...
      -231.748123
      153.659866
      -317.609009
      -251.143356
      128.311203
      74.937943
      62.291996
      195.875809
      28.809772
      47272 37999 13198 43967 40852 17638 29894 45066
    
    
      4
      880375
      8
      train
      4
      1
      14
      10.0
      3
      4.33
      16.33
      ...
      -77.055519
      8.204720
      -58.113785
      -44.359272
      59.807442
      53.728695
      -11.124917
      -11.276837
      11.506551
      15937 23165 21903 41540
    
  

5 rows × 133 columns



In [223]:

    
X_train[X_train.products == "None"].head()









    Out[223]:






  
    
      
      order_id
      user_id
      eval_set
      order_number
      order_dow
      order_hour_of_day
      days_since_prior_order
      num_orders
      avg_num_reordered
      avg_num_pids
      ...
      emb_91
      emb_92
      emb_93
      emb_94
      emb_95
      emb_96
      emb_97
      emb_98
      emb_99
      products
    
  
  
    
      6
      1822501
      10
      train
      6
      0
      19
      30.0
      5
      9.800000
      28.600000
      ...
      -230.211487
      5.730721
      -106.121880
      -142.539398
      160.519669
      101.908089
      -38.074680
      69.347435
      4.375209
      None
    
    
      23
      2436259
      44
      train
      4
      0
      12
      30.0
      3
      2.333333
      12.333333
      ...
      19.159031
      -1.656268
      5.061689
      28.789291
      -2.363226
      3.690811
      -17.018866
      -2.435671
      -4.854250
      None
    
    
      25
      2906490
      47
      train
      6
      4
      22
      16.0
      5
      1.000000
      5.200000
      ...
      -7.445229
      10.061296
      -16.668764
      8.518276
      4.098104
      8.319317
      -2.216025
      -2.315904
      -4.365201
      None
    
    
      58
      1179185
      93
      train
      15
      1
      10
      5.0
      14
      0.357143
      2.214286
      ...
      27.313814
      -18.379974
      11.125583
      2.677087
      7.727707
      -9.927894
      -14.212251
      -13.137720
      3.635296
      None
    
    
      72
      3032922
      112
      train
      27
      2
      18
      6.0
      26
      3.538462
      6.076923
      ...
      -123.882759
      -23.177948
      -42.518337
      -171.473160
      89.357986
      78.872139
      35.444054
      140.189163
      -4.364416
      None
    
  

5 rows × 132 columns



In [140]:

    
list(set(products_df.product_id))









    Out[140]:





49688



In [173]:

    
all_outcomes = ['None'] + list(set(products_df.product_id.astype('str')))



In [174]:

    
len(all_outcomes)









    Out[174]:





49689



In [175]:

    
labels_tuple_ls = [tuple(label.split(' ')) for label in X_train['products']]



In [176]:

    
labels_tuple_ls[6] #double check









    Out[176]:





('None',)



In [177]:

    
mlb = MultiLabelBinarizer(sparse_output=True)



In [178]:

    
mlb.fit([all_outcomes])









    Out[178]:





MultiLabelBinarizer(classes=None, sparse_output=True)



In [179]:

    
mlb.classes_









    Out[179]:





array(['1', '10', '100', ..., '9998', '9999', 'None'], dtype=object)



In [180]:

    
y_train = mlb.transform(labels_tuple_ls)



In [181]:

    
type(y_train)









    Out[181]:





scipy.sparse.csr.csr_matrix



In [182]:

    
y_train.shape









    Out[182]:





(131209, 49689)



In [183]:

    
y_train.data.nbytes # 3.35 MB









    Out[183]:





3349704



In [184]:

    
#save the transformed train lables 
with open("../data/processed/y_train.pickle", "wb") as handle:
    pickle.dump(y_train, handle)

reduce the data memory usage



In [185]:

    
X_train.info(memory_usage= "deep")









    



<class 'pandas.core.frame.DataFrame'>
Int64Index: 131209 entries, 0 to 131208
Columns: 133 entries, order_id to products
dtypes: float32(100), float64(23), int64(6), object(4)
memory usage: 116.2 MB



In [186]:

    
# preprocess the data
features = X_train.columns
drop_cols = ['order_id', 'eval_set', 'products']
uint8_cols = features[features.str.startswith("dow_")].tolist() +\
             features[features.str.startswith("daytime_")].tolist() +\
             features[features.str.startswith("order_")].tolist() +\
             features[features.str.startswith("avg_")].tolist() +\
             ["days_since_prior_order", "avg_days_since_prior_order"]
uint8_cols.remove('order_id')
str_cat_cols = features[features.str.startswith("top")].tolist() + \
               features[features.str.startswith("most_reordered_")].tolist() +\
                ['most_daytime', 'most_dow']
round_cols = features[features.str.startswith("emb")].tolist()



In [187]:

    
def clean_data_type(df, convert_ls, float_round_ls = None, drop_cols_ls = None):
    df = df.copy()
    df = df.drop(drop_cols_ls, axis = 1)
    
    for ct in convert_ls:
        convert_type = ct[0]
        convert_cols = ct[1]
        for col in convert_cols:
            df[col] = df[col].astype(convert_type)
    
    df[float_round_ls] = np.round(df[float_round_ls], 2)
    
    return df



In [188]:

    
convert_cols = [('uint8', uint8_cols), ('str', str_cat_cols)]
X_train = clean_data_type(X_train, convert_ls = convert_cols, float_round_ls = round_cols, drop_cols_ls = drop_cols)



In [189]:

    
X_train.info(memory_usage= "deep")









    



<class 'pandas.core.frame.DataFrame'>
Int64Index: 131209 entries, 0 to 131208
Columns: 130 entries, user_id to emb_99
dtypes: float32(100), float64(1), int64(2), object(7), uint8(20)
memory usage: 112.4 MB



In [191]:

    
# save the cleaned and transformed training set - ready for modeling
with open("../data/processed/X_train.pickle", "wb") as handle:
    pickle.dump(X_train, handle)



In [192]:

    
X_train.isnull().sum().sort_values(ascending=False)[0] #no missing values









    Out[192]:





0

Prepare the test feature matrix



In [193]:

    
test_orders = orders_df[orders_df.eval_set == "test"]



In [194]:

    
test_orders.shape









    Out[194]:





(75000, 7)



In [195]:

    
X_test = pd.merge(test_orders, X_features, how = "inner", on = "user_id")



In [196]:

    
X_test.shape #double check









    Out[196]:





(75000, 132)



In [197]:

    
drop_cols = ['order_id', 'eval_set']
X_test = clean_data_type(X_test, convert_ls = convert_cols, float_round_ls = round_cols, drop_cols_ls = drop_cols)



In [198]:

    
X_test.shape









    Out[198]:





(75000, 130)



In [199]:

    
X_test.info(memory_usage="deep")









    



<class 'pandas.core.frame.DataFrame'>
Int64Index: 75000 entries, 0 to 74999
Columns: 130 entries, user_id to emb_99
dtypes: float32(100), float64(1), int64(2), object(7), uint8(20)
memory usage: 64.3 MB



In [200]:

    
# save the cleaned and transformed test set - ready for prediction
with open("../data/processed/X_test.pickle", "wb") as handle:
    pickle.dump(X_test, handle)



In [201]:

    
X_test.isnull().sum().sort_values(ascending=False)[0] #no missing values









    Out[201]:





0



In [ ]:

	order_id	user_id	eval_set	order_number	order_dow	order_hour_of_day	days_since_prior_order
0	2539329	1	prior	1	2	8	NaN
1	2398795	1	prior	2	3	7	15.0
2	473747	1	prior	3	3	12	21.0
3	2254736	1	prior	4	4	7	29.0
4	431534	1	prior	5	4	15	28.0

	user_id	avg_num_reordered	avg_num_pids	perc_reordered
0	1	4.10	5.90	0.69
1	2	6.64	13.93	0.48
2	3	4.58	7.33	0.62
3	4	0.20	3.60	0.06
4	5	3.50	9.25	0.38

	user_id	dow_0	dow_1	dow_2	dow_3	dow_4	dow_5	dow_6
0	1	0.0	3.0	2.0	2.0	3.0	0.0	0.0
1	2	0.0	5.0	5.0	2.0	1.0	1.0	0.0
2	3	6.0	2.0	1.0	3.0	0.0	0.0	0.0
3	4	0.0	0.0	0.0	0.0	2.0	2.0	1.0
4	5	1.0	1.0	0.0	2.0	0.0	0.0	0.0

	user_id	dow_0	dow_1	dow_2	dow_3	dow_4	dow_5	dow_6	most_dow
0	1	0.0	3.0	2.0	2.0	3.0	0.0	0.0	dow_1
1	2	0.0	5.0	5.0	2.0	1.0	1.0	0.0	dow_1
2	3	6.0	2.0	1.0	3.0	0.0	0.0	0.0	dow_0
3	4	0.0	0.0	0.0	0.0	2.0	2.0	1.0	dow_4
4	5	1.0	1.0	0.0	2.0	0.0	0.0	0.0	dow_3

	order_id	user_id	eval_set	order_number	order_dow	order_hour_of_day	days_since_prior_order	daytime
0	2539329	1	prior	1	2	8	NaN	morning
1	2398795	1	prior	2	3	7	15.0	morning
2	473747	1	prior	3	3	12	21.0	noon
3	2254736	1	prior	4	4	7	29.0	morning
4	431534	1	prior	5	4	15	28.0	afternoon

	user_id	daytime_morning	daytime_noon	daytime_afternoon	daytime_evening
0	1	6.0	1.0	3.0	0.0
1	2	8.0	5.0	1.0	0.0
2	3	0.0	0.0	9.0	3.0
3	4	0.0	4.0	1.0	0.0
4	5	0.0	1.0	1.0	2.0

	user_id	daytime_morning	daytime_noon	daytime_afternoon	daytime_evening	most_daytime
0	1	6.0	1.0	3.0	0.0	daytime_morning
1	2	8.0	5.0	1.0	0.0	daytime_morning
2	3	0.0	0.0	9.0	3.0	daytime_afternoon
3	4	0.0	4.0	1.0	0.0	daytime_noon
4	5	0.0	1.0	1.0	2.0	daytime_evening

	product_id	product_name	aisle_id	department_id	aisle	department
0	1	Chocolate Sandwich Cookies	61	19	cookies cakes	snacks
1	78	Nutter Butter Cookie Bites Go-Pak	61	19	cookies cakes	snacks
2	102	Danish Butter Cookies	61	19	cookies cakes	snacks
3	172	Gluten Free All Natural Chocolate Chip Cookies	61	19	cookies cakes	snacks
4	285	Mini Nilla Wafers Munch Pack	61	19	cookies cakes	snacks

	user_id	top1_reordered_pid	top2_reordered_pid	top3_reordered_pid
0	1	12427.0	196.0	10258.0
1	2	32792.0	47209.0	24852.0
2	3	39190.0	47766.0	21903.0
3	4	35469.0	NaN	NaN
4	5	26604.0	11777.0	43693.0

	pid_freq_dict	user_id
0	{17122: 1, 41787: 1, 196: 10, 26405: 2, 26088: 2, 39657: 1, 12427: 10, 25133: 8, 35951: 1, 38928: 1, 13032: 3, 10258: 9, 49235: 2, 10326: 1, 13176: 2, 14084: 1, 30450: 1, 46149: 3}	1
1	{38656: 1, 34688: 3, 40198: 2, 45066: 3, 2573: 2, 37646: 1, 44303: 1, 18961: 1, 5907: 1, 24852: 7, 27413: 1, 23: 1, 32792: 9, 30489: 1, 7963: 1, 8479: 1, 46886: 2, 13351: 2, 19240: 1, 22825: 1, 49451: 1, 22829: 1, 20785: 1, 32052: 3, 17758: 2, 41787: 2, 27966: 2, 10305: 1, 32139: 4, 15841: 1, 17224: 1, 5450: 1, 14306: 1, 3151: 1, 48210: 1, 46676: 1, 5322: 1, 27737: 1, 18523: 6, 5212: 1, 4957: 1, 20574: 2, 7781: 3, 42342: 1, 8296: 1, 47209: 8, 19051: 4, 22124: 4, 35917: 1, 22474: 3, 19057: 1, 42356: 1, 13176: 1, 49273: 1, 24954: 1, 40571: 1, 45948: 1, 36735: 3, 21376: 2, 16521: 1, 1559: 6, 47766: 4, 16797: 1, 24990: 1, 9124: 1, 33957: 1, 47526: 1, 13742: 1, 47792: 1, 22963: 2, 21150: 1, 20084: 1, 22559: 1, 30908: 1, 28874: 1, 36287: 1, 24768: 1, 47553: 1, 39877: 1, 8138: 1, 21709: 2, 16589: 5, 27344: 2, 9681: 2, 2002: 4, 19156: 6, 14553: 1, 33754: 5, 79: 1, 45613: 2, 12000: 5, 17872: 3, 12258: 1, 48099: 1, 4071: 1, 21227: 1, 5869: 1, 48110: 2, 47144: 1, 28918: 1, ...}	2
2	{18370: 1, 9387: 5, 17668: 5, 16965: 2, 24010: 2, 44683: 2, 12845: 1, 14992: 2, 21137: 1, 32402: 3, 22035: 3, 8021: 1, 39190: 10, 42265: 1, 38596: 1, 7503: 1, 40604: 1, 16797: 3, 23650: 2, 1819: 3, 15143: 1, 24810: 3, 18599: 4, 1005: 1, 47766: 9, 39922: 1, 49683: 1, 21903: 8, 248: 1, 43961: 4, 48523: 2, 42557: 1, 28373: 2}	3
3	{42329: 1, 26576: 1, 43704: 1, 21573: 1, 17769: 1, 22199: 1, 35469: 2, 37646: 1, 1200: 1, 19057: 1, 2707: 1, 7350: 1, 25623: 1, 7160: 1, 11865: 1, 25146: 1, 36606: 1}	4
4	{11777: 4, 40706: 2, 8518: 2, 28289: 1, 18761: 1, 22475: 1, 27344: 1, 20754: 1, 24535: 3, 6808: 1, 31717: 1, 3376: 1, 13988: 2, 21413: 2, 24231: 1, 16168: 1, 48775: 1, 26604: 4, 43693: 3, 13870: 1, 5999: 1, 21616: 1, 15349: 1}	5

	user_id	dow_0	dow_1	dow_2	dow_3	dow_4	dow_5	dow_6
0	1	0.0	3.0	2.0	2.0	3.0	0.0	0.0
1	2	0.0	5.0	5.0	2.0	1.0	1.0	0.0
2	3	6.0	2.0	1.0	3.0	0.0	0.0	0.0
3	4	0.0	0.0	0.0	0.0	2.0	2.0	1.0
4	5	1.0	1.0	0.0	2.0	0.0	0.0	0.0

	user_id	daytime_morning	daytime_noon	daytime_afternoon	daytime_evening
0	1	6.0	1.0	3.0	0.0
1	2	8.0	5.0	1.0	0.0
2	3	0.0	0.0	9.0	3.0
3	4	0.0	4.0	1.0	0.0
4	5	0.0	1.0	1.0	2.0

	user_id	emb_0	emb_1	emb_2	emb_3	emb_4	emb_5	emb_6	emb_7	emb_8	...	emb_90	emb_91	emb_92	emb_93	emb_94	emb_95	emb_96	emb_97	emb_98	emb_99
0	1	-26.714828	10.802811	-25.158247	0.034250	-13.170386	37.165924	-57.506771	-67.410599	46.844711	...	-9.796089	31.366833	36.130058	23.546530	10.425104	-45.494610	-41.286137	-15.234282	-27.943871	-1.224820
1	2	81.621399	38.889706	82.688820	-102.128090	-19.691454	287.458893	-180.793015	35.621387	-54.345078	...	120.507309	-108.388176	1.776031	23.358995	-148.381149	88.404236	-56.325516	-31.194061	18.958136	25.864012
2	3	40.915524	-18.381775	75.038582	-1.873290	-58.543594	93.821640	-24.789171	79.996437	-30.540491	...	27.517372	-92.990982	48.024872	-35.506332	-128.823105	77.397263	16.856283	-16.234632	28.117483	46.908470
3	4	1.429444	2.203788	-0.556785	-11.420183	-6.547275	0.305758	0.160394	-8.673797	-5.231474	...	3.262614	18.981794	-9.738063	21.261660	2.639111	2.720276	-2.564438	4.853611	-3.863805	9.918034
4	5	7.796177	-11.155602	12.636064	37.787033	11.076339	43.303268	3.458212	15.993218	-27.569876	...	-9.303144	-57.355591	-3.194045	-31.843639	-23.899057	35.830971	18.920025	3.286705	26.969088	18.922802

	order_id	user_id	eval_set	order_number	order_dow	order_hour_of_day	days_since_prior_order
10	1187899	1	train	11	4	8	14.0
25	1492625	2	train	15	1	11	30.0
49	2196797	5	train	5	0	11	6.0
74	525192	7	train	21	2	11	6.0
78	880375	8	train	4	1	14	10.0

	order_id	products
0	1	49302 11109 43633 22035
1	36	19660 43086 46620 34497 48679 46979
2	38	21616
3	96	20574 40706 27966 24489 39275
4	98	8859 19731 43654 13176 4357 37664 34065 35951 43560 9896 27509 15455 27966 47601 40396 35042 40986 1939 46313 329 30776 36695 27683 15995 27344 47333 48287 45204 24964 18117 46413 34126 9373 22935 46720 44479 790 18441 45007 20520 7461 26317 3880 41387 17747

	user_id	product_id
0	3	9387 17668 15143 16797 39190 47766 21903 39922 24810 32402 38596 21903 248 40604 8021 17668 21137 23650 32402 39190 47766 21903 49683 28373 7503 1819 12845 9387 16965 24010 39190 9387 17668 47766 16965 28373 21903 22035 42265 18370 44683 22035 39190 9387 47766 1819 24810 14992 44683 39190 1819 21903 43961 16797 24010 14992 18599 9387 39190 22035 43961 18599 21903 47766 42557 48523 39190 47766 16797 43961 48523 18599 17668 47766 39190 1005 32402 39190 47766 21903 43961 17668 39190 18599 23650 21903 47766 24810
1	4	36606 7350 35469 2707 42329 7160 1200 17769 43704 37646 11865 35469 19057 22199 25146 26576 25623 21573
2	6	27521 38293 21903 48679 38293 20323 40992 21903 45007 11068 10644 49401 25659 8424
3	11	47912 8309 28465 42585 36070 2002 16869 35765 35738 18987 10644 38456 35640 47601 18465 35640 35948 8670 14947 42585 41290 34658 33731 27959 36070 23400 41290 14947 35948 27959 8197 8670 33731 35640 33572 42585 16869 34658 8309 38456 19543 35948 8309 8670 5605 95 19769 14947 12384 28465 26209 38970 10644 42736 27451 20383 30855 44632 43352 44987 27959 35948 8230 30480 22950 34551 44632 15261 33037 20383 35321 27959 40738 41319 28465 8309 14947 34658 40055 24799 17706 33572 27959 48697 49374 8309 30563 5989 41909 17794 13176 4799 33021 21040
4	12	17159 21616 40377 13176 10863 17766 7076 28134 46157 20350 48835 33761 28985 5876 34358 11845 21198 27243 12206 14992 5746 47672 8239 38164 42450 12872 44683 10863 7076 20350 16589 34216 13176 14992 42736 21616 19895 18761 45056 37646 38693 47144 8239 44661 22935 35345 12797 10978 33443 28342 20144 5746 13176 14992 44422 11520 31506 22959 7120 37687 19006 39667 17794 28134 10863 7076 30489 49683 24390 47509 34243 48364 16185 248

	order_id	user_id	eval_set	order_number	order_dow	order_hour_of_day	days_since_prior_order	num_orders	avg_num_reordered	avg_num_pids	...	emb_91	emb_92	emb_93	emb_94	emb_95	emb_96	emb_97	emb_98	emb_99	products
6	1822501	10	train	6	0	19	30.0	5	9.800000	28.600000	...	-230.211487	5.730721	-106.121880	-142.539398	160.519669	101.908089	-38.074680	69.347435	4.375209	None
23	2436259	44	train	4	0	12	30.0	3	2.333333	12.333333	...	19.159031	-1.656268	5.061689	28.789291	-2.363226	3.690811	-17.018866	-2.435671	-4.854250	None
25	2906490	47	train	6	4	22	16.0	5	1.000000	5.200000	...	-7.445229	10.061296	-16.668764	8.518276	4.098104	8.319317	-2.216025	-2.315904	-4.365201	None
58	1179185	93	train	15	1	10	5.0	14	0.357143	2.214286	...	27.313814	-18.379974	11.125583	2.677087	7.727707	-9.927894	-14.212251	-13.137720	3.635296	None
72	3032922	112	train	27	2	18	6.0	26	3.538462	6.076923	...	-123.882759	-23.177948	-42.518337	-171.473160	89.357986	78.872139	35.444054	140.189163	-4.364416	None

	user_id	dow_0	dow_1	dow_2	dow_3	dow_4	dow_5	dow_6
0	1	0.0	3.0	2.0	2.0	3.0	0.0	0.0
1	2	0.0	5.0	5.0	2.0	1.0	1.0	0.0
2	3	6.0	2.0	1.0	3.0	0.0	0.0	0.0
3	4	0.0	0.0	0.0	0.0	2.0	2.0	1.0
4	5	1.0	1.0	0.0	2.0	0.0	0.0	0.0

	user_id	daytime_morning	daytime_noon	daytime_afternoon	daytime_evening
0	1	6.0	1.0	3.0	0.0
1	2	8.0	5.0	1.0	0.0
2	3	0.0	0.0	9.0	3.0
3	4	0.0	4.0	1.0	0.0
4	5	0.0	1.0	1.0	2.0