In [1]:
import pandas as pd
import numpy as np
import xgboost as xgb
from sklearn.model_selection import train_test_split

%run -i readData.py


Shape aisles: (134, 2)
Shape departments: (21, 2)
Shape order_products__prior: (32434489, 4)
Shape order_products__train: (1384617, 4)
Shape orders: (3421083, 7)
Shape products: (49688, 4)

In [ ]:


In [2]:
orderProducts = pd.merge(orders,pd.concat([order_products__prior,order_products__train]),on='order_id',how='left');

#Sample products subjekt to eval_set
orderProductsEstimation = orderProducts[orderProducts['eval_set']!='prior']
orderProductsPrior = orderProducts[orderProducts['eval_set']=='prior']

In [3]:
#Calculate average order size
s = order_products__prior.groupby('order_id').size()
mean_order_size= s.mean()

In [ ]:


In [4]:
productFeatures = (orderProductsPrior.groupby('product_id',as_index=False)
                   .agg({'reordered':['sum','count','mean']})
                   
                   
)
#Collapse diminsions to just one
productFeatures.columns = [' '.join(col).strip() for col in productFeatures.columns.values]

#supply with more descriptive column names
productFeatures.rename(columns={'reordered sum' : 'product_reorders',
                                'reordered count' : 'product_orders',
                                'reordered mean': 'product_reorder_rate'},inplace=True)

In [5]:
userFeatures = (orderProductsPrior.groupby('user_id',as_index=False)
                   .agg({'reordered':['sum','count','mean']}) )
#Collapse diminsions to just one
userFeatures.columns = [' '.join(col).strip() for col in userFeatures.columns.values]

#supply with more descriptive column names
userFeatures.rename(columns={'reordered sum' : 'user_reorders',
                                'reordered count' : 'user_orders',
                                'reordered mean': 'user_reorder_rate'},inplace=True)

In [6]:
estimationData = pd.merge(orderProductsEstimation,productFeatures,
                          on='product_id', how='left')
estimationData = pd.merge(estimationData,userFeatures,
                          on='user_id', how='left')

In [7]:
#The train set is splittet into test and train. The split is performen by user
#Notice that test is not the dataset labelled test in the raw data
users = np.unique(orderProducts.loc[orderProducts['eval_set']=='train',['user_id']])
trainUsers, testUsers = train_test_split(users, test_size=0.2, random_state=30)

In [8]:
testUsers


Out[8]:
array([ 53589, 201898, 200568, ...,   4439, 164135,   8375], dtype=uint32)

In [9]:
#prepare data that is going to be used for training
trainData  = estimationData.loc[estimationData['user_id'].isin(trainUsers)].drop('eval_set',axis=1)

In [ ]:


In [10]:
#testdata is a large dataset combining data for all combination of testuser and products
testData = orderProducts[orderProducts['user_id'].isin(testUsers)]
testData = pd.merge(testData,userFeatures,on='user_id',how='left')
testData = pd.merge(testData,productFeatures,on='product_id',how='left')
#Since we aim to score the next order the "days since prior order" is not up to data and should 
#be replaced with the number from the order marked 'train". Keep in mind the dataset is for scoring

daysSince = orders[orders['eval_set']=='train'][['days_since_prior_order','user_id']]
testData = pd.merge(testData.drop(['days_since_prior_order'],axis=1),daysSince, on='user_id',how='left')

In [ ]:


In [ ]:


In [11]:
#define x variables
xCols = ['order_number',
         'order_dow',
         'order_hour_of_day',
         'days_since_prior_order',
         'product_reorders',
         'product_orders',
         'product_reorder_rate',
         'user_reorders',
         'user_orders',
         'user_reorder_rate',]
#define y variable         
yCol  = ['reordered']

In [ ]:


In [ ]:


In [ ]:


In [12]:
del order_products__prior
del order_products__train
del products,trainUsers
del estimationData
del productFeatures
del userFeatures
del orders

In [13]:
dTrain= xgb.DMatrix(trainData[xCols]label=trainData[yCol])


  File "<ipython-input-13-a1b673e0dc3a>", line 1
    dTrain= xgb.DMatrix(trainData[xCols]label=trainData[yCol])
                                            ^
SyntaxError: invalid syntax

In [ ]:
#Set up parameters for xgboost
xgParam = {
    'booster' : "gbtree", 
    'objective': 'reg:logistic',
    'eval_metric': 'logloss',
    'max_depth':8,
    'silent ':0
    
}
numRound = 40

In [ ]:
bst = xgb.train(xgParam, dTrain, numRound)

In [ ]:
del dTrain, trainData

In [ ]:
dTest = xgb.DMatrix(testData.drop('reordered',axis=1),label=testData['reordered'])

In [ ]:
preds = bst.predict(dTest)

In [ ]:
numTestOrders*mean_order_size*100/preds.shape[0]

In [ ]:
#from the sample mean an average order size of 10 is picked
numTestOrders = np.unique(testData['order_id']).shape[0]
percent = numTestOrders*mean_order_size*100/
threshold = np.percentile(preds,100-percent)

In [ ]:
testData['preds']  = (preds>threshold)

In [ ]:


In [ ]:
testData.head()