notebook.community

Edit and run



In [1]:

    
import pandas as pd
import numpy as np
import xgboost as xgb
from sklearn.model_selection import train_test_split

%run -i readData.py









    



Shape aisles: (134, 2)
Shape departments: (21, 2)
Shape order_products__prior: (32434489, 4)
Shape order_products__train: (1384617, 4)
Shape orders: (3421083, 7)
Shape products: (49688, 4)



In [ ]:



In [2]:

    
orderProducts = pd.merge(orders,pd.concat([order_products__prior,order_products__train]),on='order_id',how='left');

#Sample products subjekt to eval_set
orderProductsEstimation = orderProducts[orderProducts['eval_set']!='prior']
orderProductsPrior = orderProducts[orderProducts['eval_set']=='prior']



In [3]:

    
#Calculate average order size
s = order_products__prior.groupby('order_id').size()
mean_order_size= s.mean()



In [ ]:



In [4]:

    
productFeatures = (orderProductsPrior.groupby('product_id',as_index=False)
                   .agg({'reordered':['sum','count','mean']})
                   
                   
)
#Collapse diminsions to just one
productFeatures.columns = [' '.join(col).strip() for col in productFeatures.columns.values]

#supply with more descriptive column names
productFeatures.rename(columns={'reordered sum' : 'product_reorders',
                                'reordered count' : 'product_orders',
                                'reordered mean': 'product_reorder_rate'},inplace=True)



In [5]:

    
userFeatures = (orderProductsPrior.groupby('user_id',as_index=False)
                   .agg({'reordered':['sum','count','mean']}) )
#Collapse diminsions to just one
userFeatures.columns = [' '.join(col).strip() for col in userFeatures.columns.values]

#supply with more descriptive column names
userFeatures.rename(columns={'reordered sum' : 'user_reorders',
                                'reordered count' : 'user_orders',
                                'reordered mean': 'user_reorder_rate'},inplace=True)



In [6]:

    
estimationData = pd.merge(orderProductsEstimation,productFeatures,
                          on='product_id', how='left')
estimationData = pd.merge(estimationData,userFeatures,
                          on='user_id', how='left')



In [7]:

    
#The train set is splittet into test and train. The split is performen by user
#Notice that test is not the dataset labelled test in the raw data
users = np.unique(orderProducts.loc[orderProducts['eval_set']=='train',['user_id']])
trainUsers, testUsers = train_test_split(users, test_size=0.2, random_state=30)



In [8]:

    
testUsers









    Out[8]:





array([ 53589, 201898, 200568, ...,   4439, 164135,   8375], dtype=uint32)



In [9]:

    
#prepare data that is going to be used for training
trainData  = estimationData.loc[estimationData['user_id'].isin(trainUsers)].drop('eval_set',axis=1)



In [ ]:



In [10]:

    
#testdata is a large dataset combining data for all combination of testuser and products
testData = orderProducts[orderProducts['user_id'].isin(testUsers)]
testData = pd.merge(testData,userFeatures,on='user_id',how='left')
testData = pd.merge(testData,productFeatures,on='product_id',how='left')
#Since we aim to score the next order the "days since prior order" is not up to data and should 
#be replaced with the number from the order marked 'train". Keep in mind the dataset is for scoring

daysSince = orders[orders['eval_set']=='train'][['days_since_prior_order','user_id']]
testData = pd.merge(testData.drop(['days_since_prior_order'],axis=1),daysSince, on='user_id',how='left')



In [ ]:



In [ ]:



In [11]:

    
#define x variables
xCols = ['order_number',
         'order_dow',
         'order_hour_of_day',
         'days_since_prior_order',
         'product_reorders',
         'product_orders',
         'product_reorder_rate',
         'user_reorders',
         'user_orders',
         'user_reorder_rate',]
#define y variable         
yCol  = ['reordered']



In [ ]:



In [ ]:



In [ ]:



In [12]:

    
del order_products__prior
del order_products__train
del products,trainUsers
del estimationData
del productFeatures
del userFeatures
del orders



In [13]:

    
dTrain= xgb.DMatrix(trainData[xCols]label=trainData[yCol])









    



  File "<ipython-input-13-a1b673e0dc3a>", line 1
    dTrain= xgb.DMatrix(trainData[xCols]label=trainData[yCol])
                                            ^
SyntaxError: invalid syntax



In [ ]:

    
#Set up parameters for xgboost
xgParam = {
    'booster' : "gbtree", 
    'objective': 'reg:logistic',
    'eval_metric': 'logloss',
    'max_depth':8,
    'silent ':0
    
}
numRound = 40



In [ ]:

    
bst = xgb.train(xgParam, dTrain, numRound)



In [ ]:

    
del dTrain, trainData



In [ ]:

    
dTest = xgb.DMatrix(testData.drop('reordered',axis=1),label=testData['reordered'])



In [ ]:

    
preds = bst.predict(dTest)



In [ ]:

    
numTestOrders*mean_order_size*100/preds.shape[0]



In [ ]:

    
#from the sample mean an average order size of 10 is picked
numTestOrders = np.unique(testData['order_id']).shape[0]
percent = numTestOrders*mean_order_size*100/
threshold = np.percentile(preds,100-percent)



In [ ]:

    
testData['preds']  = (preds>threshold)



In [ ]:



In [ ]:

    
testData.head()