In [1]:
import pandas as pd
import numpy as np
import xgboost as xgb
from sklearn.model_selection import train_test_split
%run -i readData.py
In [ ]:
In [2]:
orderProducts = pd.merge(orders,pd.concat([order_products__prior,order_products__train]),on='order_id',how='left');
#Sample products subjekt to eval_set
orderProductsEstimation = orderProducts[orderProducts['eval_set']!='prior']
orderProductsPrior = orderProducts[orderProducts['eval_set']=='prior']
In [3]:
#Calculate average order size
s = order_products__prior.groupby('order_id').size()
mean_order_size= s.mean()
In [ ]:
In [4]:
productFeatures = (orderProductsPrior.groupby('product_id',as_index=False)
.agg({'reordered':['sum','count','mean']})
)
#Collapse diminsions to just one
productFeatures.columns = [' '.join(col).strip() for col in productFeatures.columns.values]
#supply with more descriptive column names
productFeatures.rename(columns={'reordered sum' : 'product_reorders',
'reordered count' : 'product_orders',
'reordered mean': 'product_reorder_rate'},inplace=True)
In [5]:
userFeatures = (orderProductsPrior.groupby('user_id',as_index=False)
.agg({'reordered':['sum','count','mean']}) )
#Collapse diminsions to just one
userFeatures.columns = [' '.join(col).strip() for col in userFeatures.columns.values]
#supply with more descriptive column names
userFeatures.rename(columns={'reordered sum' : 'user_reorders',
'reordered count' : 'user_orders',
'reordered mean': 'user_reorder_rate'},inplace=True)
In [6]:
estimationData = pd.merge(orderProductsEstimation,productFeatures,
on='product_id', how='left')
estimationData = pd.merge(estimationData,userFeatures,
on='user_id', how='left')
In [7]:
#The train set is splittet into test and train. The split is performen by user
#Notice that test is not the dataset labelled test in the raw data
users = np.unique(orderProducts.loc[orderProducts['eval_set']=='train',['user_id']])
trainUsers, testUsers = train_test_split(users, test_size=0.2, random_state=30)
In [8]:
testUsers
Out[8]:
In [9]:
#prepare data that is going to be used for training
trainData = estimationData.loc[estimationData['user_id'].isin(trainUsers)].drop('eval_set',axis=1)
In [ ]:
In [10]:
#testdata is a large dataset combining data for all combination of testuser and products
testData = orderProducts[orderProducts['user_id'].isin(testUsers)]
testData = pd.merge(testData,userFeatures,on='user_id',how='left')
testData = pd.merge(testData,productFeatures,on='product_id',how='left')
#Since we aim to score the next order the "days since prior order" is not up to data and should
#be replaced with the number from the order marked 'train". Keep in mind the dataset is for scoring
daysSince = orders[orders['eval_set']=='train'][['days_since_prior_order','user_id']]
testData = pd.merge(testData.drop(['days_since_prior_order'],axis=1),daysSince, on='user_id',how='left')
In [ ]:
In [ ]:
In [11]:
#define x variables
xCols = ['order_number',
'order_dow',
'order_hour_of_day',
'days_since_prior_order',
'product_reorders',
'product_orders',
'product_reorder_rate',
'user_reorders',
'user_orders',
'user_reorder_rate',]
#define y variable
yCol = ['reordered']
In [ ]:
In [ ]:
In [ ]:
In [12]:
del order_products__prior
del order_products__train
del products,trainUsers
del estimationData
del productFeatures
del userFeatures
del orders
In [13]:
dTrain= xgb.DMatrix(trainData[xCols]label=trainData[yCol])
In [ ]:
#Set up parameters for xgboost
xgParam = {
'booster' : "gbtree",
'objective': 'reg:logistic',
'eval_metric': 'logloss',
'max_depth':8,
'silent ':0
}
numRound = 40
In [ ]:
bst = xgb.train(xgParam, dTrain, numRound)
In [ ]:
del dTrain, trainData
In [ ]:
dTest = xgb.DMatrix(testData.drop('reordered',axis=1),label=testData['reordered'])
In [ ]:
preds = bst.predict(dTest)
In [ ]:
numTestOrders*mean_order_size*100/preds.shape[0]
In [ ]:
#from the sample mean an average order size of 10 is picked
numTestOrders = np.unique(testData['order_id']).shape[0]
percent = numTestOrders*mean_order_size*100/
threshold = np.percentile(preds,100-percent)
In [ ]:
testData['preds'] = (preds>threshold)
In [ ]:
In [ ]:
testData.head()