In [0]:
#@title Copyright 2019 The Lifetime Value Authors.
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ============================================================================
In [0]:
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import os
import numpy as np
import pandas as pd
import tqdm
import multiprocessing
In [0]:
pd.options.mode.chained_assignment = None # default='warn'
In [0]:
COMPANYS = [
'10000', '101200010', '101410010', '101600010', '102100020', '102700020',
'102840020', '103000030', '103338333', '103400030', '103600030',
'103700030', '103800030', '104300040', '104400040', '104470040',
'104900040', '105100050', '105150050', '107800070'
]
Setup kaggle API correctly following https://www.kaggle.com/docs/api
In [0]:
%%shell
if [ -e /tmp/lifetime-value/acquire-valued-shoppers-challenge/transactions.csv ]
then
echo "File already exists, no need to download."
else
rm -rf /tmp/lifetime-value/acquire-valued-shoppers-challenge
mkdir /tmp/lifetime-value/acquire-valued-shoppers-challenge
cd /tmp/lifetime-value/acquire-valued-shoppers-challenge
kaggle competitions download -c acquire-valued-shoppers-challenge
unzip acquire-valued-shoppers-challenge.zip
gunzip transactions.csv.gz
fi
In [0]:
def load_data(company):
all_data_filename = '/tmp/lifetime-value/acquire-valued-shoppers-challenge/transactions.csv'
one_company_data_filename = (
'/tmp/lifetime-value/acquire-valued-shoppers-challenge/transactions_company_{}.csv'
.format(company))
if os.path.isfile(one_company_data_filename):
df = pd.read_csv(one_company_data_filename)
else:
data_list = []
chunksize = 10**6
# 350 iterations
for chunk in tqdm.tqdm(pd.read_csv(all_data_filename, chunksize=chunksize)):
data_list.append(chunk.query("company=='{}'".format(company)))
df = pd.concat(data_list, axis=0)
df.to_csv(one_company_data_filename, index=None)
return df
In [0]:
def preprocess(df):
df = df.query('purchaseamount>0')
df['date'] = pd.to_datetime(df['date'], format='%Y-%m-%d')
df['start_date'] = df.groupby('id')['date'].transform('min')
# Compute calibration values
calibration_value = (
df.query('date==start_date').groupby('id')
['purchaseamount'].sum().reset_index())
calibration_value.columns = ['id', 'calibration_value']
# Compute holdout values
one_year_holdout_window_mask = (
(df['date'] > df['start_date']) &
(df['date'] <= df['start_date'] + np.timedelta64(365, 'D')))
holdout_value = (
df[one_year_holdout_window_mask].groupby('id')
['purchaseamount'].sum().reset_index())
holdout_value.columns = ['id', 'holdout_value']
# Compute calibration attributes
calibration_attributes = (
df.query('date==start_date').sort_values(
'purchaseamount', ascending=False).groupby('id')[[
'chain', 'dept', 'category', 'brand', 'productmeasure'
]].first().reset_index())
# Merge dataframes
customer_level_data = (
calibration_value.merge(calibration_attributes, how='left',
on='id').merge(
holdout_value, how='left', on='id'))
customer_level_data['holdout_value'] = (
customer_level_data['holdout_value'].fillna(0.))
categorical_features = ([
'chain', 'dept', 'category', 'brand', 'productmeasure'
])
customer_level_data[categorical_features] = (
customer_level_data[categorical_features].fillna('UNKNOWN'))
# Specify data types
customer_level_data['log_calibration_value'] = (
np.log(customer_level_data['calibration_value']).astype('float32'))
customer_level_data['chain'] = (
customer_level_data['chain'].astype('category'))
customer_level_data['dept'] = (customer_level_data['dept'].astype('category'))
customer_level_data['brand'] = (
customer_level_data['brand'].astype('category'))
customer_level_data['category'] = (
customer_level_data['category'].astype('category'))
customer_level_data['label'] = (
customer_level_data['holdout_value'].astype('float32'))
return customer_level_data
In [0]:
def process(company):
print("Process company {}".format(company))
transaction_level_data = load_data(company)
customer_level_data = preprocess(transaction_level_data)
customer_level_data_file = (
"/tmp/lifetime-value/acquire-valued-shoppers-challenge/customer_level_data_company_{}.csv"
.format(company))
customer_level_data.to_csv(customer_level_data_file, index=None)
print("Done company {}".format(company))
This step may take a while to finish -- 10min-1hr depending on number of core in the computer.
In [0]:
p = multiprocessing.Pool(multiprocessing.cpu_count())
_ = p.map(process, COMPANYS)