In [0]:
#@title Copyright 2019 The Lifetime Value Authors.
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ============================================================================

In [0]:
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function


import os
import numpy as np
import pandas as pd
import tqdm
import multiprocessing

In [0]:
pd.options.mode.chained_assignment = None  # default='warn'

Global variables


In [0]:
COMPANYS = [
    '10000', '101200010', '101410010', '101600010', '102100020', '102700020',
    '102840020', '103000030', '103338333', '103400030', '103600030',
    '103700030', '103800030', '104300040', '104400040', '104470040',
    '104900040', '105100050', '105150050', '107800070'
]

Data

Download data

Setup kaggle API correctly following https://www.kaggle.com/docs/api


In [0]:
%%shell
if [ -e /tmp/lifetime-value/acquire-valued-shoppers-challenge/transactions.csv ]
then
  echo "File already exists, no need to download."
else
  rm -rf /tmp/lifetime-value/acquire-valued-shoppers-challenge
  mkdir /tmp/lifetime-value/acquire-valued-shoppers-challenge
  cd /tmp/lifetime-value/acquire-valued-shoppers-challenge
  kaggle competitions download -c acquire-valued-shoppers-challenge
  unzip acquire-valued-shoppers-challenge.zip
  gunzip transactions.csv.gz
fi

Load csv


In [0]:
def load_data(company):
  all_data_filename = '/tmp/lifetime-value/acquire-valued-shoppers-challenge/transactions.csv'
  one_company_data_filename = (
      '/tmp/lifetime-value/acquire-valued-shoppers-challenge/transactions_company_{}.csv'
      .format(company))
  if os.path.isfile(one_company_data_filename):
    df = pd.read_csv(one_company_data_filename)
  else:
    data_list = []
    chunksize = 10**6
    # 350 iterations
    for chunk in tqdm.tqdm(pd.read_csv(all_data_filename, chunksize=chunksize)):
      data_list.append(chunk.query("company=='{}'".format(company)))
    df = pd.concat(data_list, axis=0)
    df.to_csv(one_company_data_filename, index=None)
  return df

Preprocess data


In [0]:
def preprocess(df):
  df = df.query('purchaseamount>0')
  df['date'] = pd.to_datetime(df['date'], format='%Y-%m-%d')
  df['start_date'] = df.groupby('id')['date'].transform('min')

  # Compute calibration values
  calibration_value = (
      df.query('date==start_date').groupby('id')
      ['purchaseamount'].sum().reset_index())
  calibration_value.columns = ['id', 'calibration_value']

  # Compute holdout values
  one_year_holdout_window_mask = (
      (df['date'] > df['start_date']) &
      (df['date'] <= df['start_date'] + np.timedelta64(365, 'D')))
  holdout_value = (
      df[one_year_holdout_window_mask].groupby('id')
      ['purchaseamount'].sum().reset_index())
  holdout_value.columns = ['id', 'holdout_value']

  # Compute calibration attributes
  calibration_attributes = (
      df.query('date==start_date').sort_values(
          'purchaseamount', ascending=False).groupby('id')[[
              'chain', 'dept', 'category', 'brand', 'productmeasure'
          ]].first().reset_index())

  # Merge dataframes
  customer_level_data = (
      calibration_value.merge(calibration_attributes, how='left',
                              on='id').merge(
                                  holdout_value, how='left', on='id'))
  customer_level_data['holdout_value'] = (
      customer_level_data['holdout_value'].fillna(0.))
  categorical_features = ([
      'chain', 'dept', 'category', 'brand', 'productmeasure'
  ])
  customer_level_data[categorical_features] = (
      customer_level_data[categorical_features].fillna('UNKNOWN'))

  # Specify data types
  customer_level_data['log_calibration_value'] = (
      np.log(customer_level_data['calibration_value']).astype('float32'))
  customer_level_data['chain'] = (
      customer_level_data['chain'].astype('category'))
  customer_level_data['dept'] = (customer_level_data['dept'].astype('category'))
  customer_level_data['brand'] = (
      customer_level_data['brand'].astype('category'))
  customer_level_data['category'] = (
      customer_level_data['category'].astype('category'))
  customer_level_data['label'] = (
      customer_level_data['holdout_value'].astype('float32'))
  return customer_level_data

In [0]:
def process(company):
  print("Process company {}".format(company))
  transaction_level_data = load_data(company)
  customer_level_data = preprocess(transaction_level_data)
  customer_level_data_file = (
      "/tmp/lifetime-value/acquire-valued-shoppers-challenge/customer_level_data_company_{}.csv"
      .format(company))
  customer_level_data.to_csv(customer_level_data_file, index=None)
  print("Done company {}".format(company))

This step may take a while to finish -- 10min-1hr depending on number of core in the computer.


In [0]:
p = multiprocessing.Pool(multiprocessing.cpu_count())
_ = p.map(process, COMPANYS)