In [0]:

    
#@title Copyright 2019 The Lifetime Value Authors.
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ============================================================================

Lifetime Value prediction for Kaggle Acquire Valued Customer Challenge

View source on GitHub



In [0]:

    
import os

import numpy as np
import pandas as pd
from scipy import stats
import seaborn as sns
from sklearn import model_selection
from sklearn import preprocessing
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import backend as K
import tensorflow_probability as tfp
import tqdm
from typing import Sequence

# install and import ltv
!pip install -q git+https://github.com/google/lifetime_value
import lifetime_value as ltv



In [0]:

    
tfd = tfp.distributions
tf.enable_eager_execution()
%config InlineBackend.figure_format='retina'
sns.set_style('whitegrid')
pd.options.mode.chained_assignment = None  # default='warn'

Global variables



In [0]:

    
COMPANY = '103600030'  # @param { isTemplate: true, type: 'string'}
LOSS = 'ziln'  # @param { isTemplate: true, type: 'string'} ['mse', 'ziln']
MODEL = 'dnn'  # @param { isTemplate: true, type: 'string'} ['linear', 'dnn']
LEARNING_RATE = 0.0002  # @param { isTemplate: true}
EPOCHS = 400  # @param { isTemplate: true, type: 'integer'}
OUTPUT_CSV_FOLDER = '/tmp/lifetime-value/kaggle_acquire_valued_shoppers_challenge/result'  # @param { isTemplate: true, type: 'string'}



In [0]:

    
CATEGORICAL_FEATURES = ['chain', 'dept', 'category', 'brand', 'productmeasure']
NUMERIC_FEATURES = ['log_calibration_value']

ALL_FEATURES = CATEGORICAL_FEATURES + NUMERIC_FEATURES

Data

Download data

Setup kaggle API correctly following https://www.kaggle.com/docs/api



In [0]:

    
%%shell
if [ -e /tmp/lifetime-value/acquire-valued-shoppers-challenge/transactions.csv ]
then
  echo "File already exists, no need to download."
else
  rm -rf /tmp/lifetime-value/acquire-valued-shoppers-challenge
  mkdir /tmp/lifetime-value/acquire-valued-shoppers-challenge
  cd /tmp/lifetime-value/acquire-valued-shoppers-challenge
  kaggle competitions download -c acquire-valued-shoppers-challenge
  unzip acquire-valued-shoppers-challenge.zip
  gunzip transactions.csv.gz
fi









    














    














    














    














    














    














    



File already exists, no need to download.

Load transaction csv



In [0]:

    
def load_transaction_data(company):
  all_data_filename = '/tmp/lifetime-value/acquire-valued-shoppers-challenge/transactions.csv'
  one_company_data_filename = (
      '/tmp/lifetime-value/acquire-valued-shoppers-challenge/transactions_company_{}.csv'
      .format(COMPANY))
  if os.path.isfile(one_company_data_filename):
    df = pd.read_csv(one_company_data_filename)
  else:
    data_list = []
    chunksize = 10**6
    # 350 iterations
    for chunk in tqdm.tqdm(pd.read_csv(all_data_filename, chunksize=chunksize)):
      data_list.append(chunk.query("company=='{}'".format(company)))
    df = pd.concat(data_list, axis=0)
    df.to_csv(one_company_data_filename, index=None)
  return df

Preprocess data



In [0]:

    
def preprocess(df):
  df = df.query('purchaseamount>0')
  df['date'] = pd.to_datetime(df['date'], format='%Y-%m-%d')
  df['start_date'] = df.groupby('id')['date'].transform('min')

  # Compute calibration values
  calibration_value = (
      df.query('date==start_date').groupby('id')
      ['purchaseamount'].sum().reset_index())
  calibration_value.columns = ['id', 'calibration_value']

  # Compute holdout values
  one_year_holdout_window_mask = (
      (df['date'] > df['start_date']) &
      (df['date'] <= df['start_date'] + np.timedelta64(365, 'D')))
  holdout_value = (
      df[one_year_holdout_window_mask].groupby('id')
      ['purchaseamount'].sum().reset_index())
  holdout_value.columns = ['id', 'holdout_value']

  # Compute calibration attributes
  calibration_attributes = (
      df.query('date==start_date').sort_values(
          'purchaseamount', ascending=False).groupby('id')[[
              'chain', 'dept', 'category', 'brand', 'productmeasure'
          ]].first().reset_index())

  # Merge dataframes
  customer_level_data = (
      calibration_value.merge(calibration_attributes, how='left',
                              on='id').merge(
                                  holdout_value, how='left', on='id'))
  customer_level_data['holdout_value'] = (
      customer_level_data['holdout_value'].fillna(0.))
  customer_level_data[CATEGORICAL_FEATURES] = (
      customer_level_data[CATEGORICAL_FEATURES].fillna('UNKNOWN'))

  # Specify data types
  customer_level_data['log_calibration_value'] = (
      np.log(customer_level_data['calibration_value']).astype('float32'))
  customer_level_data['chain'] = (
      customer_level_data['chain'].astype('category'))
  customer_level_data['dept'] = (customer_level_data['dept'].astype('category'))
  customer_level_data['brand'] = (
      customer_level_data['brand'].astype('category'))
  customer_level_data['category'] = (
      customer_level_data['category'].astype('category'))
  customer_level_data['label'] = (
      customer_level_data['holdout_value'].astype('float32'))
  return customer_level_data

Load customer-level csv



In [0]:

    
def load_customer_level_csv(company):
  customer_level_data_file = (
      '/tmp/lifetime-value/acquire-valued-shoppers-challenge/customer_level_data_company_{}.csv'
      .format(company))
  if os.path.isfile(customer_level_data_file):
    customer_level_data = pd.read_csv(customer_level_data_file)
  else:
    customer_level_data = preprocess(load_transaction_data(company))
  for cat_col in CATEGORICAL_FEATURES:
    customer_level_data[cat_col] = (
        customer_level_data[cat_col].astype('category'))
  for num_col in [
      'log_calibration_value', 'calibration_value', 'holdout_value'
  ]:
    customer_level_data[num_col] = (
        customer_level_data[num_col].astype('float32'))

  return customer_level_data



In [0]:

    
customer_level_data = load_customer_level_csv(COMPANY)

We observe a mixture of zero and lognormal distribution of holdout value.



In [0]:

    
customer_level_data.label.apply(np.log1p).hist(bins=50)









    Out[0]:





<matplotlib.axes._subplots.AxesSubplot at 0x35d5b1058e10>

Make train/eval



In [0]:

    
def linear_split(df):
  # get_dummies preserves numeric features.
  x = pd.get_dummies(df[ALL_FEATURES], drop_first=True).astype('float32').values
  y = df['label'].values
  y0 = df['calibration_value'].values

  x_train, x_eval, y_train, y_eval, y0_train, y0_eval = (
      model_selection.train_test_split(
          x, y, y0, test_size=0.2, random_state=123))

  return x_train, x_eval, y_train, y_eval, y0_eval



In [0]:

    
def dnn_split(df):
  for key in CATEGORICAL_FEATURES:
    encoder = preprocessing.LabelEncoder()
    df[key] = encoder.fit_transform(df[key])

  y0 = df['calibration_value'].values
  df_train, df_eval, y0_train, y0_eval = model_selection.train_test_split(
      df, y0, test_size=0.2, random_state=123)

  def feature_dict(df):
    features = {k: v.values for k, v in dict(df[CATEGORICAL_FEATURES]).items()}
    features['numeric'] = df[NUMERIC_FEATURES].values
    return features

  x_train, y_train = feature_dict(df_train), df_train['label'].values
  x_eval, y_eval = feature_dict(df_eval), df_eval['label'].values

  return x_train, x_eval, y_train, y_eval, y0_eval

Model



In [0]:

    
def linear_model(output_units):
  return tf.keras.experimental.LinearModel(output_units)



In [0]:

    
def embedding_dim(x):
  return int(x**.25) + 1


def embedding_layer(vocab_size):
  return tf.keras.Sequential([
      tf.keras.layers.Embedding(
          input_dim=vocab_size,
          output_dim=embedding_dim(vocab_size),
          input_length=1),
      tf.keras.layers.Flatten(),
  ])


def dnn_model(output_units, df):
  numeric_input = tf.keras.layers.Input(
      shape=(len(NUMERIC_FEATURES),), name='numeric')

  embedding_inputs = [
      tf.keras.layers.Input(shape=(1,), name=key, dtype=np.int64)
      for key in CATEGORICAL_FEATURES
  ]

  embedding_outputs = [
      embedding_layer(vocab_size=df[key].nunique())(input)
      for key, input in zip(CATEGORICAL_FEATURES, embedding_inputs)
  ]

  deep_input = tf.keras.layers.concatenate([numeric_input] + embedding_outputs)
  deep_model = tf.keras.Sequential([
      tf.keras.layers.Dense(64, activation='relu'),
      tf.keras.layers.Dense(32, activation='relu'),
      tf.keras.layers.Dense(output_units),
  ])
  return tf.keras.Model(
      inputs=[numeric_input] + embedding_inputs, outputs=deep_model(deep_input))

Train



In [0]:

    
if LOSS == 'mse':
  loss = keras.losses.MeanSquaredError()
  output_units = 1

if LOSS == 'ziln':
  loss = ltv.zero_inflated_lognormal_loss
  output_units = 3



In [0]:

    
if MODEL == 'linear':
  x_train, x_eval, y_train, y_eval, y0_eval = linear_split(customer_level_data)
  model = linear_model(output_units)

if MODEL == 'dnn':
  x_train, x_eval, y_train, y_eval, y0_eval = dnn_split(customer_level_data)
  model = dnn_model(output_units, customer_level_data)



In [0]:

    
model.compile(loss=loss, optimizer=keras.optimizers.Adam(lr=LEARNING_RATE))



In [0]:

    
callbacks = [
    tf.keras.callbacks.ReduceLROnPlateau(monitor='val_loss', min_lr=1e-6),
    tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=10),
]



In [0]:

    
history = model.fit(
    x=x_train,
    y=y_train,
    batch_size=1024,
    epochs=EPOCHS,
    verbose=2,
    callbacks=callbacks,
    validation_data=(x_eval, y_eval)).history









    



Train on 180145 samples, validate on 45037 samples
Epoch 1/400
180145/180145 - 2s - loss: 7.0981 - val_loss: 5.3484
Epoch 2/400
180145/180145 - 1s - loss: 4.7133 - val_loss: 4.4211
Epoch 3/400
180145/180145 - 1s - loss: 4.2896 - val_loss: 4.2134
Epoch 4/400
180145/180145 - 1s - loss: 4.1925 - val_loss: 4.1868
Epoch 5/400
180145/180145 - 1s - loss: 4.1803 - val_loss: 4.1819
Epoch 6/400
180145/180145 - 1s - loss: 4.1772 - val_loss: 4.1803
Epoch 7/400
180145/180145 - 1s - loss: 4.1759 - val_loss: 4.1794
Epoch 8/400
180145/180145 - 1s - loss: 4.1750 - val_loss: 4.1788
Epoch 9/400
180145/180145 - 1s - loss: 4.1741 - val_loss: 4.1779
Epoch 10/400
180145/180145 - 1s - loss: 4.1732 - val_loss: 4.1771
Epoch 11/400
180145/180145 - 1s - loss: 4.1723 - val_loss: 4.1764
Epoch 12/400
180145/180145 - 1s - loss: 4.1714 - val_loss: 4.1757
Epoch 13/400
180145/180145 - 1s - loss: 4.1708 - val_loss: 4.1750
Epoch 14/400
180145/180145 - 1s - loss: 4.1702 - val_loss: 4.1748
Epoch 15/400
180145/180145 - 1s - loss: 4.1699 - val_loss: 4.1744
Epoch 16/400
180145/180145 - 1s - loss: 4.1696 - val_loss: 4.1746
Epoch 17/400
180145/180145 - 1s - loss: 4.1694 - val_loss: 4.1742
Epoch 18/400
180145/180145 - 1s - loss: 4.1692 - val_loss: 4.1741
Epoch 19/400
180145/180145 - 1s - loss: 4.1691 - val_loss: 4.1739
Epoch 20/400
180145/180145 - 1s - loss: 4.1690 - val_loss: 4.1738
Epoch 21/400
180145/180145 - 1s - loss: 4.1688 - val_loss: 4.1738
Epoch 22/400
180145/180145 - 1s - loss: 4.1687 - val_loss: 4.1737
Epoch 23/400
180145/180145 - 1s - loss: 4.1686 - val_loss: 4.1739
Epoch 24/400
180145/180145 - 1s - loss: 4.1685 - val_loss: 4.1736
Epoch 25/400
180145/180145 - 1s - loss: 4.1683 - val_loss: 4.1734
Epoch 26/400
180145/180145 - 1s - loss: 4.1682 - val_loss: 4.1736
Epoch 27/400
180145/180145 - 1s - loss: 4.1681 - val_loss: 4.1733
Epoch 28/400
180145/180145 - 1s - loss: 4.1680 - val_loss: 4.1737
Epoch 29/400
180145/180145 - 1s - loss: 4.1678 - val_loss: 4.1732
Epoch 30/400
180145/180145 - 1s - loss: 4.1677 - val_loss: 4.1732
Epoch 31/400
180145/180145 - 1s - loss: 4.1676 - val_loss: 4.1728
Epoch 32/400
180145/180145 - 1s - loss: 4.1675 - val_loss: 4.1730
Epoch 33/400
180145/180145 - 1s - loss: 4.1673 - val_loss: 4.1728
Epoch 34/400
180145/180145 - 1s - loss: 4.1673 - val_loss: 4.1728
Epoch 35/400
180145/180145 - 1s - loss: 4.1671 - val_loss: 4.1727
Epoch 36/400
180145/180145 - 1s - loss: 4.1669 - val_loss: 4.1728
Epoch 37/400
180145/180145 - 1s - loss: 4.1669 - val_loss: 4.1726
Epoch 38/400
180145/180145 - 1s - loss: 4.1668 - val_loss: 4.1724
Epoch 39/400
180145/180145 - 1s - loss: 4.1667 - val_loss: 4.1727
Epoch 40/400
180145/180145 - 1s - loss: 4.1666 - val_loss: 4.1724
Epoch 41/400
180145/180145 - 1s - loss: 4.1664 - val_loss: 4.1723
Epoch 42/400
180145/180145 - 1s - loss: 4.1664 - val_loss: 4.1725
Epoch 43/400
180145/180145 - 1s - loss: 4.1663 - val_loss: 4.1728
Epoch 44/400
180145/180145 - 1s - loss: 4.1662 - val_loss: 4.1722
Epoch 45/400
180145/180145 - 1s - loss: 4.1661 - val_loss: 4.1723
Epoch 46/400
180145/180145 - 1s - loss: 4.1661 - val_loss: 4.1720
Epoch 47/400
180145/180145 - 1s - loss: 4.1659 - val_loss: 4.1719
Epoch 48/400
180145/180145 - 1s - loss: 4.1659 - val_loss: 4.1721
Epoch 49/400
180145/180145 - 1s - loss: 4.1659 - val_loss: 4.1720
Epoch 50/400
180145/180145 - 1s - loss: 4.1658 - val_loss: 4.1720
Epoch 51/400
180145/180145 - 1s - loss: 4.1657 - val_loss: 4.1719
Epoch 52/400
180145/180145 - 1s - loss: 4.1656 - val_loss: 4.1720
Epoch 53/400
180145/180145 - 1s - loss: 4.1656 - val_loss: 4.1718
Epoch 54/400
180145/180145 - 1s - loss: 4.1655 - val_loss: 4.1722
Epoch 55/400
180145/180145 - 1s - loss: 4.1655 - val_loss: 4.1717
Epoch 56/400
180145/180145 - 1s - loss: 4.1654 - val_loss: 4.1724
Epoch 57/400
180145/180145 - 1s - loss: 4.1654 - val_loss: 4.1717
Epoch 58/400
180145/180145 - 1s - loss: 4.1653 - val_loss: 4.1718
Epoch 59/400
180145/180145 - 1s - loss: 4.1653 - val_loss: 4.1716
Epoch 60/400
180145/180145 - 1s - loss: 4.1653 - val_loss: 4.1722
Epoch 61/400
180145/180145 - 1s - loss: 4.1652 - val_loss: 4.1716
Epoch 62/400
180145/180145 - 1s - loss: 4.1651 - val_loss: 4.1717
Epoch 63/400
180145/180145 - 1s - loss: 4.1651 - val_loss: 4.1716
Epoch 64/400
180145/180145 - 1s - loss: 4.1650 - val_loss: 4.1717
Epoch 65/400
180145/180145 - 1s - loss: 4.1650 - val_loss: 4.1714
Epoch 66/400
180145/180145 - 1s - loss: 4.1650 - val_loss: 4.1716
Epoch 67/400
180145/180145 - 1s - loss: 4.1650 - val_loss: 4.1716
Epoch 68/400
180145/180145 - 1s - loss: 4.1648 - val_loss: 4.1720
Epoch 69/400
180145/180145 - 1s - loss: 4.1650 - val_loss: 4.1721
Epoch 70/400
180145/180145 - 1s - loss: 4.1649 - val_loss: 4.1714
Epoch 71/400
180145/180145 - 1s - loss: 4.1648 - val_loss: 4.1715
Epoch 72/400
180145/180145 - 1s - loss: 4.1647 - val_loss: 4.1712
Epoch 73/400
180145/180145 - 1s - loss: 4.1647 - val_loss: 4.1717
Epoch 74/400
180145/180145 - 1s - loss: 4.1647 - val_loss: 4.1712
Epoch 75/400
180145/180145 - 1s - loss: 4.1646 - val_loss: 4.1712
Epoch 76/400
180145/180145 - 1s - loss: 4.1647 - val_loss: 4.1715
Epoch 77/400
180145/180145 - 1s - loss: 4.1646 - val_loss: 4.1713
Epoch 78/400
180145/180145 - 1s - loss: 4.1646 - val_loss: 4.1713
Epoch 79/400
180145/180145 - 1s - loss: 4.1646 - val_loss: 4.1717
Epoch 80/400
180145/180145 - 1s - loss: 4.1646 - val_loss: 4.1713
Epoch 81/400
180145/180145 - 1s - loss: 4.1644 - val_loss: 4.1714
Epoch 82/400
180145/180145 - 1s - loss: 4.1645 - val_loss: 4.1712
Epoch 83/400
180145/180145 - 1s - loss: 4.1640 - val_loss: 4.1711
Epoch 84/400
180145/180145 - 1s - loss: 4.1640 - val_loss: 4.1711
Epoch 85/400
180145/180145 - 1s - loss: 4.1640 - val_loss: 4.1712
Epoch 86/400
180145/180145 - 1s - loss: 4.1640 - val_loss: 4.1711
Epoch 87/400
180145/180145 - 1s - loss: 4.1640 - val_loss: 4.1711
Epoch 88/400
180145/180145 - 1s - loss: 4.1640 - val_loss: 4.1711
Epoch 89/400
180145/180145 - 1s - loss: 4.1640 - val_loss: 4.1711
Epoch 90/400
180145/180145 - 1s - loss: 4.1640 - val_loss: 4.1711
Epoch 91/400
180145/180145 - 1s - loss: 4.1640 - val_loss: 4.1711
Epoch 92/400
180145/180145 - 1s - loss: 4.1640 - val_loss: 4.1712
Epoch 93/400
180145/180145 - 1s - loss: 4.1640 - val_loss: 4.1711
Epoch 94/400
180145/180145 - 1s - loss: 4.1639 - val_loss: 4.1711
Epoch 95/400
180145/180145 - 1s - loss: 4.1639 - val_loss: 4.1711
Epoch 96/400
180145/180145 - 1s - loss: 4.1639 - val_loss: 4.1711
Epoch 97/400
180145/180145 - 1s - loss: 4.1639 - val_loss: 4.1711
Epoch 98/400
180145/180145 - 1s - loss: 4.1639 - val_loss: 4.1711
Epoch 99/400
180145/180145 - 1s - loss: 4.1639 - val_loss: 4.1711



In [0]:

    
pd.DataFrame(history)[['loss', 'val_loss']][2:].plot()









    Out[0]:





<matplotlib.axes._subplots.AxesSubplot at 0x35d62b4e8fd0>

Eval



In [0]:

    
if LOSS == 'mse':
  y_pred = model.predict(x=x_eval, batch_size=1024).flatten()

if LOSS == 'ziln':
  logits = model.predict(x=x_eval, batch_size=1024)
  y_pred = ltv.zero_inflated_lognormal_pred(logits).numpy().flatten()



In [0]:

    
df_pred = pd.DataFrame({
    'y_true': y_eval,
    'y_pred': y_pred,
})
df_pred.head(10)

Gini Coefficient



In [0]:

    
gain = pd.DataFrame({
    'lorenz': ltv.cumulative_true(y_eval, y_eval),
    'baseline': ltv.cumulative_true(y_eval, y0_eval),
    'model': ltv.cumulative_true(y_eval, y_pred),
})



In [0]:

    
num_customers = np.float32(gain.shape[0])
gain['cumulative_customer'] = (np.arange(num_customers) + 1.) / num_customers



In [0]:

    
ax = gain[[
    'cumulative_customer',
    'lorenz',
    'baseline',
    'model',
]].plot(
    x='cumulative_customer', figsize=(8, 5), legend=True)

ax.legend(['Groundtruth', 'Baseline', 'Model'], loc='upper left')

ax.set_xlabel('Cumulative Fraction of Customers')
ax.set_xticks(np.arange(0, 1.1, 0.1))
ax.set_xlim((0, 1.))

ax.set_ylabel('Cumulative Fraction of Total Lifetime Value')
ax.set_yticks(np.arange(0, 1.1, 0.1))
ax.set_ylim((0, 1.05))
ax.set_title('Gain Chart')









    Out[0]:





<matplotlib.text.Text at 0x35d62c2bfa90>



In [0]:

    
gini = ltv.gini_from_gain(gain[['lorenz', 'baseline', 'model']])
gini

Calibration



In [0]:

    
df_decile = ltv.decile_stats(y_eval, y_pred)
df_decile









    Out[0]:







  
    
      
      label_mean
      normalized_mae
      normalized_rmse
      pred_mean
      decile_mape
    
    
      decile
      
      
      
      
      
    
  
  
    
      0
      13.681039
      1.113830
      2.491969
      12.661569
      0.074517
    
    
      1
      20.310737
      1.056633
      2.121411
      19.333267
      0.048126
    
    
      2
      22.728945
      1.040487
      1.855777
      23.199310
      0.020695
    
    
      3
      25.441446
      1.036384
      1.613740
      27.419205
      0.077738
    
    
      4
      30.961781
      0.981437
      1.713931
      31.107035
      0.004691
    
    
      5
      35.221345
      0.984319
      1.686755
      35.173012
      0.001372
    
    
      6
      37.128378
      0.998047
      1.819248
      39.990730
      0.077093
    
    
      7
      43.696324
      0.978155
      1.530850
      47.477573
      0.086535
    
    
      8
      55.142434
      0.986490
      2.435485
      61.094627
      0.107942
    
    
      9
      108.951192
      0.987306
      3.273232
      116.391296
      0.068288



In [0]:

    
ax = df_decile[['label_mean', 'pred_mean']].plot.bar(rot=0)

ax.set_title('Decile Chart')
ax.set_xlabel('Prediction bucket')
ax.set_ylabel('Average bucket value')
ax.legend(['Label', 'Prediction'], loc='upper left')









    Out[0]:





<matplotlib.legend.Legend at 0x35d62a723d90>

Rank Correlation



In [0]:

    
def spearmanr(x1: Sequence[float], x2: Sequence[float]) -> float:
  """Calculates spearmanr rank correlation coefficient.

  See https://docs.scipy.org/doc/scipy/reference/stats.html.

  Args:
    x1: 1D array_like.
    x2: 1D array_like.

  Returns:
    correlation: float.
  """
  return stats.spearmanr(x1, x2, nan_policy='raise')[0]


spearman_corr = spearmanr(y_eval, y_pred)
spearman_corr









    Out[0]:





0.31160782761223393

All metrics together



In [0]:

    
df_metrics = pd.DataFrame(
    {
        'company': COMPANY,
        'model': MODEL,
        'loss': LOSS,
        'label_mean': y_eval.mean(),
        'pred_mean': y_pred.mean(),
        'label_positive': np.mean(y_eval > 0),
        'decile_mape': df_decile['decile_mape'].mean(),
        'baseline_gini': gini['normalized'][1],
        'gini': gini['normalized'][2],
        'spearman_corr': spearman_corr,
    },
    index=[0])



In [0]:

    
df_metrics[[
    'company',
    'model',
    'loss',
    'label_mean',
    'pred_mean',
    'label_positive',
    'decile_mape',
    'baseline_gini',
    'gini',
    'spearman_corr',
]]









    Out[0]:







  
    
      
      company
      model
      loss
      label_mean
      pred_mean
      label_positive
      decile_mape
      baseline_gini
      gini
      spearman_corr
    
  
  
    
      0
      103600030
      dnn
      ziln
      39.295242
      41.35421
      0.774563
      0.0567
      0.410277
      0.460298
      0.311608

Save



In [0]:

    
output_path = os.path.join(OUTPUT_CSV_FOLDER, COMPANY)



In [0]:

    
if not os.path.isdir(output_path):
  os.makedirs(output_path)



In [0]:

    
output_file = os.path.join(output_path,
                           '{}_regression_{}.csv'.format(MODEL, LOSS))



In [0]:

    
df_metrics.to_csv(output_file, index=False)

	y_pred	y_true
0	18.185350	1.57
1	20.422848	23.16
2	240.908432	252.28
3	35.072281	5.49
4	47.286964	3.69
5	13.456080	2.48
6	31.121714	45.31
7	15.125760	30.95
8	21.852407	11.94
9	55.407394	40.55

	raw	normalized
lorenz	0.706898	1.000000
baseline	0.290024	0.410277
model	0.325384	0.460298

	label_mean	normalized_mae	normalized_rmse	pred_mean	decile_mape
decile
0	13.681039	1.113830	2.491969	12.661569	0.074517
1	20.310737	1.056633	2.121411	19.333267	0.048126
2	22.728945	1.040487	1.855777	23.199310	0.020695
3	25.441446	1.036384	1.613740	27.419205	0.077738
4	30.961781	0.981437	1.713931	31.107035	0.004691
5	35.221345	0.984319	1.686755	35.173012	0.001372
6	37.128378	0.998047	1.819248	39.990730	0.077093
7	43.696324	0.978155	1.530850	47.477573	0.086535
8	55.142434	0.986490	2.435485	61.094627	0.107942
9	108.951192	0.987306	3.273232	116.391296	0.068288