Welcome!



In [ ]:

    
# we start by importing modules
import warnings
warnings.filterwarnings('ignore')
import numpy as np
import pandas as pd
from vf_portalytics.model import PredictionModel
from vf_portalytics.dataset import DataSet
from vf_portalytics.tool import create_train_test_sets, score_model, describe_columns
from sklearn import linear_model, ensemble, svm, tree
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_selection import SelectKBest, f_regression, mutual_info_regression, RFE

from IPython.display import HTML, display
from bokeh.io import output_notebook, show
from bokeh.plotting import figure
from bkcharts import Histogram, Bar, Line, Scatter
from bkcharts.attributes import cat
output_notebook()



In [ ]:

    
# now we load data and create the basics
# dataset = DataSet('ulnl_prod_20170621.msgpack', path='/home/carst/')
# df = dataset.data_df
df = pd.read_msgpack('/home/carst/ulnl_prod_20170627_total.msgpack')
df['product_prodh_long2'] = df['product_prodh1'] + df['product_prodh2']
df['second_placement_yn'] = df['second_placement_yn'].astype(np.bool)
df['product_volume_per_cu'] = df['baseline_vol'] /  df['baseline_units']
mask = (df['second_placement_perc'] < 40.0)
df.loc[mask, 'second_placement_yn'] = False



In [ ]:

    
# describe available columns
generic, field, tag, promotion_dimension, product_dimension, media, media_attr = describe_columns(df)



In [ ]:

    
# remove version fields
remove_list = [x for x in df.columns if x[-1:] in ['2', '3', '5'] 
               and not x.startswith('promotion_dimension')
               and not x.startswith('product_')
               and not x.startswith('media')
               and not x.startswith('field')
               and not x.startswith('week_agg')
               and not x.startswith('total_units')
              ] + [x for x in df.columns if x.startswith('fwb') or x.startswith('total_vol') or x == 'internal']
df = df[[x for x in df.columns if x not in remove_list]]



In [ ]:

    
# remove unused dimensions
keep_dimension_list = [7, 9, 10, 13, 14, 16, 17, 18, 19, 20, 21, 22, 24, 74, 115, 122, 134, 136, 137, 138, 144]
keep_list = [x for x in df.columns if 
            (not x.startswith('promotion_dimension') and not x.startswith('product_dimension'))
              or int(x.split('_')[-1]) in keep_dimension_list]
remove_list = [x for x in df.columns if x not in keep_list]
# print(remove_list)
df = df[keep_list]



In [ ]:

    
# remove unused tags
keep_tags_list = []
remove_list = [x for x in df.columns if x.startswith('tag_') and x not in keep_tags_list]
df = df[[x for x in df.columns if x not in remove_list]]



In [ ]:

    
# remove unused product attributes
keep_list = ['product_1_mco', 'product_2_div', 'product_2b_div', 
             'product_3_cat', 'product_5_bet_nl', 
             'product_6_bc', 'product_7_cc', 'product_7a_ebf_code', 
             'product_8_csc', 'product_8a_spf_code', 
             'product_9a_spfv_code', 
             'product_bonus', 
             'product_brandcategorykey', 'product_brandformkey', 'product_brandkey', 'product_brandmarketkey', 
             'product_brandsubsectorkey', 'product_brandtotalkey',
             'product_foodsolutions', 'product_ho_brand_caratbrandkey', 
             'product_level_01', 'product_level_02', 
             'product_ntgew', 
             'product_prodh', 'product_prodh1', 'product_prodh2', 'product_prodh3', 'product_prodh_long2',
             'product_repack_type',
             'product_volume_per_cu'
            ]

remove_list = [x for x in df.columns if x.startswith('product_') and not x.startswith('product_dimension') and x not in keep_list]
df = df[[x for x in df.columns if x not in remove_list]]

Filter Dataframe



In [ ]:

    
# add manual exclusions
mask = df['promotion_ext_id'].isin([15148])
df.loc[mask, 'exclude_yn'] = True

# remove exclusions
exclusion_mask = df['exclude_yn'].isin([1, True])
df = df[-exclusion_mask]
print("\nNr of Excluded Records Removed: %d." % (exclusion_mask.sum()))

# save the future
status_mask = df['promotion_status'] < 110
future_df = df[status_mask]
print("\nNr of Future Records: %d rows with %d features." % (future_df.shape[0], future_df.shape[1]))

date_mask = future_df['yearweek'] > 201730
future_df = future_df[date_mask]
print("\nNr of Future Records after week filter: %d rows with %d features." % (future_df.shape[0], future_df.shape[1]))

baseline_mask = future_df['baseline_units'] > 0.0
future_df = future_df[baseline_mask]
print("\nNr of Future Records after baseline filter: %d rows with %d features." % (future_df.shape[0], future_df.shape[1]))

# filtering the contents
status_mask = df['promotion_status'] >= 112
df = df[status_mask]
print("\nAfter status filter: We have %d df with %d features." % (df.shape[0], df.shape[1]))

date_mask = df['yearweek'] >= 201612
df = df[date_mask]
print("\nAfter week filter: %d rows with %d features." % (df.shape[0], df.shape[1]))

small_df_mask = df['baseline_units'] > 0.0
df = df[small_df_mask]
print("\nAfter >0 baseline filter: %d rows with %d features." % (df.shape[0], df.shape[1]))

huge_df_mask = df['baseline_units'] < 10.0**5
df = df[huge_df_mask]
print("\nAfter <10k baseline filter: %d rows with %d features." % (df.shape[0], df.shape[1]))

small_lift_mask = (df['lift'] > 1.2) & (df['lift'] < 40)
df = df[small_lift_mask]
print("\nAfter lift filter: %d rows with %d features." % (df.shape[0], df.shape[1]))

min_discount_mask = (df['discount_perc'] >= 5.0) & (df['discount_perc'] < 75.0)
df = df[min_discount_mask]
print("\nAfter discount filter: %d rows with %d features." % (df.shape[0], df.shape[1]))

mechanism_msk = df['mechanism'].notnull()
df = df[mechanism_msk]
print("\nAfter mechanism filter: %d rows with %d features." % (df.shape[0], df.shape[1]))

Check Most Relevant Numeric Features



In [ ]:

    
# clean and prepare data
feature_df = df.select_dtypes(exclude=[np.object]).dropna(axis=1).copy()
# Turn textual columns and booleans into classes (PredictionModel does this automatically in the preparation later) 
for column in feature_df.select_dtypes(include=[np.bool]):
    feature_df[column] = LabelEncoder().fit_transform(feature_df[column])
feature_col_list = [x for x in feature_df.columns if x != 'lift' and not x.startswith('total_units')]

max_features = len(feature_col_list)

print('# of numeric features available: ' + str(max_features))



In [ ]:

    
# Set your parameters; please be aware that mutual_info_regression can be very resource intensive
features_len = 30  # max number of columns: 'all' or a number
selected_regression = f_regression  # f_regression (univariate) or mutual_info_regression (mutual information)

if features_len > max_features:
    features_len = max_features



In [ ]:

    
# check support
features = SelectKBest(selected_regression, k=features_len).fit(feature_df[feature_col_list], feature_df['lift'])
feature_support = features.get_support()
max_score = max(features.scores_)
if max_score == 0.0 or max_score != max_score:
    max_score = 1.0
feature_columns = {col: (100.0 * score / max_score) for col, selected, score in zip(list(feature_df[feature_col_list]), feature_support, features.scores_) if selected}



In [ ]:

    
print('The ' + str(features_len) + ' most important features are (in descending order): \n')

display(HTML(
    '<table><tr>{}</tr></table>'.format(
        '</tr><tr>'.join(
            '<td>{}</td>'.format('</td><td>'.join(str(_) for _ in row)) for row in sorted(feature_columns.items(), key=lambda kv: kv[1], reverse=True))
        )
 ))
print('\nPlease do not forget to manually check for co-correlation (such as baseline_units and baseline_vol)')



In [ ]:

    
# clean up and filter dataframe to relevant columns (selected + standard)
del feature_df
# filter_cols = list(set(feature_columns.keys() + generic + field))
# df = df[filter_cols]



In [ ]:

    
# if you want to filter the data set to these columns:
# dataset.data_df = df
# dataset.save()

Try Automated Feature Selection



In [ ]:

    
# exclude standard columns
exclude_list = ['promotion_name', 'promotion_id', 'promotion_ext_id', 'account_banner', 'epid', 'ep_desc', 'baseline_units_ext', 'baseline_units_int', 'yearweek', 'total_units', 'total_units_2', 'total_units_3', 'total_units_5']
customer_exclude_list = ['baseline_vol', 'total_baseline_vol', 'weighted_distribution_normal', 'weighted_distribution_promoted']

# clean and prepare data
feature_df = df.copy()

# fill numeric values with 0.0
for column in feature_df.select_dtypes(include=[np.int64, np.float64]):
    feature_df[column] = feature_df[column].fillna(0.0)

for column in feature_df.select_dtypes(include=[np.int64, np.float64]):
    feature_df[column] = feature_df[column].fillna(0.0)

# Turn textual columns and booleans into classes (PredictionModel does this automatically in the preparation later) 
for column in feature_df.select_dtypes(include=[np.object, np.bool]):
    feature_df[column] = LabelEncoder().fit_transform(feature_df[column].fillna(-1))
feature_col_list = [x for x in feature_df.columns if x != 'lift' and x not in exclude_list and x not in customer_exclude_list]

max_features = len(feature_col_list)

print('# of total features available: ' + str(max_features))



In [ ]:

    
# Set your parameters; please be aware that mutual_info_regression can be very resource intensive
features_len = 5  # max number of columns: 'all' or a number
step = 1  # x features to be dropped each step

if features_len > max_features:
    features_len = max_features



In [ ]:

    
model = ensemble.ExtraTreesRegressor()

# create the RFE model and select 3 attributes
rfe = RFE(model,  n_features_to_select=features_len, step=step, verbose=1)
rfe = rfe.fit(feature_df[feature_col_list], feature_df['lift'])



In [ ]:

    
print('The order of importance of the features is: \n')
feature_columns = {col: ranking for col, selected, ranking in 
                   zip(list(feature_df[feature_col_list]), rfe.support_, rfe.ranking_) 
                   # if selected
                  }

col_list = sorted(feature_columns.items())

display(HTML(
    '<table><tr>{}</tr></table>'.format(
        '</tr><tr>'.join(
            '<td>{}</td>'.format('</td><td>'.join(str(_) for _ in row)) for row in sorted(feature_columns.items(), key=lambda kv: kv[1]))
        )
 ))
print('\nPlease do not forget to manually check for co-correlation (such as baseline_units and baseline_vol)')



In [ ]:

    
# clean up and filter dataframe to relevant columns (selected + standard)
del feature_df
# filter_cols = list(set(feature_columns.keys() + generic + field))
# df = df[filter_cols]



In [ ]:

    
# if you want to filter the data set to these columns:
# dataset.data_df = df
# dataset.save()

Investigate data



In [ ]:

    
# distribution histograms 
check_list = ['discount_perc', 'account_banner', 'week', 'baseline_units', 'field_35060']

# nb: we filter out the bottom and top 0.5%)
low_limit_def = 0.0
high_limit_def = 100.0
# nb: normal histograms are for numeric columns only, others are top 20 bar charts 
top_def = 20
df['count'] = 1  # nb: the count is the promotion - product combinations

# now plot
for col in check_list:
    if df[col].dtype in [np.int64, np.float64]:
        low_limit = np.percentile(df[col].fillna(0.0), low_limit_def)
        high_limit = np.percentile(df[col].fillna(0.0), high_limit_def)
        print('Showing ' + col + ' between ' + str(low_limit) + ' and ' + str(high_limit))
        mask = (df[col] >= low_limit) & (df[col] <= high_limit)
        p = Histogram(df[mask], values=col, bins=12)
        p.axis[1].axis_label = 'Count'
        show(p)
    else:
        print('Showing ' + col + ' top ' + str(top_def))
        group = df.groupby([col], as_index=False)['count'].sum()
        group[col] = group[col].str.encode('utf-8')
        group = group.nlargest(top_def, 'count')
        label = cat(columns=col, sort=False)
        p = Bar(group, label=label, values='count', legend=None)
        p.axis[1].axis_label = 'Count'
        show(p)



In [ ]:

    
# charts that check against lift
check_list = ['discount_perc', 'account_banner', 'field_35060']

# now plot
for col in check_list:
    if df[col].dtype in [np.int64, np.float64]:
        print('Drawing Average Lift + Nr Observations for ' + col)
        df['tmp_col'] = df[col].round() 
        group = df.groupby(['tmp_col'], as_index=False)['baseline_units', 'total_units', 'count'].sum()
        del df['tmp_col']
        group['lift'] = group['total_units'] / group['baseline_units']
        group = group.rename(columns={'tmp_col': col})
        p = Line(group, x=col, y='lift')
        show(p)
        p = Line(group, x=col, y='count', color='green')
        show(p) 
    else:
        print('Showing ' + col + ' Average Lift for ' + str(top_def) + ' most used')
        group = df.groupby([col], as_index=False)['baseline_units', 'total_units', 'count'].sum()
        group[col] = group[col].str.encode('utf-8')
        group = group.nlargest(top_def, 'count')
        group['lift'] = group['total_units'] / group['baseline_units']
        label = cat(columns=col, sort=False)
        p = Bar(group, label=label, values='lift', legend=None)
        p.axis[1].axis_label = 'Avg Lift'
        show(p)



In [ ]:

    
# plot scatter diagrams for correlation visualization
check_list = [('discount_perc', 'lift'), ('baseline_units', 'total_units'), ('total_units_3', 'total_units_5')]

# now plot
for col_x, col_y in check_list:
    if df[col_x].dtype in [np.int64, np.float64] and df[col_y].dtype in [np.int64, np.float64]:
        print('Drawing Scatter Correlation for ' + col_x + ' and ' + col_y)
        p = Scatter(df, x=col_x, y=col_y)
        show(p)
    else:
        print('Both columns need to be numerical')

Create a model and select the features



In [ ]:

    
# creating a prediction model
prediction_model = PredictionModel('carst_example', path='/home/carst/')

# set the features (C = categoric value for dimensional features)
prediction_model.features = {
    'promoted_price': [],
    'discount_perc': [],
    'discount_amt': [],
    'baseline_units': [],
    'total_nr_products': [],  # total to check the complete size of the promotion
    'total_baseline_units': [],  # total to check the complete size of the promotion
    'account_id': ['C'],  # account
    'field_35060': ['C'],  # Promotion Mechanism 
    'product_prodh1': ['C'],
    'product_prodh_long2': ['C'],
    'product_prodh': ['C'],
    'multi_buy_y': ['C'],
    'second_placement_yn': ['C'],
    'week_agg_2': ['C'],
    'product_volume_per_cu': [],  # check
    }

# we predict the lift normally or log?
prediction_model.target = {'lift': []}  # [] or ['log']

# are we doing logarithmic prediction?
if 'log' in prediction_model.target['lift']:
    log = True
else:
    log = False



In [ ]:

    
mask = df['promotion_ext_id'] == 20382  # 22171
df[mask][['epid', 'ep_desc', 'second_placement_yn','second_placement_perc', 'weighted_distribution_normal', 'weighted_distribution_promoted', 'baseline_units', 'total_units_5', 'lift']].sort_values('baseline_units', ascending=False)

Create train and test sets



In [ ]:

    
# get only use the needed columns
used_column_list = list(set(prediction_model.features.keys() + prediction_model.target.keys()))

# create a mask based on random selections or on a period
mask = np.random.rand(len(df)) < 0.8
df['train_test'] = 'train'
df.loc[-mask, 'train_test'] = 'test'

# create train sets
train_df, train_lift, test_df, test_lift = create_train_test_sets(df[used_column_list], mask, prediction_model, prediction_target='lift')

# are we doing logarithmic predictions
if log:
    # we need to train everything based on the log value
    train_lift = np.log(train_lift)

Select a regressor



In [ ]:

    
# create and train a regressor
# regressor = svm.SVR()
# regressor = ensemble.RandomForestRegressor(n_estimators=79, min_samples_split=4, , n_jobs=-1)
regressor = ensemble.ExtraTreesRegressor(
    n_estimators=100,
    criterion='mse',  # mse or mae
    max_depth=None,  # None or max nr for tree
    min_samples_split=2, # standard 2; how significant nr of observations do we need for a split in the forest
    min_samples_leaf=1, # standard 1; how significant nr of observations do we need for a split in the forest
    max_leaf_nodes=None,  # standard None; how many nodes can we have in a leaf
    # random_state=10, 
    n_jobs=-1)
# regressor = ensemble.AdaBoostRegressor(ensemble.ExtraTreesRegressor(n_estimators=20, n_jobs=-1), n_estimators=79)
# regressor = ensemble.GradientBoostingRegresor(n_estimators=5000, learning_rate=0.9, max_depth=10, random_state=0)

regressor.fit(train_df, train_lift)
prediction_model.model = regressor

# possibly check: https://github.com/automl/auto-sklearn/blob/master/example/example_regression.py
# https://mlwave.com/kaggle-ensembling-guide/

Predict and score the model



In [ ]:

    
# predict the lift
predict_lift = prediction_model.model.predict(test_df)

if log:
    # if it was logarithmic, expand the lift again
    predict_lift = np.exp(predict_lift)

# score the model
score_model(predict_lift, test_lift, baseline=test_df['baseline_units'])

Check Feature Importance



In [ ]:

    
# Retrieve the feature importance
feature_importance = regressor.feature_importances_
feature_importance = 100.0 * (feature_importance / feature_importance.max())
mask = feature_importance > 0.5
feature_importance = feature_importance[mask]
sorted_idx = np.argsort(feature_importance)
output_list = []
for val, feature in zip(feature_importance[sorted_idx], train_df.columns[sorted_idx]):
    output_list.append({'feature': feature, 'importance': val})
importance_df = pd.DataFrame(output_list)
importance_df = importance_df.sort_values('importance', ascending=False)

# now plot a chart
label = cat(columns='feature', sort=False)
p = Bar(importance_df, label=label, values='importance', legend=None)
p.axis[1].axis_label = 'Importance'
show(p)

Investigate Results



In [ ]:

    
# nb: we will now predict from the df
test_df = df[df['train_test'] == 'test']
test_df['lift_act'] = test_df['lift']

# Unox example filter
# mask = test_df['promotion_name'].str.lower().str.contains('plus')
mask = test_df['ep_desc'].str.lower().str.contains('robijn')
test_df = test_df[mask]

predict_df = test_df[prediction_model.features.keys()].copy()
predict_df = prediction_model.pre_processing(predict_df)
predict_lift = prediction_model.model.predict(predict_df)
if log:
    # if it was logarithmic, expand the lift again
    predict_lift = np.exp(predict_lift)
test_df['lift_pred'] = predict_lift

# check where we are off
test_df['count'] = 1
test_df['total_units_act'] = test_df['baseline_units'] * test_df['lift_act']
test_df['total_units_pred'] = test_df['baseline_units'] * test_df['lift_pred']
test_df['total_units_diff'] = test_df['total_units_act'] - test_df['total_units_pred']
test_df['total_units_abs_diff'] = np.abs(test_df['total_units_diff'])
test_df['total_units_perc_diff'] = test_df['total_units_abs_diff'] / test_df['total_units_pred']

# add total average account/product lift
total_df = test_df.groupby(['account_id', 'epid'], as_index=False)[['total_units_act', 'baseline_units']].sum()
total_df['lift_act_product_avg'] = total_df['total_units_act'] / total_df['baseline_units']
del total_df['total_units_act']
del total_df['baseline_units']
test_df = pd.merge(test_df, total_df, how='left')
del total_df



In [ ]:

    
# result: prediction vs actual -> plot + line etc. <- we will copy stuff from the investigation part
# plot scatter diagrams for correlation visualization
check_list = [('total_units_act', 'total_units_pred'), ('lift_act', 'lift_pred'), ('total_units_act', 'total_units_3')]

# now plot
for col_x, col_y in check_list:
    if test_df[col_x].dtype in [np.int64, np.float64] and test_df[col_y].dtype in [np.int64, np.float64]:
        print('Drawing Scatter Correlation for ' + col_x + ' and ' + col_y)
        p = Scatter(test_df, x=col_x, y=col_y)
        total_max = int(max(test_df[col_x].max(), test_df[col_y].max()))
        line_range = list(range(total_max))
        p.line(line_range, line_range)
        show(p)
    else:
        print('Both columns need to be numerical')



In [ ]:

    
# charts that check against variance
top_def = 20
check_list = ['discount_perc', 'account_banner', 'promotion_dimension_144', 'promotion_dimension_7', 'field_35060', 'product_prodh_long2']

# now plot
for col in check_list:
    if df[col].dtype in [np.int64, np.float64]:
        print('Drawing Average Difference + Nr Observations for ' + col)
        test_df['tmp_col'] = test_df[col].round() 
        group = test_df.groupby(['tmp_col'], as_index=False)['total_units_act', 'total_units_pred', 'count'].sum()
        del test_df['tmp_col']
        group['variance'] = group['total_units_pred'] / group['total_units_act'] * 100.0
        group = group.rename(columns={'tmp_col': col})
        p = Line(group, x=col, y='variance')
        show(p)
        p = Line(group, x=col, y='count', color='green')
        show(p) 
    else:
        print('Showing ' + col + ' Average Lift for ' + str(top_def) + ' most used')
        group = test_df.groupby([col], as_index=False)['total_units_act', 'total_units_pred', 'count'].sum()
        group[col] = group[col].str.encode('utf-8')
        group = group.nlargest(top_def, 'count')
        group['variance'] = group['total_units_pred'] / group['total_units_act'] * 100.0
        label = cat(columns=col, sort=False)
        p = Bar(group, label=label, values='variance', legend=None)
        p.axis[1].axis_label = 'Total Variance'
        show(p)



In [ ]:

    
# promotion product level differences
mask = (test_df['total_units_abs_diff'] / test_df['total_units_act'] > 0.3)
test_df[mask].sort_values('total_units_abs_diff', ascending=False)[0:100][['promotion_name', 'promotion_ext_id', 'ep_desc', 'second_placement_yn', 'baseline_units', 'total_units_pred', 'total_units_act', 'lift_pred', 'lift_act', 'lift_act_product_avg']][0:50]



In [ ]:

    
# promotion level differences
group_df = test_df.groupby(['promotion_name', 'promotion_ext_id'])[['total_units_act', 'total_units_pred', 'total_units_abs_diff']].sum()
group_df.sort_values('total_units_abs_diff', ascending=False)[0:100]



In [ ]:

    
mask = (test_df['total_units_act'] > 200000) & (test_df['total_units_act'] < 300000) & (test_df['total_units_pred'] < 150000)
test_df[mask][['promotion_name', 'promotion_ext_id', 'total_units_act', 'total_units_pred', 'total_units_abs_diff']]

Check Future Promotions



In [ ]:

    
mask = (future_df['promotion_ext_id'] == 29052) & (future_df['epid'].isin(['57c41eb57938e3ff051a9705', '57c41eb37938e3ff051a9682']))
future_df.loc[mask, 'second_placement_yn'] = True

predict_df = future_df.copy()

# select only relevant columns
predict_df = predict_df[prediction_model.features.keys()]

predict_df = prediction_model.pre_processing(predict_df)
predict_lift = prediction_model.model.predict(predict_df)
if log:
    # if it was logarithmic, expand the lift again
    predict_lift = np.exp(predict_lift)

future_df['lift_pred'] = predict_lift

# check where we are off
future_df['count'] = 1
future_df['total_units_act'] = future_df['total_units']
future_df['total_units_pred'] = future_df['baseline_units'] * future_df['lift_pred']
future_df['total_units_diff'] = future_df['total_units_act'] - future_df['total_units_pred']
future_df['total_units_abs_diff'] = np.abs(future_df['total_units_diff'])
future_df['total_units_perc_diff'] = future_df['total_units_abs_diff'] / future_df['total_units_pred']



In [ ]:

    
mask = future_df['promotion_ext_id'] == 29052
future_df[mask][['epid', 'ep_desc', 'second_placement_yn','baseline_units', 'total_units_pred', 'total_units_act', 'lift_pred', 'lift']].sort_values('baseline_units', ascending=False)



In [ ]:

    
# result: prediction vs actual -> plot + line etc. <- we will copy stuff from the investigation part
# plot scatter diagrams for correlation visualization
check_list = [('baseline_units', 'total_units_pred'), ('total_units_act', 'total_units_pred')]

# now plot
for col_x, col_y in check_list:
    if future_df[col_x].dtype in [np.int64, np.float64] and future_df[col_y].dtype in [np.int64, np.float64]:
        print('Drawing Scatter Correlation for ' + col_x + ' and ' + col_y)
        p = Scatter(future_df, x=col_x, y=col_y)
        show(p)
    else:
        print('Both columns need to be numerical')



In [ ]:

    
mask = (future_df['promotion_ext_id'] == 34310)
future_df[mask][['promotion_name', 'promotion_ext_id', 'baseline_units', 'total_units_act', 'total_units_pred', 'total_units_abs_diff']]



In [ ]:

    
future_df[mask].groupby(['promotion_name', 'promotion_ext_id'])[['baseline_units', 'total_units_act', 'total_units_pred', 'total_units_abs_diff']].sum()



In [ ]:

    
future_df.groupby(['promotion_name', 'promotion_ext_id', 'account_banner', 'yearweek'], as_index=False)[['baseline_units', 'total_units_act', 'total_units_pred', 'total_units_abs_diff']].sum().to_excel('/home/carst/ulnl_prediction_vs_actual.xlsx', index=False)

Save the Model

# save the model prediction_model.save()



In [ ]: