Supervised Learning
Unsupervised Learning
Time Series Data
Deep Learning
We have the challenge of designing the upcoming promotion campaign for a Soda Company. The intended objective is to bolster sales while at the same time obeying various business constraints.
In [1]:
import pandas
df_hist = pandas.read_excel("soda_sales_historical_data.xlsx")
df_hist[:5]
Out[1]:
In [2]:
df_hist.shape
Out[2]:
In [3]:
from pandas import DataFrame, get_dummies
categorical_columns = ['Product','Easter Included','Super Bowl Included',
'Christmas Included', 'Other Holiday']
df_hist = get_dummies(df_hist, prefix={k:"dmy_%s"%k for k in categorical_columns},
columns = list(categorical_columns))
df_hist[:5]
Out[3]:
In [4]:
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import BaggingRegressor
from sklearn import model_selection
experiments = {"Algorithm":["Ordinary Least Squares", "Regression Tree",
"Big Random Forest", "Random Forest",
"Bagging"],
"Objects" : [lambda : LinearRegression(),
lambda : DecisionTreeRegressor(),
lambda : RandomForestRegressor(n_estimators=100),
lambda : RandomForestRegressor(),
lambda : BaggingRegressor()],
"Predictions":[[] for _ in range(5)]}
actuals = []
In [5]:
from sklearn.model_selection import train_test_split
[_.shape for _ in train_test_split(df_hist.drop("Sales", axis=1),
df_hist["Sales"], test_size=0.25)]
Out[5]:
In [6]:
for _ in range (4):
train_X, test_X, train_y, test_y = (
train_test_split(df_hist.drop("Sales", axis=1),
df_hist["Sales"], test_size=0.25))
for i, obj_factory in enumerate(experiments["Objects"]):
obj = obj_factory()
obj.fit(y=train_y,X=train_X)
experiments["Predictions"][i] += list(obj.predict(test_X))
actuals += list(test_y)
actuals = pandas.Series(actuals)
experiments["Predictions"] = list(map(pandas.Series, experiments["Predictions"]))
In [7]:
len(actuals), map(len, experiments["Predictions"])
Out[7]:
In [8]:
%matplotlib inline
import numpy as np
import matplotlib.pyplot as plt
from matplotlib.pyplot import cm
color=iter(cm.rainbow(np.linspace(0,1,len(experiments)+3)))
plt.figure(figsize=(12,7),dpi=300)
plt.plot(actuals,actuals,c=next(color),markersize=2,label='Data')
for _, row in DataFrame(experiments).iterrows():
plt.plot(actuals, row["Predictions"],'o',c=next(color),
markersize=2,label=row['Algorithm'])
plt.title('Scatter Plot Prediction v/s Data')
plt.grid(True)
plt.legend()
plt.show()
In [9]:
color=iter(cm.rainbow(np.linspace(0,1,len(experiments)+3)))
next(color)
plt.figure(figsize=(13,8),dpi=300)
for index, row in DataFrame(experiments).iterrows():
relative_error = (row["Predictions"] - actuals) / (1 + abs(actuals))
plt.plot(np.sort(relative_error),'o',c=next(color),
markersize=2,label=row['Algorithm'])
plt.title('Relative Error Prediction v/s Data')
plt.ylabel('Relative Error')
plt.grid(True)
plt.legend()
plt.axis([0,len(actuals),-1,1])
plt.show()
In [10]:
def boxplot(algorithm):
prediction = (experiments["Predictions"]
[experiments["Algorithm"].index(algorithm)])
plt.title(algorithm)
plt.boxplot( (prediction - actuals) / (1 + abs(actuals)) )
plt.show()
In [11]:
boxplot("Bagging")
In [12]:
boxplot("Big Random Forest")
In [13]:
experiments["Results"] = []
for o in experiments["Objects"]:
experiments["Results"].append(
model_selection.cross_val_score(o(), y=df_hist['Sales'],
X=df_hist.drop("Sales", axis=1),
cv=5).mean())
In [14]:
DataFrame(experiments).drop(["Objects", "Predictions"],
axis=1).set_index("Algorithm")
Out[14]:
In [15]:
fitted = (experiments["Objects"]
[experiments["Algorithm"].index("Big Random Forest")]().
fit(y=df_hist["Sales"], X=df_hist.drop("Sales", axis=1)))
In [16]:
df_superbowl_original = pandas.read_excel("super_bowl_promotion_data.xlsx")
df_superbowl = get_dummies(df_superbowl_original,
prefix={k:"dmy_%s"%k for k in categorical_columns},
columns = list(categorical_columns))
assert "Sales" not in df_superbowl.columns
assert {"Sales"}.union(df_superbowl.columns).issubset(set(df_hist.columns))
len(df_superbowl)
Out[16]:
Note that the current data table might have less categorical range than the historical data.
In [17]:
for fld in set(df_hist.columns).difference(df_superbowl.columns, {"Sales"}):
assert fld.startswith("dmy_")
df_superbowl[fld] = 0
Take care!! sklearn
has no concept of columns. We make sure that the df_superbowl
columns are ordered consistently with the df_hist
independent column sub-matrix.
In [18]:
df_superbowl = df_superbowl[list(df_hist.drop("Sales", axis=1).columns)]
In [19]:
predicted = fitted.predict(df_superbowl)
In [20]:
forecast_sales = df_superbowl_original[["Product", "Cost Per Unit"]].copy()
forecast_sales["Sales"] = predicted
forecast_sales.set_index(['Product','Cost Per Unit'], inplace=True)
In [21]:
soda_family = {'11 Down': 'Clear', 'AB Root Beer': 'Dark',
'Alpine Stream': 'Clear', 'Bright': 'Clear',
'Crisp Clear': 'Clear', 'DC Kola': 'Dark',
'Koala Kola': 'Dark', 'Mr. Popper': 'Dark',
'Popsi Kola': 'Dark'}
family = set(soda_family[j] for j in soda_family)
soda = set(j for j in soda_family)
max_prom = {f:2 for f in family}
max_investment = 750
In [22]:
product_prices = set(forecast_sales.index.values)
normal_price = {b:0 for b in soda}
for b,p in product_prices:
normal_price[b] = max(normal_price[b],p)
Note that not all estimated discounts yield a boost in sales.
In [23]:
meaningful_discounts = 0
for b,p in product_prices:
if forecast_sales.Sales[b,p] > forecast_sales.Sales[b,normal_price[b]]:
meaningful_discounts += 1
meaningful_discounts, len(forecast_sales) - len(soda)
Out[23]:
In [24]:
import gurobipy as gu
model = gu.Model()
select_price = model.addVars(product_prices,vtype=gu.GRB.BINARY,name='X')
sales = model.addVar(name='sales')
revenue = model.addVar(name='revenue')
investment = model.addVar(ub=max_investment, name='investment')
gusum = gu.quicksum
In [25]:
model.addConstr(sales == select_price.prod(forecast_sales.Sales), name='sales')
model.addConstr(revenue == gusum(forecast_sales.Sales[b,p] * p *
select_price[b,p] for b,p in product_prices),
name='revenue')
model.addConstr(investment ==
gusum(max(0,forecast_sales.Sales[b,p] -
forecast_sales.Sales[b,normal_price[b]]) *
normal_price[b] * select_price[b,p]
for b,p in product_prices),
name='investment')
model.update()
In [26]:
model.addConstrs((select_price.sum(b,'*') == 1 for b in soda), name='OnePrice')
model.addConstrs((gusum(select_price[b,p] for b,p in product_prices if
soda_family[b] == f and p != normal_price[b] )
<= max_prom[f] for f in family),
name='MaxProm')
model.update()
In [27]:
model.setObjective(sales, sense=gu.GRB.MAXIMIZE)
model.optimize()
In [28]:
model.status == gu.GRB.OPTIMAL
Out[28]:
In [29]:
sales.X, revenue.X, investment.X
Out[29]:
In [30]:
price_selections = {"Product":[], "Price":[], "Is Discount":[], "Family":[]}
for b, p in product_prices:
if abs(select_price[b,p].X -1) < 0.0001: # i.e. almost one
price_selections["Product"].append(b)
price_selections["Price"].append(p)
price_selections["Is Discount"].append(p < normal_price[b])
price_selections["Family"].append(soda_family[b])
(DataFrame(price_selections).set_index("Product")
[["Price", "Is Discount", "Family"]].sort_values("Family"))
Out[30]:
In [31]:
simulated_KPI = {'Sales':[],'Revenue':[],'Investment':[]}
Z = select_price
num_infeas = 0
for i in range(100):
np.random.seed(i)
fitted = RandomForestRegressor(n_estimators=100,
n_jobs=4).fit(y=df_hist["Sales"],
X=df_hist.drop("Sales", axis=1))
forecast = df_superbowl_original[['Product', 'Cost Per Unit']].copy()
forecast["Sales"] = fitted.predict(df_superbowl)
forecast = forecast.set_index(['Product','Cost Per Unit'])
sales, revenue, investment = 0, 0, 0
for b,p in product_prices:
sales += forecast.Sales[b,p] * Z[b,p].X
revenue += forecast.Sales[b,p] * p * Z[b,p].X
investment += (max(0,forecast.Sales[b,p] -
forecast.Sales[b,normal_price[b]]) *
normal_price[b] * Z[b,p].X)
if investment > max_investment:
num_infeas += 1
simulated_KPI['Sales'].append(sales)
simulated_KPI['Revenue'].append(revenue)
simulated_KPI['Investment'].append(investment)
In [32]:
data = {'Sales','Revenue','Investment'}
color=iter(cm.rainbow(np.linspace(0,1,3)))
for t in data:
plt.figure(figsize=(7,4),dpi=300)
plt.hist(simulated_KPI[t],50,normed=1,color=next(color), alpha=0.75)
plt.ylabel('Probability')
plt.xlabel(t)
plt.grid(True)
plt.show()
In [33]:
num_infeas
Out[33]:
After the webinar we received a lot of requests for further material on this topic. The following list is an (incomplete) list of sources for these topics, but a good starting point for it.
The stochastic optimization society in their web site has several tutorials and further links.
I would like to specially thank (in alphabetic order) to Shabbir Ahmed, Georgia Tech, Tito Homem-de-mello, Universidad Adolfo Ibañez, Jim Luedtke, University of Wisconsin-Madison, David Morton, Northwestern University, and Bernardo Pagnoncelli, Universidad Adolfo Ibañez, for suggesting books, links and papers. Any omission or error is my fault.
Daniel Espinoza,
Senior Developer,
Gurobi Optimization Inc.