In [ ]:
# import packages here to reduce the size of code cells later

import pandas as pd
from prettypandas import PrettyPandas
import patsy

import numpy as np
import scipy.stats
import statsmodels.formula.api
import statsmodels.api as sm

from graphviz import Digraph
import seaborn as sns

import dexpy.factorial
import dexpy.alias
import dexpy.power

from mpl_toolkits.mplot3d import Axes3D
import matplotlib.pyplot as plt
from matplotlib import patches

from IPython.display import display, Markdown, HTML

In [ ]:
from IPython.core.display import HTML
HTML('''<script>
code_show=true; 
function code_toggle() {
 if (code_show){
 $('div.input').hide();
 } else {
 $('div.input').show();
 }
 code_show = !code_show
} 
$( document ).ready(code_toggle);
</script>
The raw code for this IPython notebook is by default hidden for easier reading.
To toggle on/off the raw code, click <a href="javascript:code_toggle()">here</a>.''')

In [ ]:
# helper functions for this notebook

# use SVG for matplotlib-based figures
%matplotlib inline
%config InlineBackend.figure_format = 'svg'

def coded_to_actual(coded_data, actual_lows, actual_highs):
    """Converts a pandas DataFrame from coded units to actuals."""
    actual_data = coded_data.copy()
    for col in actual_data.columns:
        if not (col in actual_highs and col in actual_lows):
            continue
        try:
            # convert continuous variables to their actual value
            actual_data[col] *= 0.5 * (float(actual_highs[col]) - float(actual_lows[col]))
            # don't need to cast to float here, if either are not a float exception will have been thrown
            actual_data[col] += 0.5 * (actual_highs[col] + actual_lows[col])
        except ValueError:
            # assume 2 level categorical
            actual_data[col] = actual_data[col].map({-1: actual_lows[col], 1: actual_highs[col]})
    return actual_data
        
def get_tick_labels(key, lows, highs, units):
    """Returns a list of low/high labels with units (e.g. [8mm, 10mm])"""
    return [str(lows[key]) + units[key], str(highs[key]) + units[key]]

Design of Experiments in Python

Hank Anderson
hank@statease.com
https://github.com/hpanderson/dexpy-pymntos

Agenda

  • What is Design of Experiments?
  • Design an experiment with dexpy to improve the office coffee
  • Measure the power of our experiment
  • Model the taste test results using statsmodels
  • Visualize the data with seaborn and matplotlib to find the best pot of coffee

What is Design of Experiments (DoE)?

A systematic series of tests, in which purposeful changes are made to input factors, so that you may identify causes for significant changes in the output responses.

NIST has a nice primer on DOE.


In [ ]:
dot = Digraph(comment='Design of Experiments')
dot.body.extend(['rankdir=LR', 'size="10,10"'])
dot.node_attr.update(shape='rectangle', style='filled', fontsize='20', fontname="helvetica")

dot.node('X', 'Controllable Factors', color='mediumseagreen', width='3')
dot.node('Z', 'Noise Factors', color='indianred2', width='3')
dot.node('P', 'Process', color='lightblue', height='1.25', width='3')
dot.node('Y', 'Responses', color='lightblue')

dot.edges(['XP', 'ZP', 'PY'] * 3)

dot

What Is It For?

  • Screening Experiments - Determining which inputs to your system are important
  • Modeling a Process - Gain a better understanding of your system
  • Optimization - Find the combination of inputs that results in a better output
  • Robustness - Find a combination of inputs that produces a consistent result

DoE in Python: dexpy

  • Based on Design-Expert® software, a package for design and analysis of industrial experiments
  • Apache2 licensed, pure python (for now), available on pypi
  • Requires numpy, scipy, pandas and patsy
  • Recommend statsmodels for analysis, matplotlib and seaborn for visualization
  • Other alternatives are:

Motivating Example: Better Office Coffee

  • Current coffee is subpar ("disgusting and unacceptable")
  • Need to answer the following questions via experimentation:
    • What coffee beans to use?
    • How much coffee to use?
    • How to grind the coffee?

Motivating Example: Better Office Coffee

  • 5 input factors
    • Amount of Coffee (2.5 to 4.0 oz.)
    • Grind size (8-10mm)
    • Brew time (3.5 to 4.5 minutes)
    • Grind Type (burr vs blade)
    • Coffee beans (light vs dark)
  • 1 response: Average overall liking by a panel of 5 office coffee addicts
    • Each taster rates the coffee from 1-9
  • Maximum of 3 taste tests a day, for liability reasons

In [ ]:
# set some variables related to the coffee data set
actual_lows = { 'amount' : 2.5, 'grind_size' : 8, 'brew_time': 3.5, 'grind_type': 'burr', 'beans': 'light' }
actual_highs = { 'amount' : 4, 'grind_size' : 10, 'brew_time': 4.5, 'grind_type': 'blade', 'beans': 'dark' }
units = { 'amount' : 'oz', 'grind_size' : 'mm', 'brew_time': 'm', 'grind_type': '', 'beans': '' }
         ((((
        ((((
         ))))
      _ .---.
     ( |`---'|
      \|     |
      : `---' :
       `-----'

One Factor at a Time Experimentation

  • Hold all inputs constant, then change one at a time
  • Cannot detect interactions, e.g. grind size and amount of coffee have a synergistic relationship
  • Inefficient

In [ ]:
points = [
    [-1, -1],
    [-1, 1],
    [1, -1],
    [-1, -1],
    [-1, 1],
    [1, -1],
]
df = pd.DataFrame(points, columns=['grind_size', 'amount'])
fg = sns.lmplot('amount', 'grind_size', data=df, fit_reg=False)

p = patches.Polygon(points, color="navy", alpha=0.3, lw=2)
ax = fg.axes[0, 0]
ax.add_patch(p)
ax.set_xticks([-1, 1])
ax.set_xticklabels(get_tick_labels('amount', actual_lows, actual_highs, units))
ax.set_yticks([-1, 1])
ax.set_yticklabels(get_tick_labels('grind_size', actual_lows, actual_highs, units))

p = patches.Polygon([[-1, 1], [1, 1], [1, -1]], color="firebrick", alpha=0.3, lw=2)
p = ax.add_patch(p)

In [ ]:
cube_design = dexpy.factorial.build_factorial(3, 8)

points = np.array(cube_design)
fig = plt.figure()

ax = fig.add_subplot(111, projection='3d', axisbg='w')
ax.view_init(30, -60) # rotate plot

X, Y = np.meshgrid([-1,1], [-1,1])

cube_alpha = 0.1
ax.plot_surface(X, Y, 1, alpha=cube_alpha, color="r")
ax.plot_surface(X, Y, -1, alpha=cube_alpha)
ax.plot_surface(X, -1, Y, alpha=cube_alpha)
ax.plot_surface(X, 1, Y, alpha=cube_alpha, color="r")
ax.plot_surface(1, X, Y, alpha=cube_alpha, color="r")
ax.plot_surface(-1, X, Y, alpha=cube_alpha)
ax.scatter3D(points[:, 0], points[:, 1], points[:, 2],
             c=["b", "b", "b", "r", "b", "r", "r", "r"])

ax.set_xticks([-1, 1])
ax.set_xticklabels(get_tick_labels('grind_size', actual_lows, actual_highs, units))
ax.set_yticks([-1, 1])
ax.set_yticklabels(get_tick_labels('amount', actual_lows, actual_highs, units))
ax.set_zticks([-1, 1])
ax.set_zticklabels(get_tick_labels('beans', actual_lows, actual_highs, units))

plt.show()

Factorial Design

  • Change multiple inputs at once
  • Reveals interactions
  • Maximizes information with minimum runs

In [ ]:
df = dexpy.factorial.build_factorial(2, 4)
df.columns = ['amount', 'grind_size']

fg = sns.lmplot('amount', 'grind_size', data=df, fit_reg=False)
ax = fg.axes[0, 0]
ax.set_xticks([-1, 1])
ax.set_xticklabels(get_tick_labels('amount', actual_lows, actual_highs, units))
ax.set_yticks([-1, 1])
ax.set_yticklabels(get_tick_labels('grind_size', actual_lows, actual_highs, units))
p = ax.add_patch(patches.Rectangle((-1, -1), 2, 2, color="navy", alpha=0.3, lw=2))

In [ ]:
cube_design = dexpy.factorial.build_factorial(3, 8)

points = np.array(cube_design)
fig = plt.figure()

ax = fig.add_subplot(111, projection='3d', axisbg='w')
ax.view_init(30, -60) # rotate plot

X, Y = np.meshgrid([-1,1], [-1,1])

cube_alpha = 0.1
ax.plot_surface(X, Y, 1, alpha=cube_alpha)
ax.plot_surface(X, Y, -1, alpha=cube_alpha)
ax.plot_surface(X, -1, Y, alpha=cube_alpha)
ax.plot_surface(X, 1, Y, alpha=cube_alpha)
ax.plot_surface(1, X, Y, alpha=cube_alpha)
ax.plot_surface(-1, X, Y, alpha=cube_alpha)
ax.scatter3D(points[:, 0], points[:, 1], points[:, 2], c="b")

ax.set_xticks([-1, 1])
ax.set_xticklabels(["8mm", "10mm"])
ax.set_yticks([-1, 1])
ax.set_yticklabels(["2.5oz", "4oz"])
ax.set_zticks([-1, 1])
ax.set_zticklabels(["light", "dark"])

plt.show()

Statistical Power

The probability that a design will detect an active effect.

Effect? Retain H0 Reject H0
No OK Type 1 Error
Yes Type II Error OK

Power is expressed as a probability to detect an effect of size Δ, given noise σ. This is typically given as a delta to sigma ratio Δ/σ. Power is a function of the signal to noise ratio, as well as the number and layout of experiments in the design.


In [ ]:
runs = 50
delta = 0.5
sigma = 1.5
alpha = 0.05

one_factor = pd.DataFrame([ -1, 1 ] * runs, columns=['beans'])
one_factor_power = dexpy.power.f_power('beans', one_factor, delta/sigma, alpha)

display(Markdown('''
# Power Example
{} pots of coffee are tested with light beans, then {} pots with dark beans.
There is a variance of {} taste rating from pot to pot. If we expect a {} change
in the taste rating when going from light to dark, what is the likelihood we would detect it?
(Answer: **{:.2f}%**)

Note: this assumes that we reject H<sub>0</sub> at p <= {}
'''.format(int(runs / 2), int(runs / 2), sigma, delta, one_factor_power[1]*100, alpha)
))

def plot_shift(runs, delta, sigma, annotate=False):
    """Plots two sets of random normal data, one shifted up delta units."""
    mean = 5
    low = sigma*np.random.randn(int(runs/2),1)+mean
    high = sigma*np.random.randn(int(runs/2),1)+mean+delta
    fig = plt.figure()
    ax = fig.add_subplot(111)
    ax.set_ylabel("taste")
    ax.set_xlabel("runs")
    ax.set_ylim([0, 11])
    
    plt.plot(np.concatenate([low, high]))
    plt.plot([0, (runs/2)], [low.mean()] * 2, color='firebrick', lw=2)
    plt.plot([(runs/2), runs-1], [high.mean()] * 2, color='g', lw=2)
    
    p_value = scipy.stats.f_oneway(low, high).pvalue[0]
    if annotate:
        plt.annotate("p: {:.5f}".format(p_value),
                     xy=(runs / 2, (low.mean() + high.mean()) / 2), xycoords='data',
                     xytext=(.8, .9), textcoords='axes fraction',
                     bbox=dict(boxstyle="round4", fc="w"),
                     arrowprops=dict(arrowstyle='-[', linewidth=2, color='black', connectionstyle="angle3")
                     )

    plt.show()
    return [low, high]

low, high = plot_shift(runs, delta, sigma)

In [ ]:
increased_delta = delta*4
increased_delta_power = dexpy.power.f_power('beans', one_factor, increased_delta/sigma, alpha)

display(Markdown('''
# Power Example - Increase Delta
What if we don't care about a taste increase of 0.5? That's not that much better
than the current coffee, after all. Instead, if we say we only care about a change
in rating above {}, what is the likelihood we would detect such a change?
(Answer: **{:.5f}%**)
'''.format(increased_delta, increased_delta_power[1]*100)
))

_ = plot_shift(runs, increased_delta, sigma)

In [ ]:
decreased_sigma = sigma*0.5
decreased_sigma_power = dexpy.power.f_power('beans', one_factor, delta/decreased_sigma, alpha)

display(Markdown('''
# Power Example - Decrease Noise
Instead of lowering our standards for our noisy taste ratings, instead
we could bring in expert testers who have a much more accurate palate.
If we assume a decrease in noise from {} to {}, then we can detect a
change in rating of {} with {:.2f}% probability.
'''.format(sigma, decreased_sigma, delta, decreased_sigma_power[1]*100)
))

_ = plot_shift(runs, delta, sigma*0.1)

In [ ]:
increased_runs = runs * 4
one_factor = pd.DataFrame([ -1, 1 ] * increased_runs, columns=['beans'])
increased_runs_power = dexpy.power.f_power('beans', one_factor, delta/sigma, alpha)

display(Markdown('''
# Power Example - Increase Runs
If expert testers are too expensive, and we are unwilling to compromise
our standards, then the only remaining option is to create a more powerful
design. In this toy example, there isn't much we can do, since it's
just one factor. Increasing the runs from {} to {} gives a power of
{:.2f}%. This may be a more acceptable success rate than the original power
of {:.2f}%, however... that is a lot of coffee to drink.

For more complicated designs changing the structure of the design
can also increase power.
'''.format(runs, increased_runs, increased_runs_power[1]*100, one_factor_power[1]*100)
))
_ = plot_shift(increased_runs, delta, sigma)

In [ ]:
help(dexpy.power.f_power)

In [ ]:
base_point = [-1, -1, -1, -1, -1]
ofat_points = [base_point]

for i in range(0, 5):
    new_point = base_point[:]
    new_point[i] = 1
    ofat_points.append(new_point)

sn = 2.0
alpha = 0.05

ofat_df = pd.DataFrame(ofat_points*5, columns=['amount', 'grind_size', 'brew_time', 'grind_type', 'beans'])
model = ' + '.join(ofat_df.columns)
ofat_power = dexpy.power.f_power('+'.join(ofat_df.columns), ofat_df, sn, alpha)
ofat_power.pop(0) # remove intercept
ofat_power = ['{0:.2f}%'.format(i*100) for i in ofat_power] # convert to %
ofat_power = pd.DataFrame(ofat_power, columns=['Power'], index=ofat_df.columns)

display(Markdown('''
# Calculating Power with dexpy: OFAT
* {} total runs, with a signal to noise ratio of 2.
* Model: `{}`
'''.format(len(ofat_df), model)))

display(PrettyPandas(ofat_power))

In [ ]:
full_design = dexpy.factorial.build_factorial(5, 2**5)
full_design.columns = ['amount', 'grind_size', 'brew_time', 'grind_type', 'beans']
factorial_power = dexpy.power.f_power(model, full_design, sn, alpha)
factorial_power.pop(0)
factorial_power = ['{0:.2f}%'.format(i*100) for i in factorial_power] # convert to %
factorial_power = pd.DataFrame(factorial_power, columns=['Power'], index=full_design.columns)

display(Markdown('''
# Calculating Power with dexpy: Factorial
* {} total runs, with a signal to noise ratio of 2.
* Model (`patsy` for: `{}`
'''.format(len(full_design), model)))
display(PrettyPandas(factorial_power))

Fractional Factorials

  • Coffee experiment is 25 runs (32)
  • We want to add 4 center point runs to check for curvature
  • Total runs = 36, 3 per day if all testers are in the office
  • Estimate experiment will take a month

Fractional Factorials

  • Power for the experiment is > 99%
  • Full factorial is overkill
  • Instead run 25-1 experiments, a "half fraction"

In [ ]:
help(dexpy.factorial.build_factorial)

In [ ]:
coffee_design = dexpy.factorial.build_factorial(5, 2**(5-1))
coffee_design.columns = ['amount', 'grind_size', 'brew_time', 'grind_type', 'beans']
center_points = [
    [0, 0, 0, -1, -1],
    [0, 0, 0, -1, 1],
    [0, 0, 0, 1, -1],
    [0, 0, 0, 1, 1]
]

coffee_design = coffee_design.append(pd.DataFrame(center_points * 2, columns=coffee_design.columns))
coffee_design.index = np.arange(0, len(coffee_design))

actual_design = coded_to_actual(coffee_design, actual_lows, actual_highs)

display(Markdown("## 2<sup>(5-1)</sup> Factorial Design"))
display(PrettyPandas(actual_design))

In [ ]:
model = ' + '.join(coffee_design.columns)
factorial_power = dexpy.power.f_power(model, coffee_design, sn, alpha)
factorial_power.pop(0)
factorial_power = ['{0:.2f}%'.format(i*100) for i in factorial_power] # convert to %
factorial_power = pd.DataFrame(factorial_power, columns=['Power'], index=coffee_design.columns)

display(Markdown('''
## 2<sup>(5-1)</sup> Factorial Power
* Power for {} total runs
* Signal to noise ratio of 2
* Model: `{}`
'''.format(len(coffee_design), model)))
display(PrettyPandas(factorial_power))

In [ ]:
twofi_model = "(" + '+'.join(coffee_design.columns) + ")**2"
desc = patsy.ModelDesc.from_formula(twofi_model)
factorial_power = dexpy.power.f_power(twofi_model, coffee_design, sn, alpha)
factorial_power.pop(0)
factorial_power = ['{0:.2f}%'.format(i*100) for i in factorial_power] # convert to %
factorial_power = pd.DataFrame(factorial_power, columns=['Power'], index=desc.describe().strip("~ ").split(" + "))

display(Markdown('''
## 2<sup>(5-1)</sup> Factorial Power
* Power for {} total runs
* Signal to noise ratio of 2
* Model: `{}`
'''.format(len(coffee_design), twofi_model)))
display(Markdown("## Power for {} total runs, with a signal to noise ratio of 2.".format(len(coffee_design))))
display(PrettyPandas(factorial_power))

In [ ]:
display(Markdown('''
# Aliasing

When you don't run all combinations of inputs, you lose the ability to estimate terms
For example, if you have three inputs the full model matrix looks like this:
'''))

three_factor_design = dexpy.factorial.build_factorial(3, 8)
X = patsy.dmatrix("(" + " + ".join(three_factor_design.columns) + ")**3", three_factor_design, return_type='dataframe')
display(PrettyPandas(X))

In [ ]:
display(Markdown('''
## Aliasing

If we remove half the runs, so that the design is 2<sup>3-1</sup>:
'''))

X.loc[[0,3,5,6]] = ''
display(PrettyPandas(X))
      
display(Markdown('''
You can see that A*B*C never changes. In addition, A = BC and B = AC.
When a term is a linear combination of another term that is called an **alias**. Aliased terms are unestimable.
'''))

Calculating Aliases in dexpy

dexpy can calculate the terms that are linear combinations of other terms.

https://statease.github.io/dexpy/evaluation.html#alias-list


In [ ]:
help(dexpy.alias.alias_list)

In [ ]:
display(Markdown('''
## Calculating Aliases in dexpy

Here is what that alias list looks like for our Coffee Experiment:
'''))
    
full_model = "(" + '+'.join(coffee_design.columns) + ")**3"
aliases, _ = dexpy.alias.alias_list(full_model, coffee_design)
for a in aliases:
    print(a)
    
display(Markdown('''
As you can see, we lose the ability to estimate some three factor interactions separately from the two factor interactions.
This is not a cause for great concern as the three factor interactions are rare in practice.
'''))

Analysis

  • statsmodels has lots of routines for modeling data
  • We will use Ordinary Least Squares (OLS) to fit
  • statsmodels typically takes numpy arrays for X and y data
  • It also has a "formulas" api that accepts a patsy formula

In [ ]:
# enter response data here

coffee_design['taste_rating'] = [
    4.53, 1.6336, 1.363, 8.7, 1.679, 2.895, 7.341, 3.642, # A low
    6.974, 3.398, 3.913, 9.04, 5.092, 3.718, 8.227, 6.992, # A high
    4.419, 6.806, 3.512, 5.36, 4.865, 6.342, 4.38, 5.942 # center points
]

In [ ]:
lm = statsmodels.formula.api.ols("taste_rating ~" + twofi_model, data=coffee_design).fit()
print(lm.summary2())

In [ ]:
reduced_model = "amount + grind_size + brew_time + grind_type + beans + amount:beans + grind_size:brew_time + grind_size:grind_type"
lm = statsmodels.formula.api.ols("taste_rating ~" + reduced_model, data=coffee_design).fit()
print(lm.summary2())

Visualization

  • seaborn is built on top of matplotlib and adds support for pandas dataframes
  • Can build a plot using seaborn, then manipulate it with matplotlib
  • Default themes look a lot nicer than what you get from matplotlib out of the box

In [ ]:
display(Markdown('''
If we take the experiment data from the design and use our new model to fit that data, then plot it against
the observed values we can get an idea for how well our model predicts. Points above the 45 degree line are
overpredicting for that combination of inputs. Points below the line predict a lower taste rating than
we actually measured during the experiment.'''))

actual_predicted = pd.DataFrame({ 'actual': coffee_design['taste_rating'],
                                  'predicted': lm.fittedvalues
                                }, index=np.arange(len(coffee_design['taste_rating'])))
fg = sns.FacetGrid(actual_predicted, size=5)
fg.map(plt.scatter, 'actual', 'predicted')
ax = fg.axes[0, 0]
ax.plot([1, 10], [1, 10], color='g', lw=2)
ax.set_xticks(np.arange(1, 11))
ax.set_xlim([0, 11])
ax.set_yticks(np.arange(1, 11))
ax.set_title('Actual vs Predicted')
_ = ax.set_ylim([0, 11])

In [ ]:
display(Markdown('''
Plotting the prediction for two factors at once shows how they interact with each other.
In this graph you can see that at the low brew time the larger grind size results in
a poor taste rating, likely because the coffee is too weak.'''))

pred_points = pd.DataFrame(1, columns = coffee_design.columns, index=np.arange(4))
pred_points.loc[1,'grind_size'] = -1
pred_points.loc[3,'grind_size'] = -1
pred_points.loc[2,'brew_time'] = -1
pred_points.loc[3,'brew_time'] = -1
pred_points['taste_rating'] = lm.predict(pred_points)
pred_points = coded_to_actual(pred_points, actual_lows, actual_highs)

fg = sns.factorplot('grind_size', 'taste_rating', hue='brew_time', kind='point', data=pred_points)
ax = fg.axes[0, 0]
ax.set_xticklabels(get_tick_labels('grind_size', actual_lows, actual_highs, units))
_ = ax.set_title('Grind Size/Brew Time Interaction')

In [ ]:
display(Markdown('''
This graph contains the prediction with the highest taste rating, 7.72.
However, if you look at the dark bean line there is a point where we can get
a rating of 6.93 with 2.5oz of grounds.
'''))

pred_points = pd.DataFrame(1, columns = coffee_design.columns, index=np.arange(4))
pred_points.loc[1,'amount'] = -1
pred_points.loc[3,'amount'] = -1
pred_points.loc[2,'beans'] = -1
pred_points.loc[3,'beans'] = -1
pred_points['taste_rating'] = lm.predict(pred_points)
pred_points = coded_to_actual(pred_points, actual_lows, actual_highs)

fg = sns.factorplot('amount', 'taste_rating', hue='beans', kind='point', palette={'dark': 'maroon', 'light': 'peru'}, data=pred_points)
ax = fg.axes[0, 0]
ax.set_xticklabels(get_tick_labels('amount', actual_lows, actual_highs, units))
ax.set_title('Amount/Beans Interaction')
plt.show()

display(PrettyPandas(pred_points))
display(Markdown('''That savings of 1.5oz per pot would create a nice surplus in the coffee budget at the end of the year.'''))

The End