In [1]:
%matplotlib inline

In [2]:
from collections import namedtuple
import numpy as np
import scipy.stats as stats
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sbn

In [3]:
# set some seaborn aesthetics
sbn.set_palette("Set1")

In [4]:
# initialize random seed for reproducibility
np.random.seed(20160404)

Sums of squares functions

Let's start by writing a set of functions for calculating sums of squared deviations from the mean (also called "sums of squared differences"), or "sums-of-squares" for short.


In [5]:
def sum_squares_total(groups):
    """Calculate total sum of squares ignoring groups.
    
    groups should be a sequence of np.arrays representing the samples asssigned 
    to their respective groups.
    """
    allobs = np.ravel(groups)  # np.ravel collapses arrays or lists into a single list
    grandmean = np.mean(allobs)
    return np.sum((allobs - grandmean)**2)
    

def sum_squares_between(groups):
    """Between group sum of squares"""
    ns = np.array([len(g) for g in groups])
    grandmean = np.mean(np.ravel(groups))
    groupmeans = np.array([np.mean(g) for g in groups])
    return np.sum(ns * (groupmeans - grandmean)**2)


def sum_squares_within(groups):
    """Within group sum of squares"""
    groupmeans = np.array([np.mean(g) for g in groups])
    group_sumsquares = []
    for i in range(len(groups)):
        groupi = np.asarray(groups[i])
        groupmeani = groupmeans[i]
        group_sumsquares.append(np.sum((groupi - groupmeani)**2))
    return np.sum(group_sumsquares)
        
    
def degrees_freedom(groups):
    """Calculate the """
    N = len(np.ravel(groups))
    k = len(groups)
    return (k-1, N - k, N - 1)

In [6]:
def ANOVA_oneway(groups):
    index = ['BtwGroup', 'WithinGroup', 'Total']
    cols = ['df', 'SumSquares','MS','F','pval']
    
    df   = degrees_freedom(groups)
    ss   = sum_squares_between(groups), sum_squares_within(groups), sum_squares_total(groups)
    ms   = ss[0]/df[0], ss[1]/df[1], ""
    F    = ms[0]/ms[1], "", ""
    pval = stats.f.sf(F[0], df[0], df[1]), "", ""
    
    tbl = pd.DataFrame(index=index, columns=cols)
    tbl.index.name = 'Source'
    tbl.df = df
    tbl.SumSquares = ss
    tbl.MS = ms
    tbl.F = F
    tbl.pval = pval
    
    return tbl
    
def ANOVA_R2(anovatbl):
    SSwin = anovatbl.SumSquares[1]
    SStot = anovatbl.SumSquares[2]
    return (1.0 - (SSwin/SStot))

Simulate ANOVA under the null hypothesis of no difference in group means


In [7]:
## simulate one way ANOVA under the null hypothesis of no 
## difference in group means

groupmeans = [0, 0, 0, 0]
k = len(groupmeans) # number of groups
groupstds = [1] * k  # standard deviations equal across groups
n = 25 # sample size

# generate samples
samples = [stats.norm.rvs(loc=i, scale=j, size = n) for (i,j) in zip(groupmeans,groupstds)]
allobs = np.concatenate(samples)

Draw a figure to illustrate the within group distributions and the total distribution.


In [8]:
sbn.set_palette("deep")

bins = np.linspace(-3, 3, 10)
fig, axes = plt.subplots(1, 5, figsize=(20,5), sharex=True, sharey=True)

for i, sample in enumerate(samples):
    axes[i].hist(sample, bins=bins, histtype='stepfilled', 
                 linewidth=1.5, label='sample {}'.format(i+1))
    axes[i].legend()

axes[-1].hist(allobs, bins=bins, histtype='stepfilled', linewidth=2, label='all data')
axes[-1].legend()
axes[-1].set_title("All data combined", fontsize=20)

axes[0].set_ylabel("Frequency", fontsize=20)
axes[2].set_xlabel("X", fontsize=20)

fig.tight_layout()
pass


Calculate the sums of squares


In [9]:
SSbtw = sum_squares_between(samples)
SSwin = sum_squares_within(samples)
SStot = sum_squares_total(samples)

print("SS between:", SSbtw)
print("SS within:", SSwin)
print("SS total:", SStot)


SS between: 3.38988403571
SS within: 73.5951590182
SS total: 76.9850430539

Generate ANOVA table:


In [10]:
ANOVA_oneway(samples)


Out[10]:
df SumSquares MS F pval
Source
BtwGroup 3 3.389884 1.12996 1.47396 0.226502
WithinGroup 96 73.595159 0.766616
Total 99 76.985043

Simulating ANOVA under $H_A$


In [11]:
groupmeans = [0, 0, -1, 1]
k = len(groupmeans) # number of groups
groupstds = [1] * k  # standard deviations equal across groups
n = 25 # sample size

# generate samples
samples = [stats.norm.rvs(loc=i, scale=j, size = n) for (i,j) in zip(groupmeans,groupstds)]
allobs = np.concatenate(samples)

In [12]:
sbn.set_palette("deep")

bins = np.linspace(-3, 3, 10)
fig, axes = plt.subplots(1, 5, figsize=(20,5), sharex=True, sharey=True)

for i, sample in enumerate(samples):
    axes[i].hist(sample, bins=bins, histtype='stepfilled', 
                 linewidth=1.5, label='sample {}'.format(i+1))
    axes[i].legend()

axes[-1].hist(allobs, bins=bins, histtype='stepfilled', linewidth=2, label='all data')
axes[-1].legend()
axes[-1].set_title("All data combined", fontsize=20)

axes[0].set_ylabel("Frequency", fontsize=20)
axes[2].set_xlabel("X", fontsize=20)

fig.tight_layout()
pass



In [13]:
SSbtw = sum_squares_between(samples)
SSwin = sum_squares_within(samples)
SStot = sum_squares_total(samples)

print("SS between:", SSbtw)
print("SS within:", SSwin)
print("SS total:", SStot)


SS between: 58.3506974287
SS within: 103.063491836
SS total: 161.414189265

In [14]:
tbl = ANOVA_oneway(samples)
tbl


Out[14]:
df SumSquares MS F pval
Source
BtwGroup 3 58.350697 19.4502 18.1172 2.14268e-09
WithinGroup 96 103.063492 1.07358
Total 99 161.414189

Calculate the value of $R^2$ for our ANOVA model:


In [15]:
ANOVA_R2(tbl)


Out[15]:
0.36149670418983804

scipy.stats has an f_oneway function for calculating the f statistic and assocated p-value for a one-way ANOVA. Let's compare oure result above to the implementation in scipy.stats


In [16]:
# f_oneway expects the samples to be passed in as individual arguments
# this *samples notation "unpacks" the list of samples, treating each as 
# an argument to the function.
# see: https://docs.python.org/3/tutorial/controlflow.html#unpacking-argument-lists

stats.f_oneway(*samples)


Out[16]:
F_onewayResult(statistic=18.117204108393736, pvalue=2.1426767523356996e-09)

Anderson's Iris Data revisited


In [17]:
irisurl = "https://raw.githubusercontent.com/Bio204-class/bio204-datasets/master/iris.csv"
iris = pd.read_csv(irisurl)

In [18]:
iris.head()


Out[18]:
Sepal.Length Sepal.Width Petal.Length Petal.Width Species
0 5.1 3.5 1.4 0.2 setosa
1 4.9 3.0 1.4 0.2 setosa
2 4.7 3.2 1.3 0.2 setosa
3 4.6 3.1 1.5 0.2 setosa
4 5.0 3.6 1.4 0.2 setosa

In [19]:
sbn.distplot(iris["Sepal.Length"])
pass



In [20]:
sbn.violinplot(x="Species", y="Sepal.Length", data=iris)
pass



In [21]:
setosa = iris[iris.Species =='setosa']
versicolor = iris[iris.Species=='versicolor']
virginica = iris[iris.Species == 'virginica']

In [22]:
ANOVA_oneway([setosa['Sepal.Length'], versicolor['Sepal.Length'],
            virginica['Sepal.Length']])


Out[22]:
df SumSquares MS F pval
Source
BtwGroup 2 63.212133 31.6061 119.265 1.66967e-31
WithinGroup 147 38.956200 0.265008
Total 149 102.168333

In [ ]: