notebook.community

Edit and run



In [ ]:

    
# Do imports and define a plotting function

import matplotlib
from matplotlib import pyplot as plt
%matplotlib inline

import numpy as np
from scipy.stats import norm

def plot_confidence_dist(ax, data, x, x_full, m, n, mean, sd, color, y_max):
    ax.plot(x_full+mean, norm.pdf(x_full, scale=sd), 'k')
    ax.fill_between(x+mean, norm.pdf(x, scale=sd), 0*x, np.ones(m), facecolor=color, alpha=0.5)
    sH = ax.scatter(data, np.zeros(n), c='k')
    sample_mean = np.mean(data)
    mH, = ax.plot((sample_mean, sample_mean), (-0.1, y_max+0.1),  'g' , linestyle='--')
    tH, = ax.plot((0, 0), (-0.1, y_max+0.1),  'k')
    proxyArtist = matplotlib.patches.Rectangle((0,0), 0,0, facecolor=color, alpha=0.5)
    return sH, tH, mH, proxyArtist



In [229]:

    
# Demonstrate the symmetry of a standard hypothesis test and confidence interval
# when using a normal approximation

# generate some data
n = 20
alpha = 0.05
true_mean = 0
data = norm.rvs(loc=true_mean, size=n)
mean = np.mean(data)
se = np.sqrt(data.var()/n)
z = norm.ppf(1-alpha/2.0)
p = 2*norm.cdf(-np.abs(mean/se))

# choose a reasonable plot size
y_max = norm.pdf(0, scale=se)
x_range_abs = z*se*3
x_full = np.arange(-x_range_abs, x_range_abs, 0.01)
x_alpha = np.arange(true_mean - z * se, true_mean + z * se + 0.01, 0.01)
m = len(x_alpha)

# generate the plots
f, (ax1, ax2, ax3) = plt.subplots(3, sharex=True, sharey=True, figsize=(4,7))
sH, tH, mH, proxyArtistBlue = plot_confidence_dist(ax1, data, x_alpha, x_full, m, n, true_mean, se, 'blue', y_max)
sH, tH, mH, proxyArtistRed = plot_confidence_dist(ax2, data, x_alpha, x_full, m, n, mean, se, 'red', y_max)
plot_confidence_dist(ax3, data, x_alpha, x_full, m, n, true_mean, se, 'blue', y_max)
plot_confidence_dist(ax3, data, x_alpha, x_full, m, n, mean, se, 'red', y_max)

f.subplots_adjust(hspace=0)
f.subplots_adjust(wspace=0)

ax1.text((-x_range_abs*1.2+min(mean,0))*0.9, y_max*0.9, 'p={:0.2f}'.format(p))
ax3.set_xlim(-x_range_abs*1.2+min(mean,0), x_range_abs*1.2+max(mean,0))
ax3.set_ylim(-0.1, y_max+0.1)
lgd = ax3.legend([sH, tH, mH, proxyArtistBlue, proxyArtistRed],
           ['samples', 'true mean', 'sample mean', 'p > 0.05', '95% CI'],
           loc='upper center', bbox_to_anchor=(0.5, -0.13), ncol=2)



In [243]:

    
# Further demonstrate this relationship, and the probability have having a false discovery,
# using repeated simulations

n = 100
n_trials = 20
alpha = 0.05

pvals = []
means = []
intervals = np.zeros([n_trials, 2])
points = np.zeros([n_trials, n])

# generate data, with a p-value and confidence interval for each
for i in range(n_trials):
    X = np.random.randn(n)
    points[i,:] = X
    
    mean = X.mean()
    se = np.sqrt(X.var()/float(n))
    z = norm.ppf(1-alpha/2.0)
    C_lower = mean - se * z
    C_upper = mean + se * z
    p = 2*norm.cdf(-np.abs(mean/se))
    
    pvals.append(p)
    means.append(mean)
    intervals[i,:] = [C_lower, C_upper]

order = np.argsort(means)

# generate plots
f, (ax1, ax2) = plt.subplots(2, sharex=True, sharey=False, figsize=(4, 7))

for i in range(n_trials):
    ax1.plot(intervals[order[i],:], (i,i), c='k')
    ax1.scatter(means[order[i]], i, c='k')
    #ax1.text(np.min(intervals)-0.13, i-0.02, 'p={:0.2f}'.format(pvals[order[i]]))
ax1.plot((0, 0), (-1, n_trials), c='b')
ax1.set_ylabel('Trial')
ax1.set_ylim(-1,n_trials)

for i in range(n_trials):
    x = pvals[order[i]]
    iH, = ax2.plot(intervals[order[i],:], (x, x), markersize=1, c='k')  
    mH = ax2.scatter(means[order[i]], x, c='k')
tH, = ax2.plot((0,0), (0,1), c='b')
pH, = ax2.plot((np.min(intervals)-0.15, np.max(intervals)+0.05), (0.05, 0.05), c='r', linestyle='--')
ax2.set_ylabel('p-value')
ax2.set_ylim(0, 1)

f.subplots_adjust(hspace=0)
#ax2.set_xlim(np.min(intervals)-0.15, np.max(intervals)+0.05)
ax2.set_xlim(np.min(intervals)-0.05, np.max(intervals)+0.05)
ax2.set_xlabel('Mean estimate (95% CI)')
lgd = ax2.legend([mH, iH, tH, pH],
           ['sample mean', '95% CI', 'true mean', 'p=0.05'],
           loc='upper center', bbox_to_anchor=(0.5, -0.13), ncol=2)



In [242]:

    
plt.figure(figsize=(4,4))
n_draws = 5
for j in range(n_draws):
    X = np.random.random(size=n_trials)
    for i in range(n_trials):
        plt.plot((j-0.3,j+0.3),(X[i], X[i]), c='k')
    plt.xlabel('Repitition')
plt.ylim(0, 1.0)
plt.xlim(-0.5, n_draws-0.5)



In [ ]: