In [2]:
__author__ = 'Javier Garcia-Bernardo 2014'

# Math
import numpy as np

# Stats
import scipy

# Plot
import pylab as plt
import matplotlib

# Multiple comparisons
from statsmodels.stats.multicomp import pairwise_tukeyhsd

# Kernel Estimation
from sklearn.grid_search import GridSearchCV
from sklearn.neighbors import KernelDensity

# Bootstrap
import scikits.bootstrap as bootstrap

%matplotlib inline

def customaxis(ax, c_left='k', c_bottom='k', c_right='none', c_top='none', lw=2, size=20, pad=8):
    '''
    From stackoverflow. User gcalmettes 
    '''
    for c_spine, spine in zip([c_left, c_bottom, c_right, c_top],
                              ['left', 'bottom', 'right', 'top']):
        if c_spine != 'none':
            ax.spines[spine].set_color(c_spine)
            ax.spines[spine].set_linewidth(lw)
        else:
            ax.spines[spine].set_color('none')
    if (c_bottom == 'none') & (c_top == 'none'): # no bottom and no top
        ax.xaxis.set_ticks_position('none')
    elif (c_bottom != 'none') & (c_top != 'none'): # bottom and top
        ax.tick_params(axis='x', direction='out', width=lw, length=7,
                      color=c_bottom, labelsize=size, pad=pad)
    elif (c_bottom != 'none') & (c_top == 'none'): # bottom but not top
        ax.xaxis.set_ticks_position('bottom')
        ax.tick_params(axis='x', direction='out', width=lw, length=7,
                       color=c_bottom, labelsize=size, pad=pad)
    elif (c_bottom == 'none') & (c_top != 'none'): # no bottom but top
        ax.xaxis.set_ticks_position('top')
        ax.tick_params(axis='x', direction='out', width=lw, length=7,
                       color=c_top, labelsize=size, pad=pad)
    if (c_left == 'none') & (c_right == 'none'): # no left and no right
        ax.yaxis.set_ticks_position('none')
    elif (c_left != 'none') & (c_right != 'none'): # left and right
        ax.tick_params(axis='y', direction='out', width=lw, length=7,
                       color=c_left, labelsize=size, pad=pad)
    elif (c_left != 'none') & (c_right == 'none'): # left but not right
        ax.yaxis.set_ticks_position('left')
        ax.tick_params(axis='y', direction='out', width=lw, length=7,
                       color=c_left, labelsize=size, pad=pad)
    elif (c_left == 'none') & (c_right != 'none'): # no left but right
        ax.yaxis.set_ticks_position('right')
        ax.tick_params(axis='y', direction='out', width=lw, length=7,
                       color=c_right, labelsize=size, pad=pad)

Interaction between parameters (Observed vs Expected).

Chi Square

About 25 percent of the U.S. scientific workforce consists of foreign-born scientists, in both industry and academia. We have 3 Americans and 4 Internationals. So the obvious question is: Does Mary hate Americans?

In [4]:
%%latex
$$  \frac{\sum{(Obs - Exp)^2}}{Exp}$$


$$ \frac{\sum{(Obs - Exp)^2}}{Exp}$$

In [40]:
print("p-val = ", scipy.stats.chisquare(f_obs = [3.,4.], f_exp = [7.*0.75,7.*0.25])[1])


('p-val = ', 0.049534613435626491)
YES! But wait... This test is invalid when the observed or expected frequencies in each category are too small. A typical rule is that all of the observed and expected frequencies should be at least 5.

Fisher's exact test


In [41]:
print("p-val Exact test =", scipy.stats.fisher_exact([[3, 75000], [4, 25000]])[1])


('p-val Exact test =', 0.070570483471531401)
Math says Mary doesn't hate Americans... until Jess leaves:

In [42]:
print("p-val Exact test =", scipy.stats.fisher_exact([[2, 75000], [4, 25000]])[1])


('p-val Exact test =', 0.03760622581012793)

Compare 2 independent samples

Imagine we want to compare if there are difference in GFP expression between two populations of cells.

In [43]:
def median_test(samples1, samples2):
  """Adapted from https://gist.github.com/wanderview/8285920"""
  median = np.median(samples1 + samples2)
  samples1 = np.asarray(samples1)
  samples2 = np.asarray(samples2)
  below1, above1 = np.sum(samples1<=median),np.sum(samples1>median)
  below2, above2 = np.sum(samples2<=median),np.sum(samples2>median)
    
  below_exp = (below1 + below2) / 2
  above_exp = (above1 + above2) / 2

  result = scipy.stats.chisquare(
    [below1, above1, below2, above2],
    f_exp=[below_exp, above_exp, below_exp, above_exp]
  )

  return result[1]

In [44]:
x = [np.random.randn() for _ in range(200)]
    y = [np.random.randn()+0.6 for _ in range(200)]
    ax = plt.subplot(1,1,1)
    ax.hist(x,bins=np.linspace(np.min(x+y),np.max(x+y),25),alpha=0.3,histtype='stepfilled')
    ax.hist(y,bins=np.linspace(np.min(x+y),np.max(x+y),25),alpha=0.3,histtype='stepfilled')
    ax.set_xlabel('Value',size=20)
    ax.set_ylabel('Freq',size=20)
    customaxis(ax, c_left='k', c_bottom='k', c_right='none', c_top='none', lw=2, size=20, pad=8)
    plt.show()


Check Variance

In [45]:
print("pval Equal-Variance = ",scipy.stats.levene(x,y)[1])


('pval Equal-Variance = ', 0.34631004235050145)

Equal variance. USE: Mann-Whitneyu test

Use only when the number of observation in each sample is > 20 and you have 2 independent samples of ranks. Mann-Whitney U is significant if the u-obtained is LESS THAN or equal to the critical value of U.

In [46]:
print("Mann-Whitney U test.", scipy.stats.mannwhitneyu(x,y,use_continuity=True)[1]*2)
print("T-test. Only if normality", scipy.stats.ttest_ind(x,y)[1])
print("Median test", median_test(x,y))


('Mann-Whitney U test.', 3.337747132801931e-09)
('T-test. Only if normality', 1.7455489694871835e-10)
('Median test', 1.5440498291101365e-05)

In [8]:
x = [np.random.randn() for _ in range(200)]
    y = [np.random.randn()*2+0.2 for _ in range(200)]
    ax = plt.subplot(1,1,1)
    ax.hist(x,bins=np.linspace(np.min(x+y),np.max(x+y),25),alpha=0.3,histtype='stepfilled')
    ax.hist(y,bins=np.linspace(np.min(x+y),np.max(x+y),25),alpha=0.3,histtype='stepfilled')
    ax.set_xlabel('Value',size=20)
    ax.set_ylabel('Freq',size=20)
    customaxis(ax, c_left='k', c_bottom='k', c_right='none', c_top='none', lw=2, size=20, pad=8)
    plt.show()


Non-Equal Variance and Normality. USE: T-test

Check Normality The Shapiro-Wilk test tests the null hypothesis that the data was drawn from a normal distribution. Bad if sample is big (small differences make it non-normal). QQ plot

In [13]:
print(scipy.stats.levene(x,y)[1])
print(scipy.stats.shapiro(x))
print(scipy.stats.shapiro(y))
plt.figure(1)
ax = plt.subplot(1,1,1)
(osm, osr),(slope, intercept, r)  = scipy.stats.probplot(x, dist='norm', plot=None)
ax.plot(osm, osr, 'o', osm, slope*osm + intercept)
ax.set_xlabel('Quantiles',fontsize=20)
ax.set_ylabel('Quantiles Obs',fontsize=20)
customaxis(ax, c_left='k', c_bottom='k', c_right='none', c_top='none', lw=2, size=20, pad=8)
plt.show()

ax2 = plt.subplot(1,1,1)
(osm, osr),(slope, intercept, r)  = scipy.stats.probplot(y, dist='norm', plot=None)
ax2.plot(osm, osr, 'o', osm, slope*osm + intercept)
ax2.set_xlabel('Quantiles',fontsize=20)
ax2.set_ylabel('Quantiles Obs',fontsize=20)
customaxis(ax2, c_left='k', c_bottom='k', c_right='none', c_top='none', lw=2, size=20, pad=8)
plt.show()


3.45647037494e-13
(0.9944772124290466, 0.6720660328865051)
(0.9897152781486511, 0.16231410205364227)

In [49]:
print("Mann-Whitney U test. NOP", scipy.stats.mannwhitneyu(x,y)[1]*2)
print("Unpaired T-test", scipy.stats.ttest_ind(x,y,equal_var=False)[1])
print("Median test", median_test(x,y))


('Mann-Whitney U test. NOP', 0.087346603341420845)
('Unpaired T-test', 0.25717460096222738)
('Median test', 0.18389510355041561)

Non-Equal variance and Non-Normality. USE: Median test


In [50]:
x = [np.random.lognormal(1,1.3) for _ in range(200)]
    y = [np.random.lognormal(1.5,1) for _ in range(200)]
    
    ax = plt.subplot(1,1,1)
    ax.hist(x,bins=np.linspace(np.min(x+y),np.max(x+y),25),alpha=0.3,histtype='stepfilled')
    ax.hist(y,bins=np.linspace(np.min(x+y),np.max(x+y),25),alpha=0.3,histtype='stepfilled')
    ax.set_xlabel('Value',size=20)
    ax.set_ylabel('Freq',size=20)
    customaxis(ax, c_left='k', c_bottom='k', c_right='none', c_top='none', lw=2, size=20, pad=8)
    plt.show()



In [51]:
print('Mann-Whitney U test test. NO', scipy.stats.mannwhitneyu(x,y)[1]*2)
print('T-test. NO', scipy.stats.ttest_ind(x,y)[1])
print('Median test', median_test(x,y))


('Mann-Whitney U test test. NO', 0.0087839417344925171)
('T-test. NO', 0.63251493042679674)
('Median test', 0.58075022224762773)

Compare 2 paired samples

Imagine we want to check if there are difference in GFP expression after we add salicylate in non-growing cells.

USE: Wilcoxon signed-rank test


In [52]:
x = [np.random.randn() for _ in range(200)]
    y = [np.random.randn()+0.2 for _ in range(200)]
    ax = plt.subplot(1,1,1)
    ax.hist(x,bins=np.linspace(np.min(x+y),np.max(x+y),25),alpha=0.3,histtype='stepfilled')
    ax.hist(y,bins=np.linspace(np.min(x+y),np.max(x+y),25),alpha=0.3,histtype='stepfilled')
    ax.set_xlabel('Value',size=20)
    ax.set_ylabel('Freq',size=20)
    customaxis(ax, c_left='k', c_bottom='k', c_right='none', c_top='none', lw=2, size=20, pad=8)
    plt.show()


The Wilcoxon signed-rank test tests the null hypothesis that two related paired samples come from the same distribution

In [53]:
print("Wilcoxon Signed-Rank test. Only if equal variance", scipy.stats.wilcoxon(x,y)[1])
print("Mann-Whitney U test NO",scipy.stats.mannwhitneyu(x,y)[1]*2)
print("Paired t-test. Only if normality and equal variance",scipy.stats.ttest_rel(x,y)[1])
print("Unpaired t-test NO",scipy.stats.ttest_ind(x,y)[1])


('Wilcoxon Signed-Rank test. Only if equal variance', 0.0064134158222216931)
('Mann-Whitney U test NO', 0.0023466572817885658)
('Paired t-test. Only if normality and equal variance', 0.005920142900373267)
('Unpaired t-test NO', 0.004349070916356835)

Compare >2 independent samples. Same mean?

Imagine we want to check if there are difference in GFP expression in three different versions of a promoter.

ANOVA (Parametric) or Kruskal-Wallis (Non parametric)

Analysis of variance (ANOVA) is a collection of statistical models used to analyze the differences between group means and their associated procedures (such as "variation" among and between groups). T-test

The Kruskal–Wallis one-way analysis of variance by ranks (named after William Kruskal and W. Allen Wallis) is a non-parametric method for testing whether samples originate from the same distribution. Mann-Whitney-U


In [54]:
x = [np.random.randn() for _ in range(200)]
    y = [np.random.randn()+0.2 for _ in range(200)]
    z = [np.random.randn()+0.4 for _ in range(200)]
    ax = plt.subplot(1,1,1)
    ax.hist(x,bins=np.linspace(np.min(x+y+z),np.max(x+y+z),25),alpha=0.3,histtype='stepfilled')
    ax.hist(y,bins=np.linspace(np.min(x+y+z),np.max(x+y+z),25),alpha=0.3,histtype='stepfilled')
    ax.hist(z,bins=np.linspace(np.min(x+y+z),np.max(x+y+z),25),alpha=0.3,histtype='stepfilled')
    ax.set_xlabel('Value',size=20)
    ax.set_ylabel('Freq',size=20)
    customaxis(ax, c_left='k', c_bottom='k', c_right='none', c_top='none', lw=2, size=20, pad=8)
    plt.show()



In [55]:
f_val, p_val = scipy.stats.f_oneway(x, y, z)    
print "One-way ANOVA P =", p_val  
f_val, p_val = scipy.stats.mstats.kruskalwallis(x,y,z)
print "One-way Kruskal Wallis P =", p_val


One-way ANOVA P = 3.03067030652e-08
One-way Kruskal Wallis P = 2.7064432518e-08

In [56]:
np.random.seed(50)
    x = [np.random.randn() for _ in range(200)]
    y = [np.random.randn()+0.1 for _ in range(200)]
    z = [np.random.randn()+0.1 for _ in range(200)]
    i = [np.random.randn()+0.1 for _ in range(200)]
    j = [np.random.randn()+0.2 for _ in range(200)]
    ax = plt.subplot(1,1,1)
    ax.hist(x,bins=np.linspace(np.min(x+y+z+i+j),np.max(x+y+z+i+j),25),alpha=0.3,histtype='stepfilled')
    ax.hist(y,bins=np.linspace(np.min(x+y+z+i+j),np.max(x+y+z+i+j),25),alpha=0.3,histtype='stepfilled')
    ax.hist(z,bins=np.linspace(np.min(x+y+z+i+j),np.max(x+y+z+i+j),25),alpha=0.3,histtype='stepfilled')
    ax.hist(i,bins=np.linspace(np.min(x+y+z+i+j),np.max(x+y+z+i+j),25),alpha=0.3,histtype='stepfilled')
    ax.hist(j,bins=np.linspace(np.min(x+y+z+i+j),np.max(x+y+z+i+j),25),alpha=0.3,histtype='stepfilled')
    ax.set_xlabel('Value',size=20)
    ax.set_ylabel('Freq',size=20)
    customaxis(ax, c_left='k', c_bottom='k', c_right='none', c_top='none', lw=2, size=20, pad=8)
    plt.show()



In [57]:
f_val, p_val = scipy.stats.f_oneway(x, y, z,i,j)    
print "One-way ANOVA P =", p_val  
f_val, p_val = scipy.stats.mstats.kruskalwallis(x,y,z,i,j)
print "One-way Kruskal Wallis P =", p_val


One-way ANOVA P = 0.0142504270115
One-way Kruskal Wallis P = 0.0258559380048
Resistant to deviation (CLT)

Compare >2 independent samples. Corrections


In [58]:
ax = plt.subplot(1,1,1)
ax.plot(1-(1-0.05)** np.linspace(0,21,11),linewidth=3)
ax.set_xlabel('Prob of screwing up with the labels',size=20)
ax.set_ylabel('Number of plots I make',size=20)
plt.title('Are line plots different than the histograms?',fontsize=14)
customaxis(ax, c_left='k', c_bottom='k', c_right='none', c_top='none', lw=2, size=20, pad=8)
plt.show()



In [59]:
ax = plt.subplot(1,1,1)
ax.plot(1-(1-0.025)** np.linspace(0,100,100),linewidth=3)
ax.set_ylabel('Prob of screwing up',size=20)
ax.set_xlabel('Number of trials',size=20)
plt.title('Be careful discarding experiments')
customaxis(ax, c_left='k', c_bottom='k', c_right='none', c_top='none', lw=2, size=20, pad=8)
plt.show()


Bonferroni

Corrects for the number of comparisons that you WILL make (Set p-value before the experiment. Don't cheat)

In [60]:
%%latex
$$1 - (1-\alpha_{new})^{trials} = 0.05$$
$$$$
Our Friend Taylor
$$1 - (1-trials \cdot \alpha_{new}) = 0.05$$
$$$$
$$trials \cdot \alpha_{new} = 0.05$$
$$\alpha_{new} = 0.05/trials$$


$$1 - (1-\alpha_{new})^{trials} = 0.05$$ $$$$ Our Friend Taylor $$1 - (1-trials \cdot \alpha_{new}) = 0.05$$ $$$$ $$trials \cdot \alpha_{new} = 0.05$$ $$\alpha_{new} = 0.05/trials$$

In [61]:
print("p_min = 0.05/3 = %f" %(0.05/10)) 
print(scipy.stats.mannwhitneyu(x,y)[1]*2)
print(scipy.stats.mannwhitneyu(x,z)[1]*2)
print(scipy.stats.mannwhitneyu(x,i)[1]*2)
print(scipy.stats.mannwhitneyu(x,j)[1]*2)
print(scipy.stats.mannwhitneyu(y,z)[1]*2)
print(scipy.stats.mannwhitneyu(y,i)[1]*2)
print(scipy.stats.mannwhitneyu(y,j)[1]*2)
print(scipy.stats.mannwhitneyu(z,i)[1]*2)
print(scipy.stats.mannwhitneyu(z,j)[1]*2)
print(scipy.stats.mannwhitneyu(i,j)[1]*2)


p_min = 0.05/3 = 0.005000
0.0951349836324
0.176819928628
0.0515876890194
0.00138320106226
0.692955551073
0.817025225356
0.105130438032
0.500168565389
0.0464220325154
0.187879318583

Dunnett's_test. Compare with control.

Still not implemented in Python. There are some non-official implementations in MATLAB though. Assumes normality You can use bonferroni with p-min = 0.05/(n-1)

Tukey HSD. Best

Assumes normality

In [62]:
from statsmodels.stats.multicomp import pairwise_tukeyhsd

In [63]:
res2 = pairwise_tukeyhsd(np.asarray(x+y+z+i+j),['x']*200+['y']*200+['z']*200+['i']*200+['j']*200)
print res2


Multiple Comparison of Means - Tukey HSD,FWER=0.05
=============================================
group1 group2 meandiff  lower   upper  reject
---------------------------------------------
  0      1     0.1404  -0.1341  0.4149 False 
  0      2    -0.1942  -0.4687  0.0803 False 
  0      3    -0.0337  -0.3081  0.2408 False 
  0      4    -0.1121  -0.3866  0.1623 False 
  1      2    -0.3346  -0.6091 -0.0601  True 
  1      3     -0.174  -0.4485  0.1004 False 
  1      4    -0.2525   -0.527  0.0219 False 
  2      3     0.1606  -0.1139  0.435  False 
  2      4     0.0821  -0.1924  0.3566 False 
  3      4    -0.0785   -0.353  0.196  False 
---------------------------------------------

In [64]:
res2.plot_simultaneous(comparison_name=None,xlabel='diffs',ylabel='grups')


Out[64]:

Compare >2 independent samples and >2 different parameters.

Imagine we want to check if there are difference in GFP expression for 3 different versions of a promoter in three different media. Create models, explaining the contribution of each factor. The two-way ANOVA can not only determine the main effect of contributions of each independent variable but also identifies if there is a significant interaction effect between them.

Fitting distributions

Smooth histograms.

In [65]:
from sklearn.grid_search import GridSearchCV
from sklearn.neighbors import KernelDensity

# The grid we'll use for plotting
x_grid = np.linspace(-4.5, 3.5, 1000)

# Draw points from a bimodal distribution in 1D
np.random.seed(0)
x = np.concatenate([scipy.stats.distributions.norm(-1, 1.).rvs(400),
                    scipy.stats.distributions.norm(1, 0.3).rvs(100)])

grid = GridSearchCV(KernelDensity(),
                    {'bandwidth': np.linspace(0.1, 1.0, 30)},
                    cv=20) # 20-fold cross-validation
grid.fit(x[:, None])
print grid.best_params_


{'bandwidth': 0.19310344827586207}

In [66]:
kde = grid.best_estimator_
pdf = np.exp(kde.score_samples(x_grid[:, None]))

fig, ax = plt.subplots()
ax.plot(x_grid, pdf, linewidth=3, alpha=0.5, label='bw=%.2f' % kde.bandwidth)
ax.hist(x, 30, fc='gray', histtype='stepfilled', alpha=0.3, normed=True)
ax.set_xlim(-4.5, 3.5)
ax.set_xlabel('Value',fontsize=20)
ax.set_ylabel('Norm Freq',fontsize=20)
customaxis(ax, c_left='k', c_bottom='k', c_right='none', c_top='none', lw=2, size=20, pad=8)


Fitting Adapted from Saullo Castro, StackOverflow

What is the best fit to our distribution? For discrete distributions you can use ChiSquare (Observed and Expected) to see which one fits better. For continuous distribution you can take intervals and use ChiSquare (below).

In [6]:
size = 30000
x = np.linspace(0,15,size)
y = [np.random.lognormal(1,0.8) for _ in range(size)]
h = plt.hist(y, bins=np.linspace(0,15,50),normed=1,fc='gray', histtype='stepfilled', alpha=0.3)
plt.figure(1)
ax = plt.subplot(1,1,1)
dist_names = ['beta','rayleigh', 'norm','lognorm','expon']


for dist_name in dist_names:
    t = []
    dist = getattr(scipy.stats, dist_name)
    param = dist.fit(y)
    pdf_fitted = dist.pdf(x, *param[:-2], loc=param[-2], scale=param[-1])
    ax.plot(x, pdf_fitted, label=dist_name,linewidth=3)
    
    init = h[1][0]
    for value in h[1][1:]:
        t.append(np.mean(pdf_fitted[(x>=init) & (x < value)]))
        init = value

    
    print(dist_name+" Chi stat = ", scipy.stats.chisquare(f_obs = size*np.asarray(h[0]), f_exp = size*np.asarray(t)))
    print(dist_name+" p-val KS = ", scipy.stats.kstest(y,dist_name,args=tuple(param))[1])
    scipy.stats.probplot(y, dist=dist_name,sparams=tuple(param), plot=None)
 

ax.set_xlabel('Value',fontsize=20)
ax.set_ylabel('Norm Freq',fontsize=20)
plt.xlim((0,15))

customaxis(ax, c_left='k', c_bottom='k', c_right='none', c_top='none', lw=2, size=20, pad=8)
plt.legend(loc='upper right',fontsize=16,frameon=False)
plt.show()

fig = plt.figure(2)

i = 1
for dist_name in dist_names:
    i += 1
    t = []
    dist = getattr(scipy.stats, dist_name)
    param = dist.fit(y)
    (osm, osr),(slope, intercept, r)  = scipy.stats.probplot(y, dist=dist_name,sparams=tuple(param), plot=None)
    plt.figure(i)
    ax = plt.subplot(1,1,1)
    ax.plot(osm, osr, 'o', osm, slope*osm + intercept)
    ax.set_xlabel('Quantiles',fontsize=20)
    ax.set_ylabel('Quantiles Obs',fontsize=20)
    customaxis(ax, c_left='k', c_bottom='k', c_right='none', c_top='none', lw=2, size=20, pad=8)
    plt.legend([dist_name])

    plt.show()


('beta Chi stat = ', (3859.8620361298576, 0.0))
('beta p-val KS = ', 1.6315016931288011e-47)
('rayleigh Chi stat = ', (30796.145832321308, 0.0))
('rayleigh p-val KS = ', 0.0)
('norm Chi stat = ', (52091.666972563071, 0.0))
('norm p-val KS = ', 0.0)
('lognorm Chi stat = ', (178.17135553030289, 7.4331840876546572e-17))
('lognorm p-val KS = ', 0.98944659461564122)
('expon Chi stat = ', (13627.328442541573, 0.0))
('expon p-val KS = ', 5.139587050053985e-318)

Central Limit Theorem


In [13]:
ax = plt.subplot(1,1,1)
ax.hist(np.random.rand(10000),fc='gray', histtype='stepfilled', alpha=0.3)
ax.set_xlabel('Value',size=20)
ax.set_ylabel('Freq',size=20)
customaxis(ax, c_left='k', c_bottom='k', c_right='none', c_top='none', lw=2, size=20, pad=8)
plt.show()

x = [np.mean(np.random.rand(100)) for _ in range(10000)]
ax = plt.subplot(1,1,1)
ax.hist(x,fc='gray', histtype='stepfilled', alpha=0.3)
ax.set_xlabel('Value',size=20)
ax.set_ylabel('Freq',size=20)
customaxis(ax, c_left='k', c_bottom='k', c_right='none', c_top='none', lw=2, size=20, pad=8)
plt.show()

print('Real STD: ', np.std(np.random.rand(10000)))
print('STD estimated *sqrt(n-1), n = 100 : ', np.std(x)*np.sqrt(100-1))
print('STD estimated *sqrt(n-1), n = 50 : ', np.std([np.mean(np.random.rand(50)) for _ in range(10000)])*np.sqrt(50-1))


('Real STD: ', 0.28883809877618805)
('STD estimated *sqrt(n-1), n = 100 : ', 0.28733455295932581)
('STD estimated *sqrt(n-1), n = 50 : ', 0.28646705134779565)

Bootstrap methods

Calculate CIs

Peasants way. Parametric

Calculate confidence intervals with SEM --> >20 replicates pufff Lower Limit = Mean - (t-statistic for x df)(SEM) Upper Limit = Mean + (t-statistic for x df)(SEM)

In [68]:
%%latex
$$\mu = \left[\mu + t_{1 - \alpha/2,n-1},\mu + t_{\alpha/2,n-1} \right]$$
$$$$
$$\sigma = \left[\sqrt{\frac{(n-1)\cdot \sigma^2}{\chi^2_{1 - \alpha/2,n-1}  }} \sqrt{\frac{(n-1)\cdot \sigma^2}{\chi^2_{\alpha/2,n-1}  }}\right]$$


$$\mu = \left[\mu + t_{1 - \alpha/2,n-1},\mu + t_{\alpha/2,n-1} \right]$$ $$$$ $$\sigma = \left[\sqrt{\frac{(n-1)\cdot \sigma^2}{\chi^2_{1 - \alpha/2,n-1} }} \sqrt{\frac{(n-1)\cdot \sigma^2}{\chi^2_{\alpha/2,n-1} }}\right]$$

In [6]:
x = np.random.randn(20)
print('CI for mean with SEM = ', np.mean(x) + scipy.stats.t.isf([0.025, 0.975], [19])[::-1] *scipy.stats.sem(x))
print('CI for std with SEM = ', np.sqrt(np.var(x)*19/scipy.stats.chi2.isf([0.025, 0.975], [19])))


('CI for mean with SEM = ', array([-0.53678197,  0.49794991]))
('CI for std with SEM = ', array([ 0.81939658,  1.57370483]))

CLT make it easy. Non parametric, taking samples with replacement


In [70]:
import scikits.bootstrap as bootstrap

In [71]:
CIs = bootstrap.ci(x, statfunction=np.mean,n_samples=100000)  
print('CI for mean with bootstrapping = ', CIs)

CIs = bootstrap.ci(x, statfunction=np.std,n_samples=100000)  
print('CI for std with bootstrapping = ', CIs)


('CI for mean with bootstrapping = ', array([-1.00376369,  0.02051702]))
('CI for std with bootstrapping = ', array([ 0.86606787,  1.59350777]))

Anything you want


In [72]:
np.random.seed(105)
size = 100
x = [np.random.lognormal(1,1) for _ in range(size)]
y = (np.random.randn(size)*np.std(x)+np.mean(x))
print('Mean = %f STD = %f  Median = %f' %(np.mean(x), np.std(x), np.median(x)))
print('Mean = %f STD = %f  Median = %f' %(np.mean(y), np.std(y), np.median(y)))
ax = plt.subplot(1,1,1)
ax.hist(x,bins=np.linspace(-5,20,10),alpha=0.3,histtype='stepfilled')
ax.hist(y,bins=np.linspace(-5,20,10),alpha=0.3,histtype='stepfilled')
ax.set_xlabel('Value',fontsize=20)
ax.set_ylabel('Freq',fontsize=20)
customaxis(ax, c_left='k', c_bottom='k', c_right='none', c_top='none', lw=2, size=20, pad=8)
plt.show()


Mean = 5.135465 STD = 6.925996  Median = 2.494636
Mean = 6.227468 STD = 6.899373  Median = 6.389744

In [73]:
y2 = [np.median(np.random.choice(y, size=size, replace=True)) for _ in range(100000)]
y2 = np.asarray(y2)
ax = plt.subplot(1,1,1)
ax.hist(y2,bins=np.linspace(2,11,10),alpha=0.3,histtype='stepfilled')
ax.vlines(np.median(x),0,25000,linewidth=3)
ax.set_xlabel('Value',fontsize=20)
ax.set_ylabel('Freq',fontsize=20)
customaxis(ax, c_left='k', c_bottom='k', c_right='none', c_top='none', lw=2, size=20, pad=8)
print('Bootstrap: ',float(len(y2[y2<=np.median(x)]))/100000)
print('Median test: ',median_test(x,y))
print('Mann: ',scipy.stats.mannwhitneyu(x,y)[1]*2)
print('T_test: NOT NORMAL!',scipy.stats.ttest_ind(x,y)[1])


('Bootstrap: ', 0.0)
('Median test: ', 0.16201071261586195)
('Mann: ', 0.0089697635496793649)
('T_test: NOT NORMAL!', 0.2677343915603817)
Back to the important question today. Does Mary hate American? BOOTSTRAP

In [9]:
x = np.random.rand(100000)>0.75
size = 10000
simulated = np.asarray([np.mean(np.random.choice(x,size = 7)) for _ in range(size)])
p = float(np.sum(simulated>=(4./7.)))/size
print('p-value: ', p)

print('CI for p with SEM = ', p + scipy.stats.t.isf([0.025, 0.975], [size-1])[::-1] *np.sqrt(p*(1-p)/(size-1)))


('p-value: ', 0.0688)
('CI for p with SEM = ', array([ 0.06383821,  0.07376179]))
Compare with the result of the Fisher test: 0.07057048347153140 We can get confidence intervals with the CLT (It's normally distributed because we calculate p many times)