In [2]:
__author__ = 'Javier Garcia-Bernardo 2014'
# Math
import numpy as np
# Stats
import scipy
# Plot
import pylab as plt
import matplotlib
# Multiple comparisons
from statsmodels.stats.multicomp import pairwise_tukeyhsd
# Kernel Estimation
from sklearn.grid_search import GridSearchCV
from sklearn.neighbors import KernelDensity
# Bootstrap
import scikits.bootstrap as bootstrap
%matplotlib inline
def customaxis(ax, c_left='k', c_bottom='k', c_right='none', c_top='none', lw=2, size=20, pad=8):
'''
From stackoverflow. User gcalmettes
'''
for c_spine, spine in zip([c_left, c_bottom, c_right, c_top],
['left', 'bottom', 'right', 'top']):
if c_spine != 'none':
ax.spines[spine].set_color(c_spine)
ax.spines[spine].set_linewidth(lw)
else:
ax.spines[spine].set_color('none')
if (c_bottom == 'none') & (c_top == 'none'): # no bottom and no top
ax.xaxis.set_ticks_position('none')
elif (c_bottom != 'none') & (c_top != 'none'): # bottom and top
ax.tick_params(axis='x', direction='out', width=lw, length=7,
color=c_bottom, labelsize=size, pad=pad)
elif (c_bottom != 'none') & (c_top == 'none'): # bottom but not top
ax.xaxis.set_ticks_position('bottom')
ax.tick_params(axis='x', direction='out', width=lw, length=7,
color=c_bottom, labelsize=size, pad=pad)
elif (c_bottom == 'none') & (c_top != 'none'): # no bottom but top
ax.xaxis.set_ticks_position('top')
ax.tick_params(axis='x', direction='out', width=lw, length=7,
color=c_top, labelsize=size, pad=pad)
if (c_left == 'none') & (c_right == 'none'): # no left and no right
ax.yaxis.set_ticks_position('none')
elif (c_left != 'none') & (c_right != 'none'): # left and right
ax.tick_params(axis='y', direction='out', width=lw, length=7,
color=c_left, labelsize=size, pad=pad)
elif (c_left != 'none') & (c_right == 'none'): # left but not right
ax.yaxis.set_ticks_position('left')
ax.tick_params(axis='y', direction='out', width=lw, length=7,
color=c_left, labelsize=size, pad=pad)
elif (c_left == 'none') & (c_right != 'none'): # no left but right
ax.yaxis.set_ticks_position('right')
ax.tick_params(axis='y', direction='out', width=lw, length=7,
color=c_right, labelsize=size, pad=pad)
In [4]:
%%latex
$$ \frac{\sum{(Obs - Exp)^2}}{Exp}$$
In [40]:
print("p-val = ", scipy.stats.chisquare(f_obs = [3.,4.], f_exp = [7.*0.75,7.*0.25])[1])
In [41]:
print("p-val Exact test =", scipy.stats.fisher_exact([[3, 75000], [4, 25000]])[1])
In [42]:
print("p-val Exact test =", scipy.stats.fisher_exact([[2, 75000], [4, 25000]])[1])
In [43]:
def median_test(samples1, samples2):
"""Adapted from https://gist.github.com/wanderview/8285920"""
median = np.median(samples1 + samples2)
samples1 = np.asarray(samples1)
samples2 = np.asarray(samples2)
below1, above1 = np.sum(samples1<=median),np.sum(samples1>median)
below2, above2 = np.sum(samples2<=median),np.sum(samples2>median)
below_exp = (below1 + below2) / 2
above_exp = (above1 + above2) / 2
result = scipy.stats.chisquare(
[below1, above1, below2, above2],
f_exp=[below_exp, above_exp, below_exp, above_exp]
)
return result[1]
In [44]:
x = [np.random.randn() for _ in range(200)]
y = [np.random.randn()+0.6 for _ in range(200)]
ax = plt.subplot(1,1,1)
ax.hist(x,bins=np.linspace(np.min(x+y),np.max(x+y),25),alpha=0.3,histtype='stepfilled')
ax.hist(y,bins=np.linspace(np.min(x+y),np.max(x+y),25),alpha=0.3,histtype='stepfilled')
ax.set_xlabel('Value',size=20)
ax.set_ylabel('Freq',size=20)
customaxis(ax, c_left='k', c_bottom='k', c_right='none', c_top='none', lw=2, size=20, pad=8)
plt.show()
In [45]:
print("pval Equal-Variance = ",scipy.stats.levene(x,y)[1])
In [46]:
print("Mann-Whitney U test.", scipy.stats.mannwhitneyu(x,y,use_continuity=True)[1]*2)
print("T-test. Only if normality", scipy.stats.ttest_ind(x,y)[1])
print("Median test", median_test(x,y))
In [8]:
x = [np.random.randn() for _ in range(200)]
y = [np.random.randn()*2+0.2 for _ in range(200)]
ax = plt.subplot(1,1,1)
ax.hist(x,bins=np.linspace(np.min(x+y),np.max(x+y),25),alpha=0.3,histtype='stepfilled')
ax.hist(y,bins=np.linspace(np.min(x+y),np.max(x+y),25),alpha=0.3,histtype='stepfilled')
ax.set_xlabel('Value',size=20)
ax.set_ylabel('Freq',size=20)
customaxis(ax, c_left='k', c_bottom='k', c_right='none', c_top='none', lw=2, size=20, pad=8)
plt.show()
In [13]:
print(scipy.stats.levene(x,y)[1])
print(scipy.stats.shapiro(x))
print(scipy.stats.shapiro(y))
plt.figure(1)
ax = plt.subplot(1,1,1)
(osm, osr),(slope, intercept, r) = scipy.stats.probplot(x, dist='norm', plot=None)
ax.plot(osm, osr, 'o', osm, slope*osm + intercept)
ax.set_xlabel('Quantiles',fontsize=20)
ax.set_ylabel('Quantiles Obs',fontsize=20)
customaxis(ax, c_left='k', c_bottom='k', c_right='none', c_top='none', lw=2, size=20, pad=8)
plt.show()
ax2 = plt.subplot(1,1,1)
(osm, osr),(slope, intercept, r) = scipy.stats.probplot(y, dist='norm', plot=None)
ax2.plot(osm, osr, 'o', osm, slope*osm + intercept)
ax2.set_xlabel('Quantiles',fontsize=20)
ax2.set_ylabel('Quantiles Obs',fontsize=20)
customaxis(ax2, c_left='k', c_bottom='k', c_right='none', c_top='none', lw=2, size=20, pad=8)
plt.show()
In [49]:
print("Mann-Whitney U test. NOP", scipy.stats.mannwhitneyu(x,y)[1]*2)
print("Unpaired T-test", scipy.stats.ttest_ind(x,y,equal_var=False)[1])
print("Median test", median_test(x,y))
In [50]:
x = [np.random.lognormal(1,1.3) for _ in range(200)]
y = [np.random.lognormal(1.5,1) for _ in range(200)]
ax = plt.subplot(1,1,1)
ax.hist(x,bins=np.linspace(np.min(x+y),np.max(x+y),25),alpha=0.3,histtype='stepfilled')
ax.hist(y,bins=np.linspace(np.min(x+y),np.max(x+y),25),alpha=0.3,histtype='stepfilled')
ax.set_xlabel('Value',size=20)
ax.set_ylabel('Freq',size=20)
customaxis(ax, c_left='k', c_bottom='k', c_right='none', c_top='none', lw=2, size=20, pad=8)
plt.show()
In [51]:
print('Mann-Whitney U test test. NO', scipy.stats.mannwhitneyu(x,y)[1]*2)
print('T-test. NO', scipy.stats.ttest_ind(x,y)[1])
print('Median test', median_test(x,y))
In [52]:
x = [np.random.randn() for _ in range(200)]
y = [np.random.randn()+0.2 for _ in range(200)]
ax = plt.subplot(1,1,1)
ax.hist(x,bins=np.linspace(np.min(x+y),np.max(x+y),25),alpha=0.3,histtype='stepfilled')
ax.hist(y,bins=np.linspace(np.min(x+y),np.max(x+y),25),alpha=0.3,histtype='stepfilled')
ax.set_xlabel('Value',size=20)
ax.set_ylabel('Freq',size=20)
customaxis(ax, c_left='k', c_bottom='k', c_right='none', c_top='none', lw=2, size=20, pad=8)
plt.show()
In [53]:
print("Wilcoxon Signed-Rank test. Only if equal variance", scipy.stats.wilcoxon(x,y)[1])
print("Mann-Whitney U test NO",scipy.stats.mannwhitneyu(x,y)[1]*2)
print("Paired t-test. Only if normality and equal variance",scipy.stats.ttest_rel(x,y)[1])
print("Unpaired t-test NO",scipy.stats.ttest_ind(x,y)[1])
Imagine we want to check if there are difference in GFP expression in three different versions of a promoter.
Analysis of variance (ANOVA) is a collection of statistical models used to analyze the differences between group means and their associated procedures (such as "variation" among and between groups). T-test
The Kruskal–Wallis one-way analysis of variance by ranks (named after William Kruskal and W. Allen Wallis) is a non-parametric method for testing whether samples originate from the same distribution. Mann-Whitney-U
In [54]:
x = [np.random.randn() for _ in range(200)]
y = [np.random.randn()+0.2 for _ in range(200)]
z = [np.random.randn()+0.4 for _ in range(200)]
ax = plt.subplot(1,1,1)
ax.hist(x,bins=np.linspace(np.min(x+y+z),np.max(x+y+z),25),alpha=0.3,histtype='stepfilled')
ax.hist(y,bins=np.linspace(np.min(x+y+z),np.max(x+y+z),25),alpha=0.3,histtype='stepfilled')
ax.hist(z,bins=np.linspace(np.min(x+y+z),np.max(x+y+z),25),alpha=0.3,histtype='stepfilled')
ax.set_xlabel('Value',size=20)
ax.set_ylabel('Freq',size=20)
customaxis(ax, c_left='k', c_bottom='k', c_right='none', c_top='none', lw=2, size=20, pad=8)
plt.show()
In [55]:
f_val, p_val = scipy.stats.f_oneway(x, y, z)
print "One-way ANOVA P =", p_val
f_val, p_val = scipy.stats.mstats.kruskalwallis(x,y,z)
print "One-way Kruskal Wallis P =", p_val
In [56]:
np.random.seed(50)
x = [np.random.randn() for _ in range(200)]
y = [np.random.randn()+0.1 for _ in range(200)]
z = [np.random.randn()+0.1 for _ in range(200)]
i = [np.random.randn()+0.1 for _ in range(200)]
j = [np.random.randn()+0.2 for _ in range(200)]
ax = plt.subplot(1,1,1)
ax.hist(x,bins=np.linspace(np.min(x+y+z+i+j),np.max(x+y+z+i+j),25),alpha=0.3,histtype='stepfilled')
ax.hist(y,bins=np.linspace(np.min(x+y+z+i+j),np.max(x+y+z+i+j),25),alpha=0.3,histtype='stepfilled')
ax.hist(z,bins=np.linspace(np.min(x+y+z+i+j),np.max(x+y+z+i+j),25),alpha=0.3,histtype='stepfilled')
ax.hist(i,bins=np.linspace(np.min(x+y+z+i+j),np.max(x+y+z+i+j),25),alpha=0.3,histtype='stepfilled')
ax.hist(j,bins=np.linspace(np.min(x+y+z+i+j),np.max(x+y+z+i+j),25),alpha=0.3,histtype='stepfilled')
ax.set_xlabel('Value',size=20)
ax.set_ylabel('Freq',size=20)
customaxis(ax, c_left='k', c_bottom='k', c_right='none', c_top='none', lw=2, size=20, pad=8)
plt.show()
In [57]:
f_val, p_val = scipy.stats.f_oneway(x, y, z,i,j)
print "One-way ANOVA P =", p_val
f_val, p_val = scipy.stats.mstats.kruskalwallis(x,y,z,i,j)
print "One-way Kruskal Wallis P =", p_val
In [58]:
ax = plt.subplot(1,1,1)
ax.plot(1-(1-0.05)** np.linspace(0,21,11),linewidth=3)
ax.set_xlabel('Prob of screwing up with the labels',size=20)
ax.set_ylabel('Number of plots I make',size=20)
plt.title('Are line plots different than the histograms?',fontsize=14)
customaxis(ax, c_left='k', c_bottom='k', c_right='none', c_top='none', lw=2, size=20, pad=8)
plt.show()
In [59]:
ax = plt.subplot(1,1,1)
ax.plot(1-(1-0.025)** np.linspace(0,100,100),linewidth=3)
ax.set_ylabel('Prob of screwing up',size=20)
ax.set_xlabel('Number of trials',size=20)
plt.title('Be careful discarding experiments')
customaxis(ax, c_left='k', c_bottom='k', c_right='none', c_top='none', lw=2, size=20, pad=8)
plt.show()
In [60]:
%%latex
$$1 - (1-\alpha_{new})^{trials} = 0.05$$
$$$$
Our Friend Taylor
$$1 - (1-trials \cdot \alpha_{new}) = 0.05$$
$$$$
$$trials \cdot \alpha_{new} = 0.05$$
$$\alpha_{new} = 0.05/trials$$
In [61]:
print("p_min = 0.05/3 = %f" %(0.05/10))
print(scipy.stats.mannwhitneyu(x,y)[1]*2)
print(scipy.stats.mannwhitneyu(x,z)[1]*2)
print(scipy.stats.mannwhitneyu(x,i)[1]*2)
print(scipy.stats.mannwhitneyu(x,j)[1]*2)
print(scipy.stats.mannwhitneyu(y,z)[1]*2)
print(scipy.stats.mannwhitneyu(y,i)[1]*2)
print(scipy.stats.mannwhitneyu(y,j)[1]*2)
print(scipy.stats.mannwhitneyu(z,i)[1]*2)
print(scipy.stats.mannwhitneyu(z,j)[1]*2)
print(scipy.stats.mannwhitneyu(i,j)[1]*2)
In [62]:
from statsmodels.stats.multicomp import pairwise_tukeyhsd
In [63]:
res2 = pairwise_tukeyhsd(np.asarray(x+y+z+i+j),['x']*200+['y']*200+['z']*200+['i']*200+['j']*200)
print res2
In [64]:
res2.plot_simultaneous(comparison_name=None,xlabel='diffs',ylabel='grups')
Out[64]:
In [65]:
from sklearn.grid_search import GridSearchCV
from sklearn.neighbors import KernelDensity
# The grid we'll use for plotting
x_grid = np.linspace(-4.5, 3.5, 1000)
# Draw points from a bimodal distribution in 1D
np.random.seed(0)
x = np.concatenate([scipy.stats.distributions.norm(-1, 1.).rvs(400),
scipy.stats.distributions.norm(1, 0.3).rvs(100)])
grid = GridSearchCV(KernelDensity(),
{'bandwidth': np.linspace(0.1, 1.0, 30)},
cv=20) # 20-fold cross-validation
grid.fit(x[:, None])
print grid.best_params_
In [66]:
kde = grid.best_estimator_
pdf = np.exp(kde.score_samples(x_grid[:, None]))
fig, ax = plt.subplots()
ax.plot(x_grid, pdf, linewidth=3, alpha=0.5, label='bw=%.2f' % kde.bandwidth)
ax.hist(x, 30, fc='gray', histtype='stepfilled', alpha=0.3, normed=True)
ax.set_xlim(-4.5, 3.5)
ax.set_xlabel('Value',fontsize=20)
ax.set_ylabel('Norm Freq',fontsize=20)
customaxis(ax, c_left='k', c_bottom='k', c_right='none', c_top='none', lw=2, size=20, pad=8)
In [6]:
size = 30000
x = np.linspace(0,15,size)
y = [np.random.lognormal(1,0.8) for _ in range(size)]
h = plt.hist(y, bins=np.linspace(0,15,50),normed=1,fc='gray', histtype='stepfilled', alpha=0.3)
plt.figure(1)
ax = plt.subplot(1,1,1)
dist_names = ['beta','rayleigh', 'norm','lognorm','expon']
for dist_name in dist_names:
t = []
dist = getattr(scipy.stats, dist_name)
param = dist.fit(y)
pdf_fitted = dist.pdf(x, *param[:-2], loc=param[-2], scale=param[-1])
ax.plot(x, pdf_fitted, label=dist_name,linewidth=3)
init = h[1][0]
for value in h[1][1:]:
t.append(np.mean(pdf_fitted[(x>=init) & (x < value)]))
init = value
print(dist_name+" Chi stat = ", scipy.stats.chisquare(f_obs = size*np.asarray(h[0]), f_exp = size*np.asarray(t)))
print(dist_name+" p-val KS = ", scipy.stats.kstest(y,dist_name,args=tuple(param))[1])
scipy.stats.probplot(y, dist=dist_name,sparams=tuple(param), plot=None)
ax.set_xlabel('Value',fontsize=20)
ax.set_ylabel('Norm Freq',fontsize=20)
plt.xlim((0,15))
customaxis(ax, c_left='k', c_bottom='k', c_right='none', c_top='none', lw=2, size=20, pad=8)
plt.legend(loc='upper right',fontsize=16,frameon=False)
plt.show()
fig = plt.figure(2)
i = 1
for dist_name in dist_names:
i += 1
t = []
dist = getattr(scipy.stats, dist_name)
param = dist.fit(y)
(osm, osr),(slope, intercept, r) = scipy.stats.probplot(y, dist=dist_name,sparams=tuple(param), plot=None)
plt.figure(i)
ax = plt.subplot(1,1,1)
ax.plot(osm, osr, 'o', osm, slope*osm + intercept)
ax.set_xlabel('Quantiles',fontsize=20)
ax.set_ylabel('Quantiles Obs',fontsize=20)
customaxis(ax, c_left='k', c_bottom='k', c_right='none', c_top='none', lw=2, size=20, pad=8)
plt.legend([dist_name])
plt.show()
In [13]:
ax = plt.subplot(1,1,1)
ax.hist(np.random.rand(10000),fc='gray', histtype='stepfilled', alpha=0.3)
ax.set_xlabel('Value',size=20)
ax.set_ylabel('Freq',size=20)
customaxis(ax, c_left='k', c_bottom='k', c_right='none', c_top='none', lw=2, size=20, pad=8)
plt.show()
x = [np.mean(np.random.rand(100)) for _ in range(10000)]
ax = plt.subplot(1,1,1)
ax.hist(x,fc='gray', histtype='stepfilled', alpha=0.3)
ax.set_xlabel('Value',size=20)
ax.set_ylabel('Freq',size=20)
customaxis(ax, c_left='k', c_bottom='k', c_right='none', c_top='none', lw=2, size=20, pad=8)
plt.show()
print('Real STD: ', np.std(np.random.rand(10000)))
print('STD estimated *sqrt(n-1), n = 100 : ', np.std(x)*np.sqrt(100-1))
print('STD estimated *sqrt(n-1), n = 50 : ', np.std([np.mean(np.random.rand(50)) for _ in range(10000)])*np.sqrt(50-1))
In [68]:
%%latex
$$\mu = \left[\mu + t_{1 - \alpha/2,n-1},\mu + t_{\alpha/2,n-1} \right]$$
$$$$
$$\sigma = \left[\sqrt{\frac{(n-1)\cdot \sigma^2}{\chi^2_{1 - \alpha/2,n-1} }} \sqrt{\frac{(n-1)\cdot \sigma^2}{\chi^2_{\alpha/2,n-1} }}\right]$$
In [6]:
x = np.random.randn(20)
print('CI for mean with SEM = ', np.mean(x) + scipy.stats.t.isf([0.025, 0.975], [19])[::-1] *scipy.stats.sem(x))
print('CI for std with SEM = ', np.sqrt(np.var(x)*19/scipy.stats.chi2.isf([0.025, 0.975], [19])))
In [70]:
import scikits.bootstrap as bootstrap
In [71]:
CIs = bootstrap.ci(x, statfunction=np.mean,n_samples=100000)
print('CI for mean with bootstrapping = ', CIs)
CIs = bootstrap.ci(x, statfunction=np.std,n_samples=100000)
print('CI for std with bootstrapping = ', CIs)
In [72]:
np.random.seed(105)
size = 100
x = [np.random.lognormal(1,1) for _ in range(size)]
y = (np.random.randn(size)*np.std(x)+np.mean(x))
print('Mean = %f STD = %f Median = %f' %(np.mean(x), np.std(x), np.median(x)))
print('Mean = %f STD = %f Median = %f' %(np.mean(y), np.std(y), np.median(y)))
ax = plt.subplot(1,1,1)
ax.hist(x,bins=np.linspace(-5,20,10),alpha=0.3,histtype='stepfilled')
ax.hist(y,bins=np.linspace(-5,20,10),alpha=0.3,histtype='stepfilled')
ax.set_xlabel('Value',fontsize=20)
ax.set_ylabel('Freq',fontsize=20)
customaxis(ax, c_left='k', c_bottom='k', c_right='none', c_top='none', lw=2, size=20, pad=8)
plt.show()
In [73]:
y2 = [np.median(np.random.choice(y, size=size, replace=True)) for _ in range(100000)]
y2 = np.asarray(y2)
ax = plt.subplot(1,1,1)
ax.hist(y2,bins=np.linspace(2,11,10),alpha=0.3,histtype='stepfilled')
ax.vlines(np.median(x),0,25000,linewidth=3)
ax.set_xlabel('Value',fontsize=20)
ax.set_ylabel('Freq',fontsize=20)
customaxis(ax, c_left='k', c_bottom='k', c_right='none', c_top='none', lw=2, size=20, pad=8)
print('Bootstrap: ',float(len(y2[y2<=np.median(x)]))/100000)
print('Median test: ',median_test(x,y))
print('Mann: ',scipy.stats.mannwhitneyu(x,y)[1]*2)
print('T_test: NOT NORMAL!',scipy.stats.ttest_ind(x,y)[1])
In [9]:
x = np.random.rand(100000)>0.75
size = 10000
simulated = np.asarray([np.mean(np.random.choice(x,size = 7)) for _ in range(size)])
p = float(np.sum(simulated>=(4./7.)))/size
print('p-value: ', p)
print('CI for p with SEM = ', p + scipy.stats.t.isf([0.025, 0.975], [size-1])[::-1] *np.sqrt(p*(1-p)/(size-1)))