In [7]:
import scipy.stats as stats
from scipy.stats import binom
from __future__ import division
In [8]:
%pylab
%matplotlib inline
In [9]:
import seaborn as sns
plt.plot([1,2,3], [2,3,5])
pylab.rcParams['figure.figsize'] = 12, 6
Till, J. E., McCulloch, E. a. & Siminovitch, L. A stochastic model of stem cell proliferation, based on the growth of spleen colony-forming cells . Proc. Natl. Acad. Sci. U. S. A. 51, 29–36 (1964).
First evidence for stem cells by Till/McCulloch around 1960
The Radiation Sensitivity of Normal Mouse Bone Marrow Cells, Determined by Quantitative Marrow Transplantation into Irradiated Mice
This one is a follow up, analysing data from
Siminovitch, L., McCulloch, E. & Till, J. The distribution of colony forming cells among spleen colonies'
In [10]:
nCells = 1e6
nDays = 10
nDivisions = np.log2(nCells)
time_per_division = nDays * 24 / nDivisions
print "#Generations/Divisions: %d\n\nAverage generation time %.2f (hours)" %(round(nDivisions), time_per_division)
In [11]:
colonies_per_spleen = dict()
colonies_per_spleen["exp1"] = np.concatenate([np.repeat(0,24),
np.repeat(1,6),
np.array([2,6,19,21,23,36])
])
colonies_per_spleen["exp2"] = np.concatenate([np.repeat(0,3),
np.repeat(1,3),
np.repeat(2,2),
np.array([3,3, 20,20, 32])
])
colonies_per_spleen["exp3"]= np.concatenate([np.repeat(0,12),
np.repeat(1,8),
np.repeat(2,5),
np.repeat(3,2),
np.array([4,5,5,7,8,8,11,13,20,23,29,46])
])
In [12]:
fraction = 0.17 # the fraction of CFUs that actually go to the spleen
In [13]:
#just divide each dict entry by 'fraction'
tmpFunc = lambda x: np.ceil(x/fraction)
# apply this function to each dictionary entry
trueCFUs = {k: tmpFunc(v) for k,v in colonies_per_spleen.items()}
print "#Colonies per spleen"
print colonies_per_spleen["exp1"]
print "Estimated #CFU per spleen"
print trueCFUs["exp1"]
In [14]:
#observations
N = 100
k = 10
#inference
tmpP = np.linspace(0,1,100)
like_theta = binom.pmf(k,n=N,p=tmpP)
prior_theta = 1 # simply constant
posterior_theta = like_theta * prior_theta/ np.sum(like_theta * prior_theta)
plot(tmpP,posterior_theta)
xlabel('theta'), ylabel('Posterior(theta|k)'), xlim([0,0.4])
Out[14]:
In [15]:
n = np.arange(300)
data = unique(colonies_per_spleen["exp1"]) #just considering the first experiment
posteriorArray = np.zeros((len(data), len(n)))
for i,d in enumerate(data):
likelihood_n = binom.pmf(d, n, fraction)
prior_n = 1 #constant
posterior_n = likelihood_n * prior_n / np.sum(likelihood_n*prior_n)
posteriorArray[i,:] = posterior_n
plot(n,posteriorArray.T)
xlabel('N'), ylabel('Posterior(N|k,f)'), legend(data), ylim([0, 0.1])
Out[15]:
In [16]:
#normal hists
binning = np.arange(0,300,step=5)
plt.figure()
for exp,data in trueCFUs.iteritems():
plt.hist(data, bins=binning, histtype="stepfilled", alpha=.3)
plt.legend(trueCFUs.keys())
xlabel('#CFU'), ylabel('Frequency')
Out[16]:
In [17]:
def ecdf(x):
""" calculate the empirical distribution function of data x """
sortedX=np.sort( x )
yvals=np.arange(len(sortedX))/float(len(sortedX))
return sortedX, yvals
# plot the CDF for each experiment individually
for exp,data in trueCFUs.iteritems():
x,y = ecdf(data)
plt.plot(x,y)
xlabel('#CFUs'), ylabel('cumulative density')
#put all three together
trueCFUs_all = np.concatenate(trueCFUs.values())
x,y = ecdf(trueCFUs_all)
plt.plot(x,y, linewidth=5)
leg= trueCFUs.keys()
leg.append('all')
plt.legend(leg)
Out[17]:
In [18]:
plt.figure() # the figure as presented in the paper (scaled back to colonies)
plt.plot(x*0.17,y*len(trueCFUs_all), linewidth=2)
xlim([0,25])
ylim([0,120])
title('as in the paper')
Out[18]:
In [19]:
mu = np.mean(trueCFUs_all)
sigma= np.std(trueCFUs_all)
print "Mean: %f Fano factor %f" % (mu,sigma**2/mu)
In [20]:
# put the poisson distribution as a comparision which has the same mean as the data
x_ECDF,y_ECDF = ecdf(trueCFUs_all)
x = np.arange(0,100)
y =stats.poisson.cdf(x,mu)
plt.subplot(1,2,1)
plt.plot(x_ECDF, y_ECDF, linewidth=2)
plt.plot(x,y, linewidth=2)
plt.xlim([0,300])
plt.xlabel('#CFUs'), plt.ylabel('CDF'), plt.legend(['Data','Poisson fit'])
# DO THE PDF as well, much easier to see
#plt.figure()
plt.subplot(1,2,2)
sns.distplot(trueCFUs_all,norm_hist=True, kde=False)
plt.plot(x,stats.poisson.pmf(x,mu), linewidth=2)
plt.xlabel('#CFUs'), plt.ylabel('CDF'), plt.legend(['Poisson fit', 'Data'])
Out[20]:
In [21]:
plt.subplot(1,2,1)
sns.distplot(trueCFUs_all, fit =stats.gamma, kde=False)
xlim([0, 300])
plt.xlabel('#CFUs'), plt.ylabel('PDF'), plt.legend(['Gamma fit', 'Data'])
# lets look at the CDF as well
shape,loc,scale = stats.gamma.fit(trueCFUs_all)
plt.title('Fitted Gamma(Shape=%.2f, scale=%.2f)' % (shape,scale))
x_Gamma = np.arange(0,200)
y_Gamma = stats.gamma(shape,scale=scale).cdf(x_Gamma)
plt.subplot(1,2,2)
plt.plot(x_ECDF, y_ECDF, linewidth=2)
plt.plot(x_Gamma, y_Gamma)
plt.xlabel('#CFUs'), plt.ylabel('CDF'), plt.legend(['Data', 'Gamma fit'])
Out[21]:
probabilistic model:
solvable analytically only for exponential lifetimes
instead: Monte Carlo simulation
In [22]:
def simulateTree(p0, nGens):
""" simulates a single tree from the Till/McCulloch model
inputs:
- p0: probability that a single cell undergoes terminal
differentiation (i.e. no more division)
- nGens: number of generations to simulate
returns:
- a list (one element per generation) of single cells
present at that generation.
- a single element is just an array of cells present at that time
(zeros for stem cells, 1s for differentiated cells).
"""
# cell state is either 0 (stem cell) or 1 (differentiated),
# which is the only thing we keep track of here
theGenerations = list()
theGenerations.append(np.array(0))
for g in range(nGens):
lastGen = theGenerations[-1]
# for each of the last generation, roll a dice whether it terminally diffs
newState = roll_the_dice(lastGen, p0)
#all the zeros divide, the 1's just stay
n0 = sum(newState==0)
n1 = sum(newState==1)
nextGen = np.concatenate([np.repeat(0, 2*n0), np.repeat(1,n1)])
theGenerations.append(nextGen)
return theGenerations
def roll_the_dice(cellstate_array, p0):
"""
decide if a cell goes from 0->1 (wit probability p0)
does that for an entire vector of zeros and ones in paralell
"""
# helper function so that we can index into it via generation
# makes sure that as soon as cell_state==1 it wont change anymore
tmpP = np.array([p0, 1])
p = tmpP[cellstate_array]
r = np.random.rand(cellstate_array.size)
newGeneration = r<p
return newGeneration.astype(int)
In [23]:
def pretty_print_tree(tree):
for gen in tree:
print gen
In [24]:
p0 = 0.4
nGens = 5
aTree = simulateTree(p0,nGens)
bTree = simulateTree(p0,nGens)
print 'First tree'
pretty_print_tree(aTree)
print 'Second tree'
pretty_print_tree(bTree)
In [25]:
#CFUs per generation
print '\n#CFUs, first tree'
print map(lambda x: sum(x==0), aTree)
print '\n#CFUs, second tree'
print map(lambda x: sum(x==0), bTree)
In [26]:
nTrees = 1000
nGens = 20
p0 = 0.4
# assemble a matrix of tree vs #CFUs(t) (one row, one tree; one col one timepoint)
cfus_over_time = np.zeros((nTrees, nGens+1))
for i in range(nTrees):
tree = simulateTree(p0, nGens)
nCFU = map(lambda x: np.sum(x==0), tree)
cfus_over_time[i,:] = nCFU
print cfus_over_time[0:5,:].astype(int)
In [27]:
import seaborn as sns
plt.plot(log10(cfus_over_time.T))
plt.xlabel('Generation')
plt.ylabel("log10(#CFU)")
Out[27]:
In [28]:
# violin plot
plt.figure()
sns.violinplot((log10(cfus_over_time+1)), inner='points')
plt.xlabel('Generation')
plt.ylabel("log10(#CFU+1)")
Out[28]:
In [29]:
lastGen_nCFUs = cfus_over_time[:,-1]
plt.subplot(1,2,1)
plt.hist(lastGen_nCFUs, bins=50)
xlabel('#CFUs')
ylabel('Frequency in last generation')
# CDF
plt.subplot(1,2,2)
x_ECDF_lastGen,y_ECDF_lastGen = ecdf(lastGen_nCFUs)
plt.plot(x_ECDF_lastGen, y_ECDF_lastGen, linewidth=2)
xlabel('#CFUs')
ylabel('CDF')
Out[29]:
In [30]:
shape, loc, scale = stats.gamma.fit(lastGen_nCFUs)
fittedGamma = stats.gamma(a=shape, loc=loc, scale=scale)
print "Shape: %.3f, Location: %.3f, Scale: %.3f" %(shape, loc, scale)
# histogram of the simulated data
n,bins = np.histogram(lastGen_nCFUs, bins=arange(0, 300, step=5))
# pdf, cdf
fittedPDF, fittedCDF = fittedGamma.pdf(bins[:-1]), fittedGamma.cdf(bins[:-1])
plt.subplot(1,2,1)
plt.plot(x_ECDF, y_ECDF, linewidth=2)
plt.plot(bins[:-1], fittedCDF)
xlabel('#CFU'), ylabel('CDF'), legend(['synthetic Data','Gamma Fit'])
## the PDF derived from the cdf
plt.subplot(1,2,2)
plt.plot(bins[:-1],n/n.sum())
plt.plot(bins[:-1], diff(np.insert(fittedCDF, 0, 0)))
Out[30]:
In [31]:
#estimates, based on matching means an vars
gamma_mean, gamma_var = fittedGamma.stats(moments='mv')
sample_mean, sample_var = np.mean(lastGen_nCFUs), np.var(lastGen_nCFUs)
myShape = sample_mean**2/sample_var
myScale = sample_var/sample_mean
print myShape,myScale
#fittedGamma = stats.gamma(a=myShape, loc=0, scale=myScale)
print "Sample Mean: %.2f Gamma Mean: %.2f" % (sample_mean,gamma_mean)
print "Sample Var: %.2f Gamma Var: %.2f" % (sample_var,gamma_var)
In [32]:
nGens = 20
p0 = 0.4
nTrees_fig8 = 71
fig8_cfu_20 = np.zeros((nTrees_fig8))
for i in range(nTrees_fig8):
tree = simulateTree(p0, nGens)
nCFU = map(lambda x: np.sum(x==0), tree)
fig8_cfu_20[i] = nCFU[-1]
xTmp, yTmp = ecdf(fig8_cfu_20)
plot(xTmp, yTmp*nTrees_fig8)
#fit by gamma
shape, loc, scale = stats.gamma.fit(fig8_cfu_20)
fittedGamma_synthData = stats.gamma(a=shape, loc=loc, scale=scale)
fittedCDF = fittedGamma_synthData.cdf(xTmp[:-1])
plot(xTmp[:-1], fittedCDF*nTrees_fig8)
Out[32]:
In [33]:
"Simulate a bunch of trees and get the #CFUs over time"
def sim_cfu_over_time(p0, nTrees, nGens):
cfus_over_time = np.zeros((nTrees, nGens+1))
for i in range(nTrees):
tree = simulateTree(p0, nGens)
nCFU = map(lambda x: np.sum(x==0), tree)
cfus_over_time[i,:] = nCFU
return cfus_over_time
"calculate a KS test between observed and simulated dists"
def distance_function(obs, sim):
K,pval = stats.ks_2samp(obs,sim)
return K, pval
#return np.abs(np.mean(obs)- np.mean(sim))
p0_grid = np.linspace(0, 1, 50)
distances, pvals = np.zeros(p0_grid.shape), np.zeros(p0_grid.shape)
observedData = trueCFUs_all # experimental data after 10 days/20divisions
for i, p0 in enumerate(p0_grid):
simData = sim_cfu_over_time(p0, nTrees = 100, nGens= 20)
simData_lastGen = simData[:,-1]
distances[i], pvals[i] = distance_function(obs= observedData, sim=simData_lastGen)
In [34]:
def plot_fitted_stoch_model(p0_array, K_array, pval_array, observedData):
bestP0 = p0_array[argmin(K_array)]
plt.subplot(2,2,1)
plt.plot(p0_array, K_array)
plt.vlines(bestP0,0,1)
xlabel('p0'), ylabel('KS distance')
plt.subplot(2,2,2)
plt.plot(p0_array, log10(pvals))
xlabel('p0'), ylabel('KS pval')
bestpval = pval_array[argmin(K_array)]
title(bestpval)
plt.subplot(2,2,3)
simData = sim_cfu_over_time(p0=0.4, nTrees = 100, nGens= 20)
x,y =ecdf(observedData)
plt.plot(x, y)
x,y =ecdf(simData[:,-1])
plt.plot(x, y)
xlabel('#CFU'), ylabel('CDF')
legend(['observed','simulated'])
binning = np.arange(0,300,step=5)
plt.subplot(2,2,4)
plt.hist(observedData, bins=binning, histtype="stepfilled", alpha=.5)
plt.hist(simData[:,-1], bins=binning, histtype="stepfilled", alpha=.5)
xlabel('#CFU'), ylabel('PDF')
In [35]:
plot_fitted_stoch_model(p0_grid, distances, pvals, observedData)