In [142]:
%matplotlib inline
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import scipy
from scipy import misc
import scipy.stats
from scipy.optimize import minimize
In [150]:
#number of perturbations
n_perturbs=1000
signatures=True # True=only signature level information desired, False=gene level information desired
#rough factor describing the fold change in representation between the 90th and 10th percentile of perturbations
fc_90_10=6 #default 6-fold, Wang 2013
#stringency (the minimum percentile you would like to reasonably represent)
stringency=0.1 #default 10th percentile
In [151]:
#rough approximation of scale required for individual gene differential expression
scale_factor=100
if signatures:
#or signature level differential expression
scale_factor=10
In [152]:
#representation factor, search for location and scale of normal distribution to determine parameters satisfying minimum representation and distribution width
x0 = np.array([fc_90_10*scale_factor,scale_factor])
def optimize_normal_cdf(x):
fc10_90=scipy.stats.norm.ppf([.1,.9],loc=x[0],scale=x[1])
ratio90_10=np.divide(fc10_90[1],fc10_90[0])
mincells=scipy.stats.norm.ppf([stringency],loc=x[0],scale=x[1])[0]
return np.power(ratio90_10-fc_90_10,2)+np.power(mincells-scale_factor,2)
res = minimize(optimize_normal_cdf, x0, method='nelder-mead',options={'xtol': 1e-8, 'disp': True})
In [153]:
lf,sf=res.final_simplex[0][0]
In [154]:
pltmin,pltmax=scipy.stats.norm.ppf([0.05,0.95],loc=lf,scale=sf)
In [155]:
ecdf=scipy.stats.norm.cdf(np.arange(pltmin,pltmax),loc=lf,scale=sf)
In [156]:
#plot CDF of cells per perturbation
plt.plot(ecdf)
plt.axhline(0.1,c='black')
plt.axhline(0.9,c='black')
plt.xlabel('Cells Per Perturbation')
plt.ylabel('Cumulative Probability')
total_cells=lf*n_perturbs
print('Roughly '+str(int(np.round(total_cells)))+' Total Cells Required')