Statistical exploration for Bayesian analysis of PhIP-seq


In [2]:
import pandas as pd
import numpy as np
import scipy as sp
import matplotlib.pyplot as plt
import seaborn as sns

In [3]:
%matplotlib inline

In [49]:
cpm = pd.read_csv('/Users/laserson/tmp/phip_analysis/phip-9/cpm.tsv', sep='\t', header=0, index_col=0)

In [50]:
upper_bound = sp.stats.scoreatpercentile(cpm.values.ravel(), 99.9)
upper_bound


Out[50]:
216.40100000053644

In [51]:
fig, ax = plt.subplots()
_ = ax.hist(cpm.values.ravel(), bins=100, log=True)
_ = ax.set(title='cpm')



In [52]:
fig, ax = plt.subplots()
_ = ax.hist(np.log10(cpm.values.ravel() + 0.5), bins=100, log=False)
_ = ax.set(title='log10(cpm + 0.5)')



In [53]:
fig, ax = plt.subplots()
_ = ax.hist(np.log10(cpm.values.ravel() + 0.5), bins=100, log=True)
_ = ax.set(title='log10(cpm + 0.5)')


Plot only the lowest 99.9% of the data


In [54]:
fig, ax = plt.subplots()
_ = ax.hist(cpm.values.ravel()[cpm.values.ravel() <= upper_bound], bins=range(100), log=False)
_ = ax.set(xlim=(0, 60))
_ = ax.set(title='trimmed cpm')



In [55]:
trimmed_cpm = cpm.values.ravel()[cpm.values.ravel() <= upper_bound]
trimmed_cpm.mean(), trimmed_cpm.std()


Out[55]:
(9.9421241108113403, 10.678173577708357)

In [56]:
means = cpm.apply(lambda x: x[x <= upper_bound].mean(), axis=1, raw=True)
_, edges = np.histogram(means, bins=[sp.stats.scoreatpercentile(means, p) for p in np.linspace(0, 100, 10)])

In [57]:
def plot_hist(ax, a):
    h, e = np.histogram(a, bins=100, range=(0, upper_bound), density=True)
    ax.hlines(h, e[:-1], e[1:])

In [58]:
for i in range(len(edges[:-1])):
    left = edges[i]
    right = edges[i + 1]
    rows = (means >= left) & (means <= right)
    values = cpm[rows].values.ravel()
    fig, ax = plt.subplots()
    plot_hist(ax, values)
    ax.set(xlim=(0, 50), title='mean in ({}, {})'.format(left, right))


Do the slices look Poisson?


In [59]:
a = np.random.poisson(8, 10000)
fig, ax = plt.subplots()
plot_hist(ax, a)
ax.set(xlim=(0, 50))


Out[59]:
[(0, 50)]

For the most part. Maybe try NegBin just in case

What does the distribution of the trimmed means look like?


In [60]:
fig, ax = plt.subplots()
plot_hist(ax, means)
ax.set(xlim=(0, 50))


Out[60]:
[(0, 50)]

In [61]:
a = np.random.gamma(1, 10, 10000)
fig, ax = plt.subplots()
plot_hist(ax, a)
ax.set(xlim=(0, 50))


Out[61]:
[(0, 50)]

In [62]:
means.mean()


Out[62]:
9.9956446041530818

Following Anders and Huber, Genome Biology 2010, compute some of their stats

Compute size factors


In [63]:
s = np.exp(np.median(np.log(cpm.values + 0.5) - np.log(cpm.values + 0.5).mean(axis=1).reshape((cpm.shape[0], 1)), axis=0))

In [64]:
_ = sns.distplot(s)



In [65]:
q = (cpm.values / s).mean(axis=1)

In [66]:
fig, ax = plt.subplots()
_ = ax.hist(q, bins=100, log=False)



In [67]:
fig, ax = plt.subplots()
_ = ax.hist(q, bins=100, log=True)



In [68]:
w = (cpm.values / s).std(axis=1, ddof=1)

In [69]:
fig, ax = plt.subplots()
_ = ax.hist(w, bins=100, log=True)



In [70]:
fig, ax = plt.subplots()
_ = ax.scatter(q, w)



In [71]:
_ = sns.lmplot('q', 'w', pd.DataFrame({'q': q, 'w': w}))



In [72]:
list(zip(cpm.values.sum(axis=0), s))


Out[72]:
[(1000591, 1.0681541323525534),
 (1001638, 1.046902937707179),
 (999815, 1.066070286402355),
 (997969, 1.0917607741142121),
 (997840, 1.0247851225291942),
 (998945, 1.0199611261078056),
 (999577, 0.99621509171980727),
 (1001162, 1.025592027289312),
 (999519, 1.1350877645317199),
 (999587, 1.1327429552036481),
 (999621, 1.0546279311372166),
 (999933, 1.0029357955852236),
 (999804, 1.0421606036812807),
 (999881, 0.98966897038013824),
 (999389, 0.91421867150309533),
 (1000097, 0.99979615102339625),
 (999818, 0.78370907263862766),
 (999700, 0.85051419640847536),
 (999705, 0.98316967031036961),
 (1003117, 0.96404200697356024),
 (999050, 0.92937699120961048),
 (999953, 0.97246453244842612),
 (999781, 1.0353438144903941),
 (1000229, 1.0598066140667766),
 (999719, 1.0560840170428636),
 (1000104, 1.0283736153248682),
 (1000017, 1.035337328176245),
 (999762, 1.022102156668877),
 (999626, 1.0220481626843567),
 (999791, 0.96205352272587308)]

In [73]:
s


Out[73]:
array([ 1.06815413,  1.04690294,  1.06607029,  1.09176077,  1.02478512,
        1.01996113,  0.99621509,  1.02559203,  1.13508776,  1.13274296,
        1.05462793,  1.0029358 ,  1.0421606 ,  0.98966897,  0.91421867,
        0.99979615,  0.78370907,  0.8505142 ,  0.98316967,  0.96404201,
        0.92937699,  0.97246453,  1.03534381,  1.05980661,  1.05608402,
        1.02837362,  1.03533733,  1.02210216,  1.02204816,  0.96205352])

In [74]:
a = np.random.gamma(30, 1/30, 1000)

In [75]:
sns.distplot(a)


Out[75]:
<matplotlib.axes._subplots.AxesSubplot at 0x10e3aafd0>

Proceeding with the following strategy/model

Trim data to remove top 0.1% of count values. Compute mean of each row and use the means to fit a gamma distribution. Using these values, define a posterior on a rate for each clone, assuming Poisson stats for each cell. This means the posterior is also gamma distributed. Then compute the probability of seeing a more extreme value, weighted with the posterior on r_i.


In [76]:
import pystan

In [4]:
cpm = pd.read_csv('/Users/laserson/tmp/phip_analysis/phip-9/cpm.tsv', sep='\t', header=0, index_col=0)

In [5]:
upper_bound = sp.stats.scoreatpercentile(cpm.values, 99.9)

In [6]:
trimmed_means = cpm.apply(lambda x: x[x <= upper_bound].mean(), axis=1, raw=True).values

In [80]:
brm = pystan.StanModel(model_name='background_rates', file='/Users/laserson/repos/bamophip/background_rates.stan')


INFO:pystan:COMPILING THE C++ CODE FOR MODEL background_rates_0a7b2d07f26077b0d278330f6aedefc9 NOW.

In [81]:
data = {
    'num_clones': trimmed_means.shape[0],
    'trimmed_means': trimmed_means
}
br_fit = brm.sampling(data=data, iter=2000, chains=4)

In [82]:
br_fit


Out[82]:
Inference for Stan model: background_rates_0a7b2d07f26077b0d278330f6aedefc9.
4 chains, each with iter=2000; warmup=1000; thin=1; 
post-warmup draws per chain=1000, total post-warmup draws=4000.

          mean se_mean     sd   2.5%    25%    50%    75%  97.5%  n_eff   Rhat
g_alpha   1.26  1.4e-4 5.1e-3   1.25   1.25   1.26   1.26   1.27   1352    1.0
g_beta    0.13  1.7e-5 6.2e-4   0.12   0.13   0.13   0.13   0.13   1338    nan
lp__    -3.2e5    0.03    1.0 -3.2e5 -3.2e5 -3.2e5 -3.2e5 -3.2e5   1158   1.01

Samples were drawn using NUTS at Fri Jul 21 14:55:59 2017.
For each parameter, n_eff is a crude measure of effective sample size,
and Rhat is the potential scale reduction factor on split chains (at 
convergence, Rhat=1).

In [83]:
br_fit.plot()


Out[83]:

In [84]:
alpha, beta, _ = br_fit.get_posterior_mean().mean(axis=1)

In [85]:
alpha, beta


Out[85]:
(1.2557251897737052, 0.12563033692488237)

In [119]:
h, e = np.histogram(np.random.gamma(alpha, 1 / beta, 50000), bins='auto', density=True)
fig, ax = plt.subplots()
_ = ax.hist(trimmed_means, bins=100, normed=True)
_ = ax.hlines(h, e[:-1], e[1:])
_ = ax.set(xlim=(0, 50))



In [86]:
# assumes the counts for each clone are Poisson distributed with the learned Gamma prior
# Therefore, the posterior is Gamma distributed, and we use the expression for its expected value
trimmed_sums = cpm.apply(lambda x: x[x <= upper_bound].sum(), axis=1, raw=True).values
trimmed_sizes = cpm.apply(lambda x: (x <= upper_bound).sum(), axis=1, raw=True).values
background_rates = (alpha + trimmed_sums) / (beta + trimmed_sizes)

In [88]:
# mlxp is "minus log 10 pval"
mlxp = []
for i in range(cpm.shape[0]):
    mlxp.append(-sp.stats.poisson.logsf(cpm.values[i], background_rates[i]) / np.log(10))
mlxp = np.asarray(mlxp)


/Users/laserson/miniconda3/lib/python3.6/site-packages/scipy/stats/_distn_infrastructure.py:899: RuntimeWarning: divide by zero encountered in log
  return log(self._sf(x, *args))

In [89]:
fig, ax = plt.subplots()
h, e = np.histogram(10**(-mlxp.ravel()), bins='auto')
ax.hlines(h, e[:-1], e[1:])
ax.set(xlim=(0, 1))


Out[89]:
[(0, 1)]

In [95]:
fig, ax = plt.subplots()
finite = np.isfinite(mlxp.ravel())
_ = ax.hist(mlxp.ravel()[finite], bins=100, log=True)



In [98]:
fig, ax = plt.subplots()
finite = np.isfinite(mlxp.ravel())
_ = ax.hist(np.log10(mlxp.ravel()[finite] + 0.5), bins=100, log=True)



In [99]:
old_pvals = pd.read_csv('/Users/laserson/tmp/phip_analysis/phip-9/pvals.tsv', sep='\t', header=0, index_col=0)

In [100]:
fig, ax = plt.subplots()
h, e = np.histogram(10**(-old_pvals.values.ravel()), bins='auto')
ax.hlines(h, e[:-1], e[1:])
ax.set(xlim=(0, 1))


Out[100]:
[(0, 1)]

In [101]:
(old_pvals.values.ravel() > 10).sum()


Out[101]:
4294

In [102]:
(mlxp > 10).sum()


Out[102]:
10220

In [103]:
len(mlxp.ravel())


Out[103]:
2883600

Can we use scipy's MLE for the gamma parameters instead?


In [105]:
sp.stats.gamma.fit(trimmed_means)


Out[105]:
(0.33870375867684477, -3.5950824315037392e-25, 3.1021876347022337)

In [113]:
fig, ax = plt.subplots()
_ = ax.hist(sp.stats.gamma.rvs(a=0.3387, loc=0, scale=3.102, size=10000), bins=100)
_ = ax.set(xlim=(0, 50))


Hmmm...doesn't appear to get the correct solution.

Alternatively, let's try optimizing the log likelihood ourselves


In [27]:
pos = trimmed_means > 0
n = len(trimmed_means)
s = trimmed_means[pos].sum()
sl = np.log(trimmed_means[pos]).sum()
def ll(x):
    return -1 * (n * x[0] * np.log(x[1]) - n * sp.special.gammaln(x[0]) + (x[0] - 1) * sl - x[1] * s)

In [34]:
param = sp.optimize.minimize(ll, np.asarray([2, 1]), bounds=[(np.nextafter(0, 1), None), (np.nextafter(0, 1), None)])
param


Out[34]:
      fun: 315946.1251649895
 hess_inv: <2x2 LbfgsInvHessProduct with dtype=float64>
      jac: array([ 0.       , -0.0349246])
  message: b'CONVERGENCE: REL_REDUCTION_OF_F_<=_FACTR*EPSMCH'
     nfev: 54
      nit: 12
   status: 0
  success: True
        x: array([ 1.25600681,  0.1256554 ])

In [38]:
param.x


Out[38]:
array([ 1.25600681,  0.1256554 ])

SUCCESS!

Do the p-values have a correlation with the peptide abundance?


In [42]:
mlxp = pd.read_csv('/Users/laserson/tmp/phip_analysis/sjogrens/mlxp.tsv', sep='\t', index_col=0, header=0)
inputs = pd.read_csv('/Users/laserson/repos/phage_libraries_private/human90/inputs/human90-larman1-input.tsv', sep='\t', index_col=0, header=0)

In [53]:
m = pd.merge(mlxp, inputs, left_index=True, right_index=True)

In [59]:
sample = 'Sjogrens.serum.Sjogrens.FS12-03967.20A20G.1'

In [60]:
sp.stats.pearsonr(10**(-m[sample]), m['input'])


Out[60]:
(0.20887705868459297, 0.0)

In [61]:
sp.stats.spearmanr(10**(-m[sample]), m['input'])


Out[61]:
SpearmanrResult(correlation=0.36650172649526896, pvalue=0.0)

In [62]:
fig, ax = plt.subplots()
_ = ax.scatter(10**(-m[sample]), m['input'])


Out[62]:
<matplotlib.collections.PathCollection at 0x10ae58908>

In [63]:
fig, ax = plt.subplots()
_ = ax.scatter(m[sample], m['input'])



In [68]:
h, xe, ye = np.histogram2d(m[sample], m['input'], bins=100)
fig, ax = plt.subplots()
_ = ax.imshow(h)



In [67]:
np.histogram2d

In [ ]: