In [2]:
import pandas as pd
import numpy as np
import scipy as sp
import matplotlib.pyplot as plt
import seaborn as sns
In [3]:
%matplotlib inline
In [49]:
cpm = pd.read_csv('/Users/laserson/tmp/phip_analysis/phip-9/cpm.tsv', sep='\t', header=0, index_col=0)
In [50]:
upper_bound = sp.stats.scoreatpercentile(cpm.values.ravel(), 99.9)
upper_bound
Out[50]:
In [51]:
fig, ax = plt.subplots()
_ = ax.hist(cpm.values.ravel(), bins=100, log=True)
_ = ax.set(title='cpm')
In [52]:
fig, ax = plt.subplots()
_ = ax.hist(np.log10(cpm.values.ravel() + 0.5), bins=100, log=False)
_ = ax.set(title='log10(cpm + 0.5)')
In [53]:
fig, ax = plt.subplots()
_ = ax.hist(np.log10(cpm.values.ravel() + 0.5), bins=100, log=True)
_ = ax.set(title='log10(cpm + 0.5)')
Plot only the lowest 99.9% of the data
In [54]:
fig, ax = plt.subplots()
_ = ax.hist(cpm.values.ravel()[cpm.values.ravel() <= upper_bound], bins=range(100), log=False)
_ = ax.set(xlim=(0, 60))
_ = ax.set(title='trimmed cpm')
In [55]:
trimmed_cpm = cpm.values.ravel()[cpm.values.ravel() <= upper_bound]
trimmed_cpm.mean(), trimmed_cpm.std()
Out[55]:
In [56]:
means = cpm.apply(lambda x: x[x <= upper_bound].mean(), axis=1, raw=True)
_, edges = np.histogram(means, bins=[sp.stats.scoreatpercentile(means, p) for p in np.linspace(0, 100, 10)])
In [57]:
def plot_hist(ax, a):
h, e = np.histogram(a, bins=100, range=(0, upper_bound), density=True)
ax.hlines(h, e[:-1], e[1:])
In [58]:
for i in range(len(edges[:-1])):
left = edges[i]
right = edges[i + 1]
rows = (means >= left) & (means <= right)
values = cpm[rows].values.ravel()
fig, ax = plt.subplots()
plot_hist(ax, values)
ax.set(xlim=(0, 50), title='mean in ({}, {})'.format(left, right))
Do the slices look Poisson?
In [59]:
a = np.random.poisson(8, 10000)
fig, ax = plt.subplots()
plot_hist(ax, a)
ax.set(xlim=(0, 50))
Out[59]:
For the most part. Maybe try NegBin just in case
What does the distribution of the trimmed means look like?
In [60]:
fig, ax = plt.subplots()
plot_hist(ax, means)
ax.set(xlim=(0, 50))
Out[60]:
In [61]:
a = np.random.gamma(1, 10, 10000)
fig, ax = plt.subplots()
plot_hist(ax, a)
ax.set(xlim=(0, 50))
Out[61]:
In [62]:
means.mean()
Out[62]:
Following Anders and Huber, Genome Biology 2010, compute some of their stats
Compute size factors
In [63]:
s = np.exp(np.median(np.log(cpm.values + 0.5) - np.log(cpm.values + 0.5).mean(axis=1).reshape((cpm.shape[0], 1)), axis=0))
In [64]:
_ = sns.distplot(s)
In [65]:
q = (cpm.values / s).mean(axis=1)
In [66]:
fig, ax = plt.subplots()
_ = ax.hist(q, bins=100, log=False)
In [67]:
fig, ax = plt.subplots()
_ = ax.hist(q, bins=100, log=True)
In [68]:
w = (cpm.values / s).std(axis=1, ddof=1)
In [69]:
fig, ax = plt.subplots()
_ = ax.hist(w, bins=100, log=True)
In [70]:
fig, ax = plt.subplots()
_ = ax.scatter(q, w)
In [71]:
_ = sns.lmplot('q', 'w', pd.DataFrame({'q': q, 'w': w}))
In [72]:
list(zip(cpm.values.sum(axis=0), s))
Out[72]:
In [73]:
s
Out[73]:
In [74]:
a = np.random.gamma(30, 1/30, 1000)
In [75]:
sns.distplot(a)
Out[75]:
Proceeding with the following strategy/model
Trim data to remove top 0.1% of count values. Compute mean of each row and use the means to fit a gamma distribution. Using these values, define a posterior on a rate for each clone, assuming Poisson stats for each cell. This means the posterior is also gamma distributed. Then compute the probability of seeing a more extreme value, weighted with the posterior on r_i.
In [76]:
import pystan
In [4]:
cpm = pd.read_csv('/Users/laserson/tmp/phip_analysis/phip-9/cpm.tsv', sep='\t', header=0, index_col=0)
In [5]:
upper_bound = sp.stats.scoreatpercentile(cpm.values, 99.9)
In [6]:
trimmed_means = cpm.apply(lambda x: x[x <= upper_bound].mean(), axis=1, raw=True).values
In [80]:
brm = pystan.StanModel(model_name='background_rates', file='/Users/laserson/repos/bamophip/background_rates.stan')
In [81]:
data = {
'num_clones': trimmed_means.shape[0],
'trimmed_means': trimmed_means
}
br_fit = brm.sampling(data=data, iter=2000, chains=4)
In [82]:
br_fit
Out[82]:
In [83]:
br_fit.plot()
Out[83]:
In [84]:
alpha, beta, _ = br_fit.get_posterior_mean().mean(axis=1)
In [85]:
alpha, beta
Out[85]:
In [119]:
h, e = np.histogram(np.random.gamma(alpha, 1 / beta, 50000), bins='auto', density=True)
fig, ax = plt.subplots()
_ = ax.hist(trimmed_means, bins=100, normed=True)
_ = ax.hlines(h, e[:-1], e[1:])
_ = ax.set(xlim=(0, 50))
In [86]:
# assumes the counts for each clone are Poisson distributed with the learned Gamma prior
# Therefore, the posterior is Gamma distributed, and we use the expression for its expected value
trimmed_sums = cpm.apply(lambda x: x[x <= upper_bound].sum(), axis=1, raw=True).values
trimmed_sizes = cpm.apply(lambda x: (x <= upper_bound).sum(), axis=1, raw=True).values
background_rates = (alpha + trimmed_sums) / (beta + trimmed_sizes)
In [88]:
# mlxp is "minus log 10 pval"
mlxp = []
for i in range(cpm.shape[0]):
mlxp.append(-sp.stats.poisson.logsf(cpm.values[i], background_rates[i]) / np.log(10))
mlxp = np.asarray(mlxp)
In [89]:
fig, ax = plt.subplots()
h, e = np.histogram(10**(-mlxp.ravel()), bins='auto')
ax.hlines(h, e[:-1], e[1:])
ax.set(xlim=(0, 1))
Out[89]:
In [95]:
fig, ax = plt.subplots()
finite = np.isfinite(mlxp.ravel())
_ = ax.hist(mlxp.ravel()[finite], bins=100, log=True)
In [98]:
fig, ax = plt.subplots()
finite = np.isfinite(mlxp.ravel())
_ = ax.hist(np.log10(mlxp.ravel()[finite] + 0.5), bins=100, log=True)
In [99]:
old_pvals = pd.read_csv('/Users/laserson/tmp/phip_analysis/phip-9/pvals.tsv', sep='\t', header=0, index_col=0)
In [100]:
fig, ax = plt.subplots()
h, e = np.histogram(10**(-old_pvals.values.ravel()), bins='auto')
ax.hlines(h, e[:-1], e[1:])
ax.set(xlim=(0, 1))
Out[100]:
In [101]:
(old_pvals.values.ravel() > 10).sum()
Out[101]:
In [102]:
(mlxp > 10).sum()
Out[102]:
In [103]:
len(mlxp.ravel())
Out[103]:
Can we use scipy's MLE for the gamma parameters instead?
In [105]:
sp.stats.gamma.fit(trimmed_means)
Out[105]:
In [113]:
fig, ax = plt.subplots()
_ = ax.hist(sp.stats.gamma.rvs(a=0.3387, loc=0, scale=3.102, size=10000), bins=100)
_ = ax.set(xlim=(0, 50))
Hmmm...doesn't appear to get the correct solution.
Alternatively, let's try optimizing the log likelihood ourselves
In [27]:
pos = trimmed_means > 0
n = len(trimmed_means)
s = trimmed_means[pos].sum()
sl = np.log(trimmed_means[pos]).sum()
def ll(x):
return -1 * (n * x[0] * np.log(x[1]) - n * sp.special.gammaln(x[0]) + (x[0] - 1) * sl - x[1] * s)
In [34]:
param = sp.optimize.minimize(ll, np.asarray([2, 1]), bounds=[(np.nextafter(0, 1), None), (np.nextafter(0, 1), None)])
param
Out[34]:
In [38]:
param.x
Out[38]:
SUCCESS!
Do the p-values have a correlation with the peptide abundance?
In [42]:
mlxp = pd.read_csv('/Users/laserson/tmp/phip_analysis/sjogrens/mlxp.tsv', sep='\t', index_col=0, header=0)
inputs = pd.read_csv('/Users/laserson/repos/phage_libraries_private/human90/inputs/human90-larman1-input.tsv', sep='\t', index_col=0, header=0)
In [53]:
m = pd.merge(mlxp, inputs, left_index=True, right_index=True)
In [59]:
sample = 'Sjogrens.serum.Sjogrens.FS12-03967.20A20G.1'
In [60]:
sp.stats.pearsonr(10**(-m[sample]), m['input'])
Out[60]:
In [61]:
sp.stats.spearmanr(10**(-m[sample]), m['input'])
Out[61]:
In [62]:
fig, ax = plt.subplots()
_ = ax.scatter(10**(-m[sample]), m['input'])
Out[62]:
In [63]:
fig, ax = plt.subplots()
_ = ax.scatter(m[sample], m['input'])
In [68]:
h, xe, ye = np.histogram2d(m[sample], m['input'], bins=100)
fig, ax = plt.subplots()
_ = ax.imshow(h)
In [67]:
np.histogram2d
In [ ]: