In [2]:
# dirs
workDir = '/home/nick/notebook/SIPSim/dev/bac_genome3/PCR_sim/'
genomeDir = '/home/nick/notebook/SIPSim/dev/bac_genome3/genomes/'
R_dir = '/home/nick/notebook/SIPSim/lib/R/'
# input files
OTU_table_file = os.path.join(workDir, 'OTU_abs1e9.txt')
# params
nprocs = 24
In [15]:
import numpy as np
import matplotlib.pyplot as plt
In [16]:
%load_ext rpy2.ipython
%matplotlib inline
In [6]:
%%R
library(ggplot2)
library(dplyr)
library(tidyr)
library(gridExtra)
In [247]:
%%R
NegBin = function(mu, alpha=0.5, n=1) {
mu = mu * rgamma(n, shape = alpha, scale=alpha) #rate = 1/alpha)
rpois(n, lambda = mu)
}
In [248]:
%%R
# taxon abundances
abunds = runif(1000, min=1, max=1000)
hist(abunds, 50)
In [249]:
%%R
hist(sapply(abunds, NegBin), 50)
In [250]:
%%R
# abunds with error: repeating count draw many times to increase variance estimation
alpha = 0.1
NegBin.a = function(x){
NegBin(x, alpha=alpha)
}
abund.l = list()
for (i in 1:1000){
abund.l[[i]] = sapply(abunds, NegBin.a)
}
abund.e = do.call(rbind, abund.l) %>%
as.data.frame %>% t
var.e = apply(abund.e, 1, var) %>% as.vector
var.e %>% head
In [251]:
%%R
df = data.frame(abund = abunds, var.e = var.e)
ggplot(df, aes(abund, var.e)) +
geom_point()
In [252]:
%%R
# abunds with error: repeating count draw many times to increase variance estimation
alpha = 0.5
NegBin.a = function(x){
NegBin(x, alpha=alpha)
}
abund.l = list()
for (i in 1:1000){
abund.l[[i]] = sapply(abunds, NegBin.a)
}
abund.e = do.call(rbind, abund.l) %>%
as.data.frame %>% t
var.e = apply(abund.e, 1, var) %>% as.vector
var.e %>% head
In [253]:
%%R
df = data.frame(abund = abunds, var.e = var.e)
ggplot(df, aes(abund, var.e)) +
geom_point()
In [254]:
%%R
# abunds with error: repeating count draw many times to increase variance estimation
alpha = 10
NegBin.a = function(x){
NegBin(x, alpha=alpha)
}
abund.l = list()
for (i in 1:1000){
abund.l[[i]] = sapply(abunds, NegBin.a)
}
abund.e = do.call(rbind, abund.l) %>%
as.data.frame %>% t
var.e = apply(abund.e, 1, var) %>% as.vector
var.e %>% head
In [255]:
%%R
df = data.frame(abund = abunds, var.e = var.e)
ggplot(df, aes(abund, var.e)) +
geom_point()
In [223]:
def neg_binom(mu, alpha=0.5, size=None):
mu = mu * np.random.gamma(shape=alpha, scale=alpha, size=size)
f = lambda x: np.random.poisson(size=1, lam=x)[0]
return [f(x) for x in mu]
In [224]:
counts = np.random.uniform(1000, 1, 1000)
plt.hist(counts, 50)
Out[224]:
In [260]:
n_counts = len(counts)
n_rep = 1000
alpha = 0.1
ar = np.zeros((n_rep, n_counts))
neg_binom_a = lambda x : neg_binom(x, alpha=alpha)
for i in range(n_rep):
ar[i,] = np.apply_along_axis(neg_binom_a, 0, counts)
counts_var = np.var(ar, 0)
In [261]:
fig = plt.figure()
ax = fig.add_subplot(111)
ax.scatter(counts, counts_var, s=1)
xl = ax.set_xlabel("Counts")
yl = ax.set_ylabel("Variance")
ax.set_xlim([0,np.max(counts)])
ax.set_ylim([0,np.max(counts_var)])
Out[261]:
In [256]:
n_counts = len(counts)
n_rep = 1000
alpha = 0.5
ar = np.zeros((n_rep, n_counts))
neg_binom_a = lambda x : neg_binom(x, alpha=alpha)
for i in range(n_rep):
ar[i,] = np.apply_along_axis(neg_binom_a, 0, counts)
counts_var = np.var(ar, 0)
In [257]:
fig = plt.figure()
ax = fig.add_subplot(111)
ax.scatter(counts, counts_var, s=1)
xl = ax.set_xlabel("Counts")
yl = ax.set_ylabel("Variance")
ax.set_xlim([0,np.max(counts)])
ax.set_ylim([0,np.max(counts_var)])
Out[257]:
In [258]:
n_counts = len(counts)
n_rep = 1000
alpha = 10
ar = np.zeros((n_rep, n_counts))
neg_binom_a = lambda x : neg_binom(x, alpha=alpha)
for i in range(n_rep):
ar[i,] = np.apply_along_axis(neg_binom_a, 0, counts)
counts_var = np.var(ar, 0)
In [259]:
fig = plt.figure()
ax = fig.add_subplot(111)
ax.scatter(counts, counts_var, s=1)
xl = ax.set_xlabel("Counts")
yl = ax.set_ylabel("Variance")
ax.set_xlim([0,np.max(counts)])
ax.set_ylim([0,np.max(counts_var)])
Out[259]:
In [ ]:
!cd $workDir; \
SIPSim OTU_add_error \
OTU_abs1e9.txt >
OTU_abs1e9_err.txt
!cd $workDir; \
head OTU_abs1e9_err.txt
In [ ]: