Adding error to OTU count values

  • Drawing from a distribution that simulates error in count data
    • For instance: over-dispersion (negative binomial)

Setting variables


In [2]:
# dirs
workDir = '/home/nick/notebook/SIPSim/dev/bac_genome3/PCR_sim/'
genomeDir = '/home/nick/notebook/SIPSim/dev/bac_genome3/genomes/'
R_dir = '/home/nick/notebook/SIPSim/lib/R/'

# input files
OTU_table_file = os.path.join(workDir, 'OTU_abs1e9.txt')

# params
nprocs = 24

Init


In [15]:
import numpy as np
import matplotlib.pyplot as plt

In [16]:
%load_ext rpy2.ipython
%matplotlib inline


The rpy2.ipython extension is already loaded. To reload it, use:
  %reload_ext rpy2.ipython

In [6]:
%%R
library(ggplot2)
library(dplyr)
library(tidyr)
library(gridExtra)

Testing distribution functions

Modeling in R


In [247]:
%%R
NegBin = function(mu, alpha=0.5, n=1) {
    mu = mu * rgamma(n, shape = alpha, scale=alpha) #rate = 1/alpha)
    rpois(n, lambda = mu)
}

In [248]:
%%R
# taxon abundances
abunds = runif(1000, min=1, max=1000)
hist(abunds, 50)



In [249]:
%%R
hist(sapply(abunds, NegBin), 50)



In [250]:
%%R
# abunds with error: repeating count draw many times to increase variance estimation
alpha = 0.1
NegBin.a = function(x){
    NegBin(x, alpha=alpha)
}

abund.l = list()
for (i in 1:1000){
    abund.l[[i]] = sapply(abunds, NegBin.a)
    }
abund.e = do.call(rbind, abund.l) %>%
    as.data.frame %>% t
var.e = apply(abund.e, 1, var) %>% as.vector
var.e %>% head


[1]  17.89898 174.11652 510.73830   7.64625  31.38903 180.54050

In [251]:
%%R
df = data.frame(abund = abunds, var.e = var.e)
ggplot(df, aes(abund, var.e)) +
    geom_point()



In [252]:
%%R
# abunds with error: repeating count draw many times to increase variance estimation
alpha = 0.5
NegBin.a = function(x){
    NegBin(x, alpha=alpha)
}

abund.l = list()
for (i in 1:1000){
    abund.l[[i]] = sapply(abunds, NegBin.a)
    }
abund.e = do.call(rbind, abund.l) %>%
    as.data.frame %>% t
var.e = apply(abund.e, 1, var) %>% as.vector
var.e %>% head


[1]   1678.727  27276.143 117199.021   1463.502   4276.306  19977.746

In [253]:
%%R
df = data.frame(abund = abunds, var.e = var.e)
ggplot(df, aes(abund, var.e)) +
    geom_point()



In [254]:
%%R
# abunds with error: repeating count draw many times to increase variance estimation
alpha = 10
NegBin.a = function(x){
    NegBin(x, alpha=alpha)
}

abund.l = list()
for (i in 1:1000){
    abund.l[[i]] = sapply(abunds, NegBin.a)
    }
abund.e = do.call(rbind, abund.l) %>%
    as.data.frame %>% t
var.e = apply(abund.e, 1, var) %>% as.vector
var.e %>% head


[1]  12262909 215725167 871203383  11086432  31895701 140438396

In [255]:
%%R
df = data.frame(abund = abunds, var.e = var.e)
ggplot(df, aes(abund, var.e)) +
    geom_point()


Modeling in python


In [223]:
def neg_binom(mu, alpha=0.5, size=None):
    mu = mu * np.random.gamma(shape=alpha, scale=alpha, size=size)
    f = lambda x: np.random.poisson(size=1, lam=x)[0]
    return [f(x) for x in mu]

In [224]:
counts = np.random.uniform(1000, 1, 1000)
plt.hist(counts, 50)


Out[224]:
(array([ 11.,  27.,  17.,  17.,  17.,  17.,  22.,  16.,  13.,  26.,  19.,
         15.,  20.,  14.,  19.,  13.,  22.,  28.,  19.,  18.,  21.,  21.,
         23.,  11.,  18.,  24.,  21.,  26.,  20.,  17.,  27.,  24.,  22.,
         14.,  20.,  20.,  29.,  27.,  23.,  18.,  23.,  19.,  19.,  11.,
         19.,  25.,  25.,  13.,  24.,  26.]),
 array([   1.61623873,   21.5696432 ,   41.52304767,   61.47645213,
          81.4298566 ,  101.38326107,  121.33666553,  141.29007   ,
         161.24347447,  181.19687893,  201.1502834 ,  221.10368786,
         241.05709233,  261.0104968 ,  280.96390126,  300.91730573,
         320.8707102 ,  340.82411466,  360.77751913,  380.7309236 ,
         400.68432806,  420.63773253,  440.591137  ,  460.54454146,
         480.49794593,  500.45135039,  520.40475486,  540.35815933,
         560.31156379,  580.26496826,  600.21837273,  620.17177719,
         640.12518166,  660.07858613,  680.03199059,  699.98539506,
         719.93879953,  739.89220399,  759.84560846,  779.79901292,
         799.75241739,  819.70582186,  839.65922632,  859.61263079,
         879.56603526,  899.51943972,  919.47284419,  939.42624866,
         959.37965312,  979.33305759,  999.28646206]),
 <a list of 50 Patch objects>)

In [260]:
n_counts = len(counts)
n_rep = 1000
alpha = 0.1
ar = np.zeros((n_rep, n_counts))
neg_binom_a = lambda x : neg_binom(x, alpha=alpha)
for i in range(n_rep):
    ar[i,] = np.apply_along_axis(neg_binom_a, 0, counts)

counts_var = np.var(ar, 0)

In [261]:
fig = plt.figure()
ax = fig.add_subplot(111)
ax.scatter(counts, counts_var, s=1)
xl = ax.set_xlabel("Counts")
yl = ax.set_ylabel("Variance")
ax.set_xlim([0,np.max(counts)])
ax.set_ylim([0,np.max(counts_var)])


Out[261]:
(0, 919.84718399997769)

In [256]:
n_counts = len(counts)
n_rep = 1000
alpha = 0.5
ar = np.zeros((n_rep, n_counts))
neg_binom_a = lambda x : neg_binom(x, alpha=alpha)
for i in range(n_rep):
    ar[i,] = np.apply_along_axis(neg_binom_a, 0, counts)

counts_var = np.var(ar, 0)

In [257]:
fig = plt.figure()
ax = fig.add_subplot(111)
ax.scatter(counts, counts_var, s=1)
xl = ax.set_xlabel("Counts")
yl = ax.set_ylabel("Variance")
ax.set_xlim([0,np.max(counts)])
ax.set_ylim([0,np.max(counts_var)])


Out[257]:
(0, 115118.56703599979)

In [258]:
n_counts = len(counts)
n_rep = 1000
alpha = 10
ar = np.zeros((n_rep, n_counts))
neg_binom_a = lambda x : neg_binom(x, alpha=alpha)
for i in range(n_rep):
    ar[i,] = np.apply_along_axis(neg_binom_a, 0, counts)

counts_var = np.var(ar, 0)

In [259]:
fig = plt.figure()
ax = fig.add_subplot(111)
ax.scatter(counts, counts_var, s=1)
xl = ax.set_xlabel("Counts")
yl = ax.set_ylabel("Variance")
ax.set_xlim([0,np.max(counts)])
ax.set_ylim([0,np.max(counts_var)])


Out[259]:
(0, 1043632103.6068968)

Adding error


In [ ]:
!cd $workDir; \
    SIPSim OTU_add_error \
    OTU_abs1e9.txt > 
    OTU_abs1e9_err.txt
    
!cd $workDir; \
    head OTU_abs1e9_err.txt

In [ ]: