Estimating the number of 16S rRNA genes per ng of DNA

  • Used to estimate the number of 16S genes in a CsCl gradient
    • Used ~5 ug of DNA per gradient

Calculations

  • Number of genomes in X DNA
    • MW of DNA
    • Size distribution of bacterial genomes
  • Number of 16S gene copies in X DNA
    • Distribution of 16S gene copy number per genome

Setting variables


In [58]:
import os

workDir = '/home/nick/notebook/SIPSim/dev/bac_genome1210/SSU_genes_per_ng_DNA/'
rnammerDir = os.path.join(workDir + 'rnammer')
genomeDir = '/home/nick/notebook/SIPSim/dev/bac_genome1210/genomes/'

Init


In [186]:
import glob
import pyfasta
import numpy as np
import pandas as pd
from collections import Counter
import matplotlib.pyplot as plt
import scipy.stats as ss
from fitter import Fitter
from functools import partial

%matplotlib inline  
%load_ext rpy2.ipython


The rpy2.ipython extension is already loaded. To reload it, use:
  %reload_ext rpy2.ipython

In [4]:
%%R
library(dplyr) 
library(tidyr) 
library(ggplot2)


/opt/anaconda/lib/python2.7/site-packages/rpy2/robjects/functions.py:106: UserWarning: 
Attaching package: ‘dplyr’


  res = super(Function, self).__call__(*new_args, **new_kwargs)
/opt/anaconda/lib/python2.7/site-packages/rpy2/robjects/functions.py:106: UserWarning: The following objects are masked from ‘package:stats’:

    filter, lag


  res = super(Function, self).__call__(*new_args, **new_kwargs)
/opt/anaconda/lib/python2.7/site-packages/rpy2/robjects/functions.py:106: UserWarning: The following objects are masked from ‘package:base’:

    intersect, setdiff, setequal, union


  res = super(Function, self).__call__(*new_args, **new_kwargs)

In [59]:
if not os.path.isdir(workDir):
    os.makedirs(workDir)
    
if not os.path.isdir(rnammerDir):
    os.makedirs(rnammerDir)

Size distribution of bacterial genomes


In [12]:
p = os.path.join(genomeDir, '*.fasta')
genomeFiles = glob.glob(p)

print 'Number of genome files: {}'.format(len(genomeFiles))


Number of genome files: 1210

Distribution of 16S gene copies per genome


In [44]:
total_seq_len = lambda x: sum([len(y) for y in x.values()])

def total_genome_lens(genome_files):
    genome_lens = {}
    for fasta in genome_files:
        name = os.path.split(fasta)[-1]
        name = os.path.splitext(name)[0]

        pyf = pyfasta.Fasta(fasta)
        genome_lens[name] = [total_seq_len(pyf)]
    return genome_lens

genome_lens = total_genome_lens(genomeFiles)

In [135]:
df_genome_len = pd.DataFrame(genome_lens).transpose()
df_genome_len


Out[135]:
0
Acaryochloris_marina_MBIC11017 6503724
Acetobacter_pasteurianus_IFO_3283-12 2904624
Acetobacterium_woodii_DSM_1030 4044777
Acetohalobium_arabaticum_DSM_5501 2469596
Acholeplasma_laidlawii_PG-8A 1496992
Achromobacter_xylosoxidans_A8 7013095
Acidaminococcus_fermentans_DSM_20731 2329769
Acidaminococcus_intestini_RyC-MR95 2487765
Acidimicrobium_ferrooxidans_DSM_10331 2158157
Acidiphilium_cryptum_JF-5 3389227
Acidiphilium_multivorum_AIU301 3749411
Acidithiobacillus_caldus_ATCC_51756 2777717
Acidithiobacillus_ferrivorans_SS3 3207552
Acidithiobacillus_ferrooxidans_ATCC_53993 2885038
Acidobacterium_capsulatum_ATCC_51196 4127356
Acidothermus_cellulolyticus_11B 2443540
Acidovorax_avenae_subsp_avenae_ATCC_19860 5482170
Acidovorax_citrulli_AAC00-1 5352772
Acidovorax_ebreus_TPSY 3796573
Acinetobacter_baumannii_BJAB07104 3951920
Acinetobacter_calcoaceticus_PHEA-2 3862530
Acinetobacter_oleivorans_DR1 4152543
Actinobacillus_equuli_subsp_equuli 2431533
Actinobacillus_pleuropneumoniae_serovar_5b_str_L20 2274482
Actinobacillus_succinogenes_130Z 2319663
Actinobacillus_suis_ATCC_33415 2501598
Actinoplanes_friuliensis_DSM_7358 9376071
Actinoplanes_missouriensis_431 8773466
Actinosynnema_mirum_DSM_43827 8248144
Adlercreutzia_equolifaciens_DSM_19450 2862526
... ...
Wolbachia_endosymbiont_of_Onchocerca_volvulus_str_Cameroon 960618
Wolbachia_endosymbiont_strain_TRS_of_Brugia_malayi 1080084
Wolbachia_endosymbiont_wPip_Mol_of_Culex_molestus 1340443
Xanthobacter_autotrophicus_Py2 5308934
Xanthomonas_albilineans_GPE_PC73 3768695
Xanthomonas_axonopodis_Xac29-1 5153455
Xanthomonas_axonopodis_pv_citri_str_306 5175554
Xanthomonas_axonopodis_pv_citrumelo_F1 4967469
Xanthomonas_campestris_pv_raphani_756C 4941214
Xanthomonas_campestris_pv_vesicatoria_str_85-10 5178466
Xanthomonas_oryzae_pv_oryzae_PXO99A 5240075
Xenorhabdus_bovienii_SS-2004 4225498
Xylanimonas_cellulosilytica_DSM_15894 3742776
Xylella_fastidiosa_subsp_fastidiosa_GB514 2491203
Yersinia_aldovae_670-83 4471090
Yersinia_enterocolitica_subsp_palearctica_105_5R_r 4552107
Yersinia_frederiksenii_Y225 4495532
Yersinia_pestis_biovar_Microtus_str_91001 4595065
Yersinia_pseudotuberculosis_YPIII 4689436
Zunongwangia_profunda_SM-A87 5128187
Zymomonas_mobilis_subsp_mobilis_NRRL_B-12526 2011869
_Bacillus_selenitireducens_MLS10 3592487
_Cellvibrio_gilvus_ATCC_13127 3526441
_Clostridium_acidurici_9a 3105335
_Clostridium_clariflavum_DSM_19732 4897678
_Clostridium_saccharolyticum_WM1 4662871
_Clostridium_stercorarium_subsp_stercorarium_DSM_8532 2970010
_Eubacterium_eligens_ATCC_27750 2144190
cyanobacterium_endosymbiont_of_Epithemia_turgida_isolate_EtSB_Lake_Yunoko 2794318
uncultured_Termite_group_1_bacterium_phylotype_Rs-D17 1125857

1210 rows × 1 columns


In [57]:
fig = plt.figure()
ax = fig.add_subplot(111)
ax.hist(df.ix[:,0], bins=20)


Out[57]:
(array([  33.,  109.,  127.,  186.,  153.,  139.,  138.,  132.,   51.,
          55.,   26.,   12.,   14.,   11.,    7.,   10.,    4.,    0.,
           2.,    1.]),
 array([   112091. ,    758175.4,   1404259.8,   2050344.2,   2696428.6,
          3342513. ,   3988597.4,   4634681.8,   5280766.2,   5926850.6,
          6572935. ,   7219019.4,   7865103.8,   8511188.2,   9157272.6,
          9803357. ,  10449441.4,  11095525.8,  11741610.2,  12387694.6,
         13033779. ]),
 <a list of 20 Patch objects>)

Fitting distribution


In [136]:
fo = Fitter(df_genome_len.ix[:,0])
fo.fit()


Searching best parameters for distribution alpha (error 7.25602686672e-13)
Searching best parameters for distribution anglit (error 5.2242181279e-13)
Searching best parameters for distribution arcsine (error 1.07314649197e-12)
Searching best parameters for distribution beta (error 8.32883368436e-14)
Searching best parameters for distribution betaprime (error 2.88429127954e-13)
Searching best parameters for distribution bradford (error 4.74183691784e-13)
Searching best parameters for distribution burr (error 3.35297185381e-13)
Searching best parameters for distribution cauchy (error 2.15268001783e-13)
Searching best parameters for distribution chi (error 1.28069608916e-12)
Searching best parameters for distribution chi2 (error 1.28069608916e-12)
Searching best parameters for distribution cosine (error 3.8892791496e-13)
Searching best parameters for distribution dgamma (error 1.03236993114e-13)
Searching best parameters for distribution dweibull (error 1.0283080456e-13)
Searching best parameters for distribution erlang (error 1.28069608916e-12)
Searching best parameters for distribution expon (error 1.28069608916e-12)
Searching best parameters for distribution exponnorm (error 1.00036687693e-13)
Searching best parameters for distribution exponpow (error 1.28069608916e-12)
Searching best parameters for distribution exponweib (error 1.46368686676e-12)
Searching best parameters for distribution f (error 2.79446814292e-13)
Searching best parameters for distribution fatiguelife (error 9.74931729406e-13)
Searching best parameters for distribution fisk (error 1.15293917893e-13)
Searching best parameters for distribution foldcauchy (error 1.64126123516e-13)
Searching best parameters for distribution foldnorm (error 1.28069608916e-12)
Searching best parameters for distribution frechet_l (error 1.28069608916e-12)
Searching best parameters for distribution frechet_r (error 1.25169103418e-12)
Searching best parameters for distribution gamma (error 8.40678606745e-14)
Searching best parameters for distribution gausshyper (error 8.32734624059e-14)
Searching best parameters for distribution genexpon (error 8.57982192686e-13)
Searching best parameters for distribution genextreme (error 1.28304601714e-12)
Searching best parameters for distribution gengamma (error 1.70801807676e-12)
Searching best parameters for distribution genhalflogistic (error 2.17720106944e-13)
Searching best parameters for distribution genlogistic (error 8.86296556742e-14)
Searching best parameters for distribution gennorm (error 1.28049833922e-13)
Searching best parameters for distribution genpareto (error 3.52458677122e-13)
Searching best parameters for distribution gilbrat (error 2.93947683525e-13)
Searching best parameters for distribution gompertz (error 1.28069608916e-12)
Searching best parameters for distribution gumbel_l (error 2.93340127703e-13)
Searching best parameters for distribution gumbel_r (error 8.86223678302e-14)
Searching best parameters for distribution halfcauchy (error 4.63805862992e-13)
Searching best parameters for distribution halfgennorm (error 1.99402775968e-12)
Searching best parameters for distribution halflogistic (error 1.28069608916e-12)
Searching best parameters for distribution halfnorm (error 1.28069608916e-12)
Searching best parameters for distribution hypsecant (error 1.29867517344e-13)
Searching best parameters for distribution invgamma (error 9.22931664052e-14)
Searching best parameters for distribution invgauss (error 5.1664699667e-13)
Searching best parameters for distribution invweibull (error 1.04947351678e-13)
Searching best parameters for distribution johnsonsb (error 8.52895275965e-14)
Searching best parameters for distribution johnsonsu (error 8.72304514835e-14)
Searching best parameters for distribution ksone (error nan)
Searching best parameters for distribution kstwobign (error 8.37088802271e-14)
Searching best parameters for distribution laplace (error 2.02370486484e-13)
Searching best parameters for distribution levy (error 5.16646234433e-13)
Searching best parameters for distribution levy_l (error 1.00156126379e-12)

SKIPPED levy_stable distribution (taking more than 10 seconds)
Searching best parameters for distribution loggamma (error 1.2917250732e-13)
Searching best parameters for distribution logistic (error 1.17231202738e-13)
Searching best parameters for distribution loglaplace (error 1.70078697789e-13)
Searching best parameters for distribution lognorm (error 1.30238684302e-12)
Searching best parameters for distribution lomax (error 4.99356800565e-13)
Searching best parameters for distribution maxwell (error 9.18377452962e-14)
Searching best parameters for distribution mielke (error 4.60049633971e-13)
Searching best parameters for distribution nakagami (error 8.33576964709e-14)
Searching best parameters for distribution ncf (error 1.27808122357e-12)
Searching best parameters for distribution nct (error inf)
Searching best parameters for distribution ncx2 (error inf)
Searching best parameters for distribution norm (error 1.24897056606e-13)
Searching best parameters for distribution pareto (error 3.69937503368e-12)
Searching best parameters for distribution pearson3 (error 8.40678599194e-14)
Searching best parameters for distribution powerlaw (error 6.45341220129e-13)
Searching best parameters for distribution powerlognorm (error 1.69235156732e-12)
Searching best parameters for distribution powernorm (error nan)
Searching best parameters for distribution rayleigh (error 8.29793098903e-14)
Searching best parameters for distribution rdist (error 1.24754566513e-13)
Searching best parameters for distribution recipinvgauss (error 1.28069608916e-12)
Searching best parameters for distribution reciprocal (error 1.28069608916e-12)
Searching best parameters for distribution rice (error 8.29793128794e-14)

SKIPPED rv_continuous distribution (taking more than 10 seconds)
Searching best parameters for distribution semicircular (error 6.29721133996e-13)
Searching best parameters for distribution t (error 1.93858232903e-12)
Searching best parameters for distribution triang (error 2.54141042783e-13)
Searching best parameters for distribution truncexpon (error 4.42424263834e-13)
Searching best parameters for distribution truncnorm (error 1.28069608916e-12)
Searching best parameters for distribution tukeylambda (error 2.89394361718e-13)
Searching best parameters for distribution uniform (error 6.81786179825e-13)
Searching best parameters for distribution vonmises (error 4.59243249168e+46)
Searching best parameters for distribution vonmises_line (error 6.86179910576e-13)
Searching best parameters for distribution wald (error 1.28069608916e-12)
Searching best parameters for distribution weibull_max (error 1.28069608916e-12)
Searching best parameters for distribution weibull_min (error 1.25169103418e-12)
Searching best parameters for distribution wrapcauchy (error nan)

In [137]:
fo.summary()


            sumsquare_error
rayleigh       8.297931e-14
rice           8.297931e-14
gausshyper     8.327346e-14
beta           8.328834e-14
nakagami       8.335770e-14

In [138]:
genome_len_best_fit = fo.fitted_param['rayleigh']
genome_len_best_fit


Out[138]:
(-18067.751071571984, 2937372.194656291)

In [139]:
# test of distribution
x = ss.rayleigh.rvs(*genome_len_best_fit, size=10000)
fig = plt.figure()
ax = plt.subplot(111)
ax.hist(x, bins=50)
fig.show()


Distribution of 16S gene copies per genome

rnammer run


In [60]:
%%bash -s "$genomeDir" "$rnammerDir"

find $1 -name "*fasta" | \
    perl -pe 's/.+\/|\.fasta//g' | \
    xargs -n 1 -I % -P 30 bash -c \
    "rnammer -S bac -m ssu -gff $2/%_rrn.gff -f $2/%_rrn.fna -xml $2/%_rrn.xml < $1/%.fasta"

In [61]:
## Summarizing the results

!cd $rnammerDir; \
    egrep -v "^#" *.gff | \
    grep "16s_rRNA" | \
    perl -pe 's/:/\t/' > ssu_summary.txt

In [140]:
inFile = os.path.join(rnammerDir, 'ssu_summary.txt')

inFH = open(inFile, 'rb')
df_ssu = pd.read_csv(inFH, sep='\t', header=None)
df_ssu.head()


Out[140]:
0 1 2 3 4 5 6 7 8 9 10
0 Acaryochloris_marina_MBIC11017_rrn.gff CP000828_Acaryochloris_marina_MBIC11017 RNAmmer-1.2 rRNA 5636180 5637670 1853.9 + . 16s_rRNA NaN
1 Acaryochloris_marina_MBIC11017_rrn.gff CP000828_Acaryochloris_marina_MBIC11017 RNAmmer-1.2 rRNA 1409155 1410645 1853.9 - . 16s_rRNA NaN
2 Acetobacter_pasteurianus_IFO_3283-12_rrn.gff AP011170_Acetobacter_pasteurianus_IFO_3283_12_DNA RNAmmer-1.2 rRNA 2764963 2766438 1933.6 - . 16s_rRNA NaN
3 Acetobacter_pasteurianus_IFO_3283-12_rrn.gff AP011170_Acetobacter_pasteurianus_IFO_3283_12_DNA RNAmmer-1.2 rRNA 1733300 1734775 1933.6 - . 16s_rRNA NaN
4 Acetobacter_pasteurianus_IFO_3283-12_rrn.gff AP011170_Acetobacter_pasteurianus_IFO_3283_12_DNA RNAmmer-1.2 rRNA 2285489 2286964 1933.6 - . 16s_rRNA NaN

In [141]:
fig = plt.figure()
ax = plt.subplot(111)
ax.hist(df_ssu.ix[:,6], bins=50)
fig.show()



In [143]:
# filtering by gene length of >= 1000 bp
df_ssu_f = df_ssu.loc[df[6] >= 1000]
df_ssu_f.head()


Out[143]:
0 1 2 3 4 5 6 7 8 9 10
0 Acaryochloris_marina_MBIC11017_rrn.gff CP000828_Acaryochloris_marina_MBIC11017 RNAmmer-1.2 rRNA 5636180 5637670 1853.9 + . 16s_rRNA NaN
1 Acaryochloris_marina_MBIC11017_rrn.gff CP000828_Acaryochloris_marina_MBIC11017 RNAmmer-1.2 rRNA 1409155 1410645 1853.9 - . 16s_rRNA NaN
2 Acetobacter_pasteurianus_IFO_3283-12_rrn.gff AP011170_Acetobacter_pasteurianus_IFO_3283_12_DNA RNAmmer-1.2 rRNA 2764963 2766438 1933.6 - . 16s_rRNA NaN
3 Acetobacter_pasteurianus_IFO_3283-12_rrn.gff AP011170_Acetobacter_pasteurianus_IFO_3283_12_DNA RNAmmer-1.2 rRNA 1733300 1734775 1933.6 - . 16s_rRNA NaN
4 Acetobacter_pasteurianus_IFO_3283-12_rrn.gff AP011170_Acetobacter_pasteurianus_IFO_3283_12_DNA RNAmmer-1.2 rRNA 2285489 2286964 1933.6 - . 16s_rRNA NaN

In [144]:
# counting number of 16S genes per genome
ssu_count = Counter(df_ssu_f[1])

In [145]:
ssu_max = max(ssu_count.values())

# plotting distribution
fig = plt.figure()
ax = plt.subplot(111)
ax.hist(ssu_count.values(), bins=ssu_max)
fig.show()


Fitting distribution


In [146]:
fo = Fitter(ssu_count.values())
fo.fit()


Searching best parameters for distribution alpha (error 6.62649287152)
Searching best parameters for distribution anglit (error 6.95100720304)
Searching best parameters for distribution arcsine (error 6.66839093441)
Searching best parameters for distribution beta (error 6.06120941574)
Searching best parameters for distribution betaprime (error 5.93365920893)
Searching best parameters for distribution bradford (error 6.5319897191)
Searching best parameters for distribution burr (error 5.94627173236)
Searching best parameters for distribution cauchy (error 6.76621198964)
Searching best parameters for distribution chi (error 5.62886491766)
Searching best parameters for distribution chi2 (error 6.15477270179)
Searching best parameters for distribution cosine (error 6.91162144256)
Searching best parameters for distribution dgamma (error 6.76520787584)
Searching best parameters for distribution dweibull (error 6.77318090778)
Searching best parameters for distribution erlang (error 6.21095395966)
Searching best parameters for distribution expon (error 6.3799593687)
Searching best parameters for distribution exponnorm (error 6.37951609583)
Searching best parameters for distribution exponpow (error 5.8323919002)
Searching best parameters for distribution exponweib (error 6.09672404085)
Searching best parameters for distribution f (error 5.79345858675)
Searching best parameters for distribution fatiguelife (error 6.27394346353)
Searching best parameters for distribution fisk (error 6.18931485601)
Searching best parameters for distribution foldcauchy (error 6.45432598182)
Searching best parameters for distribution foldnorm (error 6.5449462702)
Searching best parameters for distribution frechet_l (error 9.86246230516)
Searching best parameters for distribution frechet_r (error 5.71163622526)
Searching best parameters for distribution gamma (error 6.13289767029)
Searching best parameters for distribution gausshyper (error 6.29098579705)
Searching best parameters for distribution genexpon (error 6.38267850517)
Searching best parameters for distribution genextreme (error 6.61099429285)
Searching best parameters for distribution gengamma (error 6.26426383232)
Searching best parameters for distribution genhalflogistic (error 6.49211910248)
Searching best parameters for distribution genlogistic (error 6.68625635777)
Searching best parameters for distribution gennorm (error 6.76783059949)
Searching best parameters for distribution genpareto (error 6.38500207922)
Searching best parameters for distribution gilbrat (error 6.54597057997)
Searching best parameters for distribution gompertz (error 6.38654457108)
Searching best parameters for distribution gumbel_l (error 7.02886512737)
Searching best parameters for distribution gumbel_r (error 6.68574707364)
Searching best parameters for distribution halfcauchy (error 6.45367411633)
Searching best parameters for distribution halfgennorm (error 6.03360768011)
Searching best parameters for distribution halflogistic (error 6.49213669043)
Searching best parameters for distribution halfnorm (error 6.55212261459)
Searching best parameters for distribution hypsecant (error 6.77415408225)
Searching best parameters for distribution invgamma (error 6.60188332111)
Searching best parameters for distribution invgauss (error 6.55850307511)
Searching best parameters for distribution invweibull (error 6.61098847068)
Searching best parameters for distribution johnsonsb (error 5.95173395387)
Searching best parameters for distribution johnsonsu (error 6.54341025542)
Searching best parameters for distribution ksone (error nan)
Searching best parameters for distribution kstwobign (error 6.69285243595)
Searching best parameters for distribution laplace (error 6.76884823559)
Searching best parameters for distribution levy (error 6.31096999024)
Searching best parameters for distribution levy_l (error 7.54476996074)

SKIPPED levy_stable distribution (taking more than 10 seconds)
Searching best parameters for distribution loggamma (error 6.84622916462)
Searching best parameters for distribution logistic (error 6.79245601058)
Searching best parameters for distribution loglaplace (error 6.67084848615)
Searching best parameters for distribution lognorm (error 6.19602793422)
Searching best parameters for distribution lomax (error 6.38008443435)
Searching best parameters for distribution maxwell (error 6.77442577479)
Searching best parameters for distribution mielke (error 6.16802023451)
Searching best parameters for distribution nakagami (error 5.84997440138)
Searching best parameters for distribution ncf (error 6.59490475857)
Searching best parameters for distribution nct (error 6.61134006666)
Searching best parameters for distribution ncx2 (error 5.85400517339)
Searching best parameters for distribution norm (error 6.84305413682)
Searching best parameters for distribution pareto (error 6.31852573307)
Searching best parameters for distribution pearson3 (error 5.86859596589)
Searching best parameters for distribution powerlaw (error 5.71726168995)
Searching best parameters for distribution powerlognorm (error 6.12120501206)
Searching best parameters for distribution powernorm (error nan)
Searching best parameters for distribution rayleigh (error 6.74999794149)
Searching best parameters for distribution rdist (error 7.4232407051)
Searching best parameters for distribution recipinvgauss (error 5.7804522244)
Searching best parameters for distribution reciprocal (error 7.75811692214)
Searching best parameters for distribution rice (error 6.75000357119)

SKIPPED rv_continuous distribution (taking more than 10 seconds)
Searching best parameters for distribution semicircular (error 7.01897536044)
Searching best parameters for distribution t (error 6.77284965607)
Searching best parameters for distribution triang (error 6.81988808403)
Searching best parameters for distribution truncexpon (error 6.95673350175)
Searching best parameters for distribution truncnorm (error 7.75811692214)
Searching best parameters for distribution tukeylambda (error 6.91567304891)
Searching best parameters for distribution uniform (error 7.24791284051)
Searching best parameters for distribution vonmises (error 4263968.93151)
Searching best parameters for distribution vonmises_line (error 6.78496442277)
Searching best parameters for distribution wald (error 6.55314032693)
Searching best parameters for distribution weibull_max (error 9.86246230516)
Searching best parameters for distribution weibull_min (error 5.71163622526)
Searching best parameters for distribution wrapcauchy (error nan)

In [147]:
fo.summary()


               sumsquare_error
chi                   5.628865
weibull_min           5.711636
frechet_r             5.711636
powerlaw              5.717262
recipinvgauss         5.780452

In [148]:
ssu_ray_fit = fo.fitted_param['rayleigh']
ssu_ray_fit


Out[148]:
(-0.42093818866908678, 3.3743881962637636)

In [149]:
# test of distribution
x = ss.rayleigh.rvs(*ssu_ray_fit, size=10000)
fig = plt.figure()
ax = plt.subplot(111)
ax.hist(x, bins=50)
fig.show()



In [150]:
ssu_beta_fit = fo.fitted_param['beta']
ssu_beta_fit


Out[150]:
(0.74318499540524674,
 5.6551351105552428,
 0.99999999999999989,
 18.430935404032915)

In [151]:
# test of distribution
x = ss.beta.rvs(*ssu_beta_fit, size=10000)
fig = plt.figure()
ax = plt.subplot(111)
ax.hist(x, bins=50)
fig.show()


Notes

  • Using rayleigh distribution

Monte Carlo estimation of 16S gene copies per ng of DNA

  • M.W. of dsDNA = (# nucleotides x 607.4) + 157.9

In [123]:
# example of calculations
gradient_DNA_conc = 1e-9    # g of DNA
avogadro = 6.022e23         # molecules/mole

genome_len = 4000000
mw_genome = genome_len * 607.4 + 157.9

n_genomes = gradient_DNA_conc / mw_genome * avogadro

ssu_copy_per_genome = 4

n_genomes * ssu_copy_per_genome


Out[123]:
991438.8555530971

In [195]:
def SSU_copies_in_ng_DNA(DNA_conc, genome_len, ssu_copy_per_genome):
    
    DNA_conc__g = DNA_conc * 1e-9     # ng --> g of DNA
    avogadros = 6.022e23              # molecules/mole

    mw_genome = genome_len * 607.4 + 157.9
    n_genomes = DNA_conc__g / mw_genome * avogadros
    ssu_copies = n_genomes * ssu_copy_per_genome
    
    return ssu_copies

# run
SSU_copies_in_ng_DNA(1, 4000000, 4)


Out[195]:
991438.8555530971

In [197]:
def SSU_copies_MC(DNA_conc, genome_len_dist, ssu_copy_dist, n=100000):
    n_copy_dist = []
    for i in range(n):
        genome_len = genome_len_dist(size=1)[0]
        ssu_copy_per_genome = ssu_copy_dist(size=1)[0]
        n_copies = SSU_copies_in_ng_DNA(DNA_conc, genome_len, ssu_copy_per_genome)
        n_copy_dist.append(n_copies)                     
    return n_copy_dist

In [198]:
# distribution functions
genome_len_dist = partial(ss.rayleigh.rvs, *genome_len_best_fit)
ssu_copy_dist = partial(ss.rayleigh.rvs, *ssu_ray_fit)

# monte carlo estimation of ssu copies in a gradient
gradient_dna_conc__ng = 5000
n_copy_dist = SSU_copies_MC(gradient_dna_conc__ng, genome_len_dist, ssu_copy_dist, n=10000)

In [199]:
fig = plt.figure()
ax = plt.subplot(111)
ax.hist(n_copy_dist, bins=50)
fig.show()



In [200]:
median_copy = int(np.median(n_copy_dist))
std_copy = int(np.std(n_copy_dist))

print 'Number of SSU copies in {} ng of DNA: {} +/- {}'.format(gradient_dna_conc__ng, median_copy, std_copy)


Number of SSU copies in 5000 ng of DNA: 5097287217 +/- 14626056789

In [201]:
def median_confidence_interval(data, confidence=0.95):
    a = 1.0*np.array(data)
    n = len(a)
    m, se = np.median(a), ss.sem(a)
    h = se * ss.t._ppf((1+confidence)/2., n-1)
    return m, m-h, m+h

In [204]:
mci = median_confidence_interval(n_copy_dist)
mci = map(int, mci)

# lci,hci = ss.norm.interval(0.05, loc=np.mean(n_copy_dist), scale=np.std(n_copy_dist))
# copy_median = np.median(n_copy_dist)
# mci = [copy_median, copy_median - lci, copy_median + hci]

print 'Number of SSU copies in {} ng of DNA: {:,d} (low:{:,d}, high:{:,d})'.format(gradient_dna_conc__ng, *mci)


Number of SSU copies in 5000 ng of DNA: 5,097,287,217 (low:4,810,572,731, high:5,384,001,703)

In [ ]: