In [58]:
import os
workDir = '/home/nick/notebook/SIPSim/dev/bac_genome1210/SSU_genes_per_ng_DNA/'
rnammerDir = os.path.join(workDir + 'rnammer')
genomeDir = '/home/nick/notebook/SIPSim/dev/bac_genome1210/genomes/'
In [186]:
import glob
import pyfasta
import numpy as np
import pandas as pd
from collections import Counter
import matplotlib.pyplot as plt
import scipy.stats as ss
from fitter import Fitter
from functools import partial
%matplotlib inline
%load_ext rpy2.ipython
In [4]:
%%R
library(dplyr)
library(tidyr)
library(ggplot2)
In [59]:
if not os.path.isdir(workDir):
os.makedirs(workDir)
if not os.path.isdir(rnammerDir):
os.makedirs(rnammerDir)
In [12]:
p = os.path.join(genomeDir, '*.fasta')
genomeFiles = glob.glob(p)
print 'Number of genome files: {}'.format(len(genomeFiles))
In [44]:
total_seq_len = lambda x: sum([len(y) for y in x.values()])
def total_genome_lens(genome_files):
genome_lens = {}
for fasta in genome_files:
name = os.path.split(fasta)[-1]
name = os.path.splitext(name)[0]
pyf = pyfasta.Fasta(fasta)
genome_lens[name] = [total_seq_len(pyf)]
return genome_lens
genome_lens = total_genome_lens(genomeFiles)
In [135]:
df_genome_len = pd.DataFrame(genome_lens).transpose()
df_genome_len
Out[135]:
In [57]:
fig = plt.figure()
ax = fig.add_subplot(111)
ax.hist(df.ix[:,0], bins=20)
Out[57]:
In [136]:
fo = Fitter(df_genome_len.ix[:,0])
fo.fit()
In [137]:
fo.summary()
In [138]:
genome_len_best_fit = fo.fitted_param['rayleigh']
genome_len_best_fit
Out[138]:
In [139]:
# test of distribution
x = ss.rayleigh.rvs(*genome_len_best_fit, size=10000)
fig = plt.figure()
ax = plt.subplot(111)
ax.hist(x, bins=50)
fig.show()
In [60]:
%%bash -s "$genomeDir" "$rnammerDir"
find $1 -name "*fasta" | \
perl -pe 's/.+\/|\.fasta//g' | \
xargs -n 1 -I % -P 30 bash -c \
"rnammer -S bac -m ssu -gff $2/%_rrn.gff -f $2/%_rrn.fna -xml $2/%_rrn.xml < $1/%.fasta"
In [61]:
## Summarizing the results
!cd $rnammerDir; \
egrep -v "^#" *.gff | \
grep "16s_rRNA" | \
perl -pe 's/:/\t/' > ssu_summary.txt
In [140]:
inFile = os.path.join(rnammerDir, 'ssu_summary.txt')
inFH = open(inFile, 'rb')
df_ssu = pd.read_csv(inFH, sep='\t', header=None)
df_ssu.head()
Out[140]:
In [141]:
fig = plt.figure()
ax = plt.subplot(111)
ax.hist(df_ssu.ix[:,6], bins=50)
fig.show()
In [143]:
# filtering by gene length of >= 1000 bp
df_ssu_f = df_ssu.loc[df[6] >= 1000]
df_ssu_f.head()
Out[143]:
In [144]:
# counting number of 16S genes per genome
ssu_count = Counter(df_ssu_f[1])
In [145]:
ssu_max = max(ssu_count.values())
# plotting distribution
fig = plt.figure()
ax = plt.subplot(111)
ax.hist(ssu_count.values(), bins=ssu_max)
fig.show()
In [146]:
fo = Fitter(ssu_count.values())
fo.fit()
In [147]:
fo.summary()
In [148]:
ssu_ray_fit = fo.fitted_param['rayleigh']
ssu_ray_fit
Out[148]:
In [149]:
# test of distribution
x = ss.rayleigh.rvs(*ssu_ray_fit, size=10000)
fig = plt.figure()
ax = plt.subplot(111)
ax.hist(x, bins=50)
fig.show()
In [150]:
ssu_beta_fit = fo.fitted_param['beta']
ssu_beta_fit
Out[150]:
In [151]:
# test of distribution
x = ss.beta.rvs(*ssu_beta_fit, size=10000)
fig = plt.figure()
ax = plt.subplot(111)
ax.hist(x, bins=50)
fig.show()
In [123]:
# example of calculations
gradient_DNA_conc = 1e-9 # g of DNA
avogadro = 6.022e23 # molecules/mole
genome_len = 4000000
mw_genome = genome_len * 607.4 + 157.9
n_genomes = gradient_DNA_conc / mw_genome * avogadro
ssu_copy_per_genome = 4
n_genomes * ssu_copy_per_genome
Out[123]:
In [195]:
def SSU_copies_in_ng_DNA(DNA_conc, genome_len, ssu_copy_per_genome):
DNA_conc__g = DNA_conc * 1e-9 # ng --> g of DNA
avogadros = 6.022e23 # molecules/mole
mw_genome = genome_len * 607.4 + 157.9
n_genomes = DNA_conc__g / mw_genome * avogadros
ssu_copies = n_genomes * ssu_copy_per_genome
return ssu_copies
# run
SSU_copies_in_ng_DNA(1, 4000000, 4)
Out[195]:
In [197]:
def SSU_copies_MC(DNA_conc, genome_len_dist, ssu_copy_dist, n=100000):
n_copy_dist = []
for i in range(n):
genome_len = genome_len_dist(size=1)[0]
ssu_copy_per_genome = ssu_copy_dist(size=1)[0]
n_copies = SSU_copies_in_ng_DNA(DNA_conc, genome_len, ssu_copy_per_genome)
n_copy_dist.append(n_copies)
return n_copy_dist
In [198]:
# distribution functions
genome_len_dist = partial(ss.rayleigh.rvs, *genome_len_best_fit)
ssu_copy_dist = partial(ss.rayleigh.rvs, *ssu_ray_fit)
# monte carlo estimation of ssu copies in a gradient
gradient_dna_conc__ng = 5000
n_copy_dist = SSU_copies_MC(gradient_dna_conc__ng, genome_len_dist, ssu_copy_dist, n=10000)
In [199]:
fig = plt.figure()
ax = plt.subplot(111)
ax.hist(n_copy_dist, bins=50)
fig.show()
In [200]:
median_copy = int(np.median(n_copy_dist))
std_copy = int(np.std(n_copy_dist))
print 'Number of SSU copies in {} ng of DNA: {} +/- {}'.format(gradient_dna_conc__ng, median_copy, std_copy)
In [201]:
def median_confidence_interval(data, confidence=0.95):
a = 1.0*np.array(data)
n = len(a)
m, se = np.median(a), ss.sem(a)
h = se * ss.t._ppf((1+confidence)/2., n-1)
return m, m-h, m+h
In [204]:
mci = median_confidence_interval(n_copy_dist)
mci = map(int, mci)
# lci,hci = ss.norm.interval(0.05, loc=np.mean(n_copy_dist), scale=np.std(n_copy_dist))
# copy_median = np.median(n_copy_dist)
# mci = [copy_median, copy_median - lci, copy_median + hci]
print 'Number of SSU copies in {} ng of DNA: {:,d} (low:{:,d}, high:{:,d})'.format(gradient_dna_conc__ng, *mci)
In [ ]: