In [1]:
import os
workDir = '/var/seq_data/ncbi_db/genome/Jan2016/ampFragsGC/'
ampFragFile = '/var/seq_data/ncbi_db/genome/Jan2016/ampFrags_KDE.pkl'
otuFile = '/var/seq_data/ncbi_db/genome/Jan2016/rnammer_aln/otusn_map_nonSingle.txt'
In [2]:
import dill
import numpy as np
import pandas as pd
%load_ext rpy2.ipython
%load_ext pushnote
In [3]:
%%R
library(dplyr)
library(tidyr)
library(ggplot2)
In [4]:
if not os.path.isdir(workDir):
os.makedirs(workDir)
%cd $workDir
In [5]:
# max 13C shift
max_13C_shift_in_BD = 0.036
# min BD (that we care about)
min_GC = 13.5
min_BD = min_GC/100.0 * 0.098 + 1.66
# max BD (that we care about)
max_GC = 80
max_BD = max_GC / 100.0 * 0.098 + 1.66 # 80.0% G+C
max_BD = max_BD + max_13C_shift_in_BD
## BD range of values
BD_vals = np.arange(min_BD, max_BD, 0.001)
In [6]:
infoFile = os.path.splitext(ampFragFile)[0] + '_info.txt'
infoFile = os.path.join(workDir, os.path.split(infoFile)[1])
!SIPSim KDE_info -s $ampFragFile > $infoFile
!wc -l $infoFile
!head -n 4 $infoFile
In [7]:
%%R -i infoFile
df.info = read.delim(infoFile, sep='\t') %>%
mutate(genus_ID = gsub('_.+', '', taxon_ID),
species_ID = gsub('^([^_]+_[^_]+).+', '\\1', taxon_ID))
df.info %>% head(n=3)
In [13]:
%%R -i otuFile
df.OTU = read.delim(otuFile, sep='\t', header=FALSE) %>%
mutate(genome_ID = gsub('\\.fna', '', V13)) %>%
select(genome_ID, V2) %>%
rename('OTU_ID' = V2)
df.info.j = inner_join(df.info, df.OTU, c('taxon_ID' = 'genome_ID'))
df.OTU = NULL
df.info.j %>% head(n=3)
In [31]:
%%R
df.info.j.f1 = df.info.j %>%
filter(KDE_ID == 1) %>%
distinct(taxon_ID, OTU_ID) %>%
group_by(OTU_ID) %>%
mutate(n_taxa = n()) %>%
ungroup() %>%
filter(n_taxa > 1)
df.info.j.f1 %>% nrow %>% print
df.info.j.f1 %>% head(n=3) %>% as.data.frame
In [44]:
%%R -h 4000 -w 800
df.info.j.f1$taxon_ID = reorder(df.info.j.f1$taxon_ID, df.info.j.f1$genus_ID)
df.info.j.f1$OTU_ID = reorder(df.info.j.f1$OTU_ID, -df.info.j.f1$n_taxa)
ggplot(df.info.j.f1, aes(x=taxon_ID, y=median,
ymin=percentile_25, ymax=percentile_75,
color=species_ID)) +
geom_linerange() +
geom_point(size=1) +
facet_wrap(~ OTU_ID, scales='free_x', ncol=8) +
theme_bw() +
theme(
axis.text.x = element_blank(),
legend.position='none'
)
The large figure that I’ve attached shows what I’ve been suspecting: we can’t really match up OTUs from our dataset to the NCBI genomes, so an ‘apples to apples’ comparison for the simulations at the OTU-level is not really feasible.
The plots are showing the amplicon-fragment G+C (BD) variability of each genome that falls into each non-singleton OTU. The line range is 25th percentile to 75th percentile. The point is the median. Colors indicate genus.
Some of the OTUs are very consistent (eg., OTU.465), but many others (OTU.876) are not. So, the amplicon-fragments of OTUs in our datasets will not necessarily match the reference genomes that I’m using for the simulations.
Also of note: some OTUs aggregate genomes as the genus level, but this would probably happen with most clustering methods unless we used at cutoff of ~100% seqID.
In [ ]: