In [31]:
workDir = '/home/nick/notebook/SIPSim/dev/fullCyc/'
physeqDir = '/home/nick/notebook/fullCyc/data/MiSeq_16S/515f-806r/V4_Lib1-7/phyloseq/'
physeqBulkCore = 'bulk-core'
repSeqs = '/home/nick/notebook/fullCyc/data/MiSeq_16S/515f-806r/V4_Lib1-7/OTU_binning/otusn.fasta'
In [32]:
%load_ext rpy2.ipython
In [34]:
%%R
library(ggplot2)
library(dplyr)
library(tidyr)
library(phyloseq)
In [35]:
%%R -i workDir -i physeqDir -i physeqBulkCore
cat('variables loaded in R\n')
In [36]:
%%R
dir.create(workDir, showWarnings=FALSE)
In [37]:
%%R
# bulk core samples
F = file.path(physeqDir, physeqBulkCore)
physeq.bulk = readRDS(F)
physeq.bulk.m = physeq.bulk %>% sample_data
physeq.bulk
In [38]:
%%R
# just 12C-Con samples
physeq.bulk.12C = prune_samples(physeq.bulk.m$Substrate == '12C-Con', physeq.bulk) %>%
filter_taxa(function(x) sum(x) > 0, TRUE)
physeq.bulk.12C.m = physeq.bulk.12C %>% sample_data
physeq.bulk.12C
In [43]:
%%R
# list of OTUs
x = physeq.bulk.12C %>% otu_table %>% rownames %>%
as.data.frame %>%
mutate(OTU = sapply(., function(x) paste(c('>OTU.', x, '$'), collapse=''))) %>%
select(OTU)
outFile = file.path(workDir, 'bulk-core_12C-Con_OTUs.txt')
write.table(x, outFile, quote=FALSE, row.names=FALSE, col.names=FALSE, sep='\t')
In [46]:
# parsing OTU rep sequences
%Rpull outFile
outFile = outFile[0]
## output file
OTU_reps = os.path.join(workDir, 'OTU_reps.fna')
## parsing with egrep
!cd $workDir; \
egrep -A 1 -f $outFile $repSeqs | \
perl -ne 'print unless /^-+$/' > $OTU_reps
## checking file
!printf "Number of OTUs: "
!cd $workDir; \
grep -c ">" $OTU_reps
In [ ]:
In [ ]: