Pulling out rep 16S sequences for all 12-Con_bulk_core samples


In [31]:
workDir = '/home/nick/notebook/SIPSim/dev/fullCyc/'
physeqDir = '/home/nick/notebook/fullCyc/data/MiSeq_16S/515f-806r/V4_Lib1-7/phyloseq/'
physeqBulkCore = 'bulk-core'

repSeqs = '/home/nick/notebook/fullCyc/data/MiSeq_16S/515f-806r/V4_Lib1-7/OTU_binning/otusn.fasta'

In [32]:
%load_ext rpy2.ipython


The rpy2.ipython extension is already loaded. To reload it, use:
  %reload_ext rpy2.ipython

In [34]:
%%R
library(ggplot2)
library(dplyr)
library(tidyr)
library(phyloseq)

In [35]:
%%R -i workDir -i physeqDir -i physeqBulkCore

cat('variables loaded in R\n')


variables loaded in R

In [36]:
%%R

dir.create(workDir, showWarnings=FALSE)

In [37]:
%%R
# bulk core samples
F = file.path(physeqDir, physeqBulkCore)
physeq.bulk = readRDS(F)
physeq.bulk.m = physeq.bulk %>% sample_data
physeq.bulk


phyloseq-class experiment-level object
otu_table()   OTU Table:         [ 4950 taxa and 9 samples ]
sample_data() Sample Data:       [ 9 samples by 17 sample variables ]
tax_table()   Taxonomy Table:    [ 4950 taxa by 8 taxonomic ranks ]
phy_tree()    Phylogenetic Tree: [ 4950 tips and 4949 internal nodes ]

In [38]:
%%R
# just 12C-Con samples
physeq.bulk.12C = prune_samples(physeq.bulk.m$Substrate == '12C-Con', physeq.bulk) %>%
    filter_taxa(function(x) sum(x) > 0, TRUE)
physeq.bulk.12C.m = physeq.bulk.12C %>% sample_data
physeq.bulk.12C


phyloseq-class experiment-level object
otu_table()   OTU Table:         [ 4370 taxa and 6 samples ]
sample_data() Sample Data:       [ 6 samples by 17 sample variables ]
tax_table()   Taxonomy Table:    [ 4370 taxa by 8 taxonomic ranks ]
phy_tree()    Phylogenetic Tree: [ 4370 tips and 4369 internal nodes ]

In [43]:
%%R
# list of OTUs
x = physeq.bulk.12C %>% otu_table %>% rownames %>% 
    as.data.frame %>%
    mutate(OTU = sapply(., function(x) paste(c('>OTU.', x, '$'), collapse=''))) %>%
    select(OTU)    
        
outFile = file.path(workDir, 'bulk-core_12C-Con_OTUs.txt')
write.table(x, outFile, quote=FALSE, row.names=FALSE, col.names=FALSE, sep='\t')

In [46]:
# parsing OTU rep sequences

%Rpull outFile
outFile = outFile[0]

## output file
OTU_reps = os.path.join(workDir, 'OTU_reps.fna')

## parsing with egrep
!cd $workDir; \
    egrep -A 1 -f $outFile $repSeqs | \
    perl -ne 'print unless /^-+$/' > $OTU_reps

## checking file    
!printf "Number of OTUs: "
!cd $workDir; \
    grep -c ">" $OTU_reps


Number of OTUs: 4370

In [ ]:


In [ ]: