In [5]:
    
workDir = '/var/seq_data/ncbi_db/genome/Jan2016/'
    
In [6]:
    
import os
%load_ext rpy2.ipython
    
In [60]:
    
%%R
library(ggplot2)
library(dplyr)
library(tidyr)
library(genomes)
library(permute)
    
In [8]:
    
if not os.path.isdir(workDir):
    os.makedirs(workDir)
    
In [11]:
    
%%R
data(proks)
summary(proks)
    
    
In [12]:
    
%%R
update(proks)
summary(proks)
    
    
    
In [13]:
    
%%R -w 600 -h 300
# plotting GC distribution
ggplot(proks, aes(gc)) +
    geom_histogram(binwidth=1) +
    geom_vline(xintercept=50, linetype='dashed', color='red', alpha=0.7) +
    labs(x='G+C') +
    theme(
        text = element_text(size=16)
    )
    
    
In [14]:
    
%%R
proks.complete = proks %>% as.data.frame %>% 
    filter(status == 'Complete Genome') 
proks.complete %>% head(n=3)
    
    
In [15]:
    
%%R -w 600 -h 300
# plotting GC distribution
ggplot(proks.complete, aes(gc)) +
    geom_histogram(binwidth=1) +
    geom_vline(xintercept=50, linetype='dashed', color='red', alpha=0.7) +
    labs(x='G+C') +
    theme(
        text = element_text(size=16)
    )
    
    
In [16]:
    
%%R
proks.complete$gc %>% summary
    
    
In [17]:
    
prokFile = os.path.join(workDir, 'proks_complete.txt')
    
In [18]:
    
%%R -i prokFile
write.table(proks.complete, prokFile, sep='\t', row.names=F, quote=F)
cat('File written: ', prokFile, '\n')
    
    
In [20]:
    
taxFile = os.path.splitext(prokFile)[0] + '_tax.txt'
    
In [156]:
    
!cd $workDir; \
    tail -n +2 $prokFile | \
    cut -f 5 | sort -u | \
    seqDB_tools taxid2taxonomy -p 30 > $taxFile
    
    
In [21]:
    
%%R -i taxFile
df.tax = read.delim(taxFile, sep='\t') %>%
    distinct(taxid)
df.proks.complete = dplyr::inner_join(proks.complete, df.tax, c('taxid' = 'taxid'))
# checking join
proks.complete %>% nrow %>% print
df.proks.complete %>% nrow %>% print
df.proks.complete %>% head(n=3)
    
    
In [217]:
    
%%R
df.bac.complete = df.proks.complete %>%
    filter(superkingdom == 'Bacteria')
df.bac.complete %>% nrow
    
    
In [218]:
    
%%R -w 800
df.bac.complete.s = df.bac.complete %>%
    group_by(phylum) %>%
    summarize(n = n())
ggplot(df.bac.complete.s, aes(phylum, n)) +
    geom_bar(stat='identity') +
    scale_y_log10() +
    labs(y = 'Number of genomes') +
    theme_bw() +
    theme(
        text = element_text(size=16),
        axis.text.x = element_text(angle=60, hjust=1)
    )
    
    
In [219]:
    
%%R
df.bac.complete %>%
    filter(size < 0.8) %>%
    arrange(size) %>%
    select(size, name) %>% tail(n=20)
    
    
In [220]:
    
%%R
cat('Pre-filter:', df.bac.complete %>% nrow, '\n')
df.bac.complete = df.bac.complete %>%
    filter(size > 0.8) 
cat('Post-filter:', df.bac.complete %>% nrow, '\n')
    
    
In [221]:
    
%%R
cat('Pre-filter:', df.bac.complete %>% nrow, '\n')
to.rm = c("Thermoanaerobacterium saccharolyticum JW/SL-YS485",
          "Streptococcus salivarius 57.I")
df.bac.complete = df.bac.complete %>%
    filter(! name %in% to.rm)
cat('Post-filter:', df.bac.complete %>% nrow, '\n')
    
    
In [222]:
    
%%R
# parsing out representatives
df.bac.complete.rep = df.bac.complete %>%
    filter(! is.na(refseq)) %>%
    group_by(species) %>%
    sample_n(1) %>%
    ungroup()
cat('Number of representative genomes:', df.bac.complete.rep %>% nrow, '\n')
    
    
In [223]:
    
%%R -w 800
df.bac.complete.rep.s = df.bac.complete.rep %>%
    group_by(phylum) %>%
    summarize(n = n())
ggplot(df.bac.complete.rep.s, aes(phylum, n)) +
    geom_bar(stat='identity') +
    labs(y = 'Number of representative genomes') +
    theme_bw() +
    theme(
        text = element_text(size=16),
        axis.text.x = element_text(angle=60, hjust=1)
    )
    
    
In [224]:
    
%%R -w 600 -h 300
# plotting GC distribution
ggplot(df.bac.complete.rep, aes(gc)) +
    geom_histogram(binwidth=1) +
    geom_vline(xintercept=50, linetype='dashed', color='red', alpha=0.7) +
    labs(x='G+C') +
    theme(
        text = element_text(size=16)
    )
    
    
In [225]:
    
%%R -h 400
# culmulative function of Genomes w/ GC
df.bac.complete.rep.f = df.bac.complete.rep %>%
    group_by() %>%
    mutate(total_genomes = n()) %>%
    ungroup() %>%
    arrange(gc) %>%
    mutate(n = 1,
           culm_perc_total = cumsum(n) / total_genomes * 100) %>%
    select(gc, culm_perc_total, phylum, genus) 
    
# plot
ggplot(df.bac.complete.rep.f, aes(gc, culm_perc_total)) +
    geom_point() +
    geom_line() +
    labs(x='Genome G+C', y='Culmulative total (%)') +
    theme_bw() +
    theme(
        text = element_text(size=16)
    )
    
    
In [226]:
    
%%R -w 1100 -h 500
GC_cutoff = 30
df.bac.complete.rep.f = df.bac.complete.rep %>% 
    group_by(genus) %>%
    mutate(genus_medianGC = median(gc)) %>%
    ungroup() %>%
    filter(genus_medianGC <= GC_cutoff) %>%
    select(phylum, genus, gc)
df.bac.complete.rep.f$genus = reorder(df.bac.complete.rep.f$genus, df.bac.complete.rep.f$phylum)
ggplot(df.bac.complete.rep.f, aes(genus, gc, color=phylum)) +
    geom_boxplot() +
    theme_bw() +
    theme(
        text = element_text(size=16),
        axis.text.x = element_text(angle=60, hjust=1)
    )
    
    
In [228]:
    
%%R
df.bac.complete.rep = df.bac.complete.rep %>% 
    group_by(genus) %>%
    mutate(genus_medianGC = median(gc)) %>%
    ungroup()
df.lowGC = df.bac.complete.rep %>%
    filter(genus_medianGC <= 30)
df.highGC = df.bac.complete.rep %>%
    filter(genus_medianGC > 30)
n.lowGC = df.lowGC %>% nrow
n.total = df.bac.complete.rep %>% nrow
cat('Number of low GC genomes: ', n.lowGC, '\n')
cat('Low GC genomes make up ', n.lowGC / n.total * 100, '% of the dataset\n')
    
    
In [229]:
    
%%R -h 300 -w 600
n.sample = round(n.total * 0.01, 0)
cat('Sampling', n.sample, 'low GC genomes\n')
to.keep = sample(df.bac.complete.rep$name, n.sample, replace=FALSE)
to.keep = append(to.keep, df.highGC$name)
df.bac.complete.rep.p = df.bac.complete.rep %>%
    filter(name %in% to.keep)    
cat('Number of representative genomes:', df.bac.complete.rep.p %>% nrow, '\n')
# plotting GC distribution
ggplot(df.bac.complete.rep.p, aes(gc)) +
    geom_histogram(binwidth=1) +
    geom_vline(xintercept=50, linetype='dashed', color='red', alpha=0.7) +
    labs(x='G+C') +
    theme(
        text = element_text(size=16)
    )
    
    
    
In [230]:
    
%%R -i workDir
outFile = file.path(workDir, 'bac_complete_spec-rep1.txt')
write.table(df.bac.complete.rep.p, outFile, sep='\t', quote=F, row.names=F)
    
In [ ]:
    
!cd $workDir; \
    seqDB_tools accession-GI2fasta \
        -a 11 -n 2 -f 30 -header \
        -o bac_complete_spec-rep1 \
        < bac_complete_spec-rep1.txt \
        2> bac_complete_spec-rep1.log
    
In [239]:
    
!cd $workDir; \
    seqDB_tools accession-GI2fasta \
        -a 11 -n 2 -f 30 -b 1 -header \
        -o bac_complete_spec-rep1_tmp \
        < re_bac.txt
    
    
In [ ]:
    
Leptospirillum_ferrooxidans_C2-3.fna
bac_complete_spec-rep1/Marinithermus_hydrothermalis_DSM_14884.fna
bac_complete_spec-rep1/Octadecabacter_arcticus_238.fna
bac_complete_spec-rep1/Paracoccus_aminophilus_JCM_7686.fna
    
In [240]:
    
genomeDir = os.path.join(workDir, 'bac_complete_spec-rep1')
# number of genomes downloaded
!printf "Number of bacterial genomes: "
!cd $genomeDir; \
    find . -name "*.fna" | wc -l
    
    
In [243]:
    
# file size
!echo "Genome file size distribution (bytes):"
!cd $genomeDir; \
    ls -tlc *.fna | \
    perl -pe 's/ +/\t/g' | \
    cut -f 5 | NY_misc_perl stats_descriptive
    
    
In [242]:
    
# checking for non-bacterial genomes
!find $genomeDir -name "*fna" | xargs -P 20 egrep "phage|virus|phage"
    
In [251]:
    
genomeDirRn = genomeDir + '_rn'
genomeDirRn
    
    Out[251]:
In [252]:
    
# making sure each sequence is unique
!cd $genomeDir; \
    find . -name "*fna" | \
    SIPSim genome_rename -n 24 --prefix $genomeDirRn -
    
    
In [256]:
    
# determining the largest genome
!cd $genomeDirRn; \
    find . -name "*.fna" | \
    seq_tools fasta_info --fn --tn --tl --tgc --sn --sl --sgc --header -n 24 - \
    > genome_info.txt
    
In [260]:
    
%%R -i genomeDirRn
inFile = file.path(genomeDirRn, 'genome_info.txt')
df = read.delim(inFile, sep='\t')
df %>% head(n=3)
    
    
In [263]:
    
%%R -w 500 -h 350
# genome lengths
ggplot(df, aes(total_seq_length)) +
    geom_histogram(binwidth=50000) +
    labs(x='Genome length (bp)', y='Count') +
    theme_bw() +
    theme(
        text = element_text(size=16)
    )
    
    
In [264]:
    
%%R
summary(df$total_seq_length)
    
    
In [265]:
    
%%R -w 500 -h 350
# genome GC
ggplot(df, aes(total_GC)) +
    geom_histogram(binwidth=1) +
    labs(x='Genome G+C', y='Count') +
    theme_bw() +
    theme(
        text = element_text(size=16)
    )
    
    
In [266]:
    
%%R -w 500 -h 350
# genome GC
ggplot(df, aes(total_GC)) +
    geom_density(fill='red', alpha=0.3) +
    scale_x_continuous(limits=c(10,90)) +
    labs(x='Genome G+C', y='Density') +
    theme_bw() +
    theme(
        text = element_text(size=16)
    )
    
    
In [268]:
    
%%R -w 500 -h 350
# sequences per genome
ggplot(df, aes(total_sequences)) +
    geom_histogram(binwidth=1) +
    labs(x='Total chromosomes/contigs', y='Count') +
    theme_bw() +
    theme(
        text = element_text(size=16)
    )
    
    
In [271]:
    
# list of all genomes files and their associated names
!cd $genomeDirRn; \
    find . -name "*fna" | \
    perl -pe 's/.+\///' | \
    perl -pe 's/(.+)(\.[^.]+)/\$1\t\$1\$2/' > genome_index.txt
    
In [ ]:
    
!cd $genomeDirRn; \
    SIPSim genome_index \
    genome_index.txt \
    --fp . --np 30 > index_log.txt
    
In [ ]:
    
!printf 'Genome directory: '; echo $genomeDirRn
!printf 'Number of genomes: '
!find $genomeDirRn -name "*.fna"| wc -l
!printf 'Number of indexed genomes: '
!find $genomeDirRn -name "*.fna.sqlite3.db*" | wc -l
    
In [285]:
    
%%R
curve(-exp(-x/10 + 1), from = 10, to = 70)
    
    
In [293]:
    
%%R -h 300
#weights = sapply(df.bac.complete.rep$gc, function(gc) ifelse(gc < 35, 1 - ((35 - gc) / 100), 1))
#weights = sapply(df.bac.complete.rep$gc, function(gc) ifelse(gc < 35, 1 - exp(-gc/10), 1))
#weights = sapply(df.bac.complete.rep$gc, function(gc) 1 - exp(-gc/50 + 2))
weights = data.frame('weights' = weights, 'gc' = df.bac.complete.rep$gc)
    
ggplot(weights, aes(gc, weights)) +
    geom_point() +
    theme_bw()
    
    
In [244]:
    
%%R -w 600 -h 300
n.sample = 500
#weights = sapply(df.bac.complete.rep$gc, function(gc) ifelse(gc < 35, 1 - ((35 - gc) / 50), 1))
to.keep = sample(df.bac.complete.rep$name, n.sample, replace=FALSE, prob=weights$weights)
    
df.bac.complete.rep.p = df.bac.complete.rep %>%
    filter(name %in% to.keep)
    
# plotting GC distribution
ggplot(df.bac.complete.rep.p, aes(gc)) +
    geom_histogram(binwidth=1) +
    geom_vline(xintercept=50, linetype='dashed', color='red', alpha=0.7) +
    labs(x='G+C') +
    theme(
        text = element_text(size=16)
    )
    
    
In [ ]: