In [5]:
workDir = '/var/seq_data/ncbi_db/genome/Jan2016/'
In [6]:
import os
%load_ext rpy2.ipython
In [60]:
%%R
library(ggplot2)
library(dplyr)
library(tidyr)
library(genomes)
library(permute)
In [8]:
if not os.path.isdir(workDir):
os.makedirs(workDir)
In [11]:
%%R
data(proks)
summary(proks)
In [12]:
%%R
update(proks)
summary(proks)
In [13]:
%%R -w 600 -h 300
# plotting GC distribution
ggplot(proks, aes(gc)) +
geom_histogram(binwidth=1) +
geom_vline(xintercept=50, linetype='dashed', color='red', alpha=0.7) +
labs(x='G+C') +
theme(
text = element_text(size=16)
)
In [14]:
%%R
proks.complete = proks %>% as.data.frame %>%
filter(status == 'Complete Genome')
proks.complete %>% head(n=3)
In [15]:
%%R -w 600 -h 300
# plotting GC distribution
ggplot(proks.complete, aes(gc)) +
geom_histogram(binwidth=1) +
geom_vline(xintercept=50, linetype='dashed', color='red', alpha=0.7) +
labs(x='G+C') +
theme(
text = element_text(size=16)
)
In [16]:
%%R
proks.complete$gc %>% summary
In [17]:
prokFile = os.path.join(workDir, 'proks_complete.txt')
In [18]:
%%R -i prokFile
write.table(proks.complete, prokFile, sep='\t', row.names=F, quote=F)
cat('File written: ', prokFile, '\n')
In [20]:
taxFile = os.path.splitext(prokFile)[0] + '_tax.txt'
In [156]:
!cd $workDir; \
tail -n +2 $prokFile | \
cut -f 5 | sort -u | \
seqDB_tools taxid2taxonomy -p 30 > $taxFile
In [21]:
%%R -i taxFile
df.tax = read.delim(taxFile, sep='\t') %>%
distinct(taxid)
df.proks.complete = dplyr::inner_join(proks.complete, df.tax, c('taxid' = 'taxid'))
# checking join
proks.complete %>% nrow %>% print
df.proks.complete %>% nrow %>% print
df.proks.complete %>% head(n=3)
In [217]:
%%R
df.bac.complete = df.proks.complete %>%
filter(superkingdom == 'Bacteria')
df.bac.complete %>% nrow
In [218]:
%%R -w 800
df.bac.complete.s = df.bac.complete %>%
group_by(phylum) %>%
summarize(n = n())
ggplot(df.bac.complete.s, aes(phylum, n)) +
geom_bar(stat='identity') +
scale_y_log10() +
labs(y = 'Number of genomes') +
theme_bw() +
theme(
text = element_text(size=16),
axis.text.x = element_text(angle=60, hjust=1)
)
In [219]:
%%R
df.bac.complete %>%
filter(size < 0.8) %>%
arrange(size) %>%
select(size, name) %>% tail(n=20)
In [220]:
%%R
cat('Pre-filter:', df.bac.complete %>% nrow, '\n')
df.bac.complete = df.bac.complete %>%
filter(size > 0.8)
cat('Post-filter:', df.bac.complete %>% nrow, '\n')
In [221]:
%%R
cat('Pre-filter:', df.bac.complete %>% nrow, '\n')
to.rm = c("Thermoanaerobacterium saccharolyticum JW/SL-YS485",
"Streptococcus salivarius 57.I")
df.bac.complete = df.bac.complete %>%
filter(! name %in% to.rm)
cat('Post-filter:', df.bac.complete %>% nrow, '\n')
In [222]:
%%R
# parsing out representatives
df.bac.complete.rep = df.bac.complete %>%
filter(! is.na(refseq)) %>%
group_by(species) %>%
sample_n(1) %>%
ungroup()
cat('Number of representative genomes:', df.bac.complete.rep %>% nrow, '\n')
In [223]:
%%R -w 800
df.bac.complete.rep.s = df.bac.complete.rep %>%
group_by(phylum) %>%
summarize(n = n())
ggplot(df.bac.complete.rep.s, aes(phylum, n)) +
geom_bar(stat='identity') +
labs(y = 'Number of representative genomes') +
theme_bw() +
theme(
text = element_text(size=16),
axis.text.x = element_text(angle=60, hjust=1)
)
In [224]:
%%R -w 600 -h 300
# plotting GC distribution
ggplot(df.bac.complete.rep, aes(gc)) +
geom_histogram(binwidth=1) +
geom_vline(xintercept=50, linetype='dashed', color='red', alpha=0.7) +
labs(x='G+C') +
theme(
text = element_text(size=16)
)
In [225]:
%%R -h 400
# culmulative function of Genomes w/ GC
df.bac.complete.rep.f = df.bac.complete.rep %>%
group_by() %>%
mutate(total_genomes = n()) %>%
ungroup() %>%
arrange(gc) %>%
mutate(n = 1,
culm_perc_total = cumsum(n) / total_genomes * 100) %>%
select(gc, culm_perc_total, phylum, genus)
# plot
ggplot(df.bac.complete.rep.f, aes(gc, culm_perc_total)) +
geom_point() +
geom_line() +
labs(x='Genome G+C', y='Culmulative total (%)') +
theme_bw() +
theme(
text = element_text(size=16)
)
In [226]:
%%R -w 1100 -h 500
GC_cutoff = 30
df.bac.complete.rep.f = df.bac.complete.rep %>%
group_by(genus) %>%
mutate(genus_medianGC = median(gc)) %>%
ungroup() %>%
filter(genus_medianGC <= GC_cutoff) %>%
select(phylum, genus, gc)
df.bac.complete.rep.f$genus = reorder(df.bac.complete.rep.f$genus, df.bac.complete.rep.f$phylum)
ggplot(df.bac.complete.rep.f, aes(genus, gc, color=phylum)) +
geom_boxplot() +
theme_bw() +
theme(
text = element_text(size=16),
axis.text.x = element_text(angle=60, hjust=1)
)
In [228]:
%%R
df.bac.complete.rep = df.bac.complete.rep %>%
group_by(genus) %>%
mutate(genus_medianGC = median(gc)) %>%
ungroup()
df.lowGC = df.bac.complete.rep %>%
filter(genus_medianGC <= 30)
df.highGC = df.bac.complete.rep %>%
filter(genus_medianGC > 30)
n.lowGC = df.lowGC %>% nrow
n.total = df.bac.complete.rep %>% nrow
cat('Number of low GC genomes: ', n.lowGC, '\n')
cat('Low GC genomes make up ', n.lowGC / n.total * 100, '% of the dataset\n')
In [229]:
%%R -h 300 -w 600
n.sample = round(n.total * 0.01, 0)
cat('Sampling', n.sample, 'low GC genomes\n')
to.keep = sample(df.bac.complete.rep$name, n.sample, replace=FALSE)
to.keep = append(to.keep, df.highGC$name)
df.bac.complete.rep.p = df.bac.complete.rep %>%
filter(name %in% to.keep)
cat('Number of representative genomes:', df.bac.complete.rep.p %>% nrow, '\n')
# plotting GC distribution
ggplot(df.bac.complete.rep.p, aes(gc)) +
geom_histogram(binwidth=1) +
geom_vline(xintercept=50, linetype='dashed', color='red', alpha=0.7) +
labs(x='G+C') +
theme(
text = element_text(size=16)
)
In [230]:
%%R -i workDir
outFile = file.path(workDir, 'bac_complete_spec-rep1.txt')
write.table(df.bac.complete.rep.p, outFile, sep='\t', quote=F, row.names=F)
In [ ]:
!cd $workDir; \
seqDB_tools accession-GI2fasta \
-a 11 -n 2 -f 30 -header \
-o bac_complete_spec-rep1 \
< bac_complete_spec-rep1.txt \
2> bac_complete_spec-rep1.log
In [239]:
!cd $workDir; \
seqDB_tools accession-GI2fasta \
-a 11 -n 2 -f 30 -b 1 -header \
-o bac_complete_spec-rep1_tmp \
< re_bac.txt
In [ ]:
Leptospirillum_ferrooxidans_C2-3.fna
bac_complete_spec-rep1/Marinithermus_hydrothermalis_DSM_14884.fna
bac_complete_spec-rep1/Octadecabacter_arcticus_238.fna
bac_complete_spec-rep1/Paracoccus_aminophilus_JCM_7686.fna
In [240]:
genomeDir = os.path.join(workDir, 'bac_complete_spec-rep1')
# number of genomes downloaded
!printf "Number of bacterial genomes: "
!cd $genomeDir; \
find . -name "*.fna" | wc -l
In [243]:
# file size
!echo "Genome file size distribution (bytes):"
!cd $genomeDir; \
ls -tlc *.fna | \
perl -pe 's/ +/\t/g' | \
cut -f 5 | NY_misc_perl stats_descriptive
In [242]:
# checking for non-bacterial genomes
!find $genomeDir -name "*fna" | xargs -P 20 egrep "phage|virus|phage"
In [251]:
genomeDirRn = genomeDir + '_rn'
genomeDirRn
Out[251]:
In [252]:
# making sure each sequence is unique
!cd $genomeDir; \
find . -name "*fna" | \
SIPSim genome_rename -n 24 --prefix $genomeDirRn -
In [256]:
# determining the largest genome
!cd $genomeDirRn; \
find . -name "*.fna" | \
seq_tools fasta_info --fn --tn --tl --tgc --sn --sl --sgc --header -n 24 - \
> genome_info.txt
In [260]:
%%R -i genomeDirRn
inFile = file.path(genomeDirRn, 'genome_info.txt')
df = read.delim(inFile, sep='\t')
df %>% head(n=3)
In [263]:
%%R -w 500 -h 350
# genome lengths
ggplot(df, aes(total_seq_length)) +
geom_histogram(binwidth=50000) +
labs(x='Genome length (bp)', y='Count') +
theme_bw() +
theme(
text = element_text(size=16)
)
In [264]:
%%R
summary(df$total_seq_length)
In [265]:
%%R -w 500 -h 350
# genome GC
ggplot(df, aes(total_GC)) +
geom_histogram(binwidth=1) +
labs(x='Genome G+C', y='Count') +
theme_bw() +
theme(
text = element_text(size=16)
)
In [266]:
%%R -w 500 -h 350
# genome GC
ggplot(df, aes(total_GC)) +
geom_density(fill='red', alpha=0.3) +
scale_x_continuous(limits=c(10,90)) +
labs(x='Genome G+C', y='Density') +
theme_bw() +
theme(
text = element_text(size=16)
)
In [268]:
%%R -w 500 -h 350
# sequences per genome
ggplot(df, aes(total_sequences)) +
geom_histogram(binwidth=1) +
labs(x='Total chromosomes/contigs', y='Count') +
theme_bw() +
theme(
text = element_text(size=16)
)
In [271]:
# list of all genomes files and their associated names
!cd $genomeDirRn; \
find . -name "*fna" | \
perl -pe 's/.+\///' | \
perl -pe 's/(.+)(\.[^.]+)/\$1\t\$1\$2/' > genome_index.txt
In [ ]:
!cd $genomeDirRn; \
SIPSim genome_index \
genome_index.txt \
--fp . --np 30 > index_log.txt
In [ ]:
!printf 'Genome directory: '; echo $genomeDirRn
!printf 'Number of genomes: '
!find $genomeDirRn -name "*.fna"| wc -l
!printf 'Number of indexed genomes: '
!find $genomeDirRn -name "*.fna.sqlite3.db*" | wc -l
In [285]:
%%R
curve(-exp(-x/10 + 1), from = 10, to = 70)
In [293]:
%%R -h 300
#weights = sapply(df.bac.complete.rep$gc, function(gc) ifelse(gc < 35, 1 - ((35 - gc) / 100), 1))
#weights = sapply(df.bac.complete.rep$gc, function(gc) ifelse(gc < 35, 1 - exp(-gc/10), 1))
#weights = sapply(df.bac.complete.rep$gc, function(gc) 1 - exp(-gc/50 + 2))
weights = data.frame('weights' = weights, 'gc' = df.bac.complete.rep$gc)
ggplot(weights, aes(gc, weights)) +
geom_point() +
theme_bw()
In [244]:
%%R -w 600 -h 300
n.sample = 500
#weights = sapply(df.bac.complete.rep$gc, function(gc) ifelse(gc < 35, 1 - ((35 - gc) / 50), 1))
to.keep = sample(df.bac.complete.rep$name, n.sample, replace=FALSE, prob=weights$weights)
df.bac.complete.rep.p = df.bac.complete.rep %>%
filter(name %in% to.keep)
# plotting GC distribution
ggplot(df.bac.complete.rep.p, aes(gc)) +
geom_histogram(binwidth=1) +
geom_vline(xintercept=50, linetype='dashed', color='red', alpha=0.7) +
labs(x='G+C') +
theme(
text = element_text(size=16)
)
In [ ]: