intial dataset setup for deltaGC
initial investigation of GC content
Date: 3/27/2014
using 'prokaryote.txt' file from the NCBI genome ftp site
wget ftp://ftp.ncbi.nih.gov/genomes/GENOME_REPORTS/prokaryotes.txt
Table filtering pipeline:
wget ftp://ftp.ncbi.nih.gov/genomes/GENOME_REPORTS/prok_representative_genomes.txt
wget ftp://ftp.ncbi.nih.gov/genomes/GENOME_REPORTS/prok_reference_genomes.txt
@ system76-server:/var/seq_data/ncbi_db/genome
NCBIprokaryoteTableFilter.pl prokaryotes.txt > prokaryotes_filt.txt
* 23504 entries in prokaryotes
* 2864 entries remaining post-filtering
* 36 genomes do not have accession numbers
* 2828 genomes remaining with chromosome accessions
random selection of 1 from each species
@ system76-server:/var/seq_data/ncbi_db/genome
tblRandomByField.pl -c 30 -header <(perl -ne '@l=split /\t/; print unless $l[9] eq "-"' prokaryotes_filt.txt) > prokaryotes_filt_rand.txt
* filtered out all genomes w/out accession numbers
* 1283 genomes remaining
parsing by domain
perl -ne '@l=split /\t/; print if $l[23] =~/bacteria|superkingdom/i && $l[29] !~ /NA/' prokaryotes_filt_rand.txt > prok-bac_filt_rand.txt
* 1162 bacterial genomes
perl -ne '@l=split /\t/; print if $l[23] =~/archaea|superkingdom/i && $l[29] !~ /NA/' prokaryotes_filt_rand.txt > prok-arc_filt_rand.txt
* 119 archaeal genomes
downloading all genomes
screen -S bac -L NCBIprokaryoteTableFilter.pl prok-bac_filt_rand.txt -t 20 -w -d prok-bac-genomes
screen -S arc -L NCBIprokaryoteTableFilter.pl prok-arc_filt_rand.txt -t 20 -w -d prok-arc-genomes
Identifying ssu, lsu, and tsu genes in each genome
archaea_ssu
@ system76-server:/var/seq_data/ncbi_db/genome/rnammer/archaea_ssu
find ../../prok-arc-genomes/ -name "*fasta" | perl -pe 's/.+\/|\.fasta//g' | xargs -n 1 -I % -P 30 bash -c 'rnammer -S arc -m ssu,lsu,tsu -gff %_rrn.gff -f %_rrn.fna -xml %_rrn.xml < ../../prok-arc-genomes/%.fasta'
bacteria_ssu
@ system76-server:/var/seq_data/ncbi_db/genome/rnammer/bacteria_ssu
find ../../prok-bac-genomes/ -name "*fasta" | perl -pe 's/.+\/|\.fasta//g' | xargs -n 1 -I % -P 30 bash -c 'rnammer -S bac -m ssu,lsu,tsu -gff %_rrn.gff -f %_rrn.fna -xml %_rrn.xml < ../../prok-bac-genomes/%.fasta'
@ system76-server:/var/seq_data/ncbi_db/genome/rnammer/
egrep -v "^#" archaea_ssu/*gff | grep "16s_rRNA" | perl -pe 's/:/\t/' > summary/archaea_ssu_gff.txt
egrep -v "^#" bacteria_ssu/*gff | grep "16s_rRNA" | perl -pe 's/:/\t/' > summary/bacteria_ssu_gff.txt
egrep -c "^[^#]" archaea_ssu/*gff | perl -pe 's/:/\t/' > summary/archaea_ssu_gff_cnt.txt
egrep -c "^[^#].+16s_rRNA" bacteria_ssu/*gff | perl -pe 's/:/\t/' > summary/bacteria_ssu_gff_cnt.txt
$ cut -f 2 summary/archaea_ssu_gff_cnt.txt | stats_descriptive.pl
1 min 1.00
1 Q1 1.00
1 mean 1.64
1 median 1.00
1 Q3 2.00
1 max 4.00
1 stdev 0.86
cut -f 2 summary/bacteria_ssu_gff_cnt.txt | stats_descriptive.pl
1 min 0.00
1 Q1 2.00
1 mean 3.68
1 median 3.00
1 Q3 5.00
1 max 15.00
1 stdev 2.55
* 4 bacterial lacking identified 16S genes!
bacteria_ssu/Bacteroides_xylanisolvens_XB1A_ssu.gff 0
bacteria_ssu/Candidatus_Phytoplasma_solani_284_09_ssu.gff 0
bacteria_ssu/Faecalibacterium_prausnitzii_L2-6_ssu.gff 0
bacteria_ssu/Streptomyces_rapamycinicus_NRRL_5491_ssu.gff 0
* genomes lengths make sense
* just seem to be lacking the gene
Archaea
@ system76-server:/var/seq_data/ncbi_db/genome/rnammer/archaea_rrn
find ../../prok-arc-genomes/ -name "*fasta" | perl -pe 's/.+\/|\.fasta//g' | xargs -n 1 -I % -P 30 bash -c 'rnammer -S arc -m ssu,tsu,lsu -gff %_rrn.gff -f %_rrn.fna -xml %_rrn.xml < ../../prok-arc-genomes/%.fasta'
rnammer_tandem_rrn.pl <(find archaea_rrn/ -name "*gff") | tail -n +2 | perl -ne '@l=split /\t/; print if $l[2] > 0' | less
* 2 genomes (1.7%); both methanogens
Bacteria
@ system76-server:/var/seq_data/ncbi_db/genome/rnammer/bacteria_rrn
find ../../prok-bac-genomes/ -name "*fasta" | perl -pe 's/.+\/|\.fasta//g' | xargs -n 1 -I % -P 30 bash -c 'rnammer -S bac -m ssu,tsu,lsu -gff %_rrn.gff -f %_rrn.fna -xml %_rrn.xml < ../../prok-bac-genomes/%.fasta'
* 126 genomes (11.9%)
* mean of 25% tandem (max of 83% tandem)
Number of genomes
Number of phyla
@ system76-server:/var/seq_data/ncbi_db/genome
cut -f 25 prok-bac_filt_rand.txt | tail -n +2 | sort -u | less -N
* 29 bacterial phyla
cut -f 25 prok-arc_filt_rand.txt | tail -n +2 | sort -u | less -N
* 5 archaeal phyla
bacteria
@ system76-server:~/notebook/deltaGC_notes/data/genome_data/prok-bac-genomes_GC
find ../prok-bac-genomes/ -name "*fasta" | xargs -P 5 -I % calcGC.pl -w 6 % > prok-bac-genomes_GC.txt
archaea
@ system76-server:~/notebook/deltaGC_notes/data/genome_data/prok-arc-genomes_GC
find ../prok-arc-genomes/ -name "*fasta" | xargs -P 5 -I % calcGC.pl -w 6 % > prok-arc-genomes_GC.txt
min 0.13
Q1 0.37
mean 0.48
median 0.46
Q3 0.61
max 0.74
stdev 0.13
cut -f 2 prok-arc-genomes_Ngaps_GC.txt | stats_descriptive.pl
min 0.27
Q1 0.37
mean 0.47
median 0.47
Q3 0.56
max 0.67
stdev 0.11