intial dataset setup for deltaGC
initial investigation of GC content
Date: 3/27/2014
using 'prokaryote.txt' file from the NCBI genome ftp site
wget ftp://ftp.ncbi.nih.gov/genomes/GENOME_REPORTS/prokaryotes.txt
Table filtering pipeline:
wget ftp://ftp.ncbi.nih.gov/genomes/GENOME_REPORTS/prok_representative_genomes.txt
wget ftp://ftp.ncbi.nih.gov/genomes/GENOME_REPORTS/prok_reference_genomes.txt
@ system76-server:/var/seq_data/ncbi_db/genome
NCBIprokaryoteTableFilter.pl prokaryotes.txt > prokaryotes_filt.txt
    * 23504 entries in prokaryotes
    * 2864 entries remaining post-filtering
     * 36 genomes do not have accession numbers
     * 2828 genomes remaining with chromosome accessions
random selection of 1 from each species
@ system76-server:/var/seq_data/ncbi_db/genome
tblRandomByField.pl -c 30 -header <(perl -ne '@l=split /\t/; print unless $l[9] eq "-"' prokaryotes_filt.txt) > prokaryotes_filt_rand.txt
    * filtered out all genomes w/out accession numbers
    * 1283 genomes remaining
parsing by domain
perl -ne '@l=split /\t/; print if $l[23] =~/bacteria|superkingdom/i && $l[29] !~ /NA/' prokaryotes_filt_rand.txt > prok-bac_filt_rand.txt
    * 1162 bacterial genomes
perl -ne '@l=split /\t/; print if $l[23] =~/archaea|superkingdom/i && $l[29] !~ /NA/' prokaryotes_filt_rand.txt > prok-arc_filt_rand.txt
    * 119 archaeal genomes
downloading all genomes
screen -S bac -L NCBIprokaryoteTableFilter.pl prok-bac_filt_rand.txt -t 20 -w -d prok-bac-genomes
screen -S arc -L NCBIprokaryoteTableFilter.pl prok-arc_filt_rand.txt -t 20 -w -d prok-arc-genomes
Identifying ssu, lsu, and tsu genes in each genome
archaea_ssu
@ system76-server:/var/seq_data/ncbi_db/genome/rnammer/archaea_ssu
find ../../prok-arc-genomes/ -name "*fasta" | perl -pe 's/.+\/|\.fasta//g' | xargs -n 1 -I % -P 30 bash -c 'rnammer -S arc -m ssu,lsu,tsu -gff %_rrn.gff -f %_rrn.fna -xml %_rrn.xml < ../../prok-arc-genomes/%.fasta'
bacteria_ssu
@ system76-server:/var/seq_data/ncbi_db/genome/rnammer/bacteria_ssu
find ../../prok-bac-genomes/ -name "*fasta" | perl -pe 's/.+\/|\.fasta//g' | xargs -n 1 -I % -P 30 bash -c 'rnammer -S bac -m ssu,lsu,tsu -gff %_rrn.gff -f %_rrn.fna -xml %_rrn.xml < ../../prok-bac-genomes/%.fasta'
@ system76-server:/var/seq_data/ncbi_db/genome/rnammer/
egrep -v "^#" archaea_ssu/*gff | grep "16s_rRNA" | perl -pe 's/:/\t/' > summary/archaea_ssu_gff.txt
egrep -v "^#" bacteria_ssu/*gff | grep "16s_rRNA" | perl -pe 's/:/\t/' > summary/bacteria_ssu_gff.txt
egrep -c "^[^#]" archaea_ssu/*gff | perl -pe 's/:/\t/' > summary/archaea_ssu_gff_cnt.txt
egrep -c "^[^#].+16s_rRNA" bacteria_ssu/*gff | perl -pe 's/:/\t/' > summary/bacteria_ssu_gff_cnt.txt
$ cut -f 2 summary/archaea_ssu_gff_cnt.txt | stats_descriptive.pl
1   min 1.00
1   Q1  1.00
1   mean    1.64
1   median  1.00
1   Q3  2.00
1   max 4.00
1   stdev   0.86
cut -f 2 summary/bacteria_ssu_gff_cnt.txt | stats_descriptive.pl
1   min 0.00
1   Q1  2.00
1   mean    3.68
1   median  3.00
1   Q3  5.00
1   max 15.00
1   stdev   2.55
* 4 bacterial lacking identified 16S genes!
bacteria_ssu/Bacteroides_xylanisolvens_XB1A_ssu.gff     0
bacteria_ssu/Candidatus_Phytoplasma_solani_284_09_ssu.gff       0
bacteria_ssu/Faecalibacterium_prausnitzii_L2-6_ssu.gff  0
bacteria_ssu/Streptomyces_rapamycinicus_NRRL_5491_ssu.gff       0
    * genomes lengths make sense
    * just seem to be lacking the gene
Archaea
@ system76-server:/var/seq_data/ncbi_db/genome/rnammer/archaea_rrn
find ../../prok-arc-genomes/ -name "*fasta" | perl -pe 's/.+\/|\.fasta//g' | xargs -n 1 -I % -P 30 bash -c 'rnammer -S arc -m ssu,tsu,lsu -gff %_rrn.gff -f %_rrn.fna -xml %_rrn.xml < ../../prok-arc-genomes/%.fasta'
rnammer_tandem_rrn.pl <(find archaea_rrn/ -name "*gff") | tail -n +2 | perl -ne '@l=split /\t/; print if $l[2] > 0' | less
    * 2 genomes (1.7%); both methanogens
Bacteria
@ system76-server:/var/seq_data/ncbi_db/genome/rnammer/bacteria_rrn
find ../../prok-bac-genomes/ -name "*fasta" | perl -pe 's/.+\/|\.fasta//g' | xargs -n 1 -I % -P 30 bash -c 'rnammer -S bac -m ssu,tsu,lsu -gff %_rrn.gff -f %_rrn.fna -xml %_rrn.xml < ../../prok-bac-genomes/%.fasta'
    * 126 genomes (11.9%)
    * mean of 25% tandem (max of 83% tandem)
Number of genomes
Number of phyla
@ system76-server:/var/seq_data/ncbi_db/genome
cut -f 25 prok-bac_filt_rand.txt | tail -n +2 | sort -u | less -N
    * 29 bacterial phyla
cut -f 25 prok-arc_filt_rand.txt | tail -n +2 | sort -u | less -N
    * 5 archaeal phyla
bacteria
@ system76-server:~/notebook/deltaGC_notes/data/genome_data/prok-bac-genomes_GC
find ../prok-bac-genomes/ -name "*fasta" | xargs -P 5 -I % calcGC.pl -w 6 %  > prok-bac-genomes_GC.txt
archaea
@ system76-server:~/notebook/deltaGC_notes/data/genome_data/prok-arc-genomes_GC
find ../prok-arc-genomes/ -name "*fasta" | xargs -P 5 -I % calcGC.pl -w 6 %  > prok-arc-genomes_GC.txt
min 0.13
Q1  0.37
mean    0.48
median  0.46
Q3  0.61
max 0.74
stdev   0.13 
cut -f 2 prok-arc-genomes_Ngaps_GC.txt | stats_descriptive.pl
min 0.27
Q1  0.37
mean    0.47
median  0.47
Q3  0.56
max 0.67
stdev   0.11