Prepare the reference

NC_000913.3.fna file contains the reference MG1655 sequence from:

http://www.ncbi.nlm.nih.gov/nuccore/556503834?report=fasta

Build bowtie2 index


In [ ]:
REFERENCE="../ref/MG1655"
GENOME_FASTA="../ref/NC_000913.fna"

indexer=$(which bowtie2-build)

$indexer $GENOME_FASTA $REFERENCE

Align sequences


In [ ]:
OUTDIR="../results"
INPUTDIR="../data"

# These might be located in different directories on different platforms
aligner=$(which bowtie2)
bamtool=$(which bamtools)

mkdir -p $OUTDIR

for fa in $(ls $INPUTDIR | grep bam)
do
    echo "Aligning $fa to reference..."
    base_name=$(echo "$fa" | cut -d'.' -f1)
    bam_file=$OUTDIR/$base_name".bam"
    $bamtool convert -format fastq -in $INPUTDIR/$fa  \
        | $aligner -p 28 -x $REFERENCE -U - \
        | samtools view -bhS - > $bam_file
    echo "sorting $bam_file ..."
    samtools sort $bam_file $OUTDIR/$base_name"_sorted"
    echo "indexing $bam_file ..."
    samtools index $OUTDIR/$base_name"_sorted.bam"
done
echo "DONE."

Count reads in genes


In [ ]:
counter=$(which htseq-count)

for bamfile in ../results/*9.bam
do
    echo "Processing: $bamfile"
    $counter -s no -t gene -i Name -f bam \
        $bamfile ../ref/NC_000913.gff > ../results/$(basename $bamfile .bam).counts
done