import pysam
In [ ]:
#This code executes manual unit testing of the sequtil by Ali.
In [23]:
#Creating sequence to later create reads.
sequence = (
"AGCTTTTCATTCTGACTGCAACGGGCAATATGTCTCTGTGTGGATTAAAAAAAGAGTGTCTGATAGCAGC"
"TTCTGAACTGGTTACCTGCCGTGAGTAAATTAAAATTTTATTGACTTAGGTCACTAAATACTTTAACCAA"
"TATAGGCATAGCGCACAGACAGATAAAAATTACAGAGTACACAACATCCATGAAACGCATTAGCACCACC"
"ATTACCACCACCATCACCATTACCACAGGTAACGGTGCGGGCTGACGCGTACAGGAAACACAGAAAAAAG"
"CCCGCACCTGACAGTGCGGGCTTTTTTTTTCGACCAAAGGTAACGAGGTAACAACCATGCGAGTGTTGAA"
"GTTCGGCGGTACATCAGTGGCAAATGCAGAACGTTTTCTGCGTGTTGCCGATATTCTGGAAAGCAATGCC"
"AGGCAGGGGCAGGTGGCCACCGTCCTCTCTGCCCCCGCCAAAATCACCAACCACCTGGTGGCGATGATTG"
"AAAAAACCATTAGCGGCCAGGATGCTTTACCCAATATCAGCGATGCCGAACGTATTTTTGCCGAACTTTT"
"GACGGGACTCGCCGCCGCCCAGCCGGGGTTCCCGCTGGCGCAATTGAAAACTTTCGTCGATCAGGAATTT"
"GCCCAAATAAAACATGTCCTGCATGGCATTAGTTTGTTGGGGCAGTGCCCGGATAGCATCAACGCTGCGC"
)
In [2]:
len(sequence)
Out[2]:
In [24]:
# Ouput to see what "reversed( sequence )" looks like.
#for i in reversed( sequence ):
# print i
In [3]:
#Used in establishing values for the reverse strand of 'sequence'.
def reverse_complement(seq):
matches = {"A": "T", "T": "A", "G": "C", "C": "G"}
return "".join(matches[i] for i in reversed(seq))
In [4]:
#Creates output fastq file. Inserts into output file 10 sequences that are 35 bases long from the original sequence.
#Each sequence is a shift to the RIGHT by 1 of the previous sequence.
#'+' is an optional line. Not used here.
#The character '!' represents the lowest quality while '~' is the highest. Here are the quality value characters in left-to-right increasing order of quality
#This essentially creates 3'-> 5' reads of 1 strand.
with open("test_SE_plus.fastq", "w") as outfile:
for i in range(10):
outfile.write("@TEST_%d\n" % i)
outfile.write(sequence[i + 20:i + 55] + "\n")
outfile.write("+\n")
outfile.write("~" * 35 + "\n")
In [5]:
#Creates output fastq file. reverse_complement() is used. The same type of file as that above is created, though with the
#difference that it's created on the reverse compliment of the original sequence.
#This essentially creates 3'-> 5' reads of the reverse strand in relation to the reads created above.
with open("test_SE_minus.fastq", "w") as outfile:
rev = reverse_complement(sequence)
for i in range(10):
outfile.write("@TEST_%d\n" % i)
outfile.write(rev[i + 20:i + 55] + "\n")
outfile.write("+\n")
outfile.write("~" * 35 + "\n")
In [6]:
#Aligning reads created above.
#We can execute paired reads because we have reads from the reverse strand, which can therefore be translated to the 5' end
# of the initial strand established with 'sequence'.
#Aligning short sequence "plus" file created above with wild type.
!bowtie2 "/home/pphaneuf/sequencing/NC_000913_2/NC_000913_2" test_SE_plus.fastq -S test_SE_plus.sam
#Aligning short sequence "minus" file created above with wild type.
!bowtie2 "/home/pphaneuf/sequencing/NC_000913_2/NC_000913_2" test_SE_minus.fastq -S test_SE_minus.sam
#Aligning paired-end reads "plus" and "minus" files created above with wild type.
# -x indicates the basename of the index for the reference genome, which is in our case NC_000913_2
#
# 1000 is the argument "<bt2-idx>" within bowtie2 documentation. Probably shifting of start index within NC_000913_2 wild type reference genome.
#
# Paired-end sequencing enables both ends of the DNA fragment to be sequenced. Because the distance between each paired read
# known, alignment algorithms can use this information to map the reads over repetitive regions more precisely.
# This results in much better alignment of the reads, especially across difficult-to-sequence, repetitive regiosn of the
# genome.
!bowtie2 -X 1000 "/home/pphaneuf/sequencing/NC_000913_2/NC_000913_2" -1 test_SE_plus.fastq -2 test_SE_minus.fastq -S test_PE.sam
#Aligning paired-end reads "plus" and "minus" files in the reverse order from the previous above command with wild type.
!bowtie2 -X 1000 "/home/pphaneuf/sequencing/NC_000913_2/NC_000913_2" -2 test_SE_plus.fastq -1 test_SE_minus.fastq -S test_PE_rev.sam
In [7]:
!samtools view -bS test_SE_plus.sam > test_SE_plus.bam
!samtools view -bS test_SE_minus.sam > test_SE_minus.bam
!samtools view -bS test_PE.sam > test_PE.bam
!samtools view -bS test_PE_rev.sam > test_PE_rev.bam
In [8]:
!samtools sort test_SE_plus.bam test_SE_plus_sorted
!samtools sort test_SE_minus.bam test_SE_minus_sorted
!samtools sort test_PE.bam test_PE_sorted
!samtools sort test_PE_rev.bam test_PE_rev_sorted
In [9]:
from sequtil import makegff
In [10]:
makegff.write_samfile_to_gff("test_SE_plus.bam", "test_SE_plus.gff")
makegff.write_samfile_to_gff("test_SE_plus.bam", "test_SE_plus_5.gff", five_prime=True)
makegff.write_samfile_to_gff("test_SE_minus.bam", "test_SE_minus.gff")
makegff.write_samfile_to_gff("test_SE_minus.bam", "test_SE_minus_5.gff", five_prime=True)
makegff.write_samfile_to_gff("test_PE.bam", "test_PE.gff")
makegff.write_samfile_to_gff("test_PE.bam", "test_PE_5.gff", five_prime=True)
makegff.write_samfile_to_gff("test_PE_rev.bam", "test_PE_rev.gff")
makegff.write_samfile_to_gff("test_PE_rev.bam", "test_PE_rev_5.gff", five_prime=True)
In [11]:
#Values starting @ 21 are index of start of reads (reads created started at 20 and ended at 55).
#Values starting @ 1 are the number of reads (depth?) for this position.
#'-' indicates forward strand.
#Test: Test the start index of the read, the score and the direction of the read for single read.
!cat test_SE_plus.gff
In [2]:
!cat test_SE_plus_5.gff
In [12]:
#Values starting @ 637 are index of start of reads (created from forward reads started at 20 and ended at 55).
#Values starting @ -1 are the number of reads (depth?) for this position. ???Why negative value?
#'-' indicates backward strand.
#Test: Test the start index of the read, the score and the direction of the read for single read.
!cat test_SE_minus.gff
In [13]:
#Test: Test the start index of the read, the score and the direction of the read for paired-end read.
!cat test_PE.gff
In [14]:
#Test: Test the start index of the read, the score and the direction of the read for paired-end read.
!cat test_PE_rev.gff
In [15]:
!cat test_PE_5.gff
In [1]:
!cat test_PE_rev_5.gff
In [40]:
In [ ]: