This task has less to do with NGS procedures and more to do with the processing of raw sequences and of the main files involved in sequencing. It is also a bit more scholarly in that, while it is good to use certain specialized libraries, each subtask should also be performed without them. Use the template supplied bellow.
In [ ]:
class BaseSequence(object):
def __init__(self, seqstring):
self.seq = seqstring
return
def cleanup_sequence(self, seqstring):
"""
Use self.ValidChars to get the valid characters for each class.
"""
cleanedseq = ""
return cleanedseq
def append_sequence(self, seqstring):
"""
Append to self.seq. Validate the sequence first!
"""
return
def cg_content(self):
""""Return the percentage of G and C characters in the sequence"""
return
def get_ORFs(self):
"""
Return all possible open reading frames
"""
orfs = set()
return orfs
def load_from_fasta(self, fname):
"""Aside from loading the sequence, a few stats should also be displayed"""
return
class DNASequence(BaseSequence):
ValidChars = 'TCAG'
def __init__(self, seqstring):
super(DNASequence, self).__init__(seqstring)
#super(DNASequence, self).__init__(seqstring)
return
def transcribe(self):
"""Task 1.2
Produce an RNASequence string. You have an example, but as an
exercise make a more explicit code doing the same thing.
"""
transeq = ""
#import string
#tbl = string.maketrans('TCGA', 'AGCU')
#transeq = self.seq.translate(tbl)
rna = RNASequence(transeq)
return rna
def read_fasta(self, fpath):
"""Task 1.4
Update self.seq
"""
return
def binding_site_pos(self, pattern):
"""
Ex: CCAXXXTGXXXCGG. Use regex.
"""
pos = -1
return pos
def restriction_site_pos2(self, setp):
"""
setp - set of patterns
Example: CCGCGG, CCGTGG, CCACGG, or CCATGG
The purpose of this method is to find establish a regex pattern automatically
based on the given sequences and then apply it
"""
pos = -1
return pos
def get_correct_ORF(self):
"""Task 2.6
Use the parent class getORFs method.
An open reading frame starts with the start codon ATG (Met)
in most species and ends with one of the three stop codons (TAA, TAG or TGA)
"""
return
def direct_translation(self):
"""Task 2.7
Use the RNASequence.pmap
"""
pseq = ""
prot = ProteinSequence(pseq)
return prot
class RNASequence(BaseSequence):
ValidChars = 'AGCU'
pmap = {"UUU":"F", "UUC":"F", "UUA":"L", "UUG":"L",
"UCU":"S", "UCC":"S", "UCA":"S", "UCG":"S",
"UAU":"Y", "UAC":"Y", "UAA":"STOP", "UAG":"STOP",
"UGU":"C", "UGC":"C", "UGA":"STOP", "UGG":"W",
"CUU":"L", "CUC":"L", "CUA":"L", "CUG":"L",
"CCU":"P", "CCC":"P", "CCA":"P", "CCG":"P",
"CAU":"H", "CAC":"H", "CAA":"Q", "CAG":"Q",
"CGU":"R", "CGC":"R", "CGA":"R", "CGG":"R",
"AUU":"I", "AUC":"I", "AUA":"I", "AUG":"M",
"ACU":"T", "ACC":"T", "ACA":"T", "ACG":"T",
"AAU":"N", "AAC":"N", "AAA":"K", "AAG":"K",
"AGU":"S", "AGC":"S", "AGA":"R", "AGG":"R",
"GUU":"V", "GUC":"V", "GUA":"V", "GUG":"V",
"GCU":"A", "GCC":"A", "GCA":"A", "GCG":"A",
"GAU":"D", "GAC":"D", "GAA":"E", "GAG":"E",
"GGU":"G", "GGC":"G", "GGA":"G", "GGG":"G",}
def __init__(self, seqstring):
super(RNASequence, self).__init__(seqstring)
return
def transcribe(self):
return seq
class ProteinSequence(BaseSequence):
ValidChars = 'AMCEDGFIHKSTOPLNQPSRTWVY' #print "".join(list(set(pmap.values())))
def __init__(self, seqstring):
super(ProteinSequence, self).__init__(seqstring)
return
def load_PDB(self):
return
class NGSCollection(object):
def __init__(self, fastqfn):
"""Read from a fastq file into an appropriate Python native collection"""
return
def QC_metrics(self):
"""Perform a few QC metrics and plots"""
#bs = BaseSequence('kajsdfhslkjfhiugdfj')
#dna = DNASequence('aCtgHIccctga')
#classobj vs type: old style and new style classes
#print type(BaseSequence)
#print type(DNASequence)
seq = """
ACAAGATGCCATTGTCCCCCGGCCTCCTGCTGCTGCTGCTCTCCGGGGCCACGGCCACCGCTGCCCTGCC
CCTGGAGGGTGGCCCCACCGGCCGAGACAGCGAGCATATGCAGGAAGCGGCAGGAATAAGGAAAAGCAGC
CTCCTGACTTTCCTCGCTTGGTGGTTTGAGTGGACCTCCCAGGCCAGTGCCGGGCCCCTCATAGGAGAGG
AAGCTCGGGAGGTGGCCAGGCGGCAGGAAGGCGCACCCCCCCAGCAATCCGCGCGCCGGGACAGAATGCC
CTGCAGGAACTTCTTCTGGAAGACCTTCTCCTCCTGCAAATAAAACCTCACCCATGAATGCTCACGCAAG
TTTAATTACAGACCTGAA
"""