Sequencing

  • Make a toy sequencing library in standard Python for processing DNA, RNA and protein data.
  • Implement the DNA, RNA and proteins as Python classes
  • Make methods for transcription, translation, regulation.
  • Compute several sequence similarity scores, such as hamming distance and mutual information,

This task has less to do with NGS procedures and more to do with the processing of raw sequences and of the main files involved in sequencing. It is also a bit more scholarly in that, while it is good to use certain specialized libraries, each subtask should also be performed without them. Use the template supplied bellow.


In [ ]:
class BaseSequence(object):
    
    def __init__(self, seqstring):
        self.seq = seqstring
        return
    
    def cleanup_sequence(self, seqstring):
        """
        Use self.ValidChars to get the valid characters for each class.
        """
        cleanedseq = ""
        return cleanedseq
    
    def append_sequence(self, seqstring):
        """
        Append to self.seq. Validate the sequence first!
        """
        return

    def cg_content(self):
        """"Return the percentage of G and C characters in the sequence"""
        return
    
    def get_ORFs(self):
        """
        Return all possible open reading frames
        """
        orfs = set()
        return orfs
    
    def load_from_fasta(self, fname):
        """Aside from loading the sequence, a few stats should also be displayed"""
        return

class DNASequence(BaseSequence):
    ValidChars = 'TCAG'
    
    def __init__(self, seqstring):
        super(DNASequence, self).__init__(seqstring)
        #super(DNASequence, self).__init__(seqstring)
        return

    def transcribe(self):
        """Task 1.2
        Produce an RNASequence string. You have an example, but as an 
        exercise make a more explicit code doing the same thing.
        """
        transeq = ""        
        #import string
        #tbl = string.maketrans('TCGA', 'AGCU')
        #transeq = self.seq.translate(tbl)
        rna = RNASequence(transeq)
        return rna
    
    def read_fasta(self, fpath):
        """Task 1.4
        Update self.seq
        """
        return
    
    def binding_site_pos(self, pattern):
        """
        Ex: CCAXXXTGXXXCGG. Use regex.
        """
        pos = -1
        return pos
    
    def restriction_site_pos2(self, setp):
        """
        setp - set of patterns
        Example: CCGCGG, CCGTGG, CCACGG, or CCATGG
        The purpose of this method is to find establish a regex pattern automatically
        based on the given sequences and then apply it
        """
        pos = -1
        return pos
        
    def get_correct_ORF(self):
        """Task 2.6
        Use the parent class getORFs method.
        An open reading frame starts with the start codon ATG (Met) 
        in most species and ends with one of the three stop codons (TAA, TAG or TGA)
        """
        return
    
    def direct_translation(self):
        """Task 2.7
        Use the RNASequence.pmap
        """
        pseq = ""
        prot = ProteinSequence(pseq)
        return prot
        

class RNASequence(BaseSequence):
    ValidChars = 'AGCU'
    pmap = {"UUU":"F", "UUC":"F", "UUA":"L", "UUG":"L",
    "UCU":"S", "UCC":"S", "UCA":"S", "UCG":"S",
    "UAU":"Y", "UAC":"Y", "UAA":"STOP", "UAG":"STOP",
    "UGU":"C", "UGC":"C", "UGA":"STOP", "UGG":"W",
    "CUU":"L", "CUC":"L", "CUA":"L", "CUG":"L",
    "CCU":"P", "CCC":"P", "CCA":"P", "CCG":"P",
    "CAU":"H", "CAC":"H", "CAA":"Q", "CAG":"Q",
    "CGU":"R", "CGC":"R", "CGA":"R", "CGG":"R",
    "AUU":"I", "AUC":"I", "AUA":"I", "AUG":"M",
    "ACU":"T", "ACC":"T", "ACA":"T", "ACG":"T",
    "AAU":"N", "AAC":"N", "AAA":"K", "AAG":"K",
    "AGU":"S", "AGC":"S", "AGA":"R", "AGG":"R",
    "GUU":"V", "GUC":"V", "GUA":"V", "GUG":"V",
    "GCU":"A", "GCC":"A", "GCA":"A", "GCG":"A",
    "GAU":"D", "GAC":"D", "GAA":"E", "GAG":"E",
    "GGU":"G", "GGC":"G", "GGA":"G", "GGG":"G",}
    
    def __init__(self, seqstring):
        super(RNASequence, self).__init__(seqstring)
        return
    
    def transcribe(self):
        return seq
        
class ProteinSequence(BaseSequence):
    ValidChars = 'AMCEDGFIHKSTOPLNQPSRTWVY' #print "".join(list(set(pmap.values())))
    
    def __init__(self, seqstring):
        super(ProteinSequence, self).__init__(seqstring)
        return
        
    def load_PDB(self):
        return

class NGSCollection(object):
    
    def __init__(self, fastqfn):
        """Read from a fastq file into an appropriate Python native collection"""
        return
    
    def QC_metrics(self):
        """Perform a few QC metrics and plots"""

#bs = BaseSequence('kajsdfhslkjfhiugdfj')
#dna = DNASequence('aCtgHIccctga')

#classobj vs type: old style and new style classes
#print type(BaseSequence)
#print type(DNASequence)

seq = """
ACAAGATGCCATTGTCCCCCGGCCTCCTGCTGCTGCTGCTCTCCGGGGCCACGGCCACCGCTGCCCTGCC
CCTGGAGGGTGGCCCCACCGGCCGAGACAGCGAGCATATGCAGGAAGCGGCAGGAATAAGGAAAAGCAGC
CTCCTGACTTTCCTCGCTTGGTGGTTTGAGTGGACCTCCCAGGCCAGTGCCGGGCCCCTCATAGGAGAGG
AAGCTCGGGAGGTGGCCAGGCGGCAGGAAGGCGCACCCCCCCAGCAATCCGCGCGCCGGGACAGAATGCC
CTGCAGGAACTTCTTCTGGAAGACCTTCTCCTCCTGCAAATAAAACCTCACCCATGAATGCTCACGCAAG
TTTAATTACAGACCTGAA
"""