notebook.community

Edit and run



In [1]:

    
import math
from collections import defaultdict

class MarkovChainOrder(object):
    ''' Simple higher-order Markov chain, specifically for DNA
        sequences.  User provides training data (DNA strings).  '''
    
    def __init__(self, examples, order=1):
        ''' Initialize the model given collection of example DNA
            strings. '''
        self.order = order
        self.mers = defaultdict(int)
        self.longestMer = longestMer = order + 1
        for ex in examples:
            # count up all k-mers of length 'longestMer' or shorter
            for i in range(len(ex) - longestMer + 1):
                for j in range(longestMer, -1, -1):
                    self.mers[ex[i:i+j]] += 1
    
    def condProb(self, obs, given):
        ''' Return conditional probability of seeing nucleotide "obs"
            given we just saw nucleotide string "given".  Length of
            "given" can't exceed model order.  Return None if "given"
            was never observed. '''
        assert len(given) <= self.order
        ngiven = self.mers[given]
        if ngiven == 0: return None
        return float(self.mers[given + obs]) / self.mers[given]
    
    def jointProb(self, s):
        ''' Return joint probability of observing string s '''
        cum = 1.0
        for i in range(self.longestMer-1, len(s)):
            obs, given = s[i], s[i-self.longestMer+1:i]
            p = self.condProb(obs, given)
            if p is not None:
                cum *= p
        # include the smaller k-mers at the very beginning
        for j in range(self.longestMer-2, -1, -1):
            obs, given = s[j], s[:j]
            p = self.condProb(obs, given)
            if p is not None:
                cum *= p
        return cum
    
    def jointProbL(self, s):
        ''' Return log2 of joint probability of observing string s '''
        cum = 0.0
        for i in range(self.longestMer-1, len(s)):
            obs, given = s[i], s[i-self.longestMer+1:i]
            p = self.condProb(obs, given)
            if p is not None:
                cum += math.log(p, 2)
        # include the smaller k-mers at the very beginning
        for j in range(self.longestMer-2, -1, -1):
            obs, given = s[j], s[:j]
            p = self.condProb(obs, given)
            if p is not None:
                cum += math.log(p, 2)
        return cum



In [2]:

    
mc1 = MarkovChainOrder(['AC' * 10], 1)



In [3]:

    
mc1.condProb('A', 'C') # should be 1; C always followed by A









    Out[3]:





1.0



In [4]:

    
mc1.condProb('C', 'A') # should be 1; A always followed by C









    Out[4]:





1.0



In [5]:

    
mc1.condProb('G', 'A') # should be 0; A occurs but is never followed by G









    Out[5]:





0.0



In [6]:

    
mc2 = MarkovChainOrder(['AC' * 10], 2)



In [7]:

    
mc2.condProb('A', 'AC') # AC always followed by A









    Out[7]:





1.0



In [8]:

    
mc2.condProb('C', 'CA') # CA always followed by C









    Out[8]:





1.0



In [9]:

    
mc2.condProb('C', 'AA') is None # because AA doesn't occur









    Out[9]:





True



In [10]:

    
mc3 = MarkovChainOrder(['AAA1AAA2AAA2AAA3AAA3AAA3'], 3)



In [11]:

    
mc3.condProb('2', 'AAA') # 1/3









    Out[11]:





0.3333333333333333



In [12]:

    
mc3.condProb('3', 'AAA') # 1/2









    Out[12]:





0.5



In [13]:

    
p1 = mc3.condProb('A', '')
p1









    Out[13]:





0.7619047619047619



In [14]:

    
p2 = mc3.condProb('A', 'A')
p2









    Out[14]:





0.6875



In [15]:

    
p3 = mc3.condProb('A', 'AA')
p3









    Out[15]:





0.5454545454545454



In [16]:

    
p4 = mc3.condProb('1', 'AAA')
p4









    Out[16]:





0.16666666666666666



In [17]:

    
p1 * p2 * p3 * p4, mc3.jointProb('AAA1') # should be equal









    Out[17]:





(0.0476190476190476, 0.04761904761904761)



In [18]:

    
import math
math.log(mc3.jointProb('AAA1'), 2), mc3.jointProbL('AAA1') # should be equal









    Out[18]:





(-4.392317422778761, -4.392317422778761)