In [13]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import networkx as nx
import brewer2mpl
import colorsys
import math

from datetime import datetime
from Bio import AlignIO, SeqIO
from Bio.SeqRecord import SeqRecord
from Bio.Seq import Seq
from Levenshtein import distance
from itertools import combinations, product, permutations
from time import time
from __future__ import division
from collections import Counter
from copy import deepcopy
from random import shuffle, choice, sample
from scipy.stats.mstats import mquantiles
from scipy.stats import norm, expon, poisson
from scipy.misc import comb
from IPython.display import Math
%matplotlib inline

In [40]:
bases = ['t', 'c', 'a', 'g']
codons = [a+b+c for a in bases for b in bases for c in bases]
amino_acids = "F F L L S S S S Y Y * * C C * W L L L L P P P P H H Q Q R R R R I I I M T T T T N N K K S S R R V V V V A A A A D D E E G G G G".split(' ')
codon_table = dict(zip(codons, amino_acids)) 
codon_table


Out[40]:
{'aaa': 'K',
 'aac': 'N',
 'aag': 'K',
 'aat': 'N',
 'aca': 'T',
 'acc': 'T',
 'acg': 'T',
 'act': 'T',
 'aga': 'R',
 'agc': 'S',
 'agg': 'R',
 'agt': 'S',
 'ata': 'I',
 'atc': 'I',
 'atg': 'M',
 'att': 'I',
 'caa': 'Q',
 'cac': 'H',
 'cag': 'Q',
 'cat': 'H',
 'cca': 'P',
 'ccc': 'P',
 'ccg': 'P',
 'cct': 'P',
 'cga': 'R',
 'cgc': 'R',
 'cgg': 'R',
 'cgt': 'R',
 'cta': 'L',
 'ctc': 'L',
 'ctg': 'L',
 'ctt': 'L',
 'gaa': 'E',
 'gac': 'D',
 'gag': 'E',
 'gat': 'D',
 'gca': 'A',
 'gcc': 'A',
 'gcg': 'A',
 'gct': 'A',
 'gga': 'G',
 'ggc': 'G',
 'ggg': 'G',
 'ggt': 'G',
 'gta': 'V',
 'gtc': 'V',
 'gtg': 'V',
 'gtt': 'V',
 'taa': '*',
 'tac': 'Y',
 'tag': '*',
 'tat': 'Y',
 'tca': 'S',
 'tcc': 'S',
 'tcg': 'S',
 'tct': 'S',
 'tga': '*',
 'tgc': 'C',
 'tgg': 'W',
 'tgt': 'C',
 'tta': 'L',
 'ttc': 'F',
 'ttg': 'L',
 'ttt': 'F'}

In [23]:
def prob_q_mutations_in_one_codon(n_amino_acids, m_mutations, q_mutations):
    """
    Given n amino acids, what is the probability of q mutations occurring in one codon out of a total of m mutations available?
    
    This function can be integrated into a larger framework where q_mutations_in_codon is the edit distance between
    two codons.
    
    Parameters:
    - n_amino_acids: the length of the protein in amino acids.
    - m_mutations: the number of mutations that happen in total.
    - q_mutations: the number of mutations that happen in the codon.
    """
    
    return comb(n_amino_acids * 3 - 3, m_mutations - q_mutations) / comb(n_amino_acids * 3, m_mutations)

prob_q_mutations_in_one_codon(n_amino_acids=719, m_mutations=10, q_mutations=2)


Out[23]:
1.9280947751649017e-05

In [24]:
letters = {'a', 't', 'g', 'c'}
transition_matrix = dict()

for l1, in letters:
    transition_matrix[l1] = dict()
    for l2 in letters:
        if l1 != l2:
            transition_matrix[l1][l2] = 1/3
            
def prob_transition(l1, l2, transition_matrix):
    return transition_matrix[l1][l2]

prob_transition(l1='a',l2='c',transition_matrix=transition_matrix)


Out[24]:
0.3333333333333333

In [34]:
def prob_m_mutations(time, mutation_rate, n_amino_acids, m_mutations):
    """
    Probability of m substitutions happening within a specified period.
    
    Parameters:
    - time: in years
    - mutation_rate: in units substitutions/(site.year)
    - n_amino_acids: the length of the protein in number of amino acids.
    - m_mutations: the number of substitutions whose likelihood is to be queried.
    """
    return poisson(int(time * mutation_rate * n_amino_acids * 3)).pmf(m_mutations)

prob_m_mutations(time=2, mutation_rate=0.005, n_amino_acids=719, m_mutations=24)


Out[34]:
0.071115757056855564

In [93]:
def prob_mutate_codon(codon1, codon2, n_amino_acids, m_mutations, time, mutation_rate, transition_matrix):
    """
    Probability of mutating from one codon to the next.
    
    Parameters:
    - codon1, codon2: string of 3 nucleotides.
    - n_amino_acids: the length of the protein in amino acids.
    - m_mutations: the number of mutations that happen in total.
    - time: an integer, the number of years.
    - mutation_rate: in units substitutions/(site.year).
    - transition_matrix: the mutation transition matrix.
    """
    
    q_mutations = distance(codon1, codon2)
    
    p_q = prob_q_mutations_in_one_codon(n_amino_acids, m_mutations, q_mutations)
    
    p_ts = []
    for l1, l2 in zip(codon1, codon2):
        if l1 != l2:
            p_ts.append(prob_transition(l1, l2, transition_matrix))
    p_t = np.prod(p_ts)
    
    p_m = prob_m_mutations(time, mutation_rate, n_amino_acids, m_mutations)
    
#     total_prob = 0
#     for p in [p_q, p_t, p_m]:
#         if p != 0:
#             total_prob += math.log10(p)

    return p_q * p_t * p_m

In [94]:
def prob_mutate_codon_aa(codon1, aa2, codon_table, n_amino_acids, m_mutations, time, mutation_rate, transition_matrix):
    """
    Probability of mutation from one codon to another amino acid.
    
    Parameters:
    - codon1: a three letter nucleotide string, small caps please.
    - aa2: a single letter amino acid string, LARGE CAPS PLEASE.
    - codon_table: a dictionary containing the codons and their corresponding amino acids.
    - n_amino_acids: the number of amino acids present in the protein.
    - m_mutations: the number of mutations specified.
    - time: in years.
    - mutation_rate: in units substitutions/(site.year)
    - transition_matrix: the mutation transition matrix.
    """
    
    codons = [codon for codon, aa in codon_table.items() if aa == aa2]
    
    probability = 0
    for codon2 in codons:
        probability += prob_mutate_codon(codon1, codon2, n_amino_acids, m_mutations, time, mutation_rate, transition_matrix)
        
    return probability

prob_mutate_codon_aa('ctg', 'S', codon_table, n_amino_acids=719, m_mutations=24, time=2, mutation_rate=0.005, transition_matrix=transition_matrix)


Out[94]:
1.2505533988888068e-06

In [95]:
codon1 = 'atc'
codon2 = 'ggc'

for l1, l2 in zip(codon1, codon2):
    if l1 != l2:
        print(prob_transition(l1, l2, transition_matrix))


0.333333333333
0.333333333333

In [96]:
prob_mutate_codon(codon1='atc', codon2='ggc', n_amino_acids=719, m_mutations=24, time=2, mutation_rate=0.005, transition_matrix=transition_matrix)


Out[96]:
9.2834004634451908e-07

In [122]:
codon1 = 'caa'
aa2 = 'K'
n_amino_acids = 719
# m_mutations = range(1, 719)
time = 10
mutation_rate = 0.005
transition_matrix = transition_matrix

likelihoods = []
for time in range(1, 10):
    for m_mutations in range(0, n_amino_acids * 3):
        prob = prob_mutate_codon_aa(codon1, aa2, codon_table, n_amino_acids, m_mutations, time, mutation_rate, transition_matrix)
        if math.isnan(prob) == False:
            likelihoods.append(prob)
            
    print(time)


1
2
3
4
5
6
7
8
9

In [123]:
np.sum(likelihoods)


Out[123]:
0.071096159023333216

In [119]:
time = 14
likelihoods = []
for m_mutations in range(0, n_amino_acids * 3):
    prob = prob_mutate_codon_aa(codon1, aa2, codon_table, n_amino_acids, m_mutations, time, mutation_rate, transition_matrix)
    if math.isnan(prob) == False:
        likelihoods.append(prob)
        
np.sum(likelihoods)


Out[119]:
0.021230868413448168

In [121]:
sum_likelihoods = []
for time in range(1, 5):
    likelihoods = []
    for m_mutations in range(0, n_amino_acids * 3):
        prob = prob_mutate_codon_aa(codon1, aa2, codon_table, n_amino_acids, m_mutations, time, mutation_rate, transition_matrix)
        if math.isnan(prob) == False:
            likelihoods.append(prob)
        
    sum_likelihoods.append(np.sum(likelihoods))
    print(time)
    
plt.plot(sum_likelihoods)


1
2
3
4
Out[121]:
[<matplotlib.lines.Line2D at 0x112422b50>]

In [ ]: