COMPSCI 369 – Assignment 2 – Axton Pitt (2671457), Friday May 13th

Problem 1 – Jukes-Cantor Model of DNA sequence evolution

1a)

Below is a function for simulating the Jukes-Cantor Model of DNA sequence evolution.


In [16]:
def jukesCantorSimulation(sequenceLength, time, mutationRate):

    # generate random sequence of length perscribed
    from random import randint
    from random import random
    from math import log

    ancestoralSequence = []
    decendentSequence1 = []
    decendentSequence2 = []

    # create DNA seq from 0,1,2,3 as
    for index in range(0, sequenceLength):
        randomBase = randint(0,3)
        ancestoralSequence.append(randomBase)
        # print(ancestoralSequence)

    decendentSequence1 = ancestoralSequence.copy()
    decendentSequence2 = ancestoralSequence.copy()

    time1 = time
    time2 = time

    # create decendentSequence1
    while time1 > 0:

        # get a uniform random number representing U(0, 1)
        lengthOfTime = random()

        # sum probability of mutation
        sumProbability = sequenceLength * mutationRate * 0.75

        # turn this into time to mutation
        timeMinus = -log(lengthOfTime) / sumProbability

        if time1 <= 0:
            break
        else:
            bases = [0, 1, 2, 3]

            # index of base to mutate
            baseIndex = randint(0, sequenceLength-1)
            candidateIndex = randint(0, 3)
            candidateBase = bases[candidateIndex]

            # get current base
            currentBase = decendentSequence1[baseIndex]

            while currentBase == candidateBase:

                candidateIndex = randint(0, 3)
                candidateBase = bases[candidateIndex]

            decendentSequence1[baseIndex] = candidateBase

        time1 -= timeMinus

    # create decendentSequence2
    while time2 > 0:

        # get a uniform random number representing U(0, 1)
        lengthOfTime = random()

        # sum probability of mutation
        sumProbability = sequenceLength * mutationRate * 0.75

        # turn this into time to mutation
        timeMinus = -log(lengthOfTime) / sumProbability

        if time2 <= 0:
            break
        else:
            bases = [0, 1, 2, 3]

            # index of base to mutate
            baseIndex = randint(0, sequenceLength-1)
            candidateIndex = randint(0, 3)
            candidateBase = bases[candidateIndex]

            # get current base
            currentBase = decendentSequence2[baseIndex]

            while currentBase == candidateBase:
                candidateIndex = randint(0, 3)
                candidateBase = bases[candidateIndex]

            decendentSequence2[baseIndex] = candidateBase

        time2 -= timeMinus

    outputSequences = []

    # convert to bases
    # return ancestral sequence
    letterArrayAncestoralSequence = numberToBaseConversion(ancestoralSequence)

    # return descendant sequences
    letterArrayDecendentSequence1 = numberToBaseConversion(decendentSequence1)
    letterArrayDecendentSequence2 = numberToBaseConversion(decendentSequence2)

    outputSequences.append(letterArrayAncestoralSequence)
    outputSequences.append(letterArrayDecendentSequence1)
    outputSequences.append(letterArrayDecendentSequence2)

    return outputSequences

def numberToBaseConversion(array):
    newLetterArray = []

    for item in array:
        if item == 0:
            newLetterArray.append("A")
        elif item == 1:
            newLetterArray.append("C")
        elif item == 2:
            newLetterArray.append("G")
        elif item == 3:
            newLetterArray.append("T")

    return newLetterArray

def baseToNumberConversion(array):
    newArray = []

    for item in array:
        if item == "A":
            newArray.append(0)
        elif item == "C":
            newArray.append(1)
        elif item == "G":
            newArray.append(2)
        elif item == "T":
            newArray.append(3)

    return newArray

def printSequence(array):
    outputString = ""
    
    for item in array:
        outputString += item
        
    print(outputString)

def numberOfBaseDifferences(sequence1, sequence2):
    minimumSequence = min(sequence1, sequence2)
    minimumLength = len(minimumSequence)
    diff = 0
    # return number of sites at which sequence differs
    for index in range(0, minimumLength):
        if sequence1[index] != sequence2[index]:
            diff += 1
    return diff

# run with sequence length of 50, 0.01 mutation rate, and time = 10
sequences = jukesCantorSimulation(50, 0.01, 10)

# print sequences
for item in sequences:
    printSequence(item)

# print differences
print("Number of sites at which ancestral and decendent 1 differ:")
print(numberOfBaseDifferences(sequences[0], sequences[1]))

print("Number of sites at which ancestral and decendent 2 differ:")
print(numberOfBaseDifferences(sequences[0], sequences[2]))

print("Number of sites at which decendent 1 and decendent 2 differ:")
print(numberOfBaseDifferences(sequences[1], sequences[2]))


CTAAGCGTGACGCGCGCGGTAGTACGCGGTGGTCCTTTAGAGTACATCGC
CTAAGCGTGACGGGCCCGGTAGTACGCGGTGGTCCTTCATAGTACATCGA
CTAAGCGTGATGCGCCCAGTAGTACGAGGTGGTCCTTTAGAGTACATAGC
Number of sites at which ancestral and decendent 1 differ:
5
Number of sites at which ancestral and decendent 2 differ:
5
Number of sites at which decendent 1 and decendent 2 differ:
8

1b)

You would expect the average number of mutations that occur to be Poisson distributed equal to parameter [2tL\frac{3}{4}\mu].


In [18]:
def distributionSimulation():

    # repeat 1000 times
    # run with sequence length of 1000, 0.01 mutation rate, and time = 25
    count = 0
    totalCount = 0
    totalSquaredSum = 0

    for index in range(0, 1000):
        simulatedSequences = jukesCantorSimulation(1000, 0.01, 25)
        count = numberOfBaseDifferences(simulatedSequences[1], simulatedSequences[2])
        # print(count)
        totalCount += count
        totalSquaredSum += count**2


    # determine mean and variance
    mean = totalCount / 1000
    variance = (totalSquaredSum / 1000) - mean**2

    print(mean)
    print(variance)

distributionSimulation()


296.399
206.23379899999418

For the distribution to be Poisson distributed, the mean and variance should be equal to (2tL\frac{3}{4}\mu). As (2tL\frac{3}{4}\mu) = (2(25)(1000)\frac{3}{4}(0.01)) = 375. As the mean is 296.399 and variance is 206.234 it looks like the number of mutations that occur is not Poisson distributed.

1c)

Simulating a pair of sibling sequences, B and C, of length 10000 with time = 10 units and mutation rate 0.03


In [3]:
def numberOfSpecificBaseDifferences(sequence1, sequence2):
    minimumSequence = min(sequence1, sequence2)
    minimumLength = len(minimumSequence)

    results = []
    AtoA = 0
    AtoT = 0
    AtoC = 0
    AtoG = 0
    totalAto = 0

    # return number of sites at which sequence differs
    for index in range(0, minimumLength):
        if sequence1[index] == "A" and sequence2[index] == "A":
            AtoA += 1
        elif sequence1[index] == "A" and sequence2[index] == "T":
            AtoT += 1
        elif sequence1[index] == "A" and sequence2[index] == "C":
            AtoC += 1
        elif sequence1[index] == "A" and sequence2[index] == "G":
            AtoG += 1

    totalAto = AtoA + AtoT + AtoC + AtoG

    results.append(AtoA)
    results.append(AtoT)
    results.append(AtoC)
    results.append(AtoG)
    results.append(totalAto)

    return results


def calculateProbabilityAB():
    import numpy

    oneCSequences = jukesCantorSimulation(10000, 10, 0.03)
    b = oneCSequences[1]
    c = oneCSequences[2]

    values = numberOfSpecificBaseDifferences(b, c)
    estimatePABSame = values[0] / values[4] # theoretical value is 0.6616
    estimatePABTotalDiff = (values[1] + values[2] + values[3]) / values[4] # theoretical value is 0.112797
    estimatePABDiff = estimatePABTotalDiff / 3
    print("Probability Pab a = b:")
    print(estimatePABSame)
    print("Probability Pab a =/= b:")
    print(estimatePABDiff)

calculateProbabilityAB()


Probability Pab a = b:
0.6656262192742879
Probability Pab a =/= b:
0.11145792690857069

Estimates above a similar to the theoretical values of 0.6616 and 0.112797

Problem 2 – Jukes-Cantor Model of DNA sequence evolution with insertions and deletions

2a)

Simulation extended to include insertions and deletions.


In [22]:
def simulationWithInsertionsAndDeletions(sequence1, sequence2, sequenceLength, time, mutationRate):
    import random

    decendentSequence1 = sequence1
    decendentSequence2 = sequence2

    # draw a Poisson random variate hi with rate Lt\mu/10 - for number of insertions
    rate = (sequenceLength * time * mutationRate) / 10
    noOfInsertions = poissonDistribution(rate, 1) # T = 1

    # draw a Poisson random variate hd with rate Lt\mu/10 - for number of deletions
    noOfDeletions = poissonDistribution(rate, 1) # T = 1

    # print(noOfInsertions)
    # print(noOfDeletions)

    for count in range(0, noOfInsertions):
        inserts = insertionSequence()
        # add insertions to sequence
        indexToInsert = random.randint(0, sequenceLength)
        indexToInsertP1 = indexToInsert + 1
        indexToInsertP2 = indexToInsertP1 + 1
        decendentSequence1.insert(indexToInsert, inserts[0])
        decendentSequence1.insert(indexToInsertP1, inserts[1])
        decendentSequence1.insert(indexToInsertP2, inserts[2])
        decendentSequence2.insert(indexToInsert, inserts[0])
        decendentSequence2.insert(indexToInsertP1, inserts[1])
        decendentSequence2.insert(indexToInsertP2, inserts[2])

    for count in range(0, noOfDeletions):
        deletion = "-"
        # add deletions to sequence
        indexToInsert = random.randint(0, sequenceLength)
        indexToInsertP1 = indexToInsert + 1
        indexToInsertP2 = indexToInsertP1 + 1
        decendentSequence1[indexToInsert] = deletion
        decendentSequence1[indexToInsertP1] = deletion
        decendentSequence1[indexToInsertP2] = deletion
        decendentSequence2[indexToInsert] = deletion
        decendentSequence2[indexToInsertP1] = deletion
        decendentSequence2[indexToInsertP2] = deletion

    printSequence(decendentSequence1)
    printSequence(decendentSequence2)

    return [decendentSequence1, decendentSequence2]

def insertionSequence():
    import random
    insertionSeq = []
    bases = ["A", "T", "C", "G"]
    for count in range(0, 3):
        baseIndex = random.randint(0,3)
        base = bases[baseIndex]
        insertionSeq.append(base)

    return insertionSeq



def poissonDistribution(lbd, T):
    times = []
    t = randexp(lbd)
    while t < T: # draw exp(lambda) variables until we are past T
        times.append(t)
        t += randexp(lbd)
    return len(times)

def randexp(lbd): # lambda is a keyword in Python
    from math import log
    from random import random
    return - log(random()) / lbd

newSequences = jukesCantorSimulation(50, 20, 0.01)
sequences = simulationWithInsertionsAndDeletions(newSequences[1], newSequences[2], 50, 20, 0.01)


A---TAC---TAATTACAATGTCATAG---AAGGCAGATCAGGGTAGCTA
T---TGC---TATTTACGATATTAGAG---AAGGAAGATCAAGATAGTTG

2b)

If this model was used, it would alter the sequence length. This would not result in a Poisson distributed number of insertions and deletions with parameter 2Ltμ/10, as the total sequence length has the potential to change and alter the total probability of mutation.

Problem 3 – Overlap alignment algorithm

3a)

Implementation of overlap alignment algorithm.


In [6]:
def overlapAlignment(sequence1, sequence2, scoreMatrix, gapPenalty):
    import numpy

    fMatrix = numpy.zeros((len(sequence1), len(sequence2)))

    # construct the matrix
    for i in range(0, len(sequence1)):
        fMatrix[i][0] = 0*i # as this is overlap do not need penalty here
    for j in range(0, len(sequence2)):
        fMatrix[j][0] = 0*j
    for i in range(1, len(sequence1)):
        for j in range(1, len(sequence2)):
            matchValue = fMatrix[i-1][j-1] + scoreMatrix[sequence1[i]][sequence2[j]]
            deleteValue = fMatrix[i-1][j] + gapPenalty
            insertValue = fMatrix[i][j-1] + gapPenalty
            fMatrix[i][j] = max(matchValue, deleteValue, insertValue)

    # backtracking and form alignment - need to get max from last column and row
    maxX = 0
    maxY = 0

    for i in range(0, len(sequence1)):
        if fMatrix[i][0] > maxX:
            maxX = fMatrix[i][0]

    for j in range(0, len(sequence2)):
        if fMatrix[0][j] > maxY:
            maxY = fMatrix[0][j]

    alignment1 = ""
    alignment2 = ""
    i = maxX
    j = maxY

    while i > 0 or j > 0:
        if i > 0 and j > 0 and fMatrix[i][j] == fMatrix[i-1][j-1] + scoreMatrix[sequence1[i]][sequence2[j]]:
            alignment1 = sequence1[i] + alignment1
            alignment2 = sequence2[j] + alignment2
            i -= 1
            j -= 1
        elif i > 0 and fMatrix[i][j] == fMatrix[i-1][j] + gapPenalty:
            alignment1 = sequence1[i] + alignment1
            alignment2 = "-" + alignment2
            i -= 1
        elif j > 0 and fMatrix[i][j] == fMatrix[i][j-1] + gapPenalty:
            alignment1 = "-" + alignment1
            alignment2 = sequence2[j] + alignment2
            j -= 1

    print(alignment1)
    print(alignment2)

import numpy

newSequences = jukesCantorSimulation(50, 20, 0.01)
sequences = simulationWithInsertionsAndDeletions(newSequences[1], newSequences[2], 50, 20, 0.01)
scoreMatrix = numpy.zeros((len(sequences[0]), len(sequences[1])))
for i in range(1, len(sequences[0])):
    for j in range(1, len(sequences[1])):
        scoreMatrix[i][j] = 2

overlapAlignment(sequences[0], sequences[1], scoreMatrix, -3)