In [16]:
def jukesCantorSimulation(sequenceLength, time, mutationRate):
# generate random sequence of length perscribed
from random import randint
from random import random
from math import log
ancestoralSequence = []
decendentSequence1 = []
decendentSequence2 = []
# create DNA seq from 0,1,2,3 as
for index in range(0, sequenceLength):
randomBase = randint(0,3)
ancestoralSequence.append(randomBase)
# print(ancestoralSequence)
decendentSequence1 = ancestoralSequence.copy()
decendentSequence2 = ancestoralSequence.copy()
time1 = time
time2 = time
# create decendentSequence1
while time1 > 0:
# get a uniform random number representing U(0, 1)
lengthOfTime = random()
# sum probability of mutation
sumProbability = sequenceLength * mutationRate * 0.75
# turn this into time to mutation
timeMinus = -log(lengthOfTime) / sumProbability
if time1 <= 0:
break
else:
bases = [0, 1, 2, 3]
# index of base to mutate
baseIndex = randint(0, sequenceLength-1)
candidateIndex = randint(0, 3)
candidateBase = bases[candidateIndex]
# get current base
currentBase = decendentSequence1[baseIndex]
while currentBase == candidateBase:
candidateIndex = randint(0, 3)
candidateBase = bases[candidateIndex]
decendentSequence1[baseIndex] = candidateBase
time1 -= timeMinus
# create decendentSequence2
while time2 > 0:
# get a uniform random number representing U(0, 1)
lengthOfTime = random()
# sum probability of mutation
sumProbability = sequenceLength * mutationRate * 0.75
# turn this into time to mutation
timeMinus = -log(lengthOfTime) / sumProbability
if time2 <= 0:
break
else:
bases = [0, 1, 2, 3]
# index of base to mutate
baseIndex = randint(0, sequenceLength-1)
candidateIndex = randint(0, 3)
candidateBase = bases[candidateIndex]
# get current base
currentBase = decendentSequence2[baseIndex]
while currentBase == candidateBase:
candidateIndex = randint(0, 3)
candidateBase = bases[candidateIndex]
decendentSequence2[baseIndex] = candidateBase
time2 -= timeMinus
outputSequences = []
# convert to bases
# return ancestral sequence
letterArrayAncestoralSequence = numberToBaseConversion(ancestoralSequence)
# return descendant sequences
letterArrayDecendentSequence1 = numberToBaseConversion(decendentSequence1)
letterArrayDecendentSequence2 = numberToBaseConversion(decendentSequence2)
outputSequences.append(letterArrayAncestoralSequence)
outputSequences.append(letterArrayDecendentSequence1)
outputSequences.append(letterArrayDecendentSequence2)
return outputSequences
def numberToBaseConversion(array):
newLetterArray = []
for item in array:
if item == 0:
newLetterArray.append("A")
elif item == 1:
newLetterArray.append("C")
elif item == 2:
newLetterArray.append("G")
elif item == 3:
newLetterArray.append("T")
return newLetterArray
def baseToNumberConversion(array):
newArray = []
for item in array:
if item == "A":
newArray.append(0)
elif item == "C":
newArray.append(1)
elif item == "G":
newArray.append(2)
elif item == "T":
newArray.append(3)
return newArray
def printSequence(array):
outputString = ""
for item in array:
outputString += item
print(outputString)
def numberOfBaseDifferences(sequence1, sequence2):
minimumSequence = min(sequence1, sequence2)
minimumLength = len(minimumSequence)
diff = 0
# return number of sites at which sequence differs
for index in range(0, minimumLength):
if sequence1[index] != sequence2[index]:
diff += 1
return diff
# run with sequence length of 50, 0.01 mutation rate, and time = 10
sequences = jukesCantorSimulation(50, 0.01, 10)
# print sequences
for item in sequences:
printSequence(item)
# print differences
print("Number of sites at which ancestral and decendent 1 differ:")
print(numberOfBaseDifferences(sequences[0], sequences[1]))
print("Number of sites at which ancestral and decendent 2 differ:")
print(numberOfBaseDifferences(sequences[0], sequences[2]))
print("Number of sites at which decendent 1 and decendent 2 differ:")
print(numberOfBaseDifferences(sequences[1], sequences[2]))
In [18]:
def distributionSimulation():
# repeat 1000 times
# run with sequence length of 1000, 0.01 mutation rate, and time = 25
count = 0
totalCount = 0
totalSquaredSum = 0
for index in range(0, 1000):
simulatedSequences = jukesCantorSimulation(1000, 0.01, 25)
count = numberOfBaseDifferences(simulatedSequences[1], simulatedSequences[2])
# print(count)
totalCount += count
totalSquaredSum += count**2
# determine mean and variance
mean = totalCount / 1000
variance = (totalSquaredSum / 1000) - mean**2
print(mean)
print(variance)
distributionSimulation()
For the distribution to be Poisson distributed, the mean and variance should be equal to (2tL\frac{3}{4}\mu). As (2tL\frac{3}{4}\mu) = (2(25)(1000)\frac{3}{4}(0.01)) = 375. As the mean is 296.399 and variance is 206.234 it looks like the number of mutations that occur is not Poisson distributed.
In [3]:
def numberOfSpecificBaseDifferences(sequence1, sequence2):
minimumSequence = min(sequence1, sequence2)
minimumLength = len(minimumSequence)
results = []
AtoA = 0
AtoT = 0
AtoC = 0
AtoG = 0
totalAto = 0
# return number of sites at which sequence differs
for index in range(0, minimumLength):
if sequence1[index] == "A" and sequence2[index] == "A":
AtoA += 1
elif sequence1[index] == "A" and sequence2[index] == "T":
AtoT += 1
elif sequence1[index] == "A" and sequence2[index] == "C":
AtoC += 1
elif sequence1[index] == "A" and sequence2[index] == "G":
AtoG += 1
totalAto = AtoA + AtoT + AtoC + AtoG
results.append(AtoA)
results.append(AtoT)
results.append(AtoC)
results.append(AtoG)
results.append(totalAto)
return results
def calculateProbabilityAB():
import numpy
oneCSequences = jukesCantorSimulation(10000, 10, 0.03)
b = oneCSequences[1]
c = oneCSequences[2]
values = numberOfSpecificBaseDifferences(b, c)
estimatePABSame = values[0] / values[4] # theoretical value is 0.6616
estimatePABTotalDiff = (values[1] + values[2] + values[3]) / values[4] # theoretical value is 0.112797
estimatePABDiff = estimatePABTotalDiff / 3
print("Probability Pab a = b:")
print(estimatePABSame)
print("Probability Pab a =/= b:")
print(estimatePABDiff)
calculateProbabilityAB()
Estimates above a similar to the theoretical values of 0.6616 and 0.112797
In [22]:
def simulationWithInsertionsAndDeletions(sequence1, sequence2, sequenceLength, time, mutationRate):
import random
decendentSequence1 = sequence1
decendentSequence2 = sequence2
# draw a Poisson random variate hi with rate Lt\mu/10 - for number of insertions
rate = (sequenceLength * time * mutationRate) / 10
noOfInsertions = poissonDistribution(rate, 1) # T = 1
# draw a Poisson random variate hd with rate Lt\mu/10 - for number of deletions
noOfDeletions = poissonDistribution(rate, 1) # T = 1
# print(noOfInsertions)
# print(noOfDeletions)
for count in range(0, noOfInsertions):
inserts = insertionSequence()
# add insertions to sequence
indexToInsert = random.randint(0, sequenceLength)
indexToInsertP1 = indexToInsert + 1
indexToInsertP2 = indexToInsertP1 + 1
decendentSequence1.insert(indexToInsert, inserts[0])
decendentSequence1.insert(indexToInsertP1, inserts[1])
decendentSequence1.insert(indexToInsertP2, inserts[2])
decendentSequence2.insert(indexToInsert, inserts[0])
decendentSequence2.insert(indexToInsertP1, inserts[1])
decendentSequence2.insert(indexToInsertP2, inserts[2])
for count in range(0, noOfDeletions):
deletion = "-"
# add deletions to sequence
indexToInsert = random.randint(0, sequenceLength)
indexToInsertP1 = indexToInsert + 1
indexToInsertP2 = indexToInsertP1 + 1
decendentSequence1[indexToInsert] = deletion
decendentSequence1[indexToInsertP1] = deletion
decendentSequence1[indexToInsertP2] = deletion
decendentSequence2[indexToInsert] = deletion
decendentSequence2[indexToInsertP1] = deletion
decendentSequence2[indexToInsertP2] = deletion
printSequence(decendentSequence1)
printSequence(decendentSequence2)
return [decendentSequence1, decendentSequence2]
def insertionSequence():
import random
insertionSeq = []
bases = ["A", "T", "C", "G"]
for count in range(0, 3):
baseIndex = random.randint(0,3)
base = bases[baseIndex]
insertionSeq.append(base)
return insertionSeq
def poissonDistribution(lbd, T):
times = []
t = randexp(lbd)
while t < T: # draw exp(lambda) variables until we are past T
times.append(t)
t += randexp(lbd)
return len(times)
def randexp(lbd): # lambda is a keyword in Python
from math import log
from random import random
return - log(random()) / lbd
newSequences = jukesCantorSimulation(50, 20, 0.01)
sequences = simulationWithInsertionsAndDeletions(newSequences[1], newSequences[2], 50, 20, 0.01)
In [6]:
def overlapAlignment(sequence1, sequence2, scoreMatrix, gapPenalty):
import numpy
fMatrix = numpy.zeros((len(sequence1), len(sequence2)))
# construct the matrix
for i in range(0, len(sequence1)):
fMatrix[i][0] = 0*i # as this is overlap do not need penalty here
for j in range(0, len(sequence2)):
fMatrix[j][0] = 0*j
for i in range(1, len(sequence1)):
for j in range(1, len(sequence2)):
matchValue = fMatrix[i-1][j-1] + scoreMatrix[sequence1[i]][sequence2[j]]
deleteValue = fMatrix[i-1][j] + gapPenalty
insertValue = fMatrix[i][j-1] + gapPenalty
fMatrix[i][j] = max(matchValue, deleteValue, insertValue)
# backtracking and form alignment - need to get max from last column and row
maxX = 0
maxY = 0
for i in range(0, len(sequence1)):
if fMatrix[i][0] > maxX:
maxX = fMatrix[i][0]
for j in range(0, len(sequence2)):
if fMatrix[0][j] > maxY:
maxY = fMatrix[0][j]
alignment1 = ""
alignment2 = ""
i = maxX
j = maxY
while i > 0 or j > 0:
if i > 0 and j > 0 and fMatrix[i][j] == fMatrix[i-1][j-1] + scoreMatrix[sequence1[i]][sequence2[j]]:
alignment1 = sequence1[i] + alignment1
alignment2 = sequence2[j] + alignment2
i -= 1
j -= 1
elif i > 0 and fMatrix[i][j] == fMatrix[i-1][j] + gapPenalty:
alignment1 = sequence1[i] + alignment1
alignment2 = "-" + alignment2
i -= 1
elif j > 0 and fMatrix[i][j] == fMatrix[i][j-1] + gapPenalty:
alignment1 = "-" + alignment1
alignment2 = sequence2[j] + alignment2
j -= 1
print(alignment1)
print(alignment2)
import numpy
newSequences = jukesCantorSimulation(50, 20, 0.01)
sequences = simulationWithInsertionsAndDeletions(newSequences[1], newSequences[2], 50, 20, 0.01)
scoreMatrix = numpy.zeros((len(sequences[0]), len(sequences[1])))
for i in range(1, len(sequences[0])):
for j in range(1, len(sequences[1])):
scoreMatrix[i][j] = 2
overlapAlignment(sequences[0], sequences[1], scoreMatrix, -3)