Sample a fasta file while introducing mutations in DNA sequences


In [2]:
from random import randint, uniform, choice
from Bio import SeqIO

Simple function introducting mutations in a sequence (lower bases) at a given frequency


In [3]:
def mutate_seq (seq, freq):
    str= ""                                                    
    for i in seq:
        if uniform(0,1) <= freq:                                                 
            str+=choice([b for b in ["a","t","c","g"] if b != i.lower()])
        else:                                                  
            str+=i.upper()
    return str

In [4]:
mutate_seq ("CTCGATCGCTAGCATGCATCGTCGCATGCTCGATCAGCTAGCAGCATCAGCTAGCTGCATCAGTCA", 0.1)


Out[4]:
'tTCaATatgTAGCATGCATCGTCtCATaCTCGcTCAGCTAGCgGCATCAGCTcGCTGCATCgGTCA'

Example from a test dataset on the forward on reverse strand with increasing frequency of mutations


In [5]:
a = SeqIO.read("./data/test.fa", "fasta")

In [6]:
with open ("./data/sample.fa", "w") as fp:
    for i in range(1, 51):
        fp.write(">query_{:03d}%_mutation_forward\n".format(i))
        start = randint (0, len(a.seq)-100)
        fp.write("{}\n".format(mutate_seq(str(a.seq)[start:start+100], i/100.0)))
    for i in range(1, 51):
        fp.write(">query_{:03d}%_mutation_reverse\n".format(i))
        start = randint (0, len(a.seq)-100)
        fp.write("{}\n".format(mutate_seq(str(a.seq.reverse_complement())[start:start+100], i/100.0)))