In [37]:
from pylab import *
import random
from random import choice

In [27]:
alphabet = ['A', 'C', 'G', 'T']
monomer = 'ACCGCTAGAAAT'
monomer * 10


Out[27]:
'ACCGCTAGAAATACCGCTAGAAATACCGCTAGAAATACCGCTAGAAATACCGCTAGAAATACCGCTAGAAATACCGCTAGAAATACCGCTAGAAATACCGCTAGAAATACCGCTAGAAAT'

In [31]:
def debugSequence(maxFrequency, sectionLength, startFrequency=1):
    seq = ''
    for frequency in range(startFrequency, maxFrequency+1):
        monomer = ''.join([choice(alphabet) for index in range(frequency)])
        copies = ceil(sectionLength / frequency)
        repeat = monomer * copies
        seq += (repeat)
    return seq

In [32]:
debugSequence(5, 20, 3)


Out[32]:
'AACAACAACAACAACAACAACCGGTCGGTCGGTCGGTCGGTAGCCCAGCCCAGCCCAGCCC'

In [2]:
biases = [('TTT', 80995),
('TCT', 38027),
('TAT', 63937),
('TGT', 19138),
('TTC', 58774),
('TCC', 33430),
('TAC', 44631),
('TGC', 22188),
('TTA', 52382),
('TCA', 32715),
('TAA',  7356),
('TGA',  3623),
('TTG', 47500),
('TCG', 31146),
('TAG',   989),
('TGG', 50991),
('CTT', 43449),
('CCT', 27340),
('CAT', 45879),
('CGT', 73197),
('CTC', 37347),
('CCC', 19666),
('CAC', 34078),
('CGC', 72212),
('CTA', 15409),
('CCA', 31534),
('CAA', 53394),
('CGA', 13844),
('CTG',177210),
('CCG', 76644),
('CAG',104171),
('CGG', 21552),
('ATT',109072),
('ACT', 37842),
('AAT', 75436),
('AGT', 36097),
('ATC', 86796),
('ACC', 80547),
('AAC', 78443),
('AGC', 55551),
('ATA', 24984),
('ACA', 33910),
('AAA',129137),
('AGA', 13152),
('ATG', 96695),
('ACG', 50269),
('AAG', 45459),
('AGG',  7607),
('GTT', 72584),
('GCT', 62479),
('GAT',119939),
('GGT', 93325),
('GTC', 52439),
('GCC', 88721),
('GAC', 70394),
('GGC', 99390),
('GTA', 42420),
('GCA', 77547),
('GAA',143353),
('GGA', 34799),
('GTG', 89265),
('GCG',110308),
('GAG', 68609),
('GGG', 41277),]

In [3]:
import random
def weighted_choice(items, probs, bincount=10000):
  '''Puts items in bins in proportion to probs
  then uses random.choice() to select items.
 
  Larger bincount for more memory use but
  higher accuracy (on avarage).'''
 
  bins = []
  for item,prob in zip(items, probs):
    bins += [item]*int(bincount*prob)
  while True:
    yield random.choice(bins)

In [6]:
codons, weights = ([entry[0] for entry in biases], [entry[1] for entry in biases])
total = sum(weights)
probabilities = [w / total for w in weights]
probabilities


Out[6]:
[0.022114108197632605,
 0.010382532161631892,
 0.017456753328378739,
 0.005225258382446976,
 0.01604709667519796,
 0.0091274107913680846,
 0.012185625816020012,
 0.0060580015147734092,
 0.014301885494269908,
 0.0089321939587079548,
 0.0020084126168502434,
 0.00098918962898972697,
 0.012968950421477237,
 0.0085038090489964208,
 0.00027002719930191552,
 0.013922100019822017,
 0.01186290372342662,
 0.007464654832067109,
 0.012526367923935878,
 0.019985016084228827,
 0.010196871397703377,
 0.0053694185050267651,
 0.009304334578170554,
 0.019716081007067669,
 0.0042071275167272154,
 0.0086097448966497517,
 0.01457819239588117,
 0.0037798347291564394,
 0.048383741140841707,
 0.02092615233902529,
 0.028441863881172744,
 0.005884354094393209,
 0.029779986534134006,
 0.010332021512621928,
 0.020596331452516988,
 0.0098555832287171331,
 0.023697958332263964,
 0.021991790517867937,
 0.021417334271830294,
 0.015167119260283832,
 0.0068213948911618376,
 0.009258465448258802,
 0.035258344222701181,
 0.0035908975988056552,
 0.02640068760009982,
 0.013724971973415562,
 0.012411695099156499,
 0.0020769432811826808,
 0.019817648366157976,
 0.017058674808073184,
 0.03274700936003281,
 0.025480574696512909,
 0.014317448234775682,
 0.024223542112502779,
 0.019219711494094075,
 0.02713650489243416,
 0.011581955302717144,
 0.021172698912300954,
 0.039139746310947923,
 0.0095011895940418176,
 0.024372070723645591,
 0.0301174522756276,
 0.018732351988781721,
 0.011269881400996124]

In [8]:
itera = weighted_choice(codons, probabilities)
for dummy in range(10):
    print(itera.__next__())


AAC
GTG
AGC
AGT
TAT
GGA
GCC
GCA
CCG
GAG

In [13]:
generator = weighted_choice(codons, probabilities)

In [15]:
weighted_sequence = ''.join(generator.__next__() for x in range(1000))
weighted_sequence


Out[15]:
'TCTCGTGAACCGTTCTTTCCCGCGGACGTGATGGATGTGGGTGCCTTTATTTGCGACGATATGGTCCGTAAATTAGGTCTCGTTGTTTGTACCCTCTCACTTGGCCGCTTCAACTTTTTTCCGATAATGTCTAATGCACCGACGGAATTATTGTACAGAGTAGCAAGCTCAGGTTGCACGGCAGACCTTGCCGCGTCGGGTCTGCGCACCACCCAAATCTGGGCGCGTCTGGGCCTCGCTGCTACACTGGTTAACCATGCTTCAGACTCTGTGACGATGAAATATCCAAACGACGTTGAATAAAAACGACGGGGAGCGGCGGTGATTTTTATCAATCGCGGTGAAGCAGTTATGCTCGACATCTATTAACAACAGGAGAAAGGCGCCACCGCTCCGGTGTATTATACACTGGGCCGTTTGACCGTCTCATCGACGGGCAACATGACCAAACCGCACATGCATTTCTCGGGCCGAATCGCCCGCGCCTACTGGAAAGCCGGCTCTGGCGATTATGCCGATTTTGAAAGTTTTCTTTCATCCAAAGCGTATATTATTCAGTTTCAAATATCGACCTTGCTGAAGAAAATCAAAGTGATCTTTTTATTTAAAGGCAATGATGGCGATGTACTTAATCGTGCGATCGCTTTGCGGCAGGGCCCCCGTTGGAATAGATTTGATATGCAGGAGCTGTATCCGATCTGGCATATTCTGTCCAATTAACAGCGCAATATCAACGCTGCGCTGTCTCTGCTGGTCGGCGAACACGGACTGATTCAGTCTCCTTTGGCAGGTTTCGTACAAGGTACCACGCTGAGCGCCCTGGGCCAACGGGACTTTGCACTGCGTAAGGACGCAGTGGAAGTGGGCTCCCTGAACCCTGAAGCCGGTGAAGACAAACGTACGACCATCATCTTTACCTATGTACTGCAGCAGCAAGGTTACAAATCCGGTAAATGTTGCGGCGAGGATAAATATGACGTTATTCTGAAAGAAGGGATTATCTACTATACCGTAGTTCTGATCATCCGGGGCTTCAAAGATTCAGACAAGGACGAAGATGACGGACTTAAACATGCGCTTGAAGGATTCGAAGGCGAACGTGGCGCTGCTCTGTCGACTGTAGCATCCGCGTCCGCATGGAGGAGTGGTCAACATAACGGCACCACCCCTTCGTCAAAGGTGGCGCAAGAACTCCGCCAGAAACGCTGCAATTCCAATACAAACATCACCTGCCCACACGTAAACCTTGAACTTAACAAGATATATCGGCTCTTCCCGCTCCAAAACTAAAAGATACCGGACGTGATCGCGATCAGAGGCAAATACTTGACTCATAAGCTGTCAACGGTTGATTTACTGGGTTTTTCTCCGCCAACCTGTCTGCGCTTGCATGATTATGAAGCCGTGTCAGATCCGATGAAAGTGGCGAATTTCCATAACCAGATGGGTTTCTTGGTAGGCGATGCCATCTTCGTTCAGGAACTCATCAAACAGACGGTCGCGCTGATCATTAACAAAGTAAAAAACCCTGGTGGCCTGAAACAGCGAGCCTCAGAAAAACCGAACTCTCAGCTAGTTTGAGGTGGGTCTAATCATGAGCCAGCACTGCGCGACCGTGGGTCTCGTATTCTGGGTGAGCGCGTGCGTGACGATATTCTGTATCTTGTTAACATGGGTTTTAAACATTCGTTCTTGGCTGACCGTGTCATCATGATCAAGATTGAAGAAGAGCTGCATTTTCATACCCAGAGCTACGAGGTCACCTCGCTCGGACAGGGGGTCAGTAATTACCTGGTCACAGCCGATGCGAAAGCCCCAAAACGTCGCCAACTGGCATATCATCTTGGTACTGGGTTCTCATCATTCTACGCTGGGGCGGATGATCAGGCGTCGCGCGTGGAAGTCAAACAGATGCAACGGATCCTGATTGCAGCCGCCCTGCCGGGCCTCCGAAAGAAATTGCGCCTGGATGCACACAATGAATTTATTGTCCCAATCATGACCGAGTTCGACCAGACCGGCCCCTTAACCTTAGGCTACGCATCAGAAAAACGCGCGCTCGATAACATCATGGTGAGTCAGGATTCTGTGCTGGGGAATCTCTTTATGAAATTTTTAGGTGTGCTGGTGGTCGGTATCAGCCGGACAGCGATAGCGGACCCAGATAAGTATATGGCTATTCTGCTGGGTGCGGTTTTCGACATGCTGGCGATGAAAATCATTGAAGTCTTAGATGTTACGTCCAACCGCAACTATTTGACCAATCGCCGTACGACGGAAATCGCAGCTGTGGCAGAAACCTGTGAGGACGGAGCGTTTGTGATGCTGCTGACCACGTGGCTGGGCAAGAAGTCGGATTCCCTGAAGTTCCCTAACTTAGTGATTGTCTATTATATAGTTATGGTCGGCGGCCCGTGCACCGGAGAGCAGCAGAAACGTGCTACAGCAGCCATGAGTAGCGAAATTGCGCTCCAGCCGTATTTCCGCTTCCGCCGGATTGAGCACACTGTCCGCGGCCGCGTCTTTTGACTGGAAAAAAGTTTCGGCGAAGACGCCGGCGATAATCTGGTCTCCAACAAAACCAAACGTCGCGGTAAAGGGCCGCAGTTTAAATATGTGGAACTGGCAGAACTGACCTTAATCAAGCTGTCGATTTGAGGCGGTGTAGCTAACATGGGAGGTAATGCACGTCATGGAATGAAAGGCATTCTGGGTCCGCTGCGCGTTGCCTCTTTAGCTTATCAGGCGAAAGGTGTCATCGGTTTATCTATGTTAAAAAACTGGGCTCCGGCCTAACAAAAAAATCTGCTGTCAGTTGCTGTACTGGTCCCGCTGAGCGCGAGCACAGGGAGCGCCCTGGAAATGGTGCGCGGTCTGAAAGAAGGCAACGCAGTCTTGGTGGCGAAGATGGGGATCGCCAAAGGAGCGACAGGTCGCTGGGCGGCTGTGGCAGATGGTAACGTCGCACCTCCGCTTCGCGAGCAATTAAACTTTCAGGCT'

In [34]:
debugSequence(5, 200, 3)


Out[34]:
'CTACTACTACTACTACTACTACTACTACTACTACTACTACTACTACTACTACTACTACTACTACTACTACTACTACTACTACTACTACTACTACTACTACTACTACTACTACTACTACTACTACTACTACTACTACTACTACTACTACTACTACTACTACTACTACTACTACTACTACTACTACTACTACTACTACTACTAGTTAGTTAGTTAGTTAGTTAGTTAGTTAGTTAGTTAGTTAGTTAGTTAGTTAGTTAGTTAGTTAGTTAGTTAGTTAGTTAGTTAGTTAGTTAGTTAGTTAGTTAGTTAGTTAGTTAGTTAGTTAGTTAGTTAGTTAGTTAGTTAGTTAGTTAGTTAGTTAGTTAGTTAGTTAGTTAGTTAGTTAGTTAGTTAGTTAGTTACTTGCCTTGCCTTGCCTTGCCTTGCCTTGCCTTGCCTTGCCTTGCCTTGCCTTGCCTTGCCTTGCCTTGCCTTGCCTTGCCTTGCCTTGCCTTGCCTTGCCTTGCCTTGCCTTGCCTTGCCTTGCCTTGCCTTGCCTTGCCTTGCCTTGCCTTGCCTTGCCTTGCCTTGCCTTGCCTTGCCTTGCCTTGCCTTGCCTTGC'

In [ ]: