In [1]:
import matplotlib
import Bio.SeqIO
import Bio.Restriction
import scipy
import matplotlib.pyplot as plt
%matplotlib inline
import math
import random

In [7]:
handle = open("/s/fir/a/nobackup/data/ECOLI_Reference_Genome/ecoli.fa", "rU")
for record in Bio.SeqIO.parse(handle, "fasta") :
    ref_seq = record.seq    

frag_sizes = [len(seq) for seq in Bio.Restriction.XhoI.catalyse(ref_seq)]
print(scipy.mean(frag_sizes))
plt.hist(frag_sizes)


25919.972067
Out[7]:
(array([ 56.,  39.,  35.,  21.,   9.,   8.,   5.,   3.,   0.,   3.]),
 array([  1.20000000e+01,   1.16118000e+04,   2.32116000e+04,
          3.48114000e+04,   4.64112000e+04,   5.80110000e+04,
          6.96108000e+04,   8.12106000e+04,   9.28104000e+04,
          1.04410200e+05,   1.16010000e+05]),
 <a list of 10 Patch objects>)

In [15]:
size = 12784
sigma = .58
random.gauss(0, math.sqrt(size*sigma**2))
math.sqrt(size*sigma**2)
size*sigma**2


Out[15]:
4300.5376

In [14]:



Out[14]:
4639675

In [36]:
import random

all_rmaps = []
for i in range(700):
    breaks = []
    num_breaks = 40 #random.randint(20,50)
    for j in range(num_breaks):
        breaks.append(random.randint(0,len(ref_seq) - 1))
    breaks.sort()
    pairs = [(x,y) for  x,y in zip(breaks[:-1], breaks[1:])] 
    
    diffs = [y - x for x,y in pairs] + [len(ref_seq) - breaks[-1] + breaks[0]]
    bigdiffs = [diff/1000 for diff in diffs if 1000000 > diff > 250000]
    all_rmaps += bigdiffs
plt.hist(all_rmaps, bins=30)


Out[36]:
(array([ 631.,  519.,  414.,  311.,  272.,  212.,  173.,  148.,  119.,
         103.,   64.,   54.,   55.,   42.,   23.,   19.,   23.,   12.,
          15.,    8.,    5.,    9.,    5.,    6.,    1.,    4.,    3.,
           3.,    0.,    1.]),
 array([ 250.        ,  274.16666667,  298.33333333,  322.5       ,
         346.66666667,  370.83333333,  395.        ,  419.16666667,
         443.33333333,  467.5       ,  491.66666667,  515.83333333,
         540.        ,  564.16666667,  588.33333333,  612.5       ,
         636.66666667,  660.83333333,  685.        ,  709.16666667,
         733.33333333,  757.5       ,  781.66666667,  805.83333333,
         830.        ,  854.16666667,  878.33333333,  902.5       ,
         926.66666667,  950.83333333,  975.        ]),
 <a list of 30 Patch objects>)

In [108]:
def subseq(seq, limits):
    x,y = limits
    if x < y : return seq[x:y]
    return seq[x:len(seq)] + seq[0:y]

In [109]:
molecules = []
for i in range(1400):
    breaks = []
    num_breaks = 40 #random.randint(20,50)
    for j in range(num_breaks):
        breaks.append(random.randint(0,len(ref_seq) - 1))
    breaks.sort()
    pairs = [(x,y) for x,y in zip(breaks[:-1], breaks[1:])] + [(breaks[-1], breaks[0])]
    molecules += [[subseq(ref_seq, pair), pair] for pair in pairs]

In [20]:
def sim(mol):
    "Return a simulated set of fragment sizes in bp for a Bio.SeqIO seq"
    sites = [biosite - 1 for biosite in Bio.Restriction.XhoI.search(mol)]
    sites = [site for site in sites if random.randint(1,5) <= 4] # sim missing sites
    allsites = [0] + sites + [len(mol)]
    # TODO add random break sites
    sizes = [y - x for x,y in zip(allsites[:-1], allsites[1:])]
    sizes = [size/1000 for size in sizes] # convert to kb
    sigma = .58
    sizes = [size + random.gauss(0, math.sqrt(size*sigma**2)) for size in sizes]
    sizes = [size for size in sizes if size > .5]
    return sizes
    

m = open("sim_ecoli_XhoI2.valuev","w")
mo = open("sim_ecoli_XhoI2.origin","w")
accum = 0
for i, mol in enumerate((molecule for molecule in molecules if len(molecule[0]) > 250000)):
    simulated = sim(mol[0])
    accum += sum(simulated)
    lens = [str(frag) for frag in simulated]
    if len(lens) < 10: continue
    map_name = "map_" + str(i) + "\n"
    m.write(map_name)
    #frag_seqs = Bio.Restriction.XhoI.catalyse(mol)
    
    m.write("\tXhoI\tXhoI\t" + "\t".join(lens) + "\n")
    m.write("\n")
    mo.write("origin[" + map_name + "] = ")
    mo.write(repr(mol[1]))
    mo.write("\n")
    #if i == 100: break
m.close()
mo.close()

In [107]:
def sim(mol):
    "Return a simulated set of fragment sizes in bp for a Bio.SeqIO seq"
    sites = [biosite - 1 for biosite in Bio.Restriction.XhoI.search(mol)]
    #sites = [site for site in sites if random.randint(1,5) <= 4] # sim missing sites
    allsites = [0] + sites + [len(mol)]
    # TODO add random break sites
    sizes = [y - x for x,y in zip(allsites[:-1], allsites[1:])]
    sizes = [size/1000 for size in sizes] # convert to kb
    sigma = .58
    sizes = [size + random.gauss(0, math.sqrt(size*sigma**2)) for size in sizes]
    sizes = [size for size in sizes if size > .5]
    error_free_sizes = [size for size in sizes]
    sizes = [size for size in sizes if random.randint(1,5) <= 4] # sim missing sites
    return [sizes] + [error_free_sizes]

In [123]:
[str(frag) for frag in sim(molecules[0][0])[1]]


Out[123]:
['30.8979998325',
 '3.64100725313',
 '27.6845873792',
 '20.565869476',
 '6.90639800252',
 '0.509000427891',
 '4.50121753451',
 '9.15835847982']


In [141]:
handle = open("/s/fir/a/nobackup/data/ECOLI_Reference_Genome/ecoli.fa", "rU")
for record in Bio.SeqIO.parse(handle, "fasta") :
    ref_seq = record.seq    

frag_sizes = [len(seq) for seq in Bio.Restriction.XhoI.catalyse(ref_seq)]
print(scipy.mean(frag_sizes))
plt.hist(frag_sizes)

def subseq(seq, limits):
    x,y = limits
    if x < y : return seq[x:y]
    return seq[x:len(seq)] + seq[0:y]

molecules = []
for i in range(1400):
    breaks = []
    num_breaks = 40 #random.randint(20,50)
    for j in range(num_breaks):
        breaks.append(random.randint(0,len(ref_seq) - 1))
    breaks.sort()
    pairs = [(x,y) for x,y in zip(breaks[:-1], breaks[1:])] + [(breaks[-1], breaks[0])]
    molecules += [[subseq(ref_seq, pair), pair] for pair in pairs]
    
def sim(mol):
    "Return a simulated set of fragment sizes in bp for a Bio.SeqIO seq"
    sites = [biosite - 1 for biosite in Bio.Restriction.XhoI.search(mol)]
    allsites = [0] + sites + [len(mol)]
    # TODO add random break sites
    sizes = [y - x for x,y in zip(allsites[:-1], allsites[1:])]
    sizes = [size/1000 for size in sizes] # convert to kb
    sigma = .58
    sizes = [size + random.gauss(0, math.sqrt(size*sigma**2)) for size in sizes]
    sizes = [size for size in sizes if size > .5]
    error_free_sizes = [size for size in sizes]
    sizes = [size for size in sizes if random.randint(1,5) <= 4] # sim missing sites
    return [sizes] + [error_free_sizes]
    

m = open("sim_ecoli_XhoI2.valuev","w")
mwe = open("sim_ecoli_XhoI2_without_errors.valuev","w") # Without inserting errors
mo = open("sim_ecoli_XhoI2.origin","w")
accum = 0
length_of_original = []
length_of_errored = []
for i, mol in enumerate((molecule for molecule in molecules if len(molecule[0]) > 250000)):
    simulated = sim(mol[0])
    accum += sum(simulated[0])
    lens = [str(frag) for frag in simulated[0]]
    if len(lens) < 10: continue
    map_name = "map_" + str(i) + "\n"
    m.write(map_name)
    #frag_seqs = Bio.Restriction.XhoI.catalyse(mol)
    
    m.write("\tXhoI\tXhoI\t" + "\t".join(lens) + "\n")
    m.write("\n")
    
    lens_e_free = [str(frag) for frag in simulated[1]]
    mwe.write(map_name)
    mwe.write("\tXhoI\tXhoI\t" + "\t".join(lens_e_free) + "\n")
    mwe.write("\n")
    
    mo.write("origin[" + map_name + "] = ")
    mo.write(repr(mol[1]))
    mo.write("\n")
    length_of_errored.append(len(lens))
    length_of_original.append(len(lens_e_free))
    #if i == 100: break
m.close()
mo.close()
mwe.close()


25919.972067

In [169]:
x = [length_of_original, length_of_errored, length_of_corrected]
plt.boxplot(x)
plt.xticks([1, 2, 3], ['Original', 'With Errors', 'After Correcting Errors'])
plt.xlabel("For deletion errors")
plt.ylabel("OM Read length")
plt.show()



In [160]:
print len(length_of_original)
print len(length_of_errored)
length_of_corrected = []
count = 0
for lo in length_of_original:    
    if(random.randint(1,100) <= 5):
        length_of_corrected.append(lo)
    else:
        length_of_corrected.append(length_of_errored[count])
    count +=1

#[size for size in sizes if random.randint(1,5) <= 4


3767
3767

In [11]:
1241/1000.0


Out[11]:
1.241

In [ ]: