In [1]:
import matplotlib
import Bio.SeqIO
import Bio.Restriction
import scipy
import matplotlib.pyplot as plt
%matplotlib inline
import math
import random
In [7]:
handle = open("/s/fir/a/nobackup/data/ECOLI_Reference_Genome/ecoli.fa", "rU")
for record in Bio.SeqIO.parse(handle, "fasta") :
ref_seq = record.seq
frag_sizes = [len(seq) for seq in Bio.Restriction.XhoI.catalyse(ref_seq)]
print(scipy.mean(frag_sizes))
plt.hist(frag_sizes)
Out[7]:
In [15]:
size = 12784
sigma = .58
random.gauss(0, math.sqrt(size*sigma**2))
math.sqrt(size*sigma**2)
size*sigma**2
Out[15]:
In [14]:
Out[14]:
In [36]:
import random
all_rmaps = []
for i in range(700):
breaks = []
num_breaks = 40 #random.randint(20,50)
for j in range(num_breaks):
breaks.append(random.randint(0,len(ref_seq) - 1))
breaks.sort()
pairs = [(x,y) for x,y in zip(breaks[:-1], breaks[1:])]
diffs = [y - x for x,y in pairs] + [len(ref_seq) - breaks[-1] + breaks[0]]
bigdiffs = [diff/1000 for diff in diffs if 1000000 > diff > 250000]
all_rmaps += bigdiffs
plt.hist(all_rmaps, bins=30)
Out[36]:
In [108]:
def subseq(seq, limits):
x,y = limits
if x < y : return seq[x:y]
return seq[x:len(seq)] + seq[0:y]
In [109]:
molecules = []
for i in range(1400):
breaks = []
num_breaks = 40 #random.randint(20,50)
for j in range(num_breaks):
breaks.append(random.randint(0,len(ref_seq) - 1))
breaks.sort()
pairs = [(x,y) for x,y in zip(breaks[:-1], breaks[1:])] + [(breaks[-1], breaks[0])]
molecules += [[subseq(ref_seq, pair), pair] for pair in pairs]
In [20]:
def sim(mol):
"Return a simulated set of fragment sizes in bp for a Bio.SeqIO seq"
sites = [biosite - 1 for biosite in Bio.Restriction.XhoI.search(mol)]
sites = [site for site in sites if random.randint(1,5) <= 4] # sim missing sites
allsites = [0] + sites + [len(mol)]
# TODO add random break sites
sizes = [y - x for x,y in zip(allsites[:-1], allsites[1:])]
sizes = [size/1000 for size in sizes] # convert to kb
sigma = .58
sizes = [size + random.gauss(0, math.sqrt(size*sigma**2)) for size in sizes]
sizes = [size for size in sizes if size > .5]
return sizes
m = open("sim_ecoli_XhoI2.valuev","w")
mo = open("sim_ecoli_XhoI2.origin","w")
accum = 0
for i, mol in enumerate((molecule for molecule in molecules if len(molecule[0]) > 250000)):
simulated = sim(mol[0])
accum += sum(simulated)
lens = [str(frag) for frag in simulated]
if len(lens) < 10: continue
map_name = "map_" + str(i) + "\n"
m.write(map_name)
#frag_seqs = Bio.Restriction.XhoI.catalyse(mol)
m.write("\tXhoI\tXhoI\t" + "\t".join(lens) + "\n")
m.write("\n")
mo.write("origin[" + map_name + "] = ")
mo.write(repr(mol[1]))
mo.write("\n")
#if i == 100: break
m.close()
mo.close()
In [107]:
def sim(mol):
"Return a simulated set of fragment sizes in bp for a Bio.SeqIO seq"
sites = [biosite - 1 for biosite in Bio.Restriction.XhoI.search(mol)]
#sites = [site for site in sites if random.randint(1,5) <= 4] # sim missing sites
allsites = [0] + sites + [len(mol)]
# TODO add random break sites
sizes = [y - x for x,y in zip(allsites[:-1], allsites[1:])]
sizes = [size/1000 for size in sizes] # convert to kb
sigma = .58
sizes = [size + random.gauss(0, math.sqrt(size*sigma**2)) for size in sizes]
sizes = [size for size in sizes if size > .5]
error_free_sizes = [size for size in sizes]
sizes = [size for size in sizes if random.randint(1,5) <= 4] # sim missing sites
return [sizes] + [error_free_sizes]
In [123]:
[str(frag) for frag in sim(molecules[0][0])[1]]
Out[123]:
In [141]:
handle = open("/s/fir/a/nobackup/data/ECOLI_Reference_Genome/ecoli.fa", "rU")
for record in Bio.SeqIO.parse(handle, "fasta") :
ref_seq = record.seq
frag_sizes = [len(seq) for seq in Bio.Restriction.XhoI.catalyse(ref_seq)]
print(scipy.mean(frag_sizes))
plt.hist(frag_sizes)
def subseq(seq, limits):
x,y = limits
if x < y : return seq[x:y]
return seq[x:len(seq)] + seq[0:y]
molecules = []
for i in range(1400):
breaks = []
num_breaks = 40 #random.randint(20,50)
for j in range(num_breaks):
breaks.append(random.randint(0,len(ref_seq) - 1))
breaks.sort()
pairs = [(x,y) for x,y in zip(breaks[:-1], breaks[1:])] + [(breaks[-1], breaks[0])]
molecules += [[subseq(ref_seq, pair), pair] for pair in pairs]
def sim(mol):
"Return a simulated set of fragment sizes in bp for a Bio.SeqIO seq"
sites = [biosite - 1 for biosite in Bio.Restriction.XhoI.search(mol)]
allsites = [0] + sites + [len(mol)]
# TODO add random break sites
sizes = [y - x for x,y in zip(allsites[:-1], allsites[1:])]
sizes = [size/1000 for size in sizes] # convert to kb
sigma = .58
sizes = [size + random.gauss(0, math.sqrt(size*sigma**2)) for size in sizes]
sizes = [size for size in sizes if size > .5]
error_free_sizes = [size for size in sizes]
sizes = [size for size in sizes if random.randint(1,5) <= 4] # sim missing sites
return [sizes] + [error_free_sizes]
m = open("sim_ecoli_XhoI2.valuev","w")
mwe = open("sim_ecoli_XhoI2_without_errors.valuev","w") # Without inserting errors
mo = open("sim_ecoli_XhoI2.origin","w")
accum = 0
length_of_original = []
length_of_errored = []
for i, mol in enumerate((molecule for molecule in molecules if len(molecule[0]) > 250000)):
simulated = sim(mol[0])
accum += sum(simulated[0])
lens = [str(frag) for frag in simulated[0]]
if len(lens) < 10: continue
map_name = "map_" + str(i) + "\n"
m.write(map_name)
#frag_seqs = Bio.Restriction.XhoI.catalyse(mol)
m.write("\tXhoI\tXhoI\t" + "\t".join(lens) + "\n")
m.write("\n")
lens_e_free = [str(frag) for frag in simulated[1]]
mwe.write(map_name)
mwe.write("\tXhoI\tXhoI\t" + "\t".join(lens_e_free) + "\n")
mwe.write("\n")
mo.write("origin[" + map_name + "] = ")
mo.write(repr(mol[1]))
mo.write("\n")
length_of_errored.append(len(lens))
length_of_original.append(len(lens_e_free))
#if i == 100: break
m.close()
mo.close()
mwe.close()
In [169]:
x = [length_of_original, length_of_errored, length_of_corrected]
plt.boxplot(x)
plt.xticks([1, 2, 3], ['Original', 'With Errors', 'After Correcting Errors'])
plt.xlabel("For deletion errors")
plt.ylabel("OM Read length")
plt.show()
In [160]:
print len(length_of_original)
print len(length_of_errored)
length_of_corrected = []
count = 0
for lo in length_of_original:
if(random.randint(1,100) <= 5):
length_of_corrected.append(lo)
else:
length_of_corrected.append(length_of_errored[count])
count +=1
#[size for size in sizes if random.randint(1,5) <= 4
In [11]:
1241/1000.0
Out[11]:
In [ ]: