In [2]:
import matplotlib
import Bio.SeqIO
import Bio.Restriction
import scipy
import matplotlib.pyplot as plt
%matplotlib inline
import math
import random
import numpy as np
In [235]:
# standard deviation with Bionano data
def signmaS(len):
singma = (0.2*0.2) + len * 0.1 * (-0.1) + len*len*0.04*0.04
return(singma)
In [236]:
handle = open("/s/fir/a/nobackup/data/ECOLI_Reference_Genome/ecoli.fa", "rU")
for record in Bio.SeqIO.parse(handle, "fasta"):
ref_seq = record.seq
frag_sizes = [len(seq) for seq in Bio.Restriction.XhoI.catalyse(ref_seq)]
In [237]:
def subseq(seq, limits):
x,y = limits
if x < y : return seq[x:y]
return seq[x:len(seq)] + seq[0:y]
In [238]:
molecules = []
for i in range(200):
breaks = []
num_breaks = 10 #random.randint(20,50)
for j in range(num_breaks):
breaks.append(random.randint(0,len(ref_seq) - 1))
breaks.sort()
pairs = [(x,y) for x,y in zip(breaks[:-1], breaks[1:])] + [(breaks[-1], breaks[0])]
molecules += [[subseq(ref_seq, pair), pair] for pair in pairs]
In [251]:
def sim(mol):
"Return a simulated set of fragment sizes in bp for a Bio.SeqIO seq"
error_locations = []
sites = [biosite - 1 for biosite in Bio.Restriction.XhoI.search(mol)]
e_f_sites = list(sites);
for i, site in enumerate(sites):
if (random.randint(1,5) == 5):
sites.pop(i)
error_locations.append(i+1)
allsites = [0] + sites + [len(mol)]
e_f_allsites = [0] + e_f_sites + [len(mol)]
# TODO add random break sites
sizes = [y - x for x,y in zip(allsites[:-1], allsites[1:])]
print(sizes)
print(sum(sizes))
e_f_sizes = [y - x for x,y in zip(e_f_allsites[:-1], e_f_allsites[1:])]
sizes = [size/1000.0 for size in sizes] # convert to kb
e_f_sizes = [size/1000.0 for size in e_f_sizes] # convert to kb
sizes = [size + random.gauss(0, math.sqrt(signmaS(size))) for size in sizes]
sizes = [size for size in sizes if size > .5]
e_f_sizes = [size for size in e_f_sizes if size > .5]
return [sizes] + [e_f_sizes] + [error_locations]
In [230]:
m = open("sim_single_molecule_longer_200","w")
mwe = open("sim_single_molecule_without_error_longer_200","w")
In [231]:
start = 0
end = 0
length_of_original = []
length_of_errored = []
accum = 0
for i, mol in enumerate((molecule for molecule in molecules if len(molecule[0]) > 250000)):
simulated = sim(mol[0])
accum += sum(simulated[1])
lens = [str(round(frag,3)) for frag in simulated[0]]
if len(lens) < 10: continue
map_name = "map_" + str(i) + "\n"
lens_e_free = [str(round(frag,3)) for frag in simulated[1]]
length_of_errored.append(len(lens))
length_of_original.append(len(lens_e_free))
#if i == 100: break
if(start == 0 and end ==0):
start = mol[1][0]
end = mol[1][1]
elif(start <= mol[1][0] and mol[1][0] >= end):
m.write(map_name)
m.write("\tXhoI\tXhoI\t" + "\t".join(lens) + "\n")
m.write("\n")
mwe.write(map_name)
mwe.write("\tXhoI\tXhoI\t" + "\t".join(lens_e_free) + "\n")
mwe.write("\n")
elif(start <= mol[1][1] and mol[1][0] >= end):
m.write("map_" + str(i) + "\n")
m.write("\tXhoI\tXhoI\t" + "\t".join(lens) + "\n")
m.write("\n")
mwe.write(map_name)
mwe.write("\tXhoI\tXhoI\t" + "\t".join(lens_e_free) + "\n")
mwe.write("\n")
m.close()
mwe.close()
In [232]:
def nextRange(cur_range, bin_size, s_varience):
if(cur_range < 2000):
return(cur_range+ bin_size)
else:
return(round(((cur_range/1000.0) + 4 * math.sqrt((cur_range/1000.0) * s_varience))*1000))
quantizeList = []
def quantizeNew(val, bin_size, s_varience):
if(val == 0):
return(0)
if(len(quantizeList) == 0):
quantizeList.append(0.0)
quantizeList.append(nextRange(0.0, bin_size, s_varience))
for i in range(len(quantizeList)):
if(val < quantizeList[i]):
return(quantizeList[i-1])
while(1):
quantizeList.append(nextRange(quantizeList[len(quantizeList) - 1], bin_size, s_varience))
if(val < quantizeList[len(quantizeList) - 1]):
return(quantizeList[len(quantizeList) - 2])
In [217]:
def findNoOfFragments(fname):
lengths = []
lines = [line.strip() for i,line in enumerate(open(fname)) if (i%3==1)]
for line in lines:
lengths.append((len(line.split("\t"))) - 2)
count = 0.0
for length in lengths:
count += length
print(count/len(lines))
plt.hist(lengths, bins = 50)
In [218]:
findNoOfFragments("/s/oak/b/nobackup/muggli/goat/whole_genome_mapping/goat_whole_genome.maps")
In [166]:
findNoOfFragments("sim_single_molecule")
In [219]:
findNoOfFragments("sim_single_molecule_longer")
In [233]:
findNoOfFragments("sim_single_molecule_longer_200")
In [573]:
# standard deviation with Bionano data
def signmaS(len):
singma = (0.2*0.2) + len * 0.1 * (-0.1) + len*len*0.04*0.04
return(singma)
handle = open("/s/fir/a/nobackup/data/ECOLI_Reference_Genome/ecoli.fa", "rU")
for record in Bio.SeqIO.parse(handle, "fasta"):
ref_seq = record.seq
frag_sizes = [len(seq) for seq in Bio.Restriction.XhoI.catalyse(ref_seq)]
def subseq(seq, limits):
x,y = limits
if x < y : return seq[x:y]
return seq[x:len(seq)] + seq[0:y]
molecules = []
for i in range(400):
breaks = []
num_breaks = 10 #random.randint(20,50)
for j in range(num_breaks):
breaks.append(random.randint(0,len(ref_seq) - 1))
breaks.sort()
pairs = [(x,y) for x,y in zip(breaks[:-1], breaks[1:])] + [(breaks[-1], breaks[0])]
molecules += [[subseq(ref_seq, pair), pair] for pair in pairs]
In [574]:
def generateRandomList(listv, count):
randomList = []
randomListIndex = []
index = range(0, len(listv))
for c in range(0, count):
i = random.choice(index)
randomList.append(listv[i])
randomListIndex.append(i)
index.remove(i)
randomListIndex, randomList = zip(*sorted(zip(randomListIndex,randomList)))
return(randomListIndex, randomList)
def deleteRandomly(listv, count):
deleted = generateRandomList(listv, count)
deleted_index = list(deleted[0])
deleted_values = list(deleted[1])
for d in deleted_values:
listv.remove(d)
return(listv, deleted_index, deleted_values)
def sim(mol):
"Return a simulated set of fragment sizes in bp for a Bio.SeqIO seq"
sites = [biosite - 1 for biosite in Bio.Restriction.XhoI.search(mol)]
e_f_sites = list(sites);
sites, dindex, dsites = deleteRandomly( sites, int(len(mol)/100000))
allsites = [0] + sites + [len(mol)]
e_f_allsites = [0] + e_f_sites + [len(mol)]
# TODO add random break sites
sizes = [y - x for x,y in zip(allsites[:-1], allsites[1:])]
e_f_sizes = [y - x for x,y in zip(e_f_allsites[:-1], e_f_allsites[1:])]
sizes = [size/1000.0 for size in sizes] # convert to kb
e_f_sizes = [size/1000.0 for size in e_f_sizes] # convert to kb
sizes = [size + random.gauss(0, math.sqrt(signmaS(size))) for size in sizes if size > 0.5]
sizes = [size for size in sizes if size > 0.5]
e_f_sizes = [size for size in e_f_sizes if size > 0.5]
return (sizes, e_f_sizes, dindex)
In [575]:
filename = "sim_single_molecule_100"
m = open(filename+"_newDel","w")
mwe = open(filename+"_efree","w")
efile = open(filename+"_elocations","w")
start = 0
end = 0
length_of_original = []
length_of_errored = []
accum = 0
for i, mol in enumerate((molecule for molecule in molecules if len(molecule[0]) > 250000)):
simulated = sim(mol[0])
accum += sum(simulated[1])
lens = [str(round(frag,3)) for frag in simulated[0]]
if len(lens) < 10: continue
map_name = "map_" + str(i) + "\n"
lens_e_free = [str(round(frag,3)) for frag in simulated[1]]
length_of_errored.append(len(lens))
length_of_original.append(len(lens_e_free))
#if i == 100: break
if(start == 0 and end ==0):
start = mol[1][0]
end = mol[1][1]
# elif(start <= mol[1][0] and mol[1][0] >= end):
if((start <= mol[1][0] and mol[1][0] <= end) or (start <= mol[1][1] and mol[1][1] <= end) or (start >= mol[1][0] and mol[1][1] >= end)):
m.write(map_name)
m.write("\tXhoI\tXhoI\t" + "\t".join(lens) + "\n")
m.write("\n")
mwe.write(map_name)
mwe.write("\tXhoI\tXhoI\t" + "\t".join(lens_e_free) + "\n")
mwe.write("\n")
efile.write(' '.join([str(s) for s in simulated[2]]))
efile.write("\n")
"""
elif(start <= mol[1][1] and mol[1][0] >= end):
m.write("map_" + str(i) + "\n")
m.write("\tXhoI\tXhoI\t" + "\t".join(lens) + "\n")
m.write("\n")
mwe.write(map_name)
mwe.write("\tXhoI\tXhoI\t" + "\t".join(lens_e_free) + "\n")
mwe.write("\n")
efile.write(' '.join([str(s) for s in simulated[2]]))
efile.write("\n")
"""
m.close()
mwe.close()
efile.close()
In [208]:
# standard deviation with Bionano data
def signmaS(len):
singma = (0.2*0.2) + len * 0.1 * (-0.1) + len*len*0.04*0.04
return(singma)
handle = open("/s/fir/a/nobackup/data/ECOLI_Reference_Genome/ecoli.fa", "rU")
for record in Bio.SeqIO.parse(handle, "fasta"):
ref_seq = record.seq
frag_sizes = [len(seq) for seq in Bio.Restriction.XhoI.catalyse(ref_seq)]
def subseq(seq, limits):
x,y = limits
if x < y : return seq[x:y]
return seq[x:len(seq)] + seq[0:y]
molecules = []
for i in range(500):
breaks = []
num_breaks = 10 #random.randint(20,50)
for j in range(num_breaks):
breaks.append(random.randint(0,len(ref_seq) - 1))
breaks.sort()
pairs = [(x,y) for x,y in zip(breaks[:-1], breaks[1:])] + [(breaks[-1], breaks[0])]
molecules += [[subseq(ref_seq, pair), pair] for pair in pairs]
In [209]:
def generateRandomList(listv, count):
randomList = []
randomListIndex = []
if(count >= len(listv) + 1):
return(randomListIndex, randomList)
if(count <= 0):
return(randomListIndex, randomList)
if(len(listv) == 0):
return(randomListIndex, randomList)
index = range(0, len(listv))
for c in range(0, count):
i = random.choice(index)
randomList.append(listv[i])
randomListIndex.append(i)
index.remove(i)
randomListIndex, randomList = zip(*sorted(zip(randomListIndex,randomList)))
return(randomListIndex, randomList)
def deleteRandomly(listv, count):
deleted = generateRandomList(listv, count)
deleted_index = list(deleted[0])
deleted_values = list(deleted[1])
for d in deleted_values:
listv.remove(d)
return(listv, deleted_index, deleted_values)
def insertVal(A, val):
idx = A.index(val)
if(idx == 0):
A.insert(idx, random.randint(0, A[idx]))
else:
A.insert(idx, random.randint((A[idx - 1 ]), A[idx]))
def addfalsepositive(listv, count):
if(count == 0):
return(listv, count)
err_cites,sel_val = generateRandomList(listv, count)
for val in sel_val:
insertVal(listv, val)
return(listv, err_cites, count)
def sim(mol):
"Return a simulated set of fragment sizes in bp for a Bio.SeqIO seq"
sites = [biosite - 1 for biosite in Bio.Restriction.XhoI.search(mol)]
e_f_sites = list(sites);
sites, dindex, dsites = deleteRandomly( sites, int(round(len(sites)/6.66)))
sites, iindex, ins_err = addfalsepositive(sites, int(round(len(mol)/400000.0)))
allsites = [0] + sites + [len(mol)]
e_f_allsites = [0] + e_f_sites + [len(mol)]
# TODO add random break sites
sizes = [y - x for x,y in zip(allsites[:-1], allsites[1:])]
e_f_sizes = [y - x for x,y in zip(e_f_allsites[:-1], e_f_allsites[1:])]
sizes = [size/1000.0 for size in sizes] # convert to kb
e_f_sizes = [size/1000.0 for size in e_f_sizes] # convert to kb
sizes = [size + random.gauss(0, math.sqrt(signmaS(size))) for size in sizes if size > 0.5]
sizes = [size for size in sizes if size > 0.5]
e_f_sizes = [size for size in e_f_sizes if size > 0.5]
return (sizes, e_f_sizes, dindex, ins_err, iindex)
In [211]:
filename = "sim_single_molecule_100"
m = open(filename+"_newDel","w")
mwe = open(filename+"_efree","w")
efile = open(filename+"_elocations","w")
start = 0
end = 0
length_of_original = []
length_of_errored = []
accum = 0
tot_ins_err = 0
tot_del_err = 0
for i, mol in enumerate((molecule for molecule in molecules if len(molecule[0]) > 250000)):
simulated = sim(mol[0])
lens = [str(round(frag,3)) for frag in simulated[0]]
if len(lens) < 10: continue
map_name = "map_" + str(i) + "\n"
lens_e_free = [str(round(frag,3)) for frag in simulated[1]]
length_of_errored.append(len(lens))
length_of_original.append(len(lens_e_free))
#if i == 100: break
tot_ins_err += simulated[3]
tot_del_err += len(simulated[2])
if(start == 0 and end ==0):
start = mol[1][0]
end = mol[1][1]
# elif(start <= mol[1][0] and mol[1][0] >= end):
# if((start <= mol[1][0] and mol[1][0] <= end) or (start <= mol[1][1] and mol[1][1] <= end) or (start >= mol[1][0] and mol[1][1] >= end)):
m.write(map_name)
m.write("\tXhoI\tXhoI\t" + "\t".join(lens) + "\n")
m.write("\n")
mwe.write(map_name)
mwe.write("\tXhoI\tXhoI\t" + "\t".join(lens_e_free) + "\n")
mwe.write("\n")
efile.write(','.join([str(s) for s in simulated[2]]) + ' : ' + ','.join([str(s) for s in simulated[4]]))
efile.write("\n")
"""
elif(start <= mol[1][1] and mol[1][0] >= end):
m.write("map_" + str(i) + "\n")
m.write("\tXhoI\tXhoI\t" + "\t".join(lens) + "\n")
m.write("\n")
mwe.write(map_name)
mwe.write("\tXhoI\tXhoI\t" + "\t".join(lens_e_free) + "\n")
mwe.write("\n")
efile.write(' '.join([str(s) for s in simulated[2]]))
efile.write("\n")
"""
m.close()
mwe.close()
efile.close()
print("Total insertion errors: " + str(tot_ins_err))
print("Total deletion errors : "+ str(tot_del_err))
In [82]:
# standard deviation with Bionano data
def signmaS(len):
singma = (0.2*0.2) + len * 0.1 * (-0.1) + len*len*0.04*0.04
return(singma)
handle = open("/s/fir/a/nobackup/data/ECOLI_Reference_Genome/ecoli.fa", "rU")
for record in Bio.SeqIO.parse(handle, "fasta"):
ref_seq = record.seq
frag_sizes = [len(seq) for seq in Bio.Restriction.XhoI.catalyse(ref_seq)]
def subseq(seq, limits):
x,y = limits
if x < y : return seq[x:y]
return seq[x:len(seq)] + seq[0:y]
molecules = []
for i in range(500):
breaks = []
num_breaks = 10 #random.randint(20,50)
for j in range(num_breaks):
breaks.append(random.randint(0,len(ref_seq) - 1))
breaks.sort()
pairs = [(x,y) for x,y in zip(breaks[:-1], breaks[1:])] + [(breaks[-1], breaks[0])]
molecules += [[subseq(ref_seq, pair), pair] for pair in pairs]
In [83]:
def generateRandomList(listv, count):
randomList = []
randomListIndex = []
if(count >= len(listv)):
return(randomListIndex, randomList)
if(count <= 0):
return(randomListIndex, randomList)
if(len(listv) == 0):
return(randomListIndex, randomList)
index = range(0, len(listv) - 1)
for c in range(0, count):
i = random.choice(index)
randomList.append(listv[i])
randomListIndex.append(i)
index.remove(i)
randomListIndex, randomList = zip(*sorted(zip(randomListIndex,randomList)))
return(randomListIndex, randomList)
def deleteRandomly(listv, count):
deleted = generateRandomList(listv, count)
deleted_index = list(deleted[0])
deleted_values = list(deleted[1])
for i in reversed(deleted_index):
listv[i+1] = listv[i+1] + listv[i]
listv.remove(listv[i])
return(listv, deleted_index, deleted_values)
def insertVal(A, val):
idx = A.index(val)
r = round(random.uniform(0.0, val), 3)
A[idx] = r
A.insert(idx, round((val - r), 3))
def addfalsepositive(listv, count):
if(count == 0):
return(listv, count)
err_cites, sel_val = generateRandomList(listv, count)
for val in reversed(sel_val):
insertVal(listv, val)
return(listv, err_cites, count)
def sim(mol):
"Return a simulated set of fragment sizes in bp for a Bio.SeqIO seq"
sites = [biosite - 1 for biosite in Bio.Restriction.XhoI.search(mol)]
allsites = [0] + sites + [len(mol)]
sizes = [y - x for x,y in zip(allsites[:-1], allsites[1:])]
sizes = [size/1000.0 for size in sizes] # convert to kb
sizes = [size for size in sizes if size > 1.1]
if(bool(random.getrandbits(1)) == True):
sizes = sizes[::-1]
sizes = sizes[1:-1]
e_f_sizes = list(sizes);
sizes, dindex, dsites = deleteRandomly( sizes, int(round(len(sites)/6.66)))
sizes, iindex, ins_err = addfalsepositive(sizes, int(round(len(mol)/400000.0)))
sizes = [size + random.gauss(0, math.sqrt(signmaS(size))) for size in sizes if size > 0.5]
sizes = [size for size in sizes if size > 0.5]
return (sizes, e_f_sizes, dindex, ins_err, iindex)
In [84]:
filename = "sim_single_molecule_100"
m = open(filename+"_newDel","w")
mwe = open(filename+"_efree","w")
efile = open(filename+"_elocations","w")
start = 0
end = 0
length_of_original = []
length_of_errored = []
accum = 0
tot_ins_err = 0
tot_del_err = 0
for i, mol in enumerate((molecule for molecule in molecules if len(molecule[0]) > 250000)):
simulated = sim(mol[0])
lens = [str(round(frag,3)) for frag in simulated[0]]
if len(lens) < 10: continue
map_name = "map_" + str(i) + "\n"
lens_e_free = [str(round(frag,3)) for frag in simulated[1]]
length_of_errored.append(len(lens))
length_of_original.append(len(lens_e_free))
#if i == 100: break
tot_ins_err += simulated[3]
tot_del_err += len(simulated[2])
if(start == 0 and end ==0):
start = mol[1][0]
end = mol[1][1]
# elif(start <= mol[1][0] and mol[1][0] >= end):
# if((start <= mol[1][0] and mol[1][0] <= end) or (start <= mol[1][1] and mol[1][1] <= end) or (start >= mol[1][0] and mol[1][1] >= end)):
m.write(map_name)
m.write("\tXhoI\tXhoI\t" + "\t".join(lens) + "\n")
m.write("\n")
mwe.write(map_name)
mwe.write("\tXhoI\tXhoI\t" + "\t".join(lens_e_free) + "\n")
mwe.write("\n")
efile.write(','.join([str(s) for s in simulated[2]]) + ' : ' + ','.join([str(s) for s in simulated[4]]))
efile.write("\n")
m.close()
mwe.close()
efile.close()
print("Total insertion errors: " + str(tot_ins_err))
print("Total deletion errors : "+ str(tot_del_err))
In [85]:
!cp /s/oak/b/nobackup/darshanw/ipython-notebooks/sim_single_molecule_100_newDel /s/oak/b/nobackup/darshanw/COmap/test/
In [86]:
!cp /s/oak/b/nobackup/darshanw/ipython-notebooks/sim_single_molecule_100_efree /s/oak/b/nobackup/darshanw/COmap/test/
In [87]:
!cp /s/oak/b/nobackup/darshanw/ipython-notebooks/sim_single_molecule_100_elocations /s/oak/b/nobackup/darshanw/COmap/test/
In [117]:
A = [20, 40 ,55, 78, 90, 11]
index = []
for i in range(0, len(A)):
if(A[i] <= 50):
index.append(i)
index
Out[117]:
In [167]:
for i in range(0, 10000):
if((1.1 + random.gauss(0, math.sqrt(signmaS(1.1)))) < 0.5):
print("poop")
In [195]:
A = [40.0, 23, 31, 39.9, 50, 62]
def removeLessthan(A, no):
removedindex = []
removedvalues = []
for i in range(0, len(A)):
if(A[i] < no):
removedindex.append(i)
removedvalues.append(A[i])
for v in removedvalues:
A.remove(v)
return(A, removedindex)
a,_ = removeLessthan(A, 30)
In [273]:
def deleteRandomly(listv, count):
D = generateRandomList(listv, count)
deleted_index = list(D[0])
deleted_values = list(D[1])
for i in deleted_index:
if(i == len(listv)):
return(listv, [], [])
else:
listv
listv.remove(d)
return(listv, deleted_index, deleted_values)
In [328]:
def insertVal(A, val):
idx = A.index(val)
r = round(random.uniform(0.0, val), 3)
A[idx] = r
A.insert(idx, round((val - r), 3))
In [329]:
def addfalsepositive(listv, count):
if(count == 0):
return(listv, count)
err_cites, sel_val = generateRandomList(listv, count)
for val in reversed(sel_val):
insertVal(listv, val)
return(listv, err_cites, count)
In [89]:
# standard deviation with Bionano data
def signmaS(len):
singma = (0.2*0.2) + len * 0.1 * (-0.1) + len*len*0.04*0.04
return(singma)
handle = open("/s/fir/a/nobackup/data/ECOLI_Reference_Genome/ecoli.fa", "rU")
for record in Bio.SeqIO.parse(handle, "fasta"):
ref_seq = record.seq
frag_sizes = [len(seq) for seq in Bio.Restriction.XhoI.catalyse(ref_seq)]
def subseq(seq, limits):
x,y = limits
if x < y : return seq[x:y]
return seq[x:len(seq)] + seq[0:y]
molecules = []
for i in range(700):
breaks = []
num_breaks = 40 #random.randint(20,50)
for j in range(num_breaks):
breaks.append(random.randint(0,len(ref_seq) - 1))
breaks.sort()
pairs = [(x,y) for x,y in zip(breaks[:-1], breaks[1:])] + [(breaks[-1], breaks[0])]
molecules += [[subseq(ref_seq, pair), pair] for pair in pairs]
In [90]:
def generateRandomList(listv, count):
randomList = []
randomListIndex = []
if(count >= len(listv)):
return(randomListIndex, randomList)
if(count <= 0):
return(randomListIndex, randomList)
if(len(listv) == 0):
return(randomListIndex, randomList)
index = range(0, len(listv) - 1)
for c in range(0, count):
i = random.choice(index)
randomList.append(listv[i])
randomListIndex.append(i)
index.remove(i)
randomListIndex, randomList = zip(*sorted(zip(randomListIndex,randomList)))
return(randomListIndex, randomList)
def deleteRandomly(listv, count):
deleted = generateRandomList(listv, count)
deleted_index = list(deleted[0])
deleted_values = list(deleted[1])
for i in reversed(deleted_index):
listv[i+1] = listv[i+1] + listv[i]
listv.remove(listv[i])
return(listv, deleted_index, deleted_values)
def insertVal(A, val):
idx = A.index(val)
r = round(random.uniform(0.0, val), 3)
A[idx] = r
A.insert(idx, round((val - r), 3))
def addfalsepositive(listv, count):
if(count == 0):
return(listv, count)
err_cites, sel_val = generateRandomList(listv, count)
for val in reversed(sel_val):
insertVal(listv, val)
return(listv, err_cites, count)
def sim(mol):
"Return a simulated set of fragment sizes in bp for a Bio.SeqIO seq"
sites = [biosite - 1 for biosite in Bio.Restriction.XhoI.search(mol)]
sites.extend([biosite - 1 for biosite in Bio.Restriction.NheI.search(mol)])
sites.extend([biosite - 1 for biosite in Bio.Restriction.EagI.search(mol)])
sites.sort()
allsites = [0] + sites + [len(mol)]
sizes = [y - x for x,y in zip(allsites[:-1], allsites[1:])]
sizes = [size/1000.0 for size in sizes] # convert to kb
sizes = [size for size in sizes if size > 1.1]
if(bool(random.getrandbits(1)) == True):
sizes = sizes[::-1]
sizes = sizes[1:-1]
e_f_sizes = list(sizes);
sizes, dindex, dsites = deleteRandomly( sizes, int(round(len(sites)/6.66)))
sizes, iindex, ins_err = addfalsepositive(sizes, int(round(len(mol)/200000.0)))
sizes = [size + random.gauss(0, math.sqrt(signmaS(size))) for size in sizes if size > 0.5]
sizes = [size for size in sizes if size > 0.5]
return (sizes, e_f_sizes, dindex, ins_err, iindex)
In [91]:
filename = "sim_single_molecule_100"
m = open(filename+"_newDel","w")
mwe = open(filename+"_efree","w")
efile = open(filename+"_elocations","w")
start = 0
end = 0
length_of_original = []
length_of_errored = []
accum = 0
tot_ins_err = 0
tot_del_err = 0
for i, mol in enumerate((molecule for molecule in molecules if len(molecule[0]) > 250000)):
simulated = sim(mol[0])
lens = [str(round(frag,3)) for frag in simulated[0]]
if len(lens) < 10: continue
map_name = "map_" + str(i) + "\n"
lens_e_free = [str(round(frag,3)) for frag in simulated[1]]
length_of_errored.append(len(lens))
length_of_original.append(len(lens_e_free))
#if i == 100: break
tot_ins_err += simulated[3]
tot_del_err += len(simulated[2])
if(start == 0 and end ==0):
start = mol[1][0]
end = mol[1][1]
# elif(start <= mol[1][0] and mol[1][0] >= end):
# if((start <= mol[1][0] and mol[1][0] <= end) or (start <= mol[1][1] and mol[1][1] <= end) or (start >= mol[1][0] and mol[1][1] >= end)):
m.write(map_name)
m.write("\tXhoI\tXhoI\t" + "\t".join(lens) + "\n")
m.write("\n")
mwe.write(map_name)
mwe.write("\tXhoI\tXhoI\t" + "\t".join(lens_e_free) + "\n")
mwe.write("\n")
efile.write(','.join([str(s) for s in simulated[2]]) + ' : ' + ','.join([str(s) for s in simulated[4]]))
efile.write("\n")
m.close()
mwe.close()
efile.close()
print("Total insertion errors: " + str(tot_ins_err))
print("Total deletion errors : "+ str(tot_del_err))
In [92]:
!cp /s/oak/b/nobackup/darshanw/ipython-notebooks/sim_single_molecule_100_newDel /s/oak/b/nobackup/darshanw/COmap/test/
In [93]:
!cp /s/oak/b/nobackup/darshanw/ipython-notebooks/sim_single_molecule_100_efree /s/oak/b/nobackup/darshanw/COmap/test/
In [94]:
!cp /s/oak/b/nobackup/darshanw/ipython-notebooks/sim_single_molecule_100_elocations /s/oak/b/nobackup/darshanw/COmap/test/
In [ ]:
In [ ]:
In [20]:
handle = open("/s/fir/a/nobackup/data/ECOLI_Reference_Genome/ecoli.fa", "rU")
for record in Bio.SeqIO.parse(handle, "fasta"):
ref_seq = record.seq
frag_sizes = [len(seq) for seq in Bio.Restriction.XhoI.catalyse(ref_seq)]
print(frag_sizes)
In [27]:
from __future__ import division
from Bio import SeqIO
from Bio.Restriction import *
import numpy
multi = (XhoI,NheI,EagI)
handle = open("/s/fir/a/nobackup/data/ECOLI_Reference_Genome/ecoli.fa", "rU")
for record in Bio.SeqIO.parse(handle, "fasta"):
print(record.id + "\n")
print("\tXhoI\tNheI")
L=[0]
L.append (len(record)/1000)
for enz in range(len(multi)):
coords = multi[enz].search(record.seq)
for site in range(len(coords)):
L.append(coords[site]/1000)
L.sort()
for i in range(1, len(L)):
frags=[str(L[i]-L[i-1])]
print( "\t"+ "\t".join(frags))
print("\n")
print("\n")
In [61]:
sites = [biosite - 1 for biosite in Bio.Restriction.XhoI.search(molecules[3][0])]
sites
Out[61]:
In [64]:
sites = [biosite - 1 for biosite in Bio.Restriction.XhoI.search(molecules[3][0])]
sites = []
sites.extend([biosite - 1 for biosite in Bio.Restriction.NheI.search(molecules[3][0])])
sites.sort()
sites
Out[64]:
In [26]:
handle = open("/s/fir/a/nobackup/data/ECOLI_Reference_Genome/ecoli.fa", "rU")
for record in Bio.SeqIO.parse(handle, "fasta"):
ref_seq = record.seq
frag_sizes = []
for seq in Bio.Restriction.XhoI.catalyse(ref_seq):
for seq1 in Bio.Restriction.NheI.catalyse(seq):
for seq2 in Bio.Restriction.EagI.catalyse(seq1):
frag_sizes.append(len(seq2))
print(frag_sizes)
In [30]:
handle = open("/s/fir/a/nobackup/data/ECOLI_Reference_Genome/ecoli.fa", "rU")
for record in Bio.SeqIO.parse(handle, "fasta"):
ref_seq = record.seq
frag_sizes = []
for seq in Bio.Restriction.XhoI.catalyse(ref_seq):
for seq1 in Bio.Restriction.NheI.catalyse(seq):
for seq2 in Bio.Restriction.EagI.catalyse(seq1):
frag_sizes.append(len(seq2))
frag_sizes
Out[30]:
In [32]:
1+2
Out[32]:
In [ ]: