In [77]:
import matplotlib
import Bio.SeqIO
import Bio.Restriction
import scipy
import matplotlib.pyplot as plt
%matplotlib inline
import math
import random
import numpy as np
In [78]:
handle = open("/s/fir/a/nobackup/data/ECOLI_Reference_Genome/ecoli.fa", "rU")
for record in Bio.SeqIO.parse(handle, "fasta"):
ref_seq = record.seq
frag_sizes = [len(seq) for seq in Bio.Restriction.XhoI.catalyse(ref_seq)]
In [79]:
def subseq(seq, limits):
x,y = limits
if x < y : return seq[x:y]
return seq[x:len(seq)] + seq[0:y]
In [80]:
molecules = []
for i in range(1400):
breaks = []
num_breaks = 40 #random.randint(20,50)
for j in range(num_breaks):
breaks.append(random.randint(0,len(ref_seq) - 1))
breaks.sort()
pairs = [(x,y) for x,y in zip(breaks[:-1], breaks[1:])] + [(breaks[-1], breaks[0])]
molecules += [[subseq(ref_seq, pair), pair] for pair in pairs]
In [81]:
def sim(mol):
"Return a simulated set of fragment sizes in bp for a Bio.SeqIO seq"
error_locations = []
sites = [biosite - 1 for biosite in Bio.Restriction.XhoI.search(mol)]
e_f_sites = list(sites);
for i, site in enumerate(sites):
if (random.randint(1,5) == 5):
sites.pop(i)
error_locations.append(i+1)
allsites = [0] + sites + [len(mol)]
e_f_allsites = [0] + e_f_sites + [len(mol)]
# TODO add random break sites
sizes = [y - x for x,y in zip(allsites[:-1], allsites[1:])]
e_f_sizes = [y - x for x,y in zip(e_f_allsites[:-1], e_f_allsites[1:])]
sizes = [size/1000.0 for size in sizes] # convert to kb
e_f_sizes = [size/1000.0 for size in e_f_sizes] # convert to kb
sigma = .58
sizes = [size + random.gauss(0, math.sqrt(size*sigma**2)) for size in sizes]
sizes = [size for size in sizes if size > .5]
e_f_sizes = [size for size in e_f_sizes if size > .5]
return [sizes] + [e_f_sizes] + [error_locations]
In [82]:
print molecules[0][1]
print molecules[1][1][0]
print molecules[2][1]
In [76]:
m = open("sim_ecoli_XhoI2.valuev","w")
mwe = open("sim_ecoli_XhoI2_without_errors.valuev","w") # Without inserting errors
mo = open("sim_ecoli_XhoI2.origin","w")
el = open("sim_ecoli_XhoI2.error_locations","w")
In [50]:
start = 0
end = 0
length_of_original = []
length_of_errored = []
for i, mol in enumerate((molecule for molecule in molecules if len(molecule[0]) > 250000)):
simulated = sim(mol[0])
lens = [str(round(frag,3)) for frag in simulated[0]]
if len(lens) < 10: continue
map_name = "map_" + str(i) + "\n"
m.write(map_name)
#frag_seqs = Bio.Restriction.XhoI.catalyse(mol)
m.write("\tXhoI\tXhoI\t" + "\t".join(lens) + "\n")
m.write("\n")
lens_e_free = [str(round(frag,3)) for frag in simulated[1]]
mwe.write(map_name)
mwe.write("\tXhoI\tXhoI\t" + "\t".join(lens_e_free) + "\n")
mwe.write("\n")
mo.write("origin[" + map_name + "] = ")
mo.write(repr(mol[1]))
mo.write("\n")
length_of_errored.append(len(lens))
length_of_original.append(len(lens_e_free))
#if i == 100: break
el.write(" ".join([str(num) for num in simulated[2]]) + "\n")
if(start == 0 and end ==0):
start = mol[1][0]
end = mol[1][1]
elif(start <= mol[1][0] and mol[1][0] >= end):
print(i)
elif(start <= mol[1][1] and mol[1][0] >= end):
print(i)
m.close()
mo.close()
mwe.close()
el.close()
In [45]:
def nextRange(cur_range, bin_size, s_varience):
if(cur_range < 2000):
return(cur_range+ bin_size)
else:
return(round(((cur_range/1000.0) + 4 * math.sqrt((cur_range/1000.0) * s_varience))*1000))
quantizeList = []
def quantizeNew(val, bin_size, s_varience):
if(val == 0):
return(0)
if(len(quantizeList) == 0):
quantizeList.append(0.0)
quantizeList.append(nextRange(0.0, bin_size, s_varience))
for i in range(len(quantizeList)):
if(val < quantizeList[i]):
return(quantizeList[i-1])
while(1):
quantizeList.append(nextRange(quantizeList[len(quantizeList) - 1], bin_size, s_varience))
if(val < quantizeList[len(quantizeList) - 1]):
return(quantizeList[len(quantizeList) - 2])
In [46]:
variance = 0.7
counter_val = []
for num in range(1, 100):
value = []
counter = 0
for x in range(0, 100):
value.append(num + random.gauss(0, math.sqrt(num*variance)))
counter = 0
for v in value:
if(quantizeNew((v*1000), 200, variance) == quantizeNew((num * 1000), 200, variance)):
counter += 1
counter_val.append(counter)
In [47]:
plt.hist(counter_val, bins= 30)
plt.xlabel("Number of correct quantization")
plt.show()
In [379]:
In [18]:
avgCV = [(10,0.1),(20, 0.3),(30,0.4)]
aa = 45
bb = 0.5
avgCV.append((aa,bb))
In [48]:
avgCV = []
varArray = []
for variance in np.arange(0,1,0.001):
counter_val = []
for num in range(1, 100):
value = []
counter = 0
for x in range(0, 100):
value.append(num + random.gauss(0, math.sqrt(num*variance)))
counter = 0
for v in value:
if(quantizeNew((v*1000), 200, variance) == quantizeNew((num * 1000), 200, variance)):
counter += 1
counter_val.append(counter)
avgCV.append(sum(counter_val)/len(counter_val))
varArray.append(variance)
In [49]:
plt.plot(varArray, avgCV)
plt.plot(varArray[50], avgCV[50], 'rD')
plt.plot(varArray[100], avgCV[100], 'rD')
plt.plot(varArray[200], avgCV[200], 'rD')
plt.plot(varArray[250], avgCV[250], 'rD')
plt.plot(varArray[300], avgCV[300], 'rD')
plt.plot(varArray[333], avgCV[333], 'rD')
plt.xlabel("Varience")
plt.ylabel("Correct quantization")
plt.show()
In [7]:
Out[7]:
In [17]:
random.gauss(0, math.sqrt(20.0*0.05))
Out[17]:
In [283]:
plt.hist(value, bins=20)
Out[283]:
In [119]:
molecules[0][1]
Out[119]:
In [230]:
lines = [line.rstrip('\n') for line in open('sim_ecoli_XhoI2.valuev')]
counter = 1
lines_e = []
for line in lines:
counter = counter + 1;
if(counter%3 == 0):
lines_e.append(line)
In [231]:
lines = [line.rstrip('\n') for line in open('sim_ecoli_XhoI2_without_errors.valuev')]
counter = 1
lines_w_e = []
for line in lines:
counter = counter + 1;
if(counter%3 == 0):
lines_w_e.append(line)
In [267]:
file_o = open("deletion","w")
for no, line in enumerate(lines_e):
file_o.write(str(no) + " " + str(len(lines_w_e[no].split("\t")) - len(lines_e[no].split("\t"))) + "\n")
file_o.close()
In [255]:
strrr = []
stll = "\takb\taasd"
strrr.append(stll)
len(strrr[0].split("\t"))
Out[255]:
In [76]:
xs = []
for x in range(500, 10000):
x = x /1000.0
xs.append(x + random.gauss(0, math.sqrt(x*0.03)))
In [77]:
plt.plot(xs)
Out[77]:
In [132]:
quantizeNew((90000), 600, 0.33)
Out[132]:
In [95]:
def drawLine(fses):
print(fses)
plt.clf()
cur_len = 0
pre_len = 0
plt.figure(figsize=(30,1))
for fs in fses:
pre_len = cur_len
cur_len += f
plt.plot((pre_len,cur_len),(1,1),'-|')
plt.xlim(0,cur_len)
plt.ylim(0,3)
plt.savefig("sample.jpg", dpi = 700)
plt.show()
plt.clf()
plt.cla()
plt.close()
In [96]:
drawLine(frag_sizes)
In [139]:
unsigned int nextRange(unsigned int cur_range, unsigned int bin_size,double s_varience){
if(cur_range < 2000)
return(cur_range + bin_size);
else
return(round(((double)(cur_range/1000.0) + 3 * sqrt(((double)cur_range/1000.0) * s_varience))*1000));
}
unsigned int quantizeNew(unsigned int val, unsigned int bin_size, double s_varience){
static vector<double> quantizeList;
if(val == 0){
return(0);
}
if(quantizeList.size() == 0){
quantizeList.push_back(0.0);
quantizeList.push_back(nextRange(0.0, bin_size, s_varience));
}
for(int i = 0; i < quantizeList.size(); i++){
if(val < quantizeList[i])
return(quantizeList[i-1]);
}
while(1){
quantizeList.push_back(nextRange(quantizeList[quantizeList.size() - 1], bin_size, s_varience));
if(val < quantizeList[quantizeList.size() - 1]){
return(quantizeList[quantizeList.size() - 2]);
}
}
}
In [38]:
print (readline)
print("4334(7)4267(9)4152(9)4055(9)4010(9)4004(7)3918(9)3905(7)3890(7)3834(7)3830(9)3797(9)3776(7)3744(9)3726(7)3641(9)3604(7)3508(9)3375(7)3346(7)3325(9)3316(7)3275(9)3270(9)3118(7)3091(9)3021(9)3010(9)2977(9)2941(9)2913(7)2890(9)2887(7)3690(9)2867(9)2796(9)2488(11)1261(11)3306(10)768(11)2399(7)4035(11)4042(9)2328(7)1919(7)3883(9)2379(11)4225(11)2990(7)521(9)2135(10)2472(11)427(7)836(10)1654(7)2074(8)2035(11)1392(10)983(8)1574(7)1974(11)1959(11)1933(9)1924(8)2117(11)691(9)2261(11)3079(10)1863(7)3090(8)1439(10)3476(11)1431(7)1817(10)1814(10)1819(11)996(11)2405(10)769(10)1189(11)1808(8)3443(7)1984(10)2603(8)1730(7)912(11)3765(11)1666(10)1631(11)1623(11)1589(11)3225(10)3614(11)1552(7)2779(11)1530(10)1517(11)2318(7)2473(10)1478(7)1473(11)3927(9)1452(10)411(12)2618(9)3240(11)3649(11)1542(11)1569(7)746(7)1037(10)3491(8)741(10)2786(7)798(11)1873(9)878(8)1761(10)1853(11)659(8)619(8)2186(9)1363(11)540(8)2590(11)633(8)2279(9)1094(7)3982(9)701(10)502(9)3842(7)1378(11)1599(11)2008(7)206(11)2660(11)803(11)1960(11)314(11)2391(10)3246(9)4069(11)2024(11)378(7)487(8)374(7)875(11)676(8)267(7)1152(8)727(10)921(11)2310(8)519(10)3791(8)2276(11)684(8)249(11)2718(8)2426(7)790(9)823(11)2050(10)2055(11)1232(10)1114(11)3981(9)1686(8)2138(9)492(8)901(11)2537(7)1632(10)3467(11)1638(11)2461(11)869(11)966(11)4238(11)285(10)1103(11)2712(7)1498(9)922(10)2548(9)4194(10)1611(11)2838(10)1213(10)339(10)1157(7)2163(9)973(11)3018(10)1866(11)1893(8)3539(10)1070(11)2706(11)1749(10)2976(11)1111(8)932(8)1131(11)315(7)1138(10)3603(11)4255(9)2609(10)1786(10)1286(10)2204(10)1209(8)2028(11)2846(11)3255(10)3664(10)1046(11)2180(11)913(10)1754(9)2577(10)2607(8)2627(8)1005(11)2677(8)678(7)2731(11)2752(10)1944(11)298(9)2755(10)1118(11)2764(8)2837(8)2866(7)405(8)2874(10)2886(8)3725(7)448(7)2905(7)2929(11)2937(8)2948(11)3766(10)3783(7)2960(11)2142(7)2987(10)3039(8)3049(11)3458(11)3066(8)1849(9)3162(7)3168(11)2350(7)3995(8)3172(12)3190(7)3193(11)3207(7)3216(11)3231(7)2492(11)4133(10)2088(8)3359(8)3374(8)3414(11)3823(11)3419(11)4296(9)3059(8)3882(8)3551(11)3960(11)3617(11)3635(11)2011(11)3657(10)1612(7)3673(7)3680(11)3685(10)2925(11)3748(11)3780(8)3802(11)3820(10)2593(9)546(7)3838(11)576(7)1394(9)563(8)1386(9)3855(11)2210(8)3856(7)4083(8)3265(9)4088(10)4099(7)4110(11)1648(11)4117(11)4191(10)4206(11)3393(9)4216(11)4219(10)936(10)2582(11)4233(11)1779(9)4246(7)4292(10)4301(8)3485(10)2670(8)3521(8)4344(10)234(11)1918(11)272(9)3698(7)406(9)415(9)422(7)1249(7)888(7)940(7)1056(9)2157(11)1334(9)1743(9)3848(8)1379(9)3180(8)1930(9)2949(11)2126(9)3115(9)2292(9)2380(9)2418(7)4260(10)2645(7)2652(7)2724(9)")
In [29]:
readns = [307, 479, 4753, 4799, 4812, 5180]
readline = [598, 946, 9517, 9622, 9649, 10375]
readline = [ read/3 for read in readline]
for readn in readline:
no = readn + 1
no = no * 3
!head -$no sim_ecoli_XhoI2.valuev > temp
!tail -3 temp
In [338]:
#!/usr/bin/python
import sys
from math import ceil,log,sqrt
if len(sys.argv) < 4:
print("Usage: ", sys.argv[0], "<input.valuev> <STDDEVs per bin> <STDDEV per kb> <output.valuev>")
sys.exit(1)
#sigma = float(sys.argv[3])
sigma = 0.58
#m = float(sys.argv[2]) / 2 # divide by two just because I solved the algebra in terms of a delta defined as the bin boundary to the center
m = 2.0
# U(n) = 1/4 m^2 sigma^2 (4 (c_1+n)^2-1) (c_1 is an arbitrary parameter)
# let c_1 = 0
# position = 1/4 * m**2 * sigma**2 * (4 * n**2 - 1)
# formulate a function that gives a real number for bin from a position, inverse of U(n)
# log( (4* position / m**2 / sigma**2 + 1 ) / 4 ),2 = n
# p = 1/4 * m**2 * sigma**2 * (4 * n**2 - 1)
# p * 4 = m**2 * sigma**2 * (4 * n**2 - 1)
# p * 4 / m**2 = sigma**2 * (4 * n**2 - 1)
# p * 4 / m**2 / sigma**2 = 4 * n**2 - 1
# (p * 4 / m**2 / sigma**2) + 1 = 4 * n**2
# ((p * 4 / m**2 / sigma**2) + 1) / 4 = n**2
# log(((p * 4 / m**2 / sigma**2) + 1) / 4), 2) = n
c_1 = 0
def bin_pos(n):
"Return the bin position"
return 1/4 * m**2 * sigma**2 * (4 * (c_1 + n)**2 - 1)
def inv_bin_pos(p):
"Return the inverse bin position"
return sqrt(((p * 4.0 / m**2 / sigma**2) + 1) / 4) - c_1
# return log((((p * 4 / m**2 / sigma**2) + 1) / 4), 2)
# return log( ((4* p / m**2 / sigma**2) + 1 ) / 4 ,2)
def quantize(n):
estimated = inv_bin_pos(n)
bin = ceil(estimated)
upper = bin_pos(bin)
lower = bin_pos(bin-1)
# return the mid point
mid_point = lower + (upper-lower)/2
if lower > n or upper < n:
print(n,estimated, bin,lower,mid_point,upper)
return mid_point
# for i in range(1,100):
# print(quantize(i))
# print("xxxxxxxxxxxxxxxxxxxxxxxxxxx")
# for i in range(1,100):
# print(i, inv_bin_pos(i), bin_pos(inv_bin_pos(i)))
In [327]:
output = open(sys.argv[4], "w")
for lno,line in enumerate(open(sys.argv[1])):
if lno % 3 == 0 or lno % 3 == 2:
output.write(line)
continue
# otherwise, is a second line of three line sets
fields = line.strip().split("\t")
newfields = fields[:2] + [str(quantize(float(field))) for field in fields[2:]]
output.write("\t" + "\t".join(newfields) + "\n")
output.close()
In [339]:
quantize(11.521)
Out[339]:
In [340]:
quantize(12.521)
Out[340]:
In [341]:
inv_bin_pos(11.521)
Out[341]:
In [52]:
np.arange(0,1,0.01)
Out[52]:
In [4]:
print("example")
In [5]:
!ls
In [11]:
lines1 = [line for line in open('sim_single_molecule')]
lines2 = [line for line in open('sim_single_molecule_without_error')]
In [57]:
NewLines1 = []
NewLines2 = []
for i, line in enumerate(lines1):
if(i%3 == 1):
NewLines1.append(line)
for i, line in enumerate(lines2):
if(i%3 == 1):
NewLines2.append(line)
allerrors = 0
for i in range(0, len(NewLines1)):
allerrors = allerrors + ( len(NewLines2[i].rstrip('\n').split("\t")) - len(NewLines1[i].rstrip('\n').split("\t")))
In [54]:
lines = [line for line in open('corrected')]
In [55]:
lines = [int(line.rstrip('\n').split(" ")[1]) for line in lines]
In [64]:
print allerrors
print (sum(lines))
values = []
values.append(allerrors)
values.append(sum(lines))
In [63]:
plt.bar(values, menMeans, width, color='r', yerr=menStd)
In [ ]: