notebook.community

Edit and run



In [77]:

    
import matplotlib
import Bio.SeqIO
import Bio.Restriction
import scipy
import matplotlib.pyplot as plt
%matplotlib inline
import math
import random
import numpy as np



In [78]:

    
handle = open("/s/fir/a/nobackup/data/ECOLI_Reference_Genome/ecoli.fa", "rU")
for record in Bio.SeqIO.parse(handle, "fasta"):
    ref_seq = record.seq
frag_sizes = [len(seq) for seq in Bio.Restriction.XhoI.catalyse(ref_seq)]



In [79]:

    
def subseq(seq, limits):
    x,y = limits
    if x < y : return seq[x:y]
    return seq[x:len(seq)] + seq[0:y]



In [80]:

    
molecules = []
for i in range(1400):
    breaks = []
    num_breaks = 40 #random.randint(20,50)
    for j in range(num_breaks):
        breaks.append(random.randint(0,len(ref_seq) - 1))
    breaks.sort()        
    pairs = [(x,y) for x,y in zip(breaks[:-1], breaks[1:])] + [(breaks[-1], breaks[0])]
    molecules += [[subseq(ref_seq, pair), pair] for pair in pairs]



In [81]:

    
def sim(mol):
    "Return a simulated set of fragment sizes in bp for a Bio.SeqIO seq"
    error_locations = []
    sites = [biosite - 1 for biosite in Bio.Restriction.XhoI.search(mol)]
    e_f_sites = list(sites);
    
    for i, site in enumerate(sites):
        if (random.randint(1,5) == 5):
            sites.pop(i)
            error_locations.append(i+1)                    
            
    allsites = [0] + sites + [len(mol)]
    e_f_allsites = [0] + e_f_sites + [len(mol)]
    # TODO add random break sites
    sizes = [y - x for x,y in zip(allsites[:-1], allsites[1:])]
    e_f_sizes = [y - x for x,y in zip(e_f_allsites[:-1], e_f_allsites[1:])]
    sizes = [size/1000.0 for size in sizes] # convert to kb
    e_f_sizes = [size/1000.0 for size in e_f_sizes] # convert to kb
    sigma = .58
    sizes = [size + random.gauss(0, math.sqrt(size*sigma**2)) for size in sizes]
    sizes = [size for size in sizes if size > .5]
    e_f_sizes = [size for size in e_f_sizes if size > .5]
    return [sizes] + [e_f_sizes] + [error_locations]



In [82]:

    
print molecules[0][1]
print molecules[1][1][0]
print molecules[2][1]









    



(145062, 176109)
176109
(234748, 350381)



In [76]:

    
m = open("sim_ecoli_XhoI2.valuev","w")
mwe = open("sim_ecoli_XhoI2_without_errors.valuev","w") # Without inserting errors
mo = open("sim_ecoli_XhoI2.origin","w")
el = open("sim_ecoli_XhoI2.error_locations","w")



In [50]:

    
start = 0
end = 0
length_of_original = []
length_of_errored = []
for i, mol in enumerate((molecule for molecule in molecules if len(molecule[0]) > 250000)):
    simulated = sim(mol[0])    
    lens = [str(round(frag,3)) for frag in simulated[0]]
    if len(lens) < 10: continue
    map_name = "map_" + str(i) + "\n"
    m.write(map_name)
    #frag_seqs = Bio.Restriction.XhoI.catalyse(mol)
    
    m.write("\tXhoI\tXhoI\t" + "\t".join(lens) + "\n")
    m.write("\n")
    
    lens_e_free = [str(round(frag,3)) for frag in simulated[1]]
    mwe.write(map_name)
    mwe.write("\tXhoI\tXhoI\t" + "\t".join(lens_e_free) + "\n")
    mwe.write("\n")
    
    mo.write("origin[" + map_name + "] = ")
    mo.write(repr(mol[1]))
    mo.write("\n")
    length_of_errored.append(len(lens))
    length_of_original.append(len(lens_e_free))
    #if i == 100: break
    el.write(" ".join([str(num) for num in simulated[2]]) + "\n")    
    if(start == 0 and end ==0):
        start = mol[1][0]
        end = mol[1][1]
    elif(start <= mol[1][0] and mol[1][0] >= end):
        print(i)
    elif(start <= mol[1][1] and mol[1][0] >= end):
        print(i)
    
m.close()
mo.close()
mwe.close()
el.close()









    



---------------------------------------------------------------------------
ValueError                                Traceback (most recent call last)
<ipython-input-50-3f25b17cc54d> in <module>()
      8     if len(lens) < 10: continue
      9     map_name = "map_" + str(i) + "\n"
---> 10     m.write(map_name)
     11     #frag_seqs = Bio.Restriction.XhoI.catalyse(mol)
     12 

ValueError: I/O operation on closed file



In [45]:

    
def nextRange(cur_range, bin_size, s_varience):
    if(cur_range < 2000):
        return(cur_range+ bin_size)
    else:
        return(round(((cur_range/1000.0) + 4 * math.sqrt((cur_range/1000.0) * s_varience))*1000))

quantizeList = []
def quantizeNew(val, bin_size, s_varience):    
    if(val == 0):
        return(0)
    
    if(len(quantizeList) == 0):
        quantizeList.append(0.0)
        quantizeList.append(nextRange(0.0, bin_size, s_varience))
        
    for i in range(len(quantizeList)):
        if(val < quantizeList[i]):
            return(quantizeList[i-1])
    while(1):
        quantizeList.append(nextRange(quantizeList[len(quantizeList) - 1], bin_size, s_varience))
        if(val < quantizeList[len(quantizeList) - 1]):
            return(quantizeList[len(quantizeList) - 2])



In [46]:

    
variance = 0.7
counter_val = []
for num in range(1, 100):
    value = []
    counter = 0
    for x in range(0, 100):
        value.append(num + random.gauss(0, math.sqrt(num*variance)))

    counter = 0
    for v in value:
        if(quantizeNew((v*1000), 200, variance) == quantizeNew((num * 1000), 200, variance)):
            counter += 1    
    counter_val.append(counter)



In [47]:

    
plt.hist(counter_val, bins= 30)
plt.xlabel("Number of correct quantization")
plt.show()



In [379]:



In [18]:

    
avgCV = [(10,0.1),(20, 0.3),(30,0.4)]
aa = 45
bb = 0.5
avgCV.append((aa,bb))



In [48]:

    
avgCV = []
varArray = []
for variance in np.arange(0,1,0.001):
    counter_val = []
    for num in range(1, 100):
        value = []
        counter = 0
        for x in range(0, 100):
            value.append(num + random.gauss(0, math.sqrt(num*variance)))

        counter = 0
        for v in value:
            if(quantizeNew((v*1000), 200, variance) == quantizeNew((num * 1000), 200, variance)):
                counter += 1    
        counter_val.append(counter)
    avgCV.append(sum(counter_val)/len(counter_val))
    varArray.append(variance)



In [49]:

    
plt.plot(varArray, avgCV)
plt.plot(varArray[50], avgCV[50], 'rD')
plt.plot(varArray[100], avgCV[100], 'rD')
plt.plot(varArray[200], avgCV[200], 'rD')
plt.plot(varArray[250], avgCV[250], 'rD')
plt.plot(varArray[300], avgCV[300], 'rD')
plt.plot(varArray[333], avgCV[333], 'rD')
plt.xlabel("Varience")
plt.ylabel("Correct quantization")
plt.show()



In [7]:









    Out[7]:





96



In [17]:

    
random.gauss(0, math.sqrt(20.0*0.05))









    Out[17]:





-1.7006102817766666



In [283]:

    
plt.hist(value, bins=20)









    Out[283]:





(array([  1.,   0.,   2.,   0.,   3.,   1.,   5.,   4.,   6.,  11.,   4.,
         14.,  11.,  15.,   9.,   7.,   5.,   1.,   0.,   1.]),
 array([ 125.91155669,  127.88818454,  129.86481238,  131.84144023,
         133.81806808,  135.79469593,  137.77132378,  139.74795162,
         141.72457947,  143.70120732,  145.67783517,  147.65446301,
         149.63109086,  151.60771871,  153.58434656,  155.56097441,
         157.53760225,  159.5142301 ,  161.49085795,  163.4674858 ,
         165.44411365]),
 <a list of 20 Patch objects>)



In [119]:

    
molecules[0][1]









    Out[119]:





(26585, 139739)



In [230]:

    
lines = [line.rstrip('\n') for line in open('sim_ecoli_XhoI2.valuev')]
counter = 1
lines_e = []
for line in lines:
    counter = counter + 1;
    if(counter%3 == 0):
        lines_e.append(line)



In [231]:

    
lines = [line.rstrip('\n') for line in open('sim_ecoli_XhoI2_without_errors.valuev')]
counter = 1
lines_w_e = []
for line in lines:
    counter = counter + 1;
    if(counter%3 == 0):
        lines_w_e.append(line)



In [267]:

    
file_o = open("deletion","w")
for no, line in enumerate(lines_e):    
    file_o.write(str(no) + " " + str(len(lines_w_e[no].split("\t")) - len(lines_e[no].split("\t"))) + "\n")
file_o.close()



In [255]:

    
strrr = []
stll = "\takb\taasd"
strrr.append(stll)
len(strrr[0].split("\t"))









    Out[255]:





3



In [76]:

    
xs = []
for x in range(500, 10000):
    x = x /1000.0
    xs.append(x + random.gauss(0, math.sqrt(x*0.03)))



In [77]:

    
plt.plot(xs)









    Out[77]:





[<matplotlib.lines.Line2D at 0x7f095656fb90>]



In [132]:

    
quantizeNew((90000), 600, 0.33)









    Out[132]:





85143.0



In [95]:

    
def drawLine(fses):
    print(fses)
    plt.clf()
    cur_len = 0
    pre_len = 0
    plt.figure(figsize=(30,1))
    for fs in fses:
        pre_len = cur_len
        cur_len += f
        plt.plot((pre_len,cur_len),(1,1),'-|')    
    plt.xlim(0,cur_len)
    plt.ylim(0,3)
    plt.savefig("sample.jpg", dpi = 700)
    plt.show()
    plt.clf()
    plt.cla()
    plt.close()



In [96]:

    
drawLine(frag_sizes)









    



[47077, 55381, 40525, 2335, 27290, 18043, 7616, 1806, 5554, 23391, 12, 23605, 28270, 24142, 27238, 8511, 6776, 24921, 1462, 5108, 24335, 11058, 27070, 43326, 49950, 51518, 16481, 37904, 4946, 35071, 17610, 58586, 28871, 14231, 71874, 55145, 2763, 37516, 2107, 3698, 2894, 16741, 60041, 36938, 19058, 4897, 32535, 3642, 4556, 17626, 3007, 12754, 28213, 730, 26300, 33522, 9614, 178, 27279, 28887, 13417, 2786, 21812, 24018, 7009, 14835, 72120, 9739, 10360, 64508, 11884, 18627, 28582, 1557, 2506, 1913, 11641, 15440, 8697, 6170, 45312, 4218, 15062, 20234, 12854, 12396, 48964, 34111, 55638, 56159, 48165, 101, 2099, 13095, 111629, 66318, 116010, 32342, 14882, 79243, 504, 4995, 10312, 37236, 32650, 3417, 12928, 87122, 14466, 15855, 110769, 28703, 4878, 24503, 84549, 3166, 25931, 24461, 68447, 70083, 30152, 24183, 11068, 3284, 6952, 28481, 19404, 44525, 38583, 36019, 76733, 3374, 13436, 27244, 13969, 7883, 1620, 58762, 3167, 44549, 24700, 44498, 39605, 8075, 4687, 32319, 18488, 17260, 10482, 42730, 37485, 6481, 29060, 4547, 14475, 60323, 13128, 498, 12437, 19199, 8085, 17715, 91796, 162, 24820, 64611, 42963, 249, 28217, 20233, 14977, 21401, 40922, 25728, 15415, 20810, 45383, 35631, 42729]






    





<matplotlib.figure.Figure at 0x7f752fe1b450>



In [139]:

    
unsigned int nextRange(unsigned int cur_range, unsigned int bin_size,double s_varience){
    if(cur_range < 2000)
        return(cur_range + bin_size);
    else
        return(round(((double)(cur_range/1000.0) + 3 * sqrt(((double)cur_range/1000.0) * s_varience))*1000));
}


unsigned int quantizeNew(unsigned int val, unsigned int bin_size, double s_varience){

    static vector<double> quantizeList;
    
    if(val == 0){
        return(0);
    }
    if(quantizeList.size() == 0){
        quantizeList.push_back(0.0);
        quantizeList.push_back(nextRange(0.0, bin_size, s_varience));
    }
    for(int i = 0; i < quantizeList.size(); i++){ 
        if(val < quantizeList[i])
            return(quantizeList[i-1]);
    }
    while(1){
        quantizeList.push_back(nextRange(quantizeList[quantizeList.size() - 1], bin_size, s_varience));
        if(val < quantizeList[quantizeList.size() - 1]){
            return(quantizeList[quantizeList.size() - 2]);
        }
    }
}









    



  File "<ipython-input-139-cd7250970183>", line 1
    unsigned int nextRange(unsigned int cur_range, unsigned int bin_size,double s_varience){
               ^
SyntaxError: invalid syntax



In [38]:

    
print (readline)
print("4334(7)4267(9)4152(9)4055(9)4010(9)4004(7)3918(9)3905(7)3890(7)3834(7)3830(9)3797(9)3776(7)3744(9)3726(7)3641(9)3604(7)3508(9)3375(7)3346(7)3325(9)3316(7)3275(9)3270(9)3118(7)3091(9)3021(9)3010(9)2977(9)2941(9)2913(7)2890(9)2887(7)3690(9)2867(9)2796(9)2488(11)1261(11)3306(10)768(11)2399(7)4035(11)4042(9)2328(7)1919(7)3883(9)2379(11)4225(11)2990(7)521(9)2135(10)2472(11)427(7)836(10)1654(7)2074(8)2035(11)1392(10)983(8)1574(7)1974(11)1959(11)1933(9)1924(8)2117(11)691(9)2261(11)3079(10)1863(7)3090(8)1439(10)3476(11)1431(7)1817(10)1814(10)1819(11)996(11)2405(10)769(10)1189(11)1808(8)3443(7)1984(10)2603(8)1730(7)912(11)3765(11)1666(10)1631(11)1623(11)1589(11)3225(10)3614(11)1552(7)2779(11)1530(10)1517(11)2318(7)2473(10)1478(7)1473(11)3927(9)1452(10)411(12)2618(9)3240(11)3649(11)1542(11)1569(7)746(7)1037(10)3491(8)741(10)2786(7)798(11)1873(9)878(8)1761(10)1853(11)659(8)619(8)2186(9)1363(11)540(8)2590(11)633(8)2279(9)1094(7)3982(9)701(10)502(9)3842(7)1378(11)1599(11)2008(7)206(11)2660(11)803(11)1960(11)314(11)2391(10)3246(9)4069(11)2024(11)378(7)487(8)374(7)875(11)676(8)267(7)1152(8)727(10)921(11)2310(8)519(10)3791(8)2276(11)684(8)249(11)2718(8)2426(7)790(9)823(11)2050(10)2055(11)1232(10)1114(11)3981(9)1686(8)2138(9)492(8)901(11)2537(7)1632(10)3467(11)1638(11)2461(11)869(11)966(11)4238(11)285(10)1103(11)2712(7)1498(9)922(10)2548(9)4194(10)1611(11)2838(10)1213(10)339(10)1157(7)2163(9)973(11)3018(10)1866(11)1893(8)3539(10)1070(11)2706(11)1749(10)2976(11)1111(8)932(8)1131(11)315(7)1138(10)3603(11)4255(9)2609(10)1786(10)1286(10)2204(10)1209(8)2028(11)2846(11)3255(10)3664(10)1046(11)2180(11)913(10)1754(9)2577(10)2607(8)2627(8)1005(11)2677(8)678(7)2731(11)2752(10)1944(11)298(9)2755(10)1118(11)2764(8)2837(8)2866(7)405(8)2874(10)2886(8)3725(7)448(7)2905(7)2929(11)2937(8)2948(11)3766(10)3783(7)2960(11)2142(7)2987(10)3039(8)3049(11)3458(11)3066(8)1849(9)3162(7)3168(11)2350(7)3995(8)3172(12)3190(7)3193(11)3207(7)3216(11)3231(7)2492(11)4133(10)2088(8)3359(8)3374(8)3414(11)3823(11)3419(11)4296(9)3059(8)3882(8)3551(11)3960(11)3617(11)3635(11)2011(11)3657(10)1612(7)3673(7)3680(11)3685(10)2925(11)3748(11)3780(8)3802(11)3820(10)2593(9)546(7)3838(11)576(7)1394(9)563(8)1386(9)3855(11)2210(8)3856(7)4083(8)3265(9)4088(10)4099(7)4110(11)1648(11)4117(11)4191(10)4206(11)3393(9)4216(11)4219(10)936(10)2582(11)4233(11)1779(9)4246(7)4292(10)4301(8)3485(10)2670(8)3521(8)4344(10)234(11)1918(11)272(9)3698(7)406(9)415(9)422(7)1249(7)888(7)940(7)1056(9)2157(11)1334(9)1743(9)3848(8)1379(9)3180(8)1930(9)2949(11)2126(9)3115(9)2292(9)2380(9)2418(7)4260(10)2645(7)2652(7)2724(9)")









    



[199, 315, 3172, 3207, 3216, 3458]
4334(7)4267(9)4152(9)4055(9)4010(9)4004(7)3918(9)3905(7)3890(7)3834(7)3830(9)3797(9)3776(7)3744(9)3726(7)3641(9)3604(7)3508(9)3375(7)3346(7)3325(9)3316(7)3275(9)3270(9)3118(7)3091(9)3021(9)3010(9)2977(9)2941(9)2913(7)2890(9)2887(7)3690(9)2867(9)2796(9)2488(11)1261(11)3306(10)768(11)2399(7)4035(11)4042(9)2328(7)1919(7)3883(9)2379(11)4225(11)2990(7)521(9)2135(10)2472(11)427(7)836(10)1654(7)2074(8)2035(11)1392(10)983(8)1574(7)1974(11)1959(11)1933(9)1924(8)2117(11)691(9)2261(11)3079(10)1863(7)3090(8)1439(10)3476(11)1431(7)1817(10)1814(10)1819(11)996(11)2405(10)769(10)1189(11)1808(8)3443(7)1984(10)2603(8)1730(7)912(11)3765(11)1666(10)1631(11)1623(11)1589(11)3225(10)3614(11)1552(7)2779(11)1530(10)1517(11)2318(7)2473(10)1478(7)1473(11)3927(9)1452(10)411(12)2618(9)3240(11)3649(11)1542(11)1569(7)746(7)1037(10)3491(8)741(10)2786(7)798(11)1873(9)878(8)1761(10)1853(11)659(8)619(8)2186(9)1363(11)540(8)2590(11)633(8)2279(9)1094(7)3982(9)701(10)502(9)3842(7)1378(11)1599(11)2008(7)206(11)2660(11)803(11)1960(11)314(11)2391(10)3246(9)4069(11)2024(11)378(7)487(8)374(7)875(11)676(8)267(7)1152(8)727(10)921(11)2310(8)519(10)3791(8)2276(11)684(8)249(11)2718(8)2426(7)790(9)823(11)2050(10)2055(11)1232(10)1114(11)3981(9)1686(8)2138(9)492(8)901(11)2537(7)1632(10)3467(11)1638(11)2461(11)869(11)966(11)4238(11)285(10)1103(11)2712(7)1498(9)922(10)2548(9)4194(10)1611(11)2838(10)1213(10)339(10)1157(7)2163(9)973(11)3018(10)1866(11)1893(8)3539(10)1070(11)2706(11)1749(10)2976(11)1111(8)932(8)1131(11)315(7)1138(10)3603(11)4255(9)2609(10)1786(10)1286(10)2204(10)1209(8)2028(11)2846(11)3255(10)3664(10)1046(11)2180(11)913(10)1754(9)2577(10)2607(8)2627(8)1005(11)2677(8)678(7)2731(11)2752(10)1944(11)298(9)2755(10)1118(11)2764(8)2837(8)2866(7)405(8)2874(10)2886(8)3725(7)448(7)2905(7)2929(11)2937(8)2948(11)3766(10)3783(7)2960(11)2142(7)2987(10)3039(8)3049(11)3458(11)3066(8)1849(9)3162(7)3168(11)2350(7)3995(8)3172(12)3190(7)3193(11)3207(7)3216(11)3231(7)2492(11)4133(10)2088(8)3359(8)3374(8)3414(11)3823(11)3419(11)4296(9)3059(8)3882(8)3551(11)3960(11)3617(11)3635(11)2011(11)3657(10)1612(7)3673(7)3680(11)3685(10)2925(11)3748(11)3780(8)3802(11)3820(10)2593(9)546(7)3838(11)576(7)1394(9)563(8)1386(9)3855(11)2210(8)3856(7)4083(8)3265(9)4088(10)4099(7)4110(11)1648(11)4117(11)4191(10)4206(11)3393(9)4216(11)4219(10)936(10)2582(11)4233(11)1779(9)4246(7)4292(10)4301(8)3485(10)2670(8)3521(8)4344(10)234(11)1918(11)272(9)3698(7)406(9)415(9)422(7)1249(7)888(7)940(7)1056(9)2157(11)1334(9)1743(9)3848(8)1379(9)3180(8)1930(9)2949(11)2126(9)3115(9)2292(9)2380(9)2418(7)4260(10)2645(7)2652(7)2724(9)



In [29]:

    
readns = [307, 479, 4753, 4799, 4812, 5180]
readline = [598, 946, 9517, 9622, 9649, 10375]
readline = [ read/3 for read in readline]
for readn in readline:
    no = readn + 1
    no = no * 3    
    !head -$no sim_ecoli_XhoI2.valuev > temp
    !tail -3 temp









    



map_307
	XhoI	XhoI	45.579	93.382	1.891	27.653	16.251	11.21	2.008	5.573	26.32	23.138	36.234	26.267	28.865	8.898	3.456

map_479
	XhoI	XhoI	52.704	53.378	45.054	2.542	30.09	19.051	8.51	2.108	3.78	19.176	26.342	0.682

map_4753
	XhoI	XhoI	42.641	56.11	40.657	2.761	46.156	7.729	1.856	4.269	19.067	22.73	30.497	17.071	30.005	14.504	31.304	3.538

map_4799
	XhoI	XhoI	49.011	57.653	39.819	0.869	31.915	16.197	5.933	2.397	5.612	22.168	25.323	27.132	13.226

map_4812
	XhoI	XhoI	45.098	60.278	48.14	2.651	41.243	7.602	2.258	5.23	22.363	24.571	32.816	23.511	26.842	9.581

map_5180
	XhoI	XhoI	99.715	45.867	2.478	27.005	21.551	10.488	22.071	29.744	30.188	23.903	34.781	8.544	31.389	1.072	32.93	1.154



In [338]:

    
#!/usr/bin/python                                                                                                                                                                                                                                                              

import sys
from math import ceil,log,sqrt

if len(sys.argv) < 4:
    print("Usage: ", sys.argv[0], "<input.valuev> <STDDEVs per bin> <STDDEV per kb> <output.valuev>")
    sys.exit(1)

#sigma = float(sys.argv[3])
sigma = 0.58
#m = float(sys.argv[2]) / 2  # divide by two just because I solved the algebra in terms of a delta defined as the bin boundary to the center                                                                                                                                    
m = 2.0
# U(n) = 1/4 m^2 sigma^2 (4 (c_1+n)^2-1)  (c_1 is an arbitrary parameter)                                                                                                                                                                                                      
# let c_1 = 0                                                                                                                                                                                                                                                                  
# position = 1/4 * m**2 * sigma**2 * (4 * n**2 - 1)                                                                                                                                                                                                                            
# formulate a function that gives a real number for bin from a position, inverse of U(n)                                                                                                                                                                                       
# log( (4* position / m**2 / sigma**2 + 1 ) / 4 ),2 =   n                                                                                                                                                                                                                      

# p = 1/4 * m**2 * sigma**2 * (4 * n**2 - 1)                                                                                                                                                                                                                                   
# p * 4 = m**2 * sigma**2 * (4 * n**2 - 1)                                                                                                                                                                                                                                     
# p * 4 / m**2 = sigma**2 * (4 * n**2 - 1)                                                                                                                                                                                                                                     
# p * 4 / m**2 / sigma**2 = 4 * n**2 - 1                                                                                                                                                                                                                                       
# (p * 4 / m**2 / sigma**2) + 1 = 4 * n**2                                                                                                                                                                                                                                     
# ((p * 4 / m**2 / sigma**2) + 1) / 4 =  n**2                                                                                                                                                                                                                                  
# log(((p * 4 / m**2 / sigma**2) + 1) / 4), 2) = n                                                                                                                                                                                                                             

c_1 = 0

def bin_pos(n):
    "Return the bin position"
    return     1/4 * m**2 * sigma**2 * (4 * (c_1 + n)**2 - 1)

def inv_bin_pos(p):
    "Return the inverse bin position"
    return sqrt(((p * 4.0 / m**2 / sigma**2) + 1) / 4) - c_1

#    return log((((p * 4 / m**2 / sigma**2) + 1) / 4), 2)                                                                                                                                                                                                                      
#    return log( ((4* p / m**2 / sigma**2) + 1 ) / 4 ,2)                                                                                                                                                                                                                       

def quantize(n):
    estimated = inv_bin_pos(n)
    bin = ceil(estimated)
    upper = bin_pos(bin)
    lower = bin_pos(bin-1)
    # return the mid point                                                                                                                                                                                                                                                     
    mid_point = lower + (upper-lower)/2
    if lower > n or upper < n:
        print(n,estimated, bin,lower,mid_point,upper)
    return mid_point

# for i in range(1,100):                                                                                                                                                                                                                                                       
#     print(quantize(i))                                                                                                                                                                                                                                                       

# print("xxxxxxxxxxxxxxxxxxxxxxxxxxx")                                                                                                                                                                                                                                         
# for i in range(1,100):                                                                                                                                                                                                                                                       
#     print(i, inv_bin_pos(i), bin_pos(inv_bin_pos(i)))



In [327]:

    
output = open(sys.argv[4], "w")
for lno,line in enumerate(open(sys.argv[1])):

    if lno % 3 == 0 or lno % 3 == 2:
        output.write(line)
        continue
    # otherwise, is a second line of three line sets                                                                                                                                                                                                                           
    fields = line.strip().split("\t")
    newfields = fields[:2] + [str(quantize(float(field))) for field in fields[2:]]
    output.write("\t" + "\t".join(newfields) + "\n")


output.close()



In [339]:

    
quantize(11.521)









    



(11.521, 2.968497900617261, 3.0, 0.0, 0.0, 0.0)






    Out[339]:





0.0



In [340]:

    
quantize(12.521)









    



(12.521, 3.091139383346706, 4.0, 0.0, 0.0, 0.0)






    Out[340]:





0.0



In [341]:

    
inv_bin_pos(11.521)









    Out[341]:





2.968497900617261



In [52]:

    
np.arange(0,1,0.01)









    Out[52]:





array([ 0.  ,  0.01,  0.02,  0.03,  0.04,  0.05,  0.06,  0.07,  0.08,
        0.09,  0.1 ,  0.11,  0.12,  0.13,  0.14,  0.15,  0.16,  0.17,
        0.18,  0.19,  0.2 ,  0.21,  0.22,  0.23,  0.24,  0.25,  0.26,
        0.27,  0.28,  0.29,  0.3 ,  0.31,  0.32,  0.33,  0.34,  0.35,
        0.36,  0.37,  0.38,  0.39,  0.4 ,  0.41,  0.42,  0.43,  0.44,
        0.45,  0.46,  0.47,  0.48,  0.49,  0.5 ,  0.51,  0.52,  0.53,
        0.54,  0.55,  0.56,  0.57,  0.58,  0.59,  0.6 ,  0.61,  0.62,
        0.63,  0.64,  0.65,  0.66,  0.67,  0.68,  0.69,  0.7 ,  0.71,
        0.72,  0.73,  0.74,  0.75,  0.76,  0.77,  0.78,  0.79,  0.8 ,
        0.81,  0.82,  0.83,  0.84,  0.85,  0.86,  0.87,  0.88,  0.89,
        0.9 ,  0.91,  0.92,  0.93,  0.94,  0.95,  0.96,  0.97,  0.98,  0.99])



In [4]:

    
print("example")









    



example



In [5]:

    
!ls









    



align_analysis-master.zip  --profile-dir		    sim_ecoli_XhoI2.origin		   sim_single_molecule_without_error  Untitled0.ipynb
deletion		   sample.jpg			    sim_ecoli_XhoI2.valuev		   simulate-ecoli-read.ipynb	      With Bionano info.ipynb
Oldfunctions.ipynb	   sample.png			    sim_ecoli_XhoI2_without_errors.valuev  temp
Other calculations.ipynb   sim_ecoli_XhoI2.error_locations  sim_single_molecule			   test_notebook.ipynb



In [11]:

    
lines1 = [line for line in open('sim_single_molecule')]
lines2 = [line for line in open('sim_single_molecule_without_error')]



In [57]:

    
NewLines1 = []
NewLines2 = []
for i, line in enumerate(lines1):
    if(i%3 == 1):
        NewLines1.append(line)
        
for i, line in enumerate(lines2):
    if(i%3 == 1):
        NewLines2.append(line)
allerrors = 0 
for i in range(0, len(NewLines1)):
    allerrors = allerrors +  ( len(NewLines2[i].rstrip('\n').split("\t")) - len(NewLines1[i].rstrip('\n').split("\t")))



In [54]:

    
lines = [line for line in open('corrected')]



In [55]:

    
lines = [int(line.rstrip('\n').split(" ")[1]) for line in lines]



In [64]:

    
print allerrors
print (sum(lines))
values = []
values.append(allerrors)
values.append(sum(lines))



In [63]:

    
plt.bar(values, menMeans, width, color='r', yerr=menStd)









    



---------------------------------------------------------------------------
TypeError                                 Traceback (most recent call last)
<ipython-input-63-5d8a98b364f0> in <module>()
----> 1 plt.hist(allerrors)

/usr/lib64/python2.7/site-packages/matplotlib/pyplot.pyc in hist(x, bins, range, normed, weights, cumulative, bottom, histtype, align, orientation, rwidth, log, color, label, stacked, hold, **kwargs)
   2825                       histtype=histtype, align=align, orientation=orientation,
   2826                       rwidth=rwidth, log=log, color=color, label=label,
-> 2827                       stacked=stacked, **kwargs)
   2828         draw_if_interactive()
   2829     finally:

/usr/lib64/python2.7/site-packages/matplotlib/axes.pyc in hist(self, x, bins, range, normed, weights, cumulative, bottom, histtype, align, orientation, rwidth, log, color, label, stacked, **kwargs)
   8247         # Massage 'x' for processing.
   8248         # NOTE: Be sure any changes here is also done below to 'weights'
-> 8249         if isinstance(x, np.ndarray) or not iterable(x[0]):
   8250             # TODO: support masked arrays;
   8251             x = np.asarray(x)

TypeError: 'int' object has no attribute '__getitem__'



In [ ]: