notebook.community

Edit and run



In [1]:

    
import matplotlib
import Bio.SeqIO
import Bio.Restriction
import scipy
import matplotlib.pyplot as plt
%matplotlib inline
import math
import random
import numpy as np



In [9]:

    
def signmaS(length):
    singma = (0.2*0.2) + length * 0.1 * (-0.1) + length*length*0.04*0.04
    return(singma)    

values = []
lengths = []
for length in np.arange(0,50, 0.5):
    values.append(signmaS(length))
    lengths.append(length)



In [12]:

    
math.sqrt(signmaS(10))









    Out[12]:





0.31622776601683794



In [11]:

    
plt.plot(lengths,values)
plt.xlabel("Fragment length in Kbp")
plt.ylabel("Varience $\sigma^2$")









    Out[11]:





<matplotlib.text.Text at 0x7fbb5647f950>



In [46]:

    
plt.plot(lengths,values)
plt.xlabel("Fragment lengths in Kb")
plt.ylabel("Varience $\sigma^2$")









    Out[46]:





<matplotlib.text.Text at 0x7f7b7bfaa250>



In [24]:

    
def bucketupper(lower, w):
    return(lower + 2 * w * math.sqrt(signmaS(lower)))
l = 0.5
for i in range(0, 20):
    print(str(l) + "   : "+str(i))
    l = bucketupper(l, 4)









    



0.5   : 0
2.00519101778   : 1
3.30457808438   : 2
4.55489893309   : 3
5.88507492227   : 4
7.41480842528   : 5
9.27072101152   : 6
11.6004498377   : 7
14.5863709342   : 8
18.460848141   : 9
23.5247671364   : 10
30.1711053102   : 11
38.9155184749   : 12
50.4363877208   : 13
65.6274647921   : 14
85.6672140411   : 15
112.11023639   : 16
147.007870282   : 17
193.06732652   : 18
253.861702494   : 19



In [ ]:

    
for length in np.arange(0,50, 0.5):
    for i in range(0, 100):



In [28]:

    
d = 0
i = 0
for line in open("sim_single_molecule_longer_k_3_r_r"):
    d+= int(line.strip().split(" ")[2])     
    i += int(line.strip().split(" ")[6])



In [29]:

    
print d
print i



In [7]:

    
def findInsertionDeletion(fname):
    deletions = 0
    insertions = 0
    for line in open(fname):
        deletions += int(line.strip().split(" ")[2])
        insertions += int(line.strip().split(" ")[6])
    print("Deletions: "+ str(deletions))
    print("Insertions: "+ str(insertions))



In [8]:

    
findInsertionDeletion("sim_single_molecule_longer_k_3_r_r")









    



Deletions: 982
Insertions: 425



In [54]:

    
def noOfFragments(line):
    return((len(line.strip().split("\t"))) - 2)

def findTotalErrors(file1, file1_w_e):
    errors = []
    e_count = 0
    lines = [line.strip() for i,line in enumerate(open(file1)) if (i%3==1)]
    lines_w_e = [line.strip() for i,line in enumerate(open(file1_w_e)) if (i%3==1)]
    for i in range(len(lines)):
        errors.append(noOfFragments(lines_w_e[i]) - noOfFragments(lines[i]))
    for e in errors:
        e_count += int(e)
    return(e_count)



In [35]:

    
def findNoOfFragments(fname):
    lengths = []
    lines = [line.strip() for i,line in enumerate(open(fname)) if (i%3==1)]       
    for line in lines:
         lengths.append((len(line.split("\t"))) - 2)
    count = 0.0
    for length in lengths:
        count += length
    print(count/len(lines))
    plt.hist(lengths, bins = 50)



In [57]:

    
findTotalErrors("sim_single_molecule_longer","sim_single_molecule_without_error_longer")









    Out[57]:





1816



In [237]:

    
read_c = "	XhoI	XhoI	85.856	28.703	4.878	24.503	84.549	3.166	25.931	24.461	68.447	70.083	30.152	24.183	11.068	3.284	6.952	28.481	19.404	44.525	38.583	36.019	76.733	3.374	13.436	27.244	13.969	7.883	1.62	58.762	3.167	4.471"
read = "	XhoI	XhoI	88.487	28.277	4.761	108.091	3.003	25.838	23.999	67.249	68.225	30.854	25.782	11.655	3.409	34.424	20.933	43.642	36.799	36.273	74.788	3.333	13.555	27.912	13.954	7.864	1.96	52.081	3.058	4.219"



In [499]:

    
def signmaS(length):
    singma = float((0.2*0.2) + (length * 0.1 * (-0.1)) + (length*length*0.04*0.04))
    return(singma)

def giveBellRange(sigma):
    return(3 * math.sqrt(sigma))

def findErrors(c_read, read):
    deletion_index = []
    deletion_count = 0
    
    c_frag = c_read.strip().split("\t")
    del c_frag[0:2]
    c_frag = [float(f) for f in c_frag]
    
    frag = read.strip().split("\t")
    del frag[0:2]
    frag = [float(f) for f in frag]
    print(frag)
    print(c_frag)
    j = 0
    for i, c_v in enumerate(c_frag):
        d3 = giveBellRange(signmaS(c_v))                
        aggr = 0.0
        while(True):
            aggr = aggr + frag[j]
            if(aggr <= (c_v + d3) and aggr >= (c_v - d3)):
               j+=1;
               break;                                    
            else:
                deletion_index.append(j)
                deletion_count+=1
                j+=1
                print(j)
    return([deletion_count,deletion_index])



In [249]:

    
findErrors(read,read_c)









    



[85.856, 28.703, 4.878, 24.503, 84.549, 3.166, 25.931, 24.461, 68.447, 70.083, 30.152, 24.183, 11.068, 3.284, 6.952, 28.481, 19.404, 44.525, 38.583, 36.019, 76.733, 3.374, 13.436, 27.244, 13.969, 7.883, 1.62, 58.762, 3.167, 4.471]
[88.487, 28.277, 4.761, 108.091, 3.003, 25.838, 23.999, 67.249, 68.225, 30.854, 25.782, 11.655, 3.409, 34.424, 20.933, 43.642, 36.799, 36.273, 74.788, 3.333, 13.555, 27.912, 13.954, 7.864, 1.96, 52.081, 3.058, 4.219]
4
15
28
29
30






    



---------------------------------------------------------------------------
IndexError                                Traceback (most recent call last)
<ipython-input-249-40a440e20495> in <module>()
----> 1 findErrors(read,read_c)

<ipython-input-248-f5d68ea93d2a> in findErrors(c_read, read)
     24         aggr = 0.0
     25         while(True):
---> 26             aggr = aggr + frag[j]
     27             if(aggr <= (c_v + d3) and aggr >= (c_v - d3)):
     28                j+=1;

IndexError: list index out of range



In [370]:



In [366]:

    
c_read = "	XhoI	XhoI	2.412	8.396	4.396	48.964	34.111	55.638	56.159	48.165	2.099	13.095	111.629	66.318	116.01	20.27"
read = "	XhoI	XhoI	15.489	49.737	33.673	54.262	57.077	47.537	2.08	13.176	174.37	115.055	20.503"



In [372]:

    
c_read = "	XhoI	XhoI	85.856	28.703	4.878	24.503	84.549	3.166	25.931	24.461	68.447	70.083	30.152	24.183	11.068	3.284	6.952	28.481	19.404	44.525	38.583	36.019	76.733	3.374	13.436	27.244	13.969	7.883	1.62	58.762	3.167	4.471"
read = "	XhoI	XhoI	88.487	28.277	4.761	108.091	3.003	25.838	23.999	67.249	68.225	30.854	25.782	11.655	3.409	34.424	20.933	43.642	36.799	36.273	74.788	3.333	13.555	27.912	13.954	7.864	1.96	52.081	3.058	4.219"



In [374]:

    
c_read = "	XhoI	XhoI	5.458	36.938	19.058	4.897	32.535	3.642	4.556	17.626	3.007	12.754	28.213	0.73	26.3	33.522	9.614	27.279	28.887	13.417	2.786	21.812	24.018	7.009	14.835	72.12	9.739	10.36	64.508	11.884	18.627	28.582	1.557	2.506	1.913	11.641	15.44	8.697	6.17	45.312	4.218	15.062	20.234	12.854	12.396	48.964	34.111	55.638	56.159	40.422"
read = "	XhoI	XhoI	43.904	17.833	5.052	31.857	8.315	21.459	12.728	27.013	0.714	62.408	9.538	52.917	13.121	2.677	46.168	6.965	14.937	69.98	9.231	10.549	70.151	30.603	27.33	1.557	2.267	13.717	14.854	7.904	6.629	44.587	4.063	14.407	20.164	24.548	49.608	34.996	56.041	54.079	39.948"



In [857]:

    
def noOfFragments(line):
    return((len(line.strip().split("\t"))) - 2)

def signmaS(length):
    singma = float((0.2*0.2) + (length * 0.1 * (-0.1)) + (length*length*0.04*0.04))
    return(singma)

def giveBellRange(sigma):
    return(4.0 * math.sqrt(sigma))

def findErrors(read, c_read):
    deletion_index = []
    deletion_count = 0
    
    c_frag = c_read.strip().split("\t")
    del c_frag[0:2]
    c_frag = [float(f) for f in c_frag]
    
    frag = read.strip().split("\t")
    del frag[0:2]
    frag = [float(f) for f in frag]
    j = 0
    pad = 0
    for i, c_v in enumerate(c_frag):
        c_v = c_v + pad
        d3 = giveBellRange(signmaS(c_v))
        if(j>=len(frag)):
            break;
        #print(str(i)+" => "+str(frag[j]) + " "+ str(c_v - d3) + " "+ str(c_v + d3) + " d3: "+str(d3)+" c_v: "+str(c_v) + " Count = "+ str(deletion_count))
        if(frag[j] <= (c_v + d3) and frag[j] >= (c_v - d3)):
            j+=1
            pad = 0
        elif((c_v - d3) > frag[j]):            
            pad = c_frag[i]
            d3 = giveBellRange(signmaS(c_frag[i]))
            if(frag[j] <= (c_frag[i] + d3) and frag[j] >= (c_frag[i] - d3)):
                j+=1        
                pad = 0
        else:
            pad = pad + c_frag[i]
            deletion_index.append(i)
            deletion_count+=1     
            
    return([deletion_count,deletion_index, (noOfFragments(c_read) - noOfFragments(read))])

def errors(fname, c_fname):    
    lines = [line for i,line in enumerate(open(fname)) if (i%3==1)]
    c_lines = [line for i,line in enumerate(open(c_fname)) if (i%3==1)]
    notFineList = []
    FineList = []
    fullList = []
    
    #print(lines[114])
    #print(c_lines[114])
    #print(findErrors(lines[114], c_lines[114]))
    #return    
    for i,line in enumerate(lines):
        #print(lines[i])
        #print(c_lines[i])
        #print(findErrors(lines[i], c_lines[i]))
        #print(i)
        E = findErrors(lines[i], c_lines[i])
        E = [i] + E # add read number
        fullList.append(E)
        if(E[1] == E[3]):
            FineList.append(E)
        else:
            notFineList.append(E)            
    return([FineList, notFineList, fullList])



In [939]:

    
EC = errors("sim_single_molecule_100_corrected", "sim_single_molecule_100_efree")
print(len(EC[0]))
print(len(EC[1]))
E = errors("sim_single_molecule_100_newDel", "sim_single_molecule_100_efree")
print(len(E[0]))
print(len(E[1]))
print(EC[2][7])









    



179
189
344
24
[7, 18, [0, 5, 7, 13, 19, 20, 21, 22, 23, 25, 26, 29, 32, 36, 42, 44, 45, 46], 18]



In [876]:

    
for i in range(0, len(EC[2])):
    if(EC[2][i][3] < E[2][i][3]):
        print(str(EC[2][i]) + "\t" +  str(E[2][i]))









    



[1, 8, [2, 8, 9, 11, 12, 13, 15, 19], 3]	[1, 4, [1, 7, 9, 19], 4]
[2, 10, [0, 7, 13, 14, 16, 20, 21, 22, 23, 24], 8]	[2, 9, [0, 7, 8, 10, 14, 15, 17, 19, 21], 9]
[5, 6, [0, 3, 4, 8, 19, 22], 2]	[5, 6, [0, 2, 4, 12, 13, 16], 6]
[8, 5, [0, 4, 9, 17, 20], 6]	[8, 8, [1, 4, 11, 13, 17, 20, 26, 30], 8]
[12, 6, [3, 4, 5, 6, 8, 19], 4]	[12, 10, [3, 6, 10, 15, 20, 21, 23, 30, 37, 38], 10]
[13, 7, [1, 5, 20, 21, 22, 29, 32], 3]	[13, 9, [5, 7, 8, 18, 19, 22, 29, 32, 34], 9]
[14, 6, [6, 7, 10, 13, 19, 20], 2]	[14, 4, [1, 8, 10, 19], 4]
[16, 7, [0, 11, 16, 18, 19, 20, 21], 5]	[16, 6, [0, 5, 11, 12, 13, 20], 6]
[17, 4, [0, 4, 11, 14], 3]	[17, 8, [0, 2, 6, 8, 12, 18, 25, 31], 8]
[20, 5, [8, 14, 16, 18, 21], 5]	[20, 8, [0, 1, 7, 8, 9, 19, 23, 32], 8]
[26, 10, [1, 3, 7, 8, 10, 11, 18, 19, 20, 21], 5]	[26, 8, [3, 6, 16, 18, 22, 24, 28, 33], 8]
[27, 14, [0, 3, 5, 15, 16, 17, 19, 24, 25, 26, 27, 28, 29, 31], 5]	[27, 8, [0, 3, 4, 16, 17, 27, 28, 31], 8]
[32, 5, [2, 9, 15, 18, 22], 6]	[32, 7, [7, 9, 11, 15, 18, 22, 23], 7]
[39, 8, [0, 12, 14, 16, 17, 19, 21, 22], 3]	[39, 5, [0, 12, 13, 14, 20], 5]
[40, 1, [3], 2]	[40, 3, [4, 5, 11], 3]
[43, 7, [0, 1, 3, 6, 7, 9, 23], 7]	[43, 10, [3, 4, 7, 8, 12, 14, 32, 34, 35, 37], 10]
[44, 11, [2, 3, 5, 6, 7, 9, 10, 11, 14, 17, 21], 6]	[44, 7, [0, 2, 5, 6, 14, 17, 20], 7]
[46, 7, [6, 12, 20, 21, 22, 23, 33], 3]	[46, 11, [6, 8, 13, 14, 18, 20, 22, 30, 32, 37, 40], 11]
[47, 4, [0, 9, 16, 19], 3]	[47, 4, [0, 9, 16, 19], 4]
[54, 9, [1, 5, 7, 8, 9, 10, 11, 13, 16], 3]	[54, 5, [2, 3, 7, 9, 12], 5]
[55, 7, [0, 1, 18, 20, 21, 22, 23], 3]	[55, 6, [0, 1, 7, 9, 18, 24], 6]
[56, 5, [11, 17, 19, 28, 29], 6]	[56, 13, [6, 8, 11, 17, 19, 20, 24, 25, 26, 29, 36, 38, 49], 13]
[58, 10, [1, 3, 7, 9, 10, 23, 29, 46, 47, 51], 6]	[58, 12, [1, 3, 7, 8, 16, 23, 29, 31, 39, 44, 47, 50], 12]
[61, 2, [0, 16], 3]	[61, 6, [0, 10, 13, 14, 16, 17], 6]
[62, 1, [2], 1]	[62, 5, [2, 8, 9, 11, 16], 5]
[63, 13, [1, 9, 10, 11, 12, 15, 19, 20, 21, 22, 23, 24, 26], 2]	[63, 9, [1, 8, 12, 17, 20, 21, 27, 29, 34], 9]
[66, 11, [1, 3, 6, 8, 9, 14, 15, 18, 19, 21, 22], 8]	[66, 9, [1, 3, 6, 8, 9, 12, 13, 15, 16], 9]
[73, 3, [7, 13, 15], 3]	[73, 5, [5, 7, 9, 13, 15], 5]
[74, 1, [0], 3]	[74, 4, [0, 8, 9, 12], 4]
[75, 3, [6, 8, 21], 2]	[75, 6, [6, 7, 8, 13, 23, 28], 6]
[78, 3, [7, 9, 18], 3]	[78, 5, [2, 9, 14, 17, 20], 5]
[79, 1, [19], 1]	[79, 2, [2, 17], 2]
[80, 8, [0, 4, 8, 15, 23, 24, 28, 31], 8]	[80, 9, [0, 4, 5, 9, 16, 18, 20, 22, 27], 9]
[82, 17, [5, 8, 10, 11, 13, 15, 21, 24, 26, 34, 48, 49, 50, 52, 57, 58, 60], 8]	[82, 11, [5, 8, 15, 19, 20, 30, 36, 37, 39, 43, 54], 12]
[83, 4, [0, 10, 12, 23], 4]	[83, 5, [0, 1, 12, 19, 23], 5]
[85, 4, [1, 2, 5, 6], 6]	[85, 12, [1, 6, 9, 12, 18, 23, 25, 31, 36, 38, 40, 45], 12]
[86, 3, [0, 12, 20], 6]	[86, 8, [7, 9, 15, 18, 19, 21, 30, 33], 8]
[87, 6, [5, 20, 22, 23, 24, 46], 11]	[87, 13, [5, 6, 14, 15, 17, 21, 22, 26, 28, 29, 38, 41, 42], 13]
[89, 16, [1, 3, 14, 15, 16, 18, 20, 21, 22, 24, 25, 26, 27, 28, 46, 47], 9]	[89, 12, [1, 2, 7, 8, 23, 27, 29, 30, 33, 36, 37, 43], 12]
[90, 17, [0, 10, 12, 15, 17, 23, 30, 31, 35, 39, 41, 43, 46, 53, 54, 55, 56], 17]	[90, 18, [4, 6, 10, 13, 15, 17, 23, 30, 35, 39, 41, 43, 45, 46, 47, 50, 51, 53], 18]
[93, 1, [0], 1]	[93, 3, [0, 8, 10], 3]
[95, 4, [1, 6, 7, 9], 6]	[95, 9, [1, 2, 3, 9, 14, 29, 33, 34, 35], 9]
[96, 10, [0, 9, 13, 17, 19, 20, 40, 49, 74, 75], 14]	[96, 20, [1, 13, 17, 19, 23, 24, 26, 33, 35, 37, 39, 46, 48, 49, 62, 71, 73, 76, 83, 85], 20]
[99, 5, [0, 2, 5, 10, 11], 2]	[99, 4, [0, 2, 5, 10], 4]
[102, 5, [1, 2, 11, 13, 23], 6]	[102, 6, [1, 12, 13, 17, 20, 25], 7]
[104, 8, [7, 11, 13, 14, 16, 18, 20, 27], 5]	[104, 6, [0, 6, 9, 11, 24, 25], 6]
[108, 10, [2, 5, 16, 21, 25, 26, 30, 33, 34, 36], 13]	[108, 15, [2, 5, 8, 9, 13, 14, 17, 22, 25, 26, 30, 37, 39, 43, 47], 15]
[109, 14, [0, 9, 13, 14, 15, 16, 17, 18, 20, 22, 24, 25, 27, 29], 4]	[109, 6, [9, 13, 15, 17, 20, 25], 6]
[113, 3, [0, 4, 19], 3]	[113, 5, [1, 4, 17, 19, 23], 5]
[114, 1, [5], 2]	[114, 4, [5, 6, 7, 9], 4]
[118, 4, [9, 11, 15, 19], 3]	[118, 6, [9, 11, 15, 17, 19, 20], 6]
[120, 6, [0, 3, 5, 6, 7, 19], 4]	[120, 10, [0, 2, 3, 10, 14, 21, 24, 28, 31, 35], 10]
[123, 10, [4, 5, 7, 8, 10, 11, 12, 14, 20, 24], 5]	[123, 6, [0, 4, 5, 6, 10, 16], 6]
[125, 12, [0, 10, 11, 13, 22, 23, 24, 25, 27, 32, 34, 35], 4]	[125, 10, [1, 10, 13, 17, 23, 29, 30, 31, 32, 36], 10]
[126, 2, [1, 5], 2]	[126, 3, [1, 8, 13], 4]
[127, 6, [6, 8, 9, 18, 19, 20], 5]	[127, 6, [6, 8, 9, 11, 12, 16], 6]
[131, 14, [4, 7, 13, 15, 16, 17, 23, 25, 26, 27, 29, 30, 35, 37], 11]	[131, 14, [4, 6, 14, 15, 17, 22, 23, 24, 26, 27, 40, 41, 44, 49], 14]
[135, 5, [1, 2, 12, 13, 14], 1]	[135, 3, [1, 3, 14], 3]
[136, 15, [0, 6, 14, 15, 17, 18, 20, 21, 22, 24, 25, 26, 28, 29, 31], 6]	[136, 7, [3, 5, 7, 15, 24, 29, 31], 7]
[140, 3, [16, 18, 24], 3]	[140, 6, [1, 7, 16, 17, 19, 21], 6]
[141, 7, [0, 13, 18, 19, 20, 21, 23], 3]	[141, 6, [1, 8, 17, 19, 20, 23], 6]
[145, 14, [0, 8, 16, 17, 18, 19, 22, 26, 27, 28, 29, 30, 31, 33], 5]	[145, 8, [0, 8, 14, 19, 25, 32, 36, 39], 8]
[147, 7, [0, 2, 3, 5, 6, 8, 14], 2]	[147, 4, [0, 2, 3, 5], 4]
[148, 11, [0, 2, 10, 15, 16, 19, 20, 28, 30, 31, 41], 10]	[148, 13, [0, 10, 14, 15, 23, 25, 26, 27, 32, 34, 36, 50, 53], 13]
[151, 5, [2, 4, 5, 34, 38], 5]	[151, 9, [2, 13, 16, 19, 23, 26, 28, 31, 34], 9]
[154, 10, [0, 2, 7, 12, 14, 15, 16, 22, 24, 26], 7]	[154, 11, [0, 2, 5, 6, 8, 19, 21, 30, 35, 40, 41], 11]
[155, 9, [3, 8, 9, 10, 11, 12, 13, 15, 17], 3]	[155, 5, [3, 4, 13, 20, 23], 5]
[159, 5, [1, 4, 6, 7, 8], 5]	[159, 8, [1, 7, 10, 13, 16, 25, 29, 35], 8]
[160, 9, [4, 6, 10, 18, 21, 23, 24, 25, 41], 8]	[160, 11, [4, 10, 18, 25, 26, 27, 31, 33, 34, 36, 39], 11]
[166, 17, [4, 9, 13, 29, 31, 32, 34, 36, 38, 39, 40, 41, 42, 44, 60, 61, 66], 10]	[166, 20, [4, 6, 13, 15, 19, 20, 32, 33, 39, 44, 45, 47, 50, 59, 70, 72, 74, 75, 88, 90], 20]
[170, 15, [1, 4, 5, 12, 13, 16, 17, 19, 20, 21, 25, 26, 27, 28, 29], 11]	[170, 12, [1, 4, 13, 15, 16, 17, 18, 19, 22, 26, 27, 28], 12]
[173, 4, [6, 18, 26, 29], 6]	[173, 10, [6, 11, 18, 26, 28, 29, 34, 36, 38, 43], 10]
[174, 4, [0, 2, 17, 20], 2]	[174, 3, [0, 12, 17], 3]
[177, 7, [0, 3, 6, 8, 11, 50, 51], 8]	[177, 13, [3, 6, 8, 20, 29, 40, 43, 51, 52, 55, 57, 61, 62], 13]
[180, 17, [0, 2, 5, 7, 12, 13, 14, 16, 22, 24, 26, 30, 33, 34, 36, 37, 38], 11]	[180, 18, [0, 5, 6, 7, 9, 11, 14, 24, 26, 28, 32, 36, 37, 42, 48, 57, 58, 61], 18]
[181, 2, [13, 15], 2]	[181, 7, [4, 6, 7, 13, 15, 18, 31], 7]
[183, 6, [3, 11, 13, 14, 22, 27], 6]	[183, 7, [3, 7, 11, 12, 14, 23, 27], 7]
[187, 7, [0, 9, 10, 11, 12, 18, 22], 9]	[187, 17, [0, 6, 12, 21, 22, 24, 35, 43, 48, 51, 53, 58, 65, 69, 72, 74, 75], 18]
[193, 6, [1, 6, 8, 9, 10, 22], 8]	[193, 12, [1, 5, 6, 10, 13, 17, 19, 21, 25, 27, 30, 40], 12]
[194, 15, [6, 14, 15, 16, 17, 20, 24, 25, 26, 27, 28, 29, 31, 47, 48], 6]	[194, 12, [6, 13, 15, 21, 24, 29, 38, 39, 41, 46, 47, 49], 12]
[201, 13, [9, 20, 21, 22, 24, 26, 29, 30, 31, 32, 33, 34, 35], 4]	[201, 9, [2, 9, 14, 16, 17, 18, 21, 23, 33], 9]
[206, 4, [3, 7, 9, 18], 4]	[206, 5, [0, 2, 3, 4, 10], 5]
[210, 5, [1, 6, 7, 12, 34], 8]	[210, 10, [1, 7, 12, 13, 15, 17, 22, 25, 34, 41], 10]
[213, 9, [0, 4, 5, 8, 10, 23, 24, 33, 44], 8]	[213, 11, [1, 2, 6, 10, 16, 17, 18, 26, 29, 39, 44], 11]
[216, 0, [], 0]	[216, 1, [3], 1]
[217, 4, [4, 19, 22, 24], 7]	[217, 13, [4, 6, 19, 20, 22, 25, 29, 31, 32, 34, 35, 44, 45], 13]
[221, 5, [20, 21, 22, 24, 28], 3]	[221, 7, [2, 4, 14, 15, 19, 25, 28], 7]
[229, 13, [5, 16, 17, 18, 20, 22, 25, 26, 27, 28, 29, 30, 32], 3]	[229, 8, [5, 18, 20, 23, 24, 25, 30, 32], 8]
[239, 15, [6, 33, 35, 37, 38, 39, 40, 44, 47, 49, 50, 51, 52, 55, 59], 13]	[239, 14, [6, 10, 11, 31, 33, 39, 48, 54, 57, 59, 63, 66, 67, 68], 15]
[245, 1, [4], 7]	[245, 9, [0, 4, 5, 12, 13, 18, 23, 29, 30], 9]
[247, 7, [0, 2, 9, 12, 14, 23, 26], 4]	[247, 7, [0, 2, 9, 10, 12, 14, 27], 7]
[251, 5, [0, 3, 5, 18, 19], 2]	[251, 6, [3, 4, 6, 13, 15, 16], 6]
[259, 11, [0, 1, 8, 10, 24, 42, 43, 51, 53, 54, 56], 13]	[259, 14, [0, 8, 10, 17, 21, 22, 29, 39, 41, 43, 45, 48, 56, 60], 14]
[263, 6, [0, 19, 22, 26, 29, 30], 4]	[263, 9, [2, 4, 11, 13, 15, 16, 18, 26, 30], 9]
[266, 3, [0, 6, 14], 3]	[266, 4, [0, 6, 7, 14], 4]
[267, 6, [2, 12, 16, 18, 19, 20], 3]	[267, 6, [2, 7, 12, 14, 15, 22], 6]
[271, 9, [5, 8, 14, 16, 17, 19, 21, 27, 28], 5]	[271, 6, [5, 9, 11, 13, 19, 23], 6]
[277, 4, [0, 8, 16, 18], 3]	[277, 6, [1, 8, 16, 18, 25, 26], 6]
[284, 2, [3, 5], 4]	[284, 5, [3, 4, 11, 16, 17], 5]
[292, 11, [1, 7, 8, 17, 24, 26, 29, 49, 74, 77, 78], 15]	[292, 17, [1, 7, 8, 15, 17, 22, 23, 31, 32, 36, 38, 40, 48, 53, 57, 63, 64], 17]
[297, 2, [11, 12], 2]	[297, 4, [5, 7, 11, 12], 4]
[300, 3, [7, 12, 20], 5]	[300, 6, [7, 12, 14, 16, 19, 20], 6]
[301, 15, [0, 6, 8, 17, 19, 20, 21, 23, 27, 28, 29, 30, 31, 32, 33], 5]	[301, 7, [6, 8, 9, 12, 22, 26, 29], 7]
[311, 7, [0, 2, 18, 33, 35, 41, 42], 3]	[311, 9, [11, 13, 18, 23, 29, 34, 36, 37, 38], 9]
[332, 19, [3, 5, 6, 10, 12, 18, 23, 37, 38, 41, 48, 56, 59, 62, 63, 64, 65, 71, 74], 21]	[332, 21, [3, 5, 6, 10, 12, 16, 18, 22, 33, 37, 38, 41, 48, 56, 59, 62, 63, 64, 65, 71, 74], 22]
[333, 3, [2, 15, 16], 3]	[333, 4, [2, 5, 15, 16], 4]



In [940]:

    
lines = [line for i,line in enumerate(open("sim_single_molecule_100_newDel")) if (i%3==1)]
c_lines = [line for i,line in enumerate(open("sim_single_molecule_100_efree")) if (i%3==1)]
lines_corrected = [line for i,line in enumerate(open("sim_single_molecule_100_corrected")) if (i%3==1)]
S = (lines[1] + "\n" + c_lines[1] + "\n" + lines_corrected[1])
print(S)
lines = S.split("\n")
print("\nErrored\n")
print("\n".join(lines[0].strip().split("\t")))
print("\nError Free\n")
print("\n".join(lines[2].strip().split("\t")))
print("\nCorrected\n")
print("\n".join(lines[4].strip().split("\t")))









    



	XhoI	XhoI	56.831	20.481	66.57	11.365	18.19	30.223	4.41	10.779	16.155	8.466	6.104	45.186	4.209	15.126	19.258	12.922	63.763	30.327

	XhoI	XhoI	58.202	9.739	10.36	64.508	11.884	18.627	28.582	1.557	2.506	1.913	11.641	15.44	8.697	6.17	45.312	4.218	15.062	20.234	12.854	12.396	48.964	30.026

	XhoI	XhoI	31.948	10.165	66.570	11.365	18.190	30.223	2.741	2.422	10.779	16.155	8.466	6.104	45.186	4.209	15.126	19.258	12.922	63.763	30.327


Errored

XhoI
XhoI
56.831
20.481
66.57
11.365
18.19
30.223
4.41
10.779
16.155
8.466
6.104
45.186
4.209
15.126
19.258
12.922
63.763
30.327

Error Free

XhoI
XhoI
58.202
9.739
10.36
64.508
11.884
18.627
28.582
1.557
2.506
1.913
11.641
15.44
8.697
6.17
45.312
4.218
15.062
20.234
12.854
12.396
48.964
30.026

Corrected

XhoI
XhoI
31.948
10.165
66.570
11.365
18.190
30.223
2.741
2.422
10.779
16.155
8.466
6.104
45.186
4.209
15.126
19.258
12.922
63.763
30.327



In [863]:

    
EC[2][2][3]









    Out[863]:





8



In [9]:

    
lines = [len(line.strip().split(" ")) for line in open("sim_single_molecule_100_elocations")]
count = 0
counttotal = 0
for i, line in enumerate(lines):
    if(line > EC[2][i][3]):
        counttotal+=line
        count += (line - EC[2][i][3])
print("Total: "+ str(counttotal) + "\ncorrected: " + str(count))









    



---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-9-38282d674db5> in <module>()
      3 counttotal = 0
      4 for i, line in enumerate(lines):
----> 5     if(line > EC[2][i][3]):
      6         counttotal+=line
      7         count += (line - EC[2][i][3])

NameError: name 'EC' is not defined



In [4]:

    
lines = [line.strip().split(" ") for line in open("sim_single_molecule_100_elocations")]
lines_ef = [line.strip().split() for i,line in enumerate(open("sim_single_molecule_100_efree")) if (i%3==1)]
for line in lines:
    for i in line:









    



  File "<ipython-input-4-a19136fcf14d>", line 5
    
    ^
IndentationError: expected an indented block



In [17]:

    
def findInsertionDeletion(fname):
    deletions = 0
    insertions = 0
    for line in open(fname):
        deletions += int(line.strip().split(" ")[2])
        insertions += int(line.strip().split(" ")[6])
    print("Deletions: "+ str(deletions))
    print("Insertions: "+ str(insertions))



In [18]:

    
findInsertionDeletion("corrected")









    



Deletions: 860
Insertions: 1699



In [836]:

    
413/593.0









    Out[836]:





0.6964586846543002



In [841]:

    
# your code goes here
sum = 10
A = [3, 7, 2, 5, 6, 4]
D = {}
for a in A:
    D[a] = 1
for a in A:    
    if(D.has_key(sum - a)):
        print(str(a) + " " + str(sum - a))



In [14]:

    
def noOfFragments(line):
    return((len(line.strip().split("\t"))) - 2)

def findTotalErrors(file1, file1_w_e):
    errors = []
    e_count = 0
    lines = [line.strip() for i,line in enumerate(open(file1)) if (i%3==1)]
    lines_w_e = [line.strip() for i,line in enumerate(open(file1_w_e)) if (i%3==1)]
    for i in range(len(lines)):
        errors.append(noOfFragments(lines_w_e[i]) - noOfFragments(lines[i]))
    for e in errors:
        e_count += int(e)
    return(e_count)



In [15]:

    
findTotalErrors("sim_single_molecule_100_newDel","sim_single_molecule_100_efree")









    Out[15]:





5524



In [16]:

    
findTotalErrors("sim_single_molecule_100_corrected","sim_single_molecule_100_efree")









    Out[16]:





800



In [915]:

    
lines = S.split("\n")
print("\n".join(lines[1].strip().split("\t")))



In [912]:

    
A = ["ssa", "tsx"]
B = "sa"
print("\n".join(A))









    



ssa
tsx



In [920]:

    
lines[4]









    Out[920]:





'\tXhoI\tXhoI\t19.206\t64.657\t68.451\t82.015\t35.874\t11.383\t3.258\t6.948\t28.365\t19.128\t51.210\t38.302\t36.137\t78.898\t3.285\t16.386\t27.898\t14.684\t1.603\t58.576\t49.088\t24.635\t44.991\t38.679\t7.808\t4.952\t33.224\t18.560\t22.910\t10.520\t39.991\t38.868\t6.724\t4.591\t16.345\t61.072\t13.336\t12.346\t19.405\t8.636'



In [941]:

    
L = "-1 2 1 1 1 1 2 1 1 1 1 1 1 1 1 1 -1\n" + "9 12 12 13 16 20 10 19 22 20 22 20 20 18 21 15 10"
L = L.split("\n")
L = "\n".join(L[0].split(" ")) + "\n------------------ \n" +"\n".join(L[1].split(" "))
print L









    



-1
2
1
1
1
1
2
1
1
1
1
1
1
1
1
1
-1
------------------ 
9
12
12
13
16
20
10
19
22
20
22
20
20
18
21
15
10



In [1169]:



In [1028]:

    
map = dict()
A = ['b','a','d','b','b','b','c','a','b']
first = '0'
for a in A:
    if map.has_key(a):
        if(map[a][0] != '0'):
            map[map[a][0]] = [map[map[a][0]][0], map[a][1]]
            map[a] = ['0', first]   
            map[first] = [a,map[first][1]]
            first = a
    else:           
        if first != '0' :
            map[first] = [a, map[first][1]]
        map[a]= ['0', first]
        first = a
        
def giveLastN(map, n, first):
    cur = first
    while(n > 0 and cur != '0'):        
        print(cur)
        cur = map[cur][1]
        n-=1



In [1029]:

    
print map









    



{'a': ['b', 'c'], 'c': ['a', 'd'], 'b': ['0', 'a'], 'd': ['b', '0']}



In [1033]:

    
giveLastN(map, 1, first)



In [1025]:

    
first









    Out[1025]:





'd'



In [1043]:

    
class TradeTracker(object):
    def __init__(self):
        self.map = dict()
        self.first = '0'

    def addTrade(self, A):
        for a in A:
            if  self.map.has_key(a):
                if(self.map[a][0] != '0'):
                    self.map[self.map[a][0]] = [self.map[self.map[a][0]][0], self.map[a][1]]
                    self.map[a] = ['0', self.first]   
                    self.map[self.first] = [a,self.map[self.first][1]]
                    self.first = a
            else:
                if self.first != '0' :
                    self.map[self.first] = [a, self.map[self.first][1]]
                self.map[a]= ['0', self.first]
                self.first = a
        
    def giveLastN(self, n):
        cur = self.first
        while(n > 0 and cur != '0'):        
            print(cur)
            cur = self.map[cur][1]
            n-=1
            
A = ['b','a','d','b','b','b','c','a','b','a','a','k']
T = TradeTracker();
T.addTrade(A)
T.giveLastN(4)









    



k
a
b
c



In [1069]:

    
def nextRange(cur_range, bin_size, s_varience):
    if(cur_range < 2000):
        return(cur_range + bin_size)
    else:
        return(round(((double)(cur_range/1000.0) + 3 * math.sqrt(((float)(cur_range/1000.0)) * s_varience))*1000))



In [1067]:

    
# standard deviation with Bionano data
def signmaS(len):
    singma = (0.2*0.2) + len * 0.1 * (-0.1) + len*len*0.04*0.04
    return(singma)



In [1068]:

    
signmaS(0.3)









    Out[1068]:





0.037144



In [1094]:

    
quantizeList = []
def quantize(val, bin_size, s_varience):    
    if(val == 0):
        return;
    
    if(quantizeList.size() == 0):
        quantizeList.push_back(0.0)
        quantizeList.push_back(nextRange(0.0, bin_size, s_varience))

    for i in range(0, quantizeList.len):
        if(val < quantizeList[i]):        
            val = quantizeList[i-1];
            return;

    while(1):
        quantizeList.append(nextRange(quantizeList[len(quantizeList) - 1], bin_size, s_varience))
        if(val < quantizeList[len(quantizeList) - 1]):
            val = quantizeList[len(quantizeList) - 2]
            return



In [1094]:

    
len(quantizeList)



In [1095]:

    
quantize(1000, 300, )









    



---------------------------------------------------------------------------
TypeError                                 Traceback (most recent call last)
<ipython-input-1095-86a37a19f9cb> in <module>()
----> 1 quantize(1000, 300, )

TypeError: quantize() takes exactly 3 arguments (2 given)



In [1150]:

    
# standard deviation with Bionano data
def signmaS(len):
    singma = (0.2*0.2) + len * 0.1 * (-0.1) + len*len*0.04*0.04
    return(math.sqrt(singma))

def nextRange(pre_val, folds):
    S = signmaS(float(pre_val/1000.0))
    d = (2.0 * folds * S) * 1000
    return(int(pre_val + d))

quantizeList = []
def quantizeNew(val):
    folds = 3
    start = 500
    if(val <= start):
        return;
    if(len(quantizeList) == 0):
        quantizeList.append(start)
        quantizeList.append(nextRange(start, folds))
    
    for i in range(0, len(quantizeList)):
        if(val < quantizeList[i]):        
            val = quantizeList[i-1];
            return(val);
        
    while(1):
        quantizeList.append(nextRange(quantizeList[len(quantizeList) - 1], folds))
        if(val < quantizeList[len(quantizeList) - 1]):
            val = quantizeList[len(quantizeList) - 2]
            return(val)



In [1169]:

    
quantizeNew(20000)









    Out[1169]:





19012



In [1167]:

    
signmaS(0.5)









    Out[1167]:





0.1881488772222678

================================================



In [2]:

    
# standard deviation with Bionano data
def signmaS(len):
    singma = (0.2*0.2) + len * 0.1 * (-0.1) + len*len*0.04*0.04
    return(singma)

handle = open("/s/fir/a/nobackup/data/ECOLI_Reference_Genome/ecoli.fa", "rU")
for record in Bio.SeqIO.parse(handle, "fasta"):
    ref_seq = record.seq
frag_sizes = [len(seq) for seq in Bio.Restriction.XhoI.catalyse(ref_seq)]

def subseq(seq, limits):
    x,y = limits
    if x < y : return seq[x:y]
    return seq[x:len(seq)] + seq[0:y]

molecules = []
for i in range(200):
    breaks = []
    num_breaks = 10 #random.randint(20,50)
    for j in range(num_breaks):
        breaks.append(random.randint(0,len(ref_seq) - 1))
    breaks.sort()        
    pairs = [(x,y) for x,y in zip(breaks[:-1], breaks[1:])] + [(breaks[-1], breaks[0])]
    molecules += [[subseq(ref_seq, pair), pair] for pair in pairs]



In [3]:

    
def generateRandomList(listv, count):
    randomList = []
    randomListIndex = []
    if(count >= len(listv) + 1):
        return(randomListIndex, randomList)
    if(count <= 0):
        return(randomListIndex, randomList)
    if(len(listv) == 0):
        return(randomListIndex, randomList)
    
    index = range(0, len(listv))
    for c in range(0, count):
        i = random.choice(index)
        randomList.append(listv[i])
        randomListIndex.append(i)
        index.remove(i)
    
    randomListIndex, randomList = zip(*sorted(zip(randomListIndex,randomList)))
    return(randomListIndex, randomList)

def deleteRandomly(listv, count):
    deleted = generateRandomList(listv, count)
    deleted_index = list(deleted[0])
    deleted_values = list(deleted[1])    
    for d in deleted_values:
        listv.remove(d)
    return(listv, deleted_index, deleted_values)

def insertVal(A, val):
    idx = A.index(val)
    if(idx + 1 != len(A)):
        A.insert(idx + 1, random.randint(A[idx], A[idx + 1] ))
    else:
        A.insert(idx, random.randint(A[idx-1], A[idx]))
        
def addfalsepositive(listv, count):
    if(count == 0):
        return(listv, count)
    sel_cites,sel_val = generateRandomList(listv, count)
    for val in sel_val:
        insertVal(listv, val)
    return(listv, count)

def sim(mol):
    "Return a simulated set of fragment sizes in bp for a Bio.SeqIO seq"    
    sites = [biosite - 1 for biosite in Bio.Restriction.XhoI.search(mol)]
    e_f_sites = list(sites);    
    sites, dindex, dsites = deleteRandomly( sites, int(round(len(sites)/6.66)))
    sites, ins_err = addfalsepositive(sites, int(round(len(mol)/100000.0)))
    allsites = [0] + sites + [len(mol)]
    e_f_allsites = [0] + e_f_sites + [len(mol)]    
    # TODO add random break sites
    sizes = [y - x for x,y in zip(allsites[:-1], allsites[1:])]
    e_f_sizes = [y - x for x,y in zip(e_f_allsites[:-1], e_f_allsites[1:])]
    sizes = [size/1000.0 for size in sizes] # convert to kb
    e_f_sizes = [size/1000.0 for size in e_f_sizes] # convert to kb    
    sizes = [size + random.gauss(0, math.sqrt(signmaS(size))) for size in sizes if size > 0.5]
    sizes = [size for size in sizes if size > 0.5]
    e_f_sizes = [size for size in e_f_sizes if size > 0.5]
    return (sizes, e_f_sizes, dindex, ins_err)



In [4]:

    
filename = "sim_single_molecule_100"
m = open(filename+"_newDel","w")
mwe = open(filename+"_efree","w")
efile  = open(filename+"_elocations","w")

start = 0
end = 0
length_of_original = []
length_of_errored = []
accum = 0
tot_ins_err = 0
tot_del_err = 0
for i, mol in enumerate((molecule for molecule in molecules if len(molecule[0]) > 250000)):
    simulated = sim(mol[0])      
    lens = [str(round(frag,3)) for frag in simulated[0]]
    if len(lens) < 10: continue
    map_name = "map_" + str(i) + "\n"
    
    lens_e_free = [str(round(frag,3)) for frag in simulated[1]]
    length_of_errored.append(len(lens))
    length_of_original.append(len(lens_e_free))
    #if i == 100: break
    tot_ins_err += simulated[3]
    tot_del_err += len(simulated[2])
    
    if(start == 0 and end ==0):
        start = mol[1][0]
        end = mol[1][1]
    
#    elif(start <= mol[1][0] and mol[1][0] >= end):
#    if((start <= mol[1][0] and mol[1][0] <= end) or (start <= mol[1][1] and mol[1][1] <= end) or (start >= mol[1][0] and mol[1][1] >= end)):
    m.write(map_name)
    m.write("\tXhoI\tXhoI\t" + "\t".join(lens) + "\n")
    m.write("\n")

    mwe.write(map_name)
    mwe.write("\tXhoI\tXhoI\t" + "\t".join(lens_e_free) + "\n")
    mwe.write("\n")

    efile.write(' '.join([str(s) for s in simulated[2]]))
    efile.write("\n")
"""        
    elif(start <= mol[1][1] and mol[1][0] >= end):
        m.write("map_" + str(i) + "\n")
        m.write("\tXhoI\tXhoI\t" + "\t".join(lens) + "\n")        
        m.write("\n")
        
        mwe.write(map_name)
        mwe.write("\tXhoI\tXhoI\t" + "\t".join(lens_e_free) + "\n")
        mwe.write("\n")
                
        efile.write(' '.join([str(s) for s in simulated[2]]))
        efile.write("\n")
"""
m.close()
mwe.close()
efile.close()



In [92]:

    
print(tot_ins_err)
print(tot_del_err)



In [9]:

    
sim(molecules[8][0])









    Out[9]:





([16.661084748208314,
  43.70944988565148,
  36.68483729040059,
  6.166476837458508,
  2.2833071155421356,
  4.678181974795673,
  28.39280830620119,
  4.72748533400852,
  6.349912406371118,
  11.665345742826457,
  11.98442646820368,
  5.1987928289760035,
  10.29788725160603,
  21.450466784232596,
  20.80503321462935,
  45.539693030552435,
  16.693412325513393,
  11.868400426718367,
  3.0677588944895344,
  1.5738562963077127,
  11.069184117362395,
  3.697835722660505,
  17.868432926059306,
  40.853293336586944,
  6.515085925338014,
  6.736002721819806,
  32.26512551441425,
  8.038889568497952,
  16.315186447704882,
  0.9780097524662003,
  26.1015410585528],
 [16.373,
  44.498,
  39.605,
  8.075,
  4.687,
  32.319,
  18.488,
  17.26,
  10.482,
  42.73,
  37.485,
  6.481,
  29.06,
  4.547,
  14.475,
  60.323,
  13.128,
  12.437,
  19.199,
  8.085,
  17.715,
  25.914],
 [10, 17, 18],
 13)



In [57]:

    
A = []
deleteRandomly(A,7)
#random.choice()









    Out[57]:





([], [], [])



In [153]:

    
A = [43196, 225524, 257866, 272748]
addfalsepositive(A, 1)
print(A)









    



[43196, 91066, 225524, 257866, 272748]



In [115]:

    
A = [43196, 225524, 257866, 272748]

        
insertVal(A, 43196)



In [116]:

    
A









    Out[116]:





[43196, 82743, 225524, 257866, 272748]



In [82]:

    
len(A)









    Out[82]:





4



In [163]:

    
round(len(molecules[5][0])/100000.0)









    Out[163]:





3.0



In [6]:

    
!/s/oak/b/nobackup/darshanw/valuev_optmap_alignment-master/ovlp/ovlp









    



Usage:
ovlp <Map_File> <Ouput> <Range num1> <Range num2>



In [139]:

    
def createExcel(copies, tot_del, tol_ins, in_file):
    lines = [line for line in open(in_file)]    
    print("Copies,K,MIN COMMON K IN READS (m),MIN_CONSENSUS (d),Deletions, Insertions,Reads aligned (B),Avg S-score (B),Reads aligned (A),Avg S-score (A)")            
    for i in range(0, len(lines)/7):
        k = lines[0+i*7].strip().split(" ")[1]
        m = lines[0+i*7].strip().split(" ")[4]
        d = lines[0+i*7].strip().split(" ")[8]    
        no_algn_r_B = lines[1+i*7].strip().split(":")[1].strip().split("(")[0]
        tot_r = lines[1+i*7].strip().split(":")[1].strip().split("(")[1].split(")")[0]
        avg_S_B = lines[2+i*7].strip().split(":")[1].strip()
        no_algn_r_A = lines[3+i*7].strip().split(":")[1].strip().split("(")[0]
        avg_S_A = lines[4+i*7].strip().split(":")[1].strip()
        deletion = lines[5+i*7].strip().split(":")[1].strip()
        insertion = lines[6+i*7].strip().split(":")[1].strip()
        print(str(copies)+","+str(k)+","+str(m)+","+str(d)+","+str(deletion)+"["+tot_del+"]"+","+str(insertion)+"["+tol_ins+"]"+","+str(no_algn_r_B)+"["+str(tot_r)+"],"+str(avg_S_B)+","+str(no_algn_r_A)+"["+str(tot_r)+"],"+str(avg_S_A))



In [140]:

    
createExcel("100", "2364", "1005", "/s/oak/b/nobackup/darshanw/COmap/test/with-100")









    



Copies,K,MIN COMMON K IN READS (m),MIN_CONSENSUS (d),Deletions, Insertions,Reads aligned (B),Avg S-score (B),Reads aligned (A),Avg S-score (A)
100,3,3,3,871[2364],674[1005],406[569],83.6360800493,350[569],74.4751637143
100,3,3,4,725[2364],542[1005],406[569],83.6360800493,364[569],76.1814417582
100,3,4,3,722[2364],521[1005],406[569],83.6360800493,373[569],75.1897680965
100,3,4,4,591[2364],409[1005],406[569],83.6360800493,379[569],77.4527548813
100,3,5,3,602[2364],409[1005],406[569],83.6360800493,377[569],76.4011779841
100,3,5,4,452[2364],313[1005],406[569],83.6360800493,386[569],78.6214935233
100,3,6,3,465[2364],311[1005],406[569],83.6360800493,381[569],78.1248698163
100,3,6,4,339[2364],222[1005],406[569],83.6360800493,393[569],79.8416101781
100,4,3,3,541[2364],392[1005],406[569],83.6360800493,376[569],77.6900329787
100,4,3,4,410[2364],284[1005],406[569],83.6360800493,381[569],80.5583664042
100,4,4,3,365[2364],260[1005],406[569],83.6360800493,382[569],79.4010798429
100,4,4,4,233[2364],174[1005],406[569],83.6360800493,390[569],81.1712607692
100,4,5,3,202[2364],162[1005],406[569],83.6360800493,392[569],80.6711729592
100,4,5,4,108[2364],91[1005],406[569],83.6360800493,401[569],82.3390167082
100,4,6,3,118[2364],86[1005],406[569],83.6360800493,397[569],81.6632745592
100,4,6,4,46[2364],42[1005],406[569],83.6360800493,405[569],82.9071224691
100,5,3,3,251[2364],177[1005],406[569],83.6360800493,387[569],80.9344640827
100,5,3,4,143[2364],108[1005],406[569],83.6360800493,395[569],82.3741908861
100,5,4,3,140[2364],111[1005],406[569],83.6360800493,396[569],81.7156444444
100,5,4,4,53[2364],58[1005],406[569],83.6360800493,402[569],82.6482666667
100,5,5,3,69[2364],53[1005],406[569],83.6360800493,403[569],82.4090076923
100,5,5,4,19[2364],22[1005],406[569],83.6360800493,406[569],83.0508559113
100,5,6,3,28[2364],20[1005],406[569],83.6360800493,406[569],83.2058327586
100,5,6,4,8[2364],10[1005],406[569],83.6360800493,406[569],83.4566896552



In [187]:

    
A = """344[2049]"""



In [188]:

    
A = A.split("\n")
for i in range(0, len(A)):    
    val = A[i].strip().split("[")[0]
    tot = A[i].strip().split("[")[1].split("]")[0]
    percent = (float(val)/float(tot))*100.0
    percent = round(percent,2)
    print(str(percent)+"% ["+str(val)+"]")









    



16.79% [344]



In [149]:

    
A[0]









    Out[149]:





'871[2364]'



In [ ]:

    
for i in range(0, 1000000000):
    i = i + 10
    i = i - 10



In [457]:

    
A = """map_0
	XhoI	XhoI	42.775	51.518	16.481	37.904	4.946	35.071	17.61	58.586	28.871	14.231	71.874	55.145	2.763	37.516	2.107	3.698	2.894	16.741	60.041	36.938	19.058	4.897	32.535	3.642	4.556	17.626	3.007	12.754	28.213	26.3	33.522	9.614	27.279	28.887	13.417	2.786	21.812	24.018	7.009	14.835	72.12	9.739	10.36	64.508	11.884	18.627	28.582	1.557	2.506	1.913	11.641	15.44	8.697	6.17	45.312	4.218	15.062	20.234	12.854	12.396	48.964	34.111	55.638	56.159	48.165	2.099	13.095	7.375

"""
B = """map_0
	XhoI	XhoI	44.617	51.091	16.655	38.27	4.801	34.72	18.1	86.118			14.007	69.636	55.972	37.284			2.474	3.766	2.598	16.698	60.875	36.399	18.954	5.025	32.848	7.911	20.872	13.18	26.291	4.137	22.175	44.452	28.029	29.394	11.691	1.801	23.03	23.284	7.059	14.563	69.41	9.619	72.232	11.745	18.805	30.361	1.501	2.471	1.718	11.645	14.989	14.565	42.496	19.018	18.792	9.871	2.773	60.874	32.248	59.05	40.6	15.116	53.477	15.904	7.436

"""
L = "1,11,14,17 : 2"
def countContinuous(V):    
    count = 0;
    V = [int(v) for v in V]
    for i in range(0,len(V)):
        if(i == 0):
            vprev = int(V[i])
        else:
            if(V[i] == (vprev + 1)):
                count+=1
                print("I'm here")
            vprev = V[i]
    return(count)

e_f = A.strip().split("\n")[1].strip().split("\t")[2:]
w_e = B.strip().split("\n")[1].strip().split("\t")[2:]

deletion = L.strip().split(":")[0].strip().split(",")
insertion = L.strip().split(":")[1].strip().split(",")

print(len(e_f))
print(len(w_e))
print(len(e_f) - len(w_e))
print(len(deletion))
print(countContinuous(deletion))
print(len(insertion))
print(countContinuous(insertion))

def findTrueErrorLocation(E):
    deletion = E.strip().split(":")[0].strip().split(",")
    insertion = E.strip().split(":")[1].strip().split(",")
    insertion = [ (int(insertion[i])+i) for i in range(0, len(insertion))]
    deletion = [ (int(deletion[i])-i) for i in range(0, len(deletion))]
    for z in range(0,len(deletion)):
        less = [i for i in insertion if i < deletion[z]]
        deletion[z] = deletion[z] + len(less)        
    return(deletion, insertion)

findTrueErrorLocation(L)









    



68
65
3
4
0
1
0






    Out[457]:





([1, 11, 13, 15], [2])



In [307]:

    
import random 
def insertVal(A, val):
    idx = A.index(val)
    if(idx + 1 != len(A)):
        A.insert(idx + 1, random.randint(A[idx], A[idx + 1] ))
    else:
        A.insert(idx, random.randint(A[idx-1], A[idx]))



In [351]:

    
A = [10,20,30,40,50,60,70,80,90]
insertVal(A, 10)
insertVal(A, 20)
for a in A:
    if(a ==50):
        continue
A =  [if(a == 50) else a for a in A ]
A









    



  File "<ipython-input-351-b909ad99ed5e>", line 4
    A =  [continue if(a == 50) else a for a in A ]
                 ^
SyntaxError: invalid syntax



In [316]:

    
def insertVal(A, val):
    idx = A.index(val)
    if(idx == 0):
        A.insert(idx, random.randint(0, A[idx]))
    else:
        A.insert(idx, random.randint((A[idx - 1 ]), A[idx]))



In [766]:

    
A = """Base read aligned to: 11 map_793
Starting at: 6 -> 0 0 0 0 0 0 0 0 0 0 2 1 1 1 1 1 
Base read aligned to: 6 map_331
Starting at: 3 -> -1 -1 1 8 8 2 1 1 1 2 1 -1 1 1 1 1 
Base read aligned to: 2 map_447
Starting at: 27 -> -1 1 -1 1 3 8 8 1 1 2 3 1 1 1 1 1 
Base read aligned to: 3 map_115
Starting at: 0 -> -1 1 -1 1 3 1 1 1 1 1 -1 1 1 1 0 0 
Base read aligned to: 8 map_1579
Starting at: 0 -> 0 -1 -1 -1 2 1 1 1 1 8 8 1 1 1 2 1 
Base read aligned to: 12 map_19
Starting at: 15 -> -1 1 1 1 -1 4 1 1 -1 1 1 1 1 1 1 1 
Base read aligned to: 1 map_1530
Starting at: 11 -> 0 0 0 0 0 1 1 1 1 1 1 1 1 1 2 1 
Base read aligned to: 13 map_1179
Starting at: 12 -> 0 -1 -1 1 1 2 1 1 1 1 1 -1 1 1 1 1 
Base read aligned to: 7 map_955
Starting at: 0 -> -1 1 -1 1 3 1 -1 1 1 1 1 1 1 1 2 1 
Base read aligned to: 10 map_175
Starting at: 9 -> 0 0 0 0 0 1 -1 1 1 1 1 1 1 1 1 1 
Base read aligned to: 9 map_136
Starting at: 0 -> -1 1 -1 1 2 8 8 1 1 1 1 1 1 1 1 1 
Base read aligned to: 4 map_2320
Starting at: 17 -> -1 1 8 8 2 2 1 1 1 1 2 1 1 1 1 1 
Base read aligned to: 5 map_919
Starting at: 0 -> 8 8 -1 1 2 1 1 1 1 1 2 1 1 1 -1 1
"""



In [769]:

    
from collections import Counter
def getCountStat(A, loc):
    alignment = A.strip().split("\n")
    alignment = [alignment[i] for i in range(0, len(alignment)) if i%2 == 1]
    countLoc = []
    for line in alignment:
        line = line.strip().split("->")[1]
        line = line.strip().split(" ")
        countLoc.append(line[loc])
    print(Counter(countLoc))
    print(len(alignment))
getCountStat(A, 1)









    



Counter({'1': 6, '0': 3, '-1': 3, '8': 1})
13



In [495]:

    
def createExcel(copies, tot_del, tot_ins, in_file):
    lines = [line for line in open(in_file)]    
    print("Copies,K,MIN COMMON K IN READS (m),MIN_CONSENSUS (d),Deletions,True +ve (Del),Insertions,True +ve (Ins),Reads aligned (B),Avg S-score (B),Reads aligned (A),Avg S-score (A)")            
    linepset = 11
    for i in range(0, len(lines)/linepset):
        k = lines[0+i*linepset].strip().split(" ")[1]
        m = lines[0+i*linepset].strip().split(" ")[4]
        d = lines[0+i*linepset].strip().split(" ")[8]    
        no_algn_r_B = lines[1+i*linepset].strip().split(":")[1].strip().split("(")[0]
        tot_r = lines[1+i*linepset].strip().split(":")[1].strip().split("(")[1].split(")")[0]
        avg_S_B = round(float(lines[2+i*linepset].strip().split(":")[1].strip()),3)
        no_algn_r_A = lines[3+i*linepset].strip().split(":")[1].strip().split("(")[0]
        avg_S_A = round(float(lines[4+i*linepset].strip().split(":")[1].strip()), 3)
        no_algn_r_A_per = round(((int(no_algn_r_A)*100.0)/int(no_algn_r_B)),2)
        deletion = lines[5+i*linepset].strip().split(":")[1].strip()
        insertion = lines[6+i*linepset].strip().split(":")[1].strip()
        deletion_per = round(((float(deletion)/float(tot_del))*100.0),2)
        insertion_per = round(((float(insertion)/float(tot_ins))*100.0),2)
        correctd = lines[7+i*linepset].strip().split(":")[1].strip()
        incorrectd = lines[8+i*linepset].strip().split(":")[1].strip()
        correcti = lines[9+i*linepset].strip().split(":")[1].strip()
        incorrecti = lines[10+i*linepset].strip().split(":")[1].strip()
        correctdper = round((int(correctd)*100.0)/int(deletion),2)
        correctiper = round((int(correcti)*100.0)/int(insertion),2)
        print(str(copies)+","+str(k)+","+str(m)+","+str(d)+","+str(deletion_per)+"% ["+str(deletion)+"],"+str(correctdper)+"% ["+str(correctd)+"],"+str(insertion_per)+"% ["+str(insertion)+"],"+str(correctiper)+"% ["+str(correcti)+"],"+str(no_algn_r_B)+"["+str(tot_r)+"],"+str(avg_S_B)+","+str(no_algn_r_A_per)+"% ["+str(no_algn_r_A)+"],"+str(avg_S_A))



In [496]:

    
createExcel("500", "11872", "5077", "/s/oak/b/nobackup/darshanw/COmap/test/with-100")









    



Copies,K,MIN COMMON K IN READS (m),MIN_CONSENSUS (d),Deletions,True +ve (Del),Insertions,True +ve (Ins),Reads aligned (B),Avg S-score (B),Reads aligned (A),Avg S-score (A)
500,3,3,3,60.34% [7163],73.82% [5288],100.26% [5090],53.67% [2732],1892[2757],86.836,97.83% [1851],81.611
500,3,3,4,56.87% [6752],74.14% [5006],92.53% [4698],54.83% [2576],1892[2757],86.836,98.78% [1869],82.322
500,3,3,5,54.13% [6426],74.4% [4781],86.59% [4396],55.55% [2442],1892[2757],86.836,99.84% [1889],82.87
500,3,3,6,51.72% [6140],74.36% [4566],81.31% [4128],56.32% [2325],1892[2757],86.836,100.53% [1902],83.264



In [502]:

    
for i in range(0, 10000):
    for j in range(0, 10000):
        a = i + j
print("Done as Always!!")









    



Done as Always!!



In [719]:

    
R = """map_0
	XhoI	XhoI	2.818	53.177	39.813	1.623	4.768	35.427	28.231	45.115	51.285	51.683	9.39	7.196	39.674	2.666	1.943	35.23	18.15	59.354	27.53	13.615	69.173	56.923	2.672	20.22

"""

print(R)

print(reverseOMRead(R))









    



map_0
	XhoI	XhoI	2.818	53.177	39.813	1.623	4.768	35.427	28.231	45.115	51.285	51.683	9.39	7.196	39.674	2.666	1.943	35.23	18.15	59.354	27.53	13.615	69.173	56.923	2.672	20.22


map_0
	XhoI	XhoI	20.22	2.672	56.923	69.173	13.615	27.53	59.354	18.15	35.23	1.943	2.666	39.674	7.196	9.39	51.683	51.285	45.115	28.231	35.427	4.768	1.623	39.813	53.177	2.818



In [715]:

    
def reverseOMRead(R):
    NR = R.strip().split("\n")[0]    
    NR = NR +"\n"
    NR = NR +"\t"+ "\t".join(R.strip().split("\n")[1].strip().split("\t")[0:2]) + "\t"
    NR = NR + "\t".join(R.strip().split("\n")[1].strip().split("\t")[2:][::-1])
    NR = NR + "\n\n"
    return(NR)

def addReverseReads(fname, sname):
    lines  = [line for line in open(fname)]
    slines = [line for line in open(sname)]
    f = open(fname, 'w')
    s = open(sname, 'w')
    for i in range(0,(len(lines)/3)):
        R = lines[i*3 + 0]
        R = R + lines[i*3 + 1]
        R = R + lines[i*3 + 2]
        
        S = slines[i*3 + 0]
        S = S + slines[i*3 + 1]
        S = S + slines[i*3 + 2]
        
        if(bool(random.getrandbits(1)) == True):
            f.write(reverseOMRead(R))
            s.write(reverseOMRead(S))
        else:
            f.write(R)
            s.write(S)
    f.close()



In [716]:

    
addReverseReads("sim_single_molecule_100_newDel","sim_single_molecule_100_efree")



In [652]:

    
t = 0
f = 0
for i in range (0, 100000):
    if(bool(random.getrandbits(1)) == True):
        t+=1
    else:
        f+=1
t









    Out[652]:





50147



In [704]:

    
2+6









    Out[704]:





8



In [720]:

    
A = [1, 2, 3, 4, 5, 6]



In [724]:

    
A[1:-1]









    Out[724]:





[2, 3, 4, 5]



In [726]:

    
def findTrueErrorLocation(E):
    deletion = E.strip().split(":")[0].strip().split(",")
    insertion = E.strip().split(":")[1].strip().split(",")
    insertion = [ (int(insertion[i])+i) for i in range(0, len(insertion))]
    deletion = [ (int(deletion[i])-i) for i in range(0, len(deletion))]
    for z in range(0,len(deletion)):
        less = [i for i in insertion if i < deletion[z]]
        deletion[z] = deletion[z] + len(less)        
    return(deletion, insertion)



In [765]:

    
print(findTrueErrorLocation("2,4,10 : 0,1"))
print("+0 +2 -4 ")









    



([3, 5, 10], [0, 2])
+0 +2 -4



In [741]:

    
def howManyCorrected(dactual, iactual, dcorrected, icorrected):
    correctlyCorrectedDeletions = 0
    correctlyCorrectedInsertions = 0
    incorrectlyCorrectedDeletions = 0
    incorrectlyCorrecteInsertions = 0    
    for d in dcorrected:        
        for delloc in d[1]:
            if delloc in dactual[d[0]][1]:
                correctlyCorrectedDeletions+=1
            else:
                print(delloc)
                incorrectlyCorrectedDeletions+=1                
    for i in icorrected:
        for insloc in i[1]:
            if insloc in iactual[i[0]][1]:
                correctlyCorrectedInsertions+=1
            else:
                incorrectlyCorrecteInsertions+=1                
    return(correctlyCorrectedDeletions,correctlyCorrectedInsertions,incorrectlyCorrectedDeletions,incorrectlyCorrecteInsertions)



In [742]:

    
dactual = []
iactual = []
dcorrected = []
icorrected  = []
dactual.append([0, [5, 8, 16, 16, 28]])
iactual.append([0, [17, 25]])
dcorrected.append([0, [5,8,16,17]])
icorrected.append([0, [17, 25]])
howManyCorrected(dactual, iactual, dcorrected, icorrected)









    



17






    Out[742]:





(3, 2, 1, 0)



In [747]:

    
import subprocess as sp
sp.call('ls')









    Out[747]:





0



In [819]:

    
from timeit import timeit
prg_path = "/s/oak/b/nobackup/darshanw/COmap/"
prg = prg_path + "bin/COmap"
in_file = prg_path + "test/sim_single_molecule_100_newDel"
out_file = prg_path + "test/output/sim_single_molecule_100_corrected"
eloc_file = prg_path + "test/sim_single_molecule_100_elocations"
e_free = prg_path + "test/sim_single_molecule_100_efree"
eloc_corrected = prg_path + "test/output/corrected"
k = 3
d = 3
m = 3
copies = 400
out_file = out_file+"_k"+str(k)+"m"+str(m)+"d"+str(d)+"_"+str(copies)
eloc_corrected = eloc_corrected+"_k"+str(k)+"m"+str(m)+"d"+str(d)+"_"+str(copies)
print timeit(stmt = "sp.call([prg, "-x","-z", "-k", str(k), "-m", str(m), "-d", str(d), "-f",in_file], stdout = open(out_file, "wb"), stderr=sp.STDOUT)", setup = "import subprocess", number = 1)









    



  File "<ipython-input-819-291383618567>", line 15
    print timeit(stmt = "sp.call([prg, "-x","-z", "-k", str(k), "-m", str(m), "-d", str(d), "-f",in_file], stdout = open(out_file, "wb"), stderr=sp.STDOUT)", setup = "import subprocess", number = 1)
                                            ^
SyntaxError: invalid syntax



In [827]:

    
from timeit import timeit
#aa = '-l'
#stmt = "subprocess.call(['ls','"+aa+"'])"
#k = 3
#stmt = "subprocess.call(['"+prg+"', '-x','-z', '-k', '"+str(k)+"', '-m', '"+str(m)+"', '-d', '"+str(d)+"', '-f','"+in_file+"'], stdout = open(out_file, 'wb'), stderr=sp.STDOUT)"
prg_path = "/s/oak/b/nobackup/darshanw/COmap/"
prg = prg_path + "bin/COmap"
in_file = prg_path + "test/sim_single_molecule_100_newDel"
out_file = prg_path + "test/output/sim_single_molecule_100_corrected"
eloc_file = prg_path + "test/sim_single_molecule_100_elocations"
e_free = prg_path + "test/sim_single_molecule_100_efree"
eloc_corrected = prg_path + "test/output/corrected"
k = 8
d = 6
m = 3
copies = 400
time = (timeit(stmt = "subprocess.call(['"+prg+"', '-x','-z', '-k', '"+str(k)+"', '-m', '"+str(m)+"', '-d', '"+str(d)+"', '-f','"+in_file+"'], stdout = open('"+out_file+"', 'wb'), stderr=subprocess.STDOUT)", setup = "import subprocess", number = 1))
print(time)









    



2.83635997772



In [8]:

    
%install_ext https://raw.githubusercontent.com/meduz/ipython_magics/master/tikzmagic.py









    



Installed tikzmagic.py. To use it, type:
  %load_ext tikzmagic



In [13]:

    
%reload_ext tikzmagic



In [14]:

    
%%tikz

%# the number line
\draw [thick,dashed] (-1,0) -- (0,0);
\draw [thick](0,0) -- (9,0);

%# The bucket boundaries
\draw [thick](0,-.4) -- (0, .4);
\draw [thick](2,-.4) -- (2, .4);
\draw [thick](5,-.4) -- (5, .4);
\draw [thick](9,-.4) -- (9, .4);

%# Labels for the bucket boundaries
\node[below] at (0,-.4) {$U(n-3)$};
\node[below] at (2,-.4) {$U(n-2)$};
\node[below] at (5,-.4) {$U(n-1)$};
\node[below] at (9,-.4) {$U(n)$};

%# A hypothetical ideal actual fragment size centered in a bucket
\draw (7,-.2) -- (7, .2);



%# delta measurements
\draw[|<->|] (5,.6) -- (7, .6);
\node[above] at (6,.6) {$\delta$};
\draw[|<->|] (7,.6) -- (9, .6);
\node[above] at (8,.6) {$\delta$};


%# normal code, taken from http://tex.stackexchange.com/questions/43610/plotting-bell-shaped-curve-in-tikz-pgf
%# define normal distribution function 'normaltwo'
\def\normaltwo{\x,{4*1/exp(((\x-3)^2)/2)}}

%# input y parameter
\def\y{4.4}

%# this line calculates f(y)
\def\fy{4*1/exp(((\y-3)^2)/2)}


%# Draw and label normal distribution function
\draw[color=blue,domain=0:6,shift={(4,-5)}] plot (\normaltwo) node[right] {};









    



LaTeX terminated with signal -1
No image generated.






    





This is pdfTeX, Version 3.14159265-2.6-1.40.15 (TeX Live 2014) (preloaded format=pdflatex 2016.2.26)  16 APR 2016 14:06
entering extended mode
 \write18 enabled.
 %&-line parsing enabled.
**tikz.tex
(./tikz.tex
LaTeX2e <2011/06/27>
Babel <3.9k> and hyphenation patterns for 2 languages loaded.

! LaTeX Error: File `standalone.cls' not found.

Type X to quit or <RETURN> to proceed,
or enter new name. (Default extension: cls)

Enter file name: 
! Emergency stop.
<read *> 
         
l.3 \usepackage
               {xcolor}^^M
End of file on the terminal!

 
Here is how much of TeX's memory you used:
 10 strings out of 495025
 184 string characters out of 6181515
 45959 words of memory out of 5000000
 3322 multiletter control sequences out of 15000+600000
 3640 words of font info for 14 fonts, out of 8000000 for 9000
 14 hyphenation exceptions out of 8191
 10i,0n,7p,96b,8s stack positions out of 5000i,500n,10000p,200000b,80000s
!  ==> Fatal error occurred, no output PDF file produced!



In [2]:

    
import subprocess
import time



In [3]:

    
subprocess.Popen('ls')









    Out[3]:





<subprocess.Popen at 0x7ff2a0824e10>



In [14]:

    
comb = [[2,2,2],[3,2,2],[4,2,2],[5,2,2],[6,2,2],[7,2,2],[8,2,2], [3,3,2], [3,4,2], [3,5,2], [3,6,2], [3,7,2], [3,8,2], [3,2,3], [3,2,4], [3,2,5], [3,2,6], [3,2,7], [3,2,8]]



In [15]:

    
for k,m,d in A:
    print(str(k)+" "+str(m)+" "+str(d))



In [26]:

    
!wget ftp://climb.genomics.cn/pub/10.5524/100001_101000/100084/plum.OM.raw/14088LA.maps









    



--2016-05-28 11:58:47--  ftp://climb.genomics.cn/pub/10.5524/100001_101000/100084/plum.OM.raw/14088LA.maps
           => ‘14088LA.maps’
Resolving climb.genomics.cn (climb.genomics.cn)... 218.188.108.76
Connecting to climb.genomics.cn (climb.genomics.cn)|218.188.108.76|:21... connected.
Logging in as anonymous ... Logged in!
==> SYST ... done.    ==> PWD ... done.
==> TYPE I ... done.  ==> CWD (1) /pub/10.5524/100001_101000/100084/plum.OM.raw ... done.
==> SIZE 14088LA.maps ... 15430424
==> PASV ... done.    ==> RETR 14088LA.maps ... done.
Length: 15430424 (15M) (unauthoritative)

14088LA.maps        100%[===================>]  14.71M   496KB/s    in 31s     

2016-05-28 11:59:21 (484 KB/s) - ‘14088LA.maps’ saved [15430424]



In [29]:

    
!wc -l 14088LA.maps









    



348900 14088LA.maps



In [30]:

    
lines = [line for line in open("14088LA.maps")]



In [31]:

    
count = 0
for i in range(0,len(lines)):
    if(i%3 != 1):
        continue
    frags = lines[i].strip().split()
    if(len(frags)-1 > 10):
        count+=1
print(count)



In [34]:

    
filename = "/s/oak/b/nobackup/darshanw/COmap/test/output-200/output.txt"
lines = [line for line in open(filename)]
for i in range(0, len(lines)):
    if((i%13) == 6):
        print(lines[i].strip().split()[4])



In [ ]: