notebook.community

Edit and run



In [7]:

    
import subprocess as sp
import matplotlib
import Bio.SeqIO
import Bio.Restriction
import scipy
import matplotlib.pyplot as plt
%matplotlib inline
import math
import random
import numpy as np
from timeit import timeit
import time



In [8]:

    
def FindSandTScores(infile_e, infile_w_e, loc = "/s/oak/b/nobackup/darshanw/valuev_optmap_alignment-master/ovlp/"):     
    prg = loc + "ovlp"
    output = loc+ "TempOutput"
    lines_e = [line for line in open(infile_e)]
    lines_w_e = [line for line in open(infile_w_e)]
    temp_file = loc + "temp"

    Naligned = 0
    Sscores = []
    Tscores = []
    #for i in range(0, 2):
    for i in range(0, len(lines_e)/3):
        t = open(temp_file, 'w')
        for j in range(0,3):
            t.write(lines_e[i*3+j])
        for j in range(0,3):
            t.write(lines_w_e[i*3+j])
        t.close()
        rvalue = sp.check_output([prg, temp_file, output, "0", "1"], stderr=sp.STDOUT)
        rvalue = rvalue.strip().split("\n")
        scores = rvalue[-1].strip().split(":")
        if(len(scores) > 2):
            Naligned += 1
            continue
        else:
            Tscores.append([float(scores[1]), i])
            scores = rvalue[-2].strip().split(":")
            Sscores.append([float(scores[1]), i])
    
    return([Sscores, Tscores, Naligned])



In [9]:

    
def getCorrespondingValAndPlot(A, B, k, m, d, path, output):
    out = []
    if(len(A) == 0 or len(B) == 0):
        return(out);
    Avalues = list(zip(*A)[0])
    Aindex = list(zip(*A)[1])
    Bvalues = list(zip(*B)[0])
    Bindex = list(zip(*B)[1])
    
    for i in range(0, len(Bindex)):
        try:
            indx = Aindex.index(Bindex[i])
        except:
            continue
        out.append([Avalues[indx], Bvalues[i], Bindex[i]])

    plt.plot(zip(*out)[0],zip(*out)[1], "*")
    plt.plot([0, 400], [0, 400], 'r-')
    plt.xlabel("S-score before correction")
    plt.ylabel("S-score after correction")
    plt.savefig((path+'k'+str(k)+'m'+str(m)+'d'+str(d)+'.png'), bbox_inches='tight')
    plt.show()
    
    improv = []
    for v in out:
        improv.append(v[1]/v[0])
    output.write("S-score is improved for: " + str(len([v for v in out if v[1]>v[0]])) + " out of "+str(len(out))+" reads\n")
    plt.hist(improv, 10)
    plt.xlabel("Ratio of S-score after and before correction")
    plt.ylabel("Frequency")
    plt.tight_layout()    
    plt.savefig((path+'k'+str(k)+'m'+str(m)+'d'+str(d)+'-hist'+'.png'), bbox_inches='tight')
    plt.show()



In [10]:

    
def findTrueErrorLocation(E):
    deletion = E.strip().split(":")[0].strip().split(",")
    insertion = E.strip().split(":")[1].strip().split(",")
    insertion = [ (int(insertion[i])+i) for i in range(0, len(insertion))]
    deletion = [ (int(deletion[i])-i) for i in range(0, len(deletion))]
    for z in range(0,len(deletion)):
        less = [i for i in insertion if i < deletion[z]]
        deletion[z] = deletion[z] + len(less)        
    return(deletion, insertion)

def howManyCorrected(dactual, iactual, dcorrected, icorrected):
    correctlyCorrectedDeletions = 0
    correctlyCorrectedInsertions = 0
    incorrectlyCorrectedDeletions = 0
    incorrectlyCorrecteInsertions = 0
    for d in dcorrected:
        for delloc in d[1]:
            if delloc in dactual[d[0]][1]:
                correctlyCorrectedDeletions+=1
            else:
                incorrectlyCorrectedDeletions+=1                
    for i in icorrected:
        for insloc in i[1]:
            if insloc in iactual[i[0]][1]:
                correctlyCorrectedInsertions+=1
            else:
                incorrectlyCorrecteInsertions+=1                
    return(correctlyCorrectedDeletions,correctlyCorrectedInsertions,incorrectlyCorrectedDeletions,incorrectlyCorrecteInsertions)

def findInsertionDeletion(factual, fcorrected, output):
    dactual = []
    iactual = []
    dcorrected = []
    icorrected = []
    deletions = 0
    insertions = 0
    lines = [line for line in open(factual)]
    for i in range(0, len(lines)):
        R = findTrueErrorLocation(lines[i])
        dactual.append([i, R[0]])
        iactual.append([i, R[1]])
    lines = [line for line in open(fcorrected)]
    dloc = []
    iloc = []    
    for i in range(0, len(lines)):        
        if(i%2 == 0):
            line = lines[i].strip().split(" ")
            dloc = []
            iloc = []
            for e in line:
                if(e[0] == '-'):                    
                    e = e[1:]
                    dloc.append(int(e))
                elif(e[0] == '+'):
                    e = e[1:]
                    iloc.append(int(e))
        else:            
            deletions += int(lines[i].strip().split(" ")[2])
            insertions += int(lines[i].strip().split(" ")[6])
            readNo = int(lines[i].strip().split(" ")[0])
            dcorrected.append([readNo, dloc])
            icorrected.append([readNo, iloc])
    
    ccd, cci, icd, ici = howManyCorrected(dactual, iactual, dcorrected, icorrected)
    output.write("Deletions: "+ str(deletions)+"\n")
    output.write("Insertions: "+ str(insertions)+"\n")
    output.write("Correctly-Corrected-Deletions: "+ str(ccd)+"\n")
    output.write("Incorrectly-Corrected-Deletions: "+ str(icd)+"\n")
    output.write("Correctly-Corrected-Insertion: "+ str(cci)+"\n")
    output.write("Incorrectly-Corrected-Insertion: "+ str(ici)+"\n")



In [11]:

    
def getAvg(S):
    add = 0.0
    for x in S:
        add=add+x[0]
    avg = add/len(S)
    return(avg)



In [12]:

    
def seperateFile(inf, outf, elocf):
    inlines = [line for line in open(inf)]
    outlines = [line for line in open(outf)]
    of = open(outf, "w")
    ef = open(elocf, "w")
    for i in range(0, (len(outlines)-len(inlines))):
        ef.write(outlines[i])
    for i in range((len(outlines)-len(inlines)), (len(outlines))):
        of.write(outlines[i])
    of.close()
    ef.close()



In [13]:

    
def createExcel(copies, tot_del, tot_ins, in_file):
    lines = [line for line in open(in_file)]    
    print("Copies,K,MIN COMMON K IN READS (m),MIN_CONSENSUS (d),Deletions,True +ve (Del),Insertions,True +ve (Ins),Reads aligned (B),Avg S-score (B),Reads aligned (A),Avg S-score (A), Run-time")            
    linepset = 13
    for i in range(0, len(lines)/linepset):
        k = lines[0+i*linepset].strip().split(" ")[1]
        m = lines[0+i*linepset].strip().split(" ")[4]
        d = lines[0+i*linepset].strip().split(" ")[8]
        time = round(float(lines[1+i*linepset].strip().split(" ")[4]),2)
        no_algn_r_B = lines[2+i*linepset].strip().split(":")[1].strip().split("(")[0]
        tot_r = lines[2+i*linepset].strip().split(":")[1].strip().split("(")[1].split(")")[0]
        avg_S_B = round(float(lines[3+i*linepset].strip().split(":")[1].strip()),3)
        no_algn_r_A = lines[4+i*linepset].strip().split(":")[1].strip().split("(")[0]
        avg_S_A = round(float(lines[5+i*linepset].strip().split(":")[1].strip()), 3)
        no_algn_r_A_per = round(((int(no_algn_r_A)*100.0)/int(no_algn_r_B)),2)        
        deletion = lines[7+i*linepset].strip().split(":")[1].strip()
        insertion = lines[8+i*linepset].strip().split(":")[1].strip()
        deletion_per = round(((float(deletion)/float(tot_del))*100.0),2)
        insertion_per = round(((float(insertion)/float(tot_ins))*100.0),2)
        correctd = lines[9+i*linepset].strip().split(":")[1].strip()
        incorrectd = lines[10+i*linepset].strip().split(":")[1].strip()
        correcti = lines[11+i*linepset].strip().split(":")[1].strip()
        incorrecti = lines[12+i*linepset].strip().split(":")[1].strip()
        correctdper = round((int(correctd)*100.0)/int(deletion),2)
        correctiper = round((int(correcti)*100.0)/int(insertion),2)
        print(str(copies)+","+str(k)+","+str(m)+","+str(d)+","+str(deletion_per)+"% ["+str(deletion)+"],"+str(correctdper)+"% ["+str(correctd)+"],"+str(insertion_per)+"% ["+str(insertion)+"],"+str(correctiper)+"% ["+str(correcti)+"],"+str(no_algn_r_B)+"["+str(tot_r)+"],"+str(avg_S_B)+","+str(no_algn_r_A_per)+"% ["+str(no_algn_r_A)+"],"+str(avg_S_A)+","+str(time))



In [14]:

    
createExcel("500", "2230", "927", "/s/oak/b/nobackup/darshanw/COmap/test/output/output.txt")









    



Copies,K,MIN COMMON K IN READS (m),MIN_CONSENSUS (d),Deletions,True +ve (Del),Insertions,True +ve (Ins),Reads aligned (B),Avg S-score (B),Reads aligned (A),Avg S-score (A), Run-time



In [16]:

    
copies = 500
prg_path = "/s/oak/b/nobackup/darshanw/COmap/"
prg = prg_path + "bin/COmap"
in_file = prg_path + "test/sim_single_molecule_100_newDel"
eloc_file = prg_path + "test/sim_single_molecule_100_elocations"
e_free = prg_path + "test/sim_single_molecule_100_efree"
output = open(prg_path + "test/output/output.txt", 'w', 0)
for k in range(3,4):
    for m in range(2,3):
        for d in range(2,3):            
            out_file = prg_path + "test/output/sim_single_molecule_100_corrected"
            out_file = out_file+"_k"+str(k)+"m"+str(m)+"d"+str(d)+"_"+str(copies)
            eloc_corrected = prg_path + "test/output/corrected"            
            eloc_corrected = eloc_corrected+"_k"+str(k)+"m"+str(m)+"d"+str(d)+"_"+str(copies)                        
            
            output.write("k: "+str(k)+" | m: "+str(m) + " | d : " + str(d)+ " > \n")            
            time = (timeit(stmt = "subprocess.call(['"+prg+"', '-x','-z', '-k', '"+str(k)+"', '-m', '"+str(m)+"', '-d', '"+str(d)+"', '-f','"+in_file+"'], stdout = open('"+out_file+"', 'wb'), stderr=subprocess.STDOUT)", setup = "import subprocess", number = 1))
            output.write("Time taken to execute: "+ str(time)+" sec\n")
            seperateFile(in_file, out_file, eloc_corrected)
            S1, T1, Na1 = FindSandTScores(in_file, e_free)          
            output.write("aligned without correction :" + str(len(S1)) + "("+ str(len(S1)+Na1) + ")\n")
            output.write("S-score average before correction: "+ str(getAvg(S1))+"\n")
            S2, T2, Na2 = FindSandTScores(out_file, e_free)
            output.write("aligned after correction :" + str(len(S2)) + "("+ str(len(S2)+Na2) + ")\n")
            output.write("S-score average after correction: "+ str(getAvg(S2))+"\n")
            getCorrespondingValAndPlot(S1,S2, k, m, d, (prg_path+'test/images/'), output)
            #sp.call([prg, "-x", "-k", str(k), "-m", str(m), "-d", str(d), "-f",in_file], stdout = open(eloc_corrected, "wb"), stderr=sp.STDOUT)            
            findInsertionDeletion(eloc_file, eloc_corrected, output)
output.close()









    



---------------------------------------------------------------------------
KeyboardInterrupt                         Traceback (most recent call last)
<ipython-input-16-5ded858b5a22> in <module>()
     15 
     16             output.write("k: "+str(k)+" | m: "+str(m) + " | d : " + str(d)+ " > \n")
---> 17             time = (timeit(stmt = "subprocess.call(['"+prg+"', '-x','-z', '-k', '"+str(k)+"', '-m', '"+str(m)+"', '-d', '"+str(d)+"', '-f','"+in_file+"'], stdout = open('"+out_file+"', 'wb'), stderr=subprocess.STDOUT)", setup = "import subprocess", number = 1))
     18             output.write("Time taken to execute: "+ str(time)+" sec\n")
     19             seperateFile(in_file, out_file, eloc_corrected)

/usr/lib64/python2.7/timeit.pyc in timeit(stmt, setup, timer, number)
    235            number=default_number):
    236     """Convenience function to create Timer object and call timeit method."""
--> 237     return Timer(stmt, setup, timer).timeit(number)
    238 
    239 def repeat(stmt="pass", setup="pass", timer=default_timer,

/usr/lib64/python2.7/timeit.pyc in timeit(self, number)
    200         gc.disable()
    201         try:
--> 202             timing = self.inner(it, self.timer)
    203         finally:
    204             if gcold:

/usr/lib64/python2.7/timeit.pyc in inner(_it, _timer)

/usr/lib64/python2.7/subprocess.pyc in call(*popenargs, **kwargs)
    520     retcode = call(["ls", "-l"])
    521     """
--> 522     return Popen(*popenargs, **kwargs).wait()
    523 
    524 

/usr/lib64/python2.7/subprocess.pyc in wait(self)
   1382             while self.returncode is None:
   1383                 try:
-> 1384                     pid, sts = _eintr_retry_call(os.waitpid, self.pid, 0)
   1385                 except OSError as e:
   1386                     if e.errno != errno.ECHILD:

/usr/lib64/python2.7/subprocess.pyc in _eintr_retry_call(func, *args)
    474     while True:
    475         try:
--> 476             return func(*args)
    477         except (OSError, IOError) as e:
    478             if e.errno == errno.EINTR:

KeyboardInterrupt:



In [ ]:

    
def getCorrespondingValAndPlot(A, B, k, m, d, path, output):
    out = []
    if(len(A) == 0 or len(B) == 0):
        return(out);
    Avalues = list(zip(*A)[0])
    Aindex = list(zip(*A)[1])
    Bvalues = list(zip(*B)[0])
    Bindex = list(zip(*B)[1])
    
    for i in range(0, len(Bindex)):
        try:
            indx = Aindex.index(Bindex[i])
        except:
            continue
        out.append([Avalues[indx], Bvalues[i], Bindex[i]])

    plt.plot(zip(*out)[0],zip(*out)[1], "*")
    plt.plot([0, 400], [0, 400], 'r-')
    plt.xlabel("Rmap quality score before error correction")
    plt.ylabel("Rmap quality score after error correction")
    plt.savefig((path+'k'+str(k)+'m'+str(m)+'d'+str(d)+'.png'), bbox_inches='tight', dpi=300)
    plt.show()
    
    improv = []
    for v in out:
        improv.append(v[1]/v[0])
    print("Rmap quality is improved for: " + str(len([v for v in out if v[1]>v[0]])) + " out of "+str(len(out))+" reads\n")    
    plt.hist(improv, 15)    
    plt.xlabel("Ratio of the quality scores after divided by before error correction")
    plt.ylabel("Number of Rmaps")
    plt.tight_layout()    
    plt.savefig((path+'k'+str(k)+'m'+str(m)+'d'+str(d)+'-hist'+'.png'), bbox_inches='tight', dpi=300)
    plt.show()



In [ ]:

    
getCorrespondingValAndPlot(S1,S2, 3, 2, 2, (prg_path+'test/images/'), output)



In [ ]:



In [ ]:



In [ ]:



In [287]:

    
findInsertionDeletion(eloc_file, eloc_corrected)









    



Deletions: 17
Insertions: 5
Correctly-Corrected-Deletions: 12
Incorrectly-Corrected-Deletions: 5
Correctly-Corrected-Insertion: 3
Incorrectly-Corrected-Insertion: 2



In [410]:

    
N = [[4,5],[6,1],[3,4], [6,10], [6,4]]
print(len([n for n in N if n[1]>=n[0]]))



In [8]:

    
round(4.12521,2)









    Out[8]:





4.13



In [543]:

    
sp.call(["mkdir", "kk"])
sp.call(["mkdir", "keku"])









    Out[543]:





0



In [7]:

    
1+8









    Out[7]:





9



In [ ]:



In [ ]:



In [ ]:



In [ ]:



In [ ]: