In [7]:
import subprocess as sp
import matplotlib
import Bio.SeqIO
import Bio.Restriction
import scipy
import matplotlib.pyplot as plt
%matplotlib inline
import math
import random
import numpy as np
from timeit import timeit
import time
In [8]:
def FindSandTScores(infile_e, infile_w_e, loc = "/s/oak/b/nobackup/darshanw/valuev_optmap_alignment-master/ovlp/"):
prg = loc + "ovlp"
output = loc+ "TempOutput"
lines_e = [line for line in open(infile_e)]
lines_w_e = [line for line in open(infile_w_e)]
temp_file = loc + "temp"
Naligned = 0
Sscores = []
Tscores = []
#for i in range(0, 2):
for i in range(0, len(lines_e)/3):
t = open(temp_file, 'w')
for j in range(0,3):
t.write(lines_e[i*3+j])
for j in range(0,3):
t.write(lines_w_e[i*3+j])
t.close()
rvalue = sp.check_output([prg, temp_file, output, "0", "1"], stderr=sp.STDOUT)
rvalue = rvalue.strip().split("\n")
scores = rvalue[-1].strip().split(":")
if(len(scores) > 2):
Naligned += 1
continue
else:
Tscores.append([float(scores[1]), i])
scores = rvalue[-2].strip().split(":")
Sscores.append([float(scores[1]), i])
return([Sscores, Tscores, Naligned])
In [9]:
def getCorrespondingValAndPlot(A, B, k, m, d, path, output):
out = []
if(len(A) == 0 or len(B) == 0):
return(out);
Avalues = list(zip(*A)[0])
Aindex = list(zip(*A)[1])
Bvalues = list(zip(*B)[0])
Bindex = list(zip(*B)[1])
for i in range(0, len(Bindex)):
try:
indx = Aindex.index(Bindex[i])
except:
continue
out.append([Avalues[indx], Bvalues[i], Bindex[i]])
plt.plot(zip(*out)[0],zip(*out)[1], "*")
plt.plot([0, 400], [0, 400], 'r-')
plt.xlabel("S-score before correction")
plt.ylabel("S-score after correction")
plt.savefig((path+'k'+str(k)+'m'+str(m)+'d'+str(d)+'.png'), bbox_inches='tight')
plt.show()
improv = []
for v in out:
improv.append(v[1]/v[0])
output.write("S-score is improved for: " + str(len([v for v in out if v[1]>v[0]])) + " out of "+str(len(out))+" reads\n")
plt.hist(improv, 10)
plt.xlabel("Ratio of S-score after and before correction")
plt.ylabel("Frequency")
plt.tight_layout()
plt.savefig((path+'k'+str(k)+'m'+str(m)+'d'+str(d)+'-hist'+'.png'), bbox_inches='tight')
plt.show()
In [10]:
def findTrueErrorLocation(E):
deletion = E.strip().split(":")[0].strip().split(",")
insertion = E.strip().split(":")[1].strip().split(",")
insertion = [ (int(insertion[i])+i) for i in range(0, len(insertion))]
deletion = [ (int(deletion[i])-i) for i in range(0, len(deletion))]
for z in range(0,len(deletion)):
less = [i for i in insertion if i < deletion[z]]
deletion[z] = deletion[z] + len(less)
return(deletion, insertion)
def howManyCorrected(dactual, iactual, dcorrected, icorrected):
correctlyCorrectedDeletions = 0
correctlyCorrectedInsertions = 0
incorrectlyCorrectedDeletions = 0
incorrectlyCorrecteInsertions = 0
for d in dcorrected:
for delloc in d[1]:
if delloc in dactual[d[0]][1]:
correctlyCorrectedDeletions+=1
else:
incorrectlyCorrectedDeletions+=1
for i in icorrected:
for insloc in i[1]:
if insloc in iactual[i[0]][1]:
correctlyCorrectedInsertions+=1
else:
incorrectlyCorrecteInsertions+=1
return(correctlyCorrectedDeletions,correctlyCorrectedInsertions,incorrectlyCorrectedDeletions,incorrectlyCorrecteInsertions)
def findInsertionDeletion(factual, fcorrected, output):
dactual = []
iactual = []
dcorrected = []
icorrected = []
deletions = 0
insertions = 0
lines = [line for line in open(factual)]
for i in range(0, len(lines)):
R = findTrueErrorLocation(lines[i])
dactual.append([i, R[0]])
iactual.append([i, R[1]])
lines = [line for line in open(fcorrected)]
dloc = []
iloc = []
for i in range(0, len(lines)):
if(i%2 == 0):
line = lines[i].strip().split(" ")
dloc = []
iloc = []
for e in line:
if(e[0] == '-'):
e = e[1:]
dloc.append(int(e))
elif(e[0] == '+'):
e = e[1:]
iloc.append(int(e))
else:
deletions += int(lines[i].strip().split(" ")[2])
insertions += int(lines[i].strip().split(" ")[6])
readNo = int(lines[i].strip().split(" ")[0])
dcorrected.append([readNo, dloc])
icorrected.append([readNo, iloc])
ccd, cci, icd, ici = howManyCorrected(dactual, iactual, dcorrected, icorrected)
output.write("Deletions: "+ str(deletions)+"\n")
output.write("Insertions: "+ str(insertions)+"\n")
output.write("Correctly-Corrected-Deletions: "+ str(ccd)+"\n")
output.write("Incorrectly-Corrected-Deletions: "+ str(icd)+"\n")
output.write("Correctly-Corrected-Insertion: "+ str(cci)+"\n")
output.write("Incorrectly-Corrected-Insertion: "+ str(ici)+"\n")
In [11]:
def getAvg(S):
add = 0.0
for x in S:
add=add+x[0]
avg = add/len(S)
return(avg)
In [12]:
def seperateFile(inf, outf, elocf):
inlines = [line for line in open(inf)]
outlines = [line for line in open(outf)]
of = open(outf, "w")
ef = open(elocf, "w")
for i in range(0, (len(outlines)-len(inlines))):
ef.write(outlines[i])
for i in range((len(outlines)-len(inlines)), (len(outlines))):
of.write(outlines[i])
of.close()
ef.close()
In [13]:
def createExcel(copies, tot_del, tot_ins, in_file):
lines = [line for line in open(in_file)]
print("Copies,K,MIN COMMON K IN READS (m),MIN_CONSENSUS (d),Deletions,True +ve (Del),Insertions,True +ve (Ins),Reads aligned (B),Avg S-score (B),Reads aligned (A),Avg S-score (A), Run-time")
linepset = 13
for i in range(0, len(lines)/linepset):
k = lines[0+i*linepset].strip().split(" ")[1]
m = lines[0+i*linepset].strip().split(" ")[4]
d = lines[0+i*linepset].strip().split(" ")[8]
time = round(float(lines[1+i*linepset].strip().split(" ")[4]),2)
no_algn_r_B = lines[2+i*linepset].strip().split(":")[1].strip().split("(")[0]
tot_r = lines[2+i*linepset].strip().split(":")[1].strip().split("(")[1].split(")")[0]
avg_S_B = round(float(lines[3+i*linepset].strip().split(":")[1].strip()),3)
no_algn_r_A = lines[4+i*linepset].strip().split(":")[1].strip().split("(")[0]
avg_S_A = round(float(lines[5+i*linepset].strip().split(":")[1].strip()), 3)
no_algn_r_A_per = round(((int(no_algn_r_A)*100.0)/int(no_algn_r_B)),2)
deletion = lines[7+i*linepset].strip().split(":")[1].strip()
insertion = lines[8+i*linepset].strip().split(":")[1].strip()
deletion_per = round(((float(deletion)/float(tot_del))*100.0),2)
insertion_per = round(((float(insertion)/float(tot_ins))*100.0),2)
correctd = lines[9+i*linepset].strip().split(":")[1].strip()
incorrectd = lines[10+i*linepset].strip().split(":")[1].strip()
correcti = lines[11+i*linepset].strip().split(":")[1].strip()
incorrecti = lines[12+i*linepset].strip().split(":")[1].strip()
correctdper = round((int(correctd)*100.0)/int(deletion),2)
correctiper = round((int(correcti)*100.0)/int(insertion),2)
print(str(copies)+","+str(k)+","+str(m)+","+str(d)+","+str(deletion_per)+"% ["+str(deletion)+"],"+str(correctdper)+"% ["+str(correctd)+"],"+str(insertion_per)+"% ["+str(insertion)+"],"+str(correctiper)+"% ["+str(correcti)+"],"+str(no_algn_r_B)+"["+str(tot_r)+"],"+str(avg_S_B)+","+str(no_algn_r_A_per)+"% ["+str(no_algn_r_A)+"],"+str(avg_S_A)+","+str(time))
In [14]:
createExcel("500", "2230", "927", "/s/oak/b/nobackup/darshanw/COmap/test/output/output.txt")
In [16]:
copies = 500
prg_path = "/s/oak/b/nobackup/darshanw/COmap/"
prg = prg_path + "bin/COmap"
in_file = prg_path + "test/sim_single_molecule_100_newDel"
eloc_file = prg_path + "test/sim_single_molecule_100_elocations"
e_free = prg_path + "test/sim_single_molecule_100_efree"
output = open(prg_path + "test/output/output.txt", 'w', 0)
for k in range(3,4):
for m in range(2,3):
for d in range(2,3):
out_file = prg_path + "test/output/sim_single_molecule_100_corrected"
out_file = out_file+"_k"+str(k)+"m"+str(m)+"d"+str(d)+"_"+str(copies)
eloc_corrected = prg_path + "test/output/corrected"
eloc_corrected = eloc_corrected+"_k"+str(k)+"m"+str(m)+"d"+str(d)+"_"+str(copies)
output.write("k: "+str(k)+" | m: "+str(m) + " | d : " + str(d)+ " > \n")
time = (timeit(stmt = "subprocess.call(['"+prg+"', '-x','-z', '-k', '"+str(k)+"', '-m', '"+str(m)+"', '-d', '"+str(d)+"', '-f','"+in_file+"'], stdout = open('"+out_file+"', 'wb'), stderr=subprocess.STDOUT)", setup = "import subprocess", number = 1))
output.write("Time taken to execute: "+ str(time)+" sec\n")
seperateFile(in_file, out_file, eloc_corrected)
S1, T1, Na1 = FindSandTScores(in_file, e_free)
output.write("aligned without correction :" + str(len(S1)) + "("+ str(len(S1)+Na1) + ")\n")
output.write("S-score average before correction: "+ str(getAvg(S1))+"\n")
S2, T2, Na2 = FindSandTScores(out_file, e_free)
output.write("aligned after correction :" + str(len(S2)) + "("+ str(len(S2)+Na2) + ")\n")
output.write("S-score average after correction: "+ str(getAvg(S2))+"\n")
getCorrespondingValAndPlot(S1,S2, k, m, d, (prg_path+'test/images/'), output)
#sp.call([prg, "-x", "-k", str(k), "-m", str(m), "-d", str(d), "-f",in_file], stdout = open(eloc_corrected, "wb"), stderr=sp.STDOUT)
findInsertionDeletion(eloc_file, eloc_corrected, output)
output.close()
In [ ]:
def getCorrespondingValAndPlot(A, B, k, m, d, path, output):
out = []
if(len(A) == 0 or len(B) == 0):
return(out);
Avalues = list(zip(*A)[0])
Aindex = list(zip(*A)[1])
Bvalues = list(zip(*B)[0])
Bindex = list(zip(*B)[1])
for i in range(0, len(Bindex)):
try:
indx = Aindex.index(Bindex[i])
except:
continue
out.append([Avalues[indx], Bvalues[i], Bindex[i]])
plt.plot(zip(*out)[0],zip(*out)[1], "*")
plt.plot([0, 400], [0, 400], 'r-')
plt.xlabel("Rmap quality score before error correction")
plt.ylabel("Rmap quality score after error correction")
plt.savefig((path+'k'+str(k)+'m'+str(m)+'d'+str(d)+'.png'), bbox_inches='tight', dpi=300)
plt.show()
improv = []
for v in out:
improv.append(v[1]/v[0])
print("Rmap quality is improved for: " + str(len([v for v in out if v[1]>v[0]])) + " out of "+str(len(out))+" reads\n")
plt.hist(improv, 15)
plt.xlabel("Ratio of the quality scores after divided by before error correction")
plt.ylabel("Number of Rmaps")
plt.tight_layout()
plt.savefig((path+'k'+str(k)+'m'+str(m)+'d'+str(d)+'-hist'+'.png'), bbox_inches='tight', dpi=300)
plt.show()
In [ ]:
getCorrespondingValAndPlot(S1,S2, 3, 2, 2, (prg_path+'test/images/'), output)
In [ ]:
In [ ]:
In [ ]:
In [287]:
findInsertionDeletion(eloc_file, eloc_corrected)
In [410]:
N = [[4,5],[6,1],[3,4], [6,10], [6,4]]
print(len([n for n in N if n[1]>=n[0]]))
In [8]:
round(4.12521,2)
Out[8]:
In [543]:
sp.call(["mkdir", "kk"])
sp.call(["mkdir", "keku"])
Out[543]:
In [7]:
1+8
Out[7]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]: