In [1]:
%matplotlib inline
In [2]:
import re
import itertools
In [3]:
r1_file = "R1_10000.fa"
r2_file = "R2_10000.fa"
merge_strategies = {"bayes":"pandaseq_subset/panda_bayes.fa",
"pear": "pandaseq_subset/panda_pear.fa",
"rdp_mle": "pandaseq_subset/panda_rdp.fa",
"usearch": "usearch_subset/usearch_hsp8.fa",
"usearch_mm5": "usearch_subset/usearch_hsp8_mm5.fa"
}
HEADER=0
SEQ=1
In [4]:
def loadFasta(filename):
result = []
fh = open(filename)
current_header = " "
for line in fh:
if line.startswith(">"):
current_header = line.rstrip("\n").lstrip(">").rsplit(" ",1)[0]
else:
result.append( (current_header, line.rstrip("\n")) )
return result
In [5]:
def reverseComplement(dna_seq):
return dna_seq.upper().replace("A","t").replace("C","g").replace("G","c").replace("T","a")[::-1].upper()
In [6]:
r1 = loadFasta(r1_file)
r1 = dict(r1)
r2 = loadFasta(r2_file)
r2 = dict( [(h,reverseComplement(s)) for h,s in r2 ] )
print len(r1)
print len(r2)
In [7]:
#Returns length of alignment and %id
def statsAlignment(r1,r2,merged):
align_len = len(r1) + len(r2) - len(merged)
r1_seg = r1[len(r1)-align_len : len(r1)].upper()
r2_seg = r2[:align_len].upper()
matches = 0
for r1_nucl,r2_nucl in zip(r1_seg,r2_seg):
matches += 1 if r1_nucl == r2_nucl else 0
mismatches = align_len - matches
return (align_len,mismatches)
In [8]:
def visualizeAlignment(r1,r2,merged):
alignment_len = len(r1) + len(r2) - len(merged)
r1_seg = r1_seq[len(r1)-alignment_len:]
merged_seg = merged_seq[len(r1)-alignment_len:len(r1)]
r2_seg = r2_seq[:alignment_len]
assert (len(r1_seg) == len(merged_seg) and len(r2_seg) == len(merged_seg))
matches = []
for idx in range(len(merged_seg)):
matches.append("|" if r2_seg[idx] == r1_seg[idx] else " ")
matches = "".join(matches)
return "\n".join([r1_seg,matches,merged_seg,matches,r2_seg])+"\n\n"
In [9]:
total_merged = {}
total_filtered = {}
alignment_stats = {}
for strategy in merge_strategies:
#Load strategy and remove pandaseq label from the read id
merged_fasta = loadFasta(merge_strategies[strategy])
merged_fasta_mod = [( (h.rsplit(":",1)[0] if re.search(r":[NACGT]+$",h) else h) ,s) for h,s in merged_fasta]
#Initialize variables for the stategy
total_merged[strategy] = len(merged_fasta_mod)
total_filtered[strategy] = 0
alignment_stats[strategy] = []
with open(strategy+".align", "w") as alignment_fh:
for merged_header,merged_seq in merged_fasta_mod:
assert (merged_header in r1 and merged_header in r2)
r1_seq = r1[merged_header]
r2_seq = r2[merged_header] #It is already reverse complemented before
align_len, mismatches = statsAlignment(r1_seq,r2_seq,merged_seq)
#Store alignment stats for further examination
alignment_stats[strategy].append( (align_len,mismatches) )
#Filter sequence if align_len <20 or more than 5 mismatches
if align_len > 20 and mismatches <= 5:
alignment_fh.write(visualizeAlignment(r1_seq,r2_seq,merged_seq)+"\n")
else:
total_filtered[strategy] += 1
In [10]:
#Plot stats
import numpy as np
import matplotlib.pyplot as plt
from scipy.interpolate import interp1d
from collections import defaultdict
Plot merged and filtered for each strategy
In [11]:
N = len(merge_strategies)
labels = [x for x in merge_strategies]
kept_seqs = [total_merged[x]-total_filtered[x] for x in labels]
filtered_seqs = [total_filtered[x] for x in labels]
ind = np.arange(N) # the x locations for the groups
width = 0.55 # the width of the bars: can also be len(x) sequence
p1 = plt.bar(ind, kept_seqs, width, color='g', )
p2 = plt.bar(ind, filtered_seqs, width, color='r',
bottom=kept_seqs)
plt.ylabel('# of merged seqs')
plt.title('Merged sequences per algorythm')
plt.xticks(ind+width/2., labels )
plt.legend( (p1[0], p2[0]), ('Kept', 'Filtered') )
plt.show()
In [12]:
#If data points are few , calculate density of each point manually
density = defaultdict(lambda : defaultdict(int))
for strategy in alignment_stats:
for stat in alignment_stats[strategy]:
density[strategy][stat] += 1
In [13]:
align_len_np= {}
pct_id_np = {}
mismatch_np = {}
density_np = {}
for strategy in density:
align_len_np[strategy] = np.array([x[0] for x in density[strategy]])
mismatch_np[strategy] = np.array([x[1] for x in density[strategy]])
pct_id_np[strategy] = (align_len_np[strategy] - mismatch_np[strategy])/ align_len_np[strategy].astype(float)
density_np[strategy] = np.array( [density[strategy][x] for x in density[strategy]] )
In [22]:
plt.figure(figsize=(7,14))
i=1
for strategy in sorted(density):
plt.subplot(len(density)*100+10+i)
plt.title(strategy)
plt.scatter( align_len_np[strategy] , pct_id_np[strategy] ,c=density_np[strategy],vmin=0,vmax=100)
plt.colorbar()
i+=1
plt.tight_layout()
plt.show()
#plt.savefig("alignlen_vs_pctid.pdf")
In [20]:
plt.figure(figsize=(8,16))
i=1
for strategy in sorted(density):
plt.subplot(len(density)*100+10+i)
plt.title(strategy)
plt.scatter( align_len_np[strategy] , mismatch_np[strategy] ,c=density_np[strategy],vmin=0,vmax=100)
plt.colorbar()
i+=1
plt.tight_layout()
plt.show()
In [16]:
i=1
plt.figure(figsize=(8,16))
for strategy in sorted(density):
#strategy="rdp_mle"
plt.subplot(len(density)*100+10+i)
plt.hist( (align_len_np[strategy][mismatch_np[strategy]<=5],align_len_np[strategy][mismatch_np[strategy]>5]), label=['mm <= 5', 'mm > 5' ],color=['green','red'])
plt.legend()
_ = plt.title(strategy.upper()+" - Histogram of seqs according to overlap length")
i+=1
plt.tight_layout()
plt.show()
In [17]:
i=1
plt.figure(figsize=(8,16))
for strategy in sorted(density):
#strategy="rdp_mle"
plt.subplot(len(density)*100+10+i)
plt.hist( mismatch_np[strategy][align_len_np[strategy] >= 20])
_ = plt.title(strategy.upper()+" - Mismatch distribution in when overlap >= 20 bp")
i+=1
plt.tight_layout()
plt.show()