In [1]:
%matplotlib inline

In [2]:
import re
import itertools

In [3]:
r1_file = "R1_10000.fa"
r2_file = "R2_10000.fa"

merge_strategies = {"bayes":"pandaseq_subset/panda_bayes.fa",
                    "pear": "pandaseq_subset/panda_pear.fa",
                    "rdp_mle": "pandaseq_subset/panda_rdp.fa",
                    "usearch": "usearch_subset/usearch_hsp8.fa",
                    "usearch_mm5": "usearch_subset/usearch_hsp8_mm5.fa"
                    }

HEADER=0
SEQ=1

In [4]:
def loadFasta(filename):
    result = []
    fh = open(filename)
    current_header = " "
    for line in fh:
        if line.startswith(">"):
            current_header = line.rstrip("\n").lstrip(">").rsplit(" ",1)[0]
        else:
            result.append( (current_header, line.rstrip("\n")) )
    return result

In [5]:
def reverseComplement(dna_seq):
    return dna_seq.upper().replace("A","t").replace("C","g").replace("G","c").replace("T","a")[::-1].upper()

Load forward and reverse reads


In [6]:
r1 = loadFasta(r1_file)
r1 = dict(r1)
r2 = loadFasta(r2_file)
r2 = dict( [(h,reverseComplement(s)) for h,s in r2  ] )
print len(r1)
print len(r2)


10000
10000

In [7]:
#Returns length of alignment and %id 
def statsAlignment(r1,r2,merged):
    align_len = len(r1) + len(r2) - len(merged)
    r1_seg = r1[len(r1)-align_len : len(r1)].upper()
    r2_seg = r2[:align_len].upper()
    
    matches = 0
    for r1_nucl,r2_nucl in zip(r1_seg,r2_seg):
        matches += 1 if r1_nucl == r2_nucl else 0
    
    mismatches = align_len - matches
    return (align_len,mismatches)

In [8]:
def visualizeAlignment(r1,r2,merged):
    alignment_len = len(r1) + len(r2) - len(merged)
    
    r1_seg = r1_seq[len(r1)-alignment_len:]
    merged_seg = merged_seq[len(r1)-alignment_len:len(r1)]
    r2_seg = r2_seq[:alignment_len]
    
    assert (len(r1_seg) == len(merged_seg) and len(r2_seg) == len(merged_seg))
    
    matches = []
    for idx in range(len(merged_seg)):
        matches.append("|" if r2_seg[idx] == r1_seg[idx] else " ")
        
    matches = "".join(matches)
    return "\n".join([r1_seg,matches,merged_seg,matches,r2_seg])+"\n\n"

In [9]:
total_merged = {}
total_filtered = {}
alignment_stats = {}
for strategy in merge_strategies:
    #Load strategy and remove pandaseq label from the read id
    merged_fasta =  loadFasta(merge_strategies[strategy])
    merged_fasta_mod = [( (h.rsplit(":",1)[0] if re.search(r":[NACGT]+$",h) else h) ,s) for h,s in merged_fasta]
    
    #Initialize variables for the stategy
    total_merged[strategy]  = len(merged_fasta_mod)
    total_filtered[strategy] = 0
    alignment_stats[strategy] = []
    
    with open(strategy+".align", "w") as alignment_fh:
        for merged_header,merged_seq in merged_fasta_mod:
            assert (merged_header in r1 and merged_header in r2)
            r1_seq = r1[merged_header]
            r2_seq = r2[merged_header] #It is already reverse complemented before
            align_len, mismatches = statsAlignment(r1_seq,r2_seq,merged_seq)
            #Store alignment stats for further examination
            alignment_stats[strategy].append( (align_len,mismatches) )
            #Filter sequence if align_len <20 or more than 5 mismatches
            if align_len > 20 and mismatches <= 5:
                alignment_fh.write(visualizeAlignment(r1_seq,r2_seq,merged_seq)+"\n")
            else:
                total_filtered[strategy] += 1

Plotting


In [10]:
#Plot stats
import numpy as np
import matplotlib.pyplot as plt
from scipy.interpolate import interp1d
from collections import defaultdict

Plot merged and filtered for each strategy


In [11]:
N = len(merge_strategies)
labels = [x for x in merge_strategies]

kept_seqs   = [total_merged[x]-total_filtered[x] for x in labels]
filtered_seqs = [total_filtered[x] for x in labels]
ind = np.arange(N)    # the x locations for the groups
width = 0.55       # the width of the bars: can also be len(x) sequence

p1 = plt.bar(ind, kept_seqs,   width, color='g', )
p2 = plt.bar(ind, filtered_seqs, width, color='r',
             bottom=kept_seqs)

plt.ylabel('# of merged seqs')
plt.title('Merged sequences per algorythm')
plt.xticks(ind+width/2., labels )
plt.legend( (p1[0], p2[0]), ('Kept', 'Filtered') )

plt.show()



In [12]:
#If data points are few , calculate density of each point manually
density = defaultdict(lambda : defaultdict(int))
for strategy in alignment_stats:
    for stat in alignment_stats[strategy]:
        density[strategy][stat] += 1

In [13]:
align_len_np= {}
pct_id_np = {}
mismatch_np = {}
density_np = {}
for strategy in density:
    align_len_np[strategy] = np.array([x[0] for x in density[strategy]])  
    mismatch_np[strategy] = np.array([x[1] for x in density[strategy]])
    pct_id_np[strategy] = (align_len_np[strategy] - mismatch_np[strategy])/ align_len_np[strategy].astype(float)
    density_np[strategy] = np.array( [density[strategy][x] for x in density[strategy]] )

In [22]:
plt.figure(figsize=(7,14))
i=1
for strategy in sorted(density):
    plt.subplot(len(density)*100+10+i)
    plt.title(strategy)
    plt.scatter( align_len_np[strategy] , pct_id_np[strategy] ,c=density_np[strategy],vmin=0,vmax=100)
    plt.colorbar()
    i+=1
plt.tight_layout()
plt.show()
#plt.savefig("alignlen_vs_pctid.pdf")



In [20]:
plt.figure(figsize=(8,16))
i=1
for strategy in sorted(density):
    plt.subplot(len(density)*100+10+i)
    plt.title(strategy)
    plt.scatter( align_len_np[strategy] , mismatch_np[strategy] ,c=density_np[strategy],vmin=0,vmax=100)
    plt.colorbar()
    i+=1
plt.tight_layout()
plt.show()



In [16]:
i=1
plt.figure(figsize=(8,16))
for strategy in sorted(density):
    #strategy="rdp_mle"
    plt.subplot(len(density)*100+10+i)
    plt.hist(  (align_len_np[strategy][mismatch_np[strategy]<=5],align_len_np[strategy][mismatch_np[strategy]>5]), label=['mm <= 5', 'mm > 5' ],color=['green','red'])
    plt.legend()
    _ = plt.title(strategy.upper()+" - Histogram of seqs according to overlap length")
    i+=1
plt.tight_layout()
plt.show()



In [17]:
i=1
plt.figure(figsize=(8,16))
for strategy in sorted(density):
    #strategy="rdp_mle"
    plt.subplot(len(density)*100+10+i)
    plt.hist( mismatch_np[strategy][align_len_np[strategy] >= 20])
    _ = plt.title(strategy.upper()+" -  Mismatch distribution in when overlap >= 20 bp")
    i+=1
plt.tight_layout()
plt.show()