In [1]:
%matplotlib inline
In [2]:
import pandas as pd
import numpy as np
import itertools
from collections import Counter
import re
from glob import glob
import subprocess
from matplotlib import pyplot as plt
In [3]:
def load_fastqc_txt(fastqc_filename):
fastqc_out = {}
module_name = ""
module_output= []
with open(fastqc_filename) as fastqc_fh:
for line in fastqc_fh:
line = line.rstrip("\n")
if line.startswith(">>END_MODULE"):
fastqc_out[module_name] = module_output
module_output = []
elif line.startswith(">>"):
module_name = line.lstrip(">").split("\t")[0]
else:
module_output.append(line)
return fastqc_out
In [29]:
def parse_fastq(fastq_file):
stats = {}
cat_zcat = "zcat" if fastq_file.endswith(".gz") else "cat"
read_lengths = subprocess.Popen(cat_zcat+' '+fastq_file+' | sed -n "2~4p" | awk "{print length($1)}" ',shell=True,stdout=subprocess.PIPE).stdout
read_lengths = Counter(int(x) for x in read_lengths)
stats["filename"] = fastq_file
stats["n_bases"] = sum( k*read_lengths[k] for k in read_lengths )
stats["n_reads"] = sum( read_lengths[k] for k in read_lengths )
stats["read_len_freqs"] = read_lengths
return stats
In [82]:
def plot_read_len_dist(stats_obj_list):
if len(stats_obj_list) == 3:
fig1, fig_axes = plt.subplots(nrows=3,ncols=1)
else:
fig1, fig_axes = plt.subplots(nrows=2,ncols=2)
fig_axes = itertools.chain(*fig_axes)
for ax,stats_obj in zip(fig_axes,stats_obj_list):
#Calculate histogram range
hist_range = range(min(stats_obj["read_len_freqs"]), max(stats_obj["read_len_freqs"])+1 )
#Get list of freqs for all read lengths
hist_values = map(lambda k: stats_obj["read_len_freqs"][k] if k in stats_obj["read_len_freqs"] else 0 , hist_range)
pos = np.arange(len(hist_range))
width = 1 # gives histogram aspect to the bar diagram
ax.text(0.05,0.95,"# reads: {}".format(stats_obj["n_reads"]),transform=ax.transAxes)
ax.text(0.05,0.9,"# bases: {}".format(stats_obj["n_bases"]),transform=ax.transAxes)
ax.text(0.05,0.85,"Avg read len: {}".format(stats_obj["n_bases"]/stats_obj["n_reads"]),transform=ax.transAxes)
ax.set_title( stats_obj["filename"] )
ax.set_xticks(pos + (width / 2))
#Print every 10 axis labels
ax.set_xticklabels( [x if (idx % 10 == 0) or (idx+1 == len(hist_range)) else "" for idx, x in enumerate(hist_range)] , rotation = 90 )
ax.bar(pos, hist_values, width, color='r')
plt.show()
In [58]:
raw_r1_stats= parse_fastq(glob("reads/*.fastq.gz")[0])
In [66]:
print "Number of (paired-end) reads: {} ".format(raw_r1_stats["n_reads"])
print "Number of bases: 2 * {} = {} ".format(raw_r1_stats["n_bases"],2*raw_r1_stats["n_bases"])
print "Read Length: {} ".format(raw_r1_stats["read_len_freqs"].items()[0][0])
In [64]:
qf_fastq_files = sorted(glob("2_qf/*.fq.gz"))
print qf_fastq_files
qf_stats_list = map(parse_fastq,qf_fastq_files)
In [71]:
qf_total_reads= qf_stats_list[0]["n_reads"]+qf_stats_list[2]["n_reads"]+qf_stats_list[3]["n_reads"]
qf_total_bases = sum( x["n_bases"] for x in qf_stats_list )
print "Reads after quality filtering: {} pe + {} merged pe + {} single = {}".format(qf_stats_list[0]["n_reads"],
qf_stats_list[2]["n_reads"],
qf_stats_list[3]["n_reads"],
qf_total_reads)
print "Num of bases remaining after quality filtering: {}".format(qf_total_bases)
print "Pct of bases remaining after quality filtering: {}%".format(100.0 * qf_total_bases / (2*raw_r1_stats["n_bases"]) )
In [83]:
import pylab
pylab.rcParams['figure.figsize'] = (20.0, 16.0)
plot_read_len_dist( qf_stats_list )
In [72]:
hf_fastq_files = sorted(glob("3_hostfiltering/*.fq"))
print hf_fastq_files
hf_stats_list = map(parse_fastq,hf_fastq_files)
In [86]:
hf_total_reads= (hf_stats_list[0]["n_reads"]) + ( hf_stats_list[1]["n_reads"]/2) + hf_stats_list[2]["n_reads"]
hf_total_bases = sum( x["n_bases"] for x in hf_stats_list )
print "Number of effective reads: {} pe + {} merged pe + {} single = {}".format(hf_stats_list[1]["n_reads"],
hf_stats_list[0]["n_reads"],
hf_stats_list[2]["n_reads"],
hf_total_reads)
print "Num of effective bases: {}".format(hf_total_bases)
print "Pct of bases remaining after host filtering: {}%".format(100.0 * hf_total_bases / (2*raw_r1_stats["n_bases"]) )
In [79]:
#Human pct defined as proportion of bases discarded from qf'ed bases
print "Pct of human in data: {}%".format(100.0 * (qf_total_bases - hf_total_bases) / qf_total_bases )
In [84]:
pylab.rcParams['figure.figsize'] = (15, 20)
plot_read_len_dist( hf_stats_list )
In [ ]: