Main code below
In [13]:
import glob
from collections import OrderedDict
from pprint import pprint
def summarize_reports (directory="./", out_prefix=""):
#Init a dict to store values that will be obtained for all samples
summary_dict = OrderedDict ()
# Iterate on files based on FastqSweeper_report
for f in glob.glob(directory+"*_FastqSweeper_report.csv"):
# Extract the base name and generate the names of the report to be parsed
FastqSweeper_report = f
Cutadapt_report = f[:-24]+"_trim_report.txt"
basename = f[:-24].rpartition("/")[-1]
# Create a Dict entry for the ongoing sample
summary_dict[basename] = OrderedDict ()
# Parse fields of cutadapt trimming report
with open (Cutadapt_report, "r") as fin:
for line in fin:
if line.startswith("Total reads processed:"):
summary_dict[basename]["Total_read"] = int(line.split()[-1].replace(",",""))
if line.startswith("Reads written (passing filters):"):
summary_dict[basename]["Trimmed_read"] = int(line.split()[-2].replace(",",""))
# Parse fields of FastqSweeper report generated after bwa alignement
with open (FastqSweeper_report, "r") as fin:
for line in fin:
if line.startswith(" Mapped"):
summary_dict[basename]["Mapped_read"] = int(line.split()[-1].replace(",",""))
if line.startswith(" Unmapped"):
summary_dict[basename]["Unmapped_read"] = int(line.split()[-1].replace(",",""))
# Write stored data in a csv file
out_name = out_prefix+"_summary_report.csv" if out_prefix else "summary_report.csv"
with open (out_name, "w") as fout:
fout.write(" \t")
for i in summary_dict.keys():
fout.write("{}\t".format(i))
for entry in ["Total_read", "Trimmed_read", "Mapped_read", "Unmapped_read"]:
fout.write("\n{}\t".format(entry))
for i in summary_dict.values():
fout.write("{} ({}%)\t".format(i[entry], round(float(i[entry])/i["Total_read"]*100, 2)))
fout.write("\n")
Indicates the directory containing the output files of FastqSweeper
In [14]:
summarize_reports("./clean_250k_MAPQ0/", out_prefix="MapQ0")
summarize_reports("./clean_250k_MAPQ10/", out_prefix="MapQ10")
summarize_reports("./clean_250k_MAPQ20/", out_prefix="MapQ20")