Notebook to summarize reports generated by FastqSweeper

Main code below


In [13]:
import glob
from collections import OrderedDict
from pprint import pprint

def summarize_reports (directory="./", out_prefix=""):
    
    #Init a dict to store values that will be obtained for all samples
    summary_dict = OrderedDict ()
    
    # Iterate on files based on FastqSweeper_report
    for f in glob.glob(directory+"*_FastqSweeper_report.csv"):
        
        # Extract the base name and generate the names of the report to be parsed
        FastqSweeper_report = f
        Cutadapt_report =  f[:-24]+"_trim_report.txt"       
        basename = f[:-24].rpartition("/")[-1]
        
        # Create a Dict entry for the ongoing sample
        summary_dict[basename] = OrderedDict ()
        
        # Parse fields of cutadapt trimming report
        with open (Cutadapt_report, "r") as fin:
            for line in fin:
                if line.startswith("Total reads processed:"):
                    summary_dict[basename]["Total_read"] = int(line.split()[-1].replace(",",""))
                if line.startswith("Reads written (passing filters):"):
                    summary_dict[basename]["Trimmed_read"] = int(line.split()[-2].replace(",",""))
        
        # Parse fields of FastqSweeper report generated after bwa alignement
        with open (FastqSweeper_report, "r") as fin:
            for line in fin:
                if line.startswith("  Mapped"):
                    summary_dict[basename]["Mapped_read"] = int(line.split()[-1].replace(",",""))
                if line.startswith("  Unmapped"):
                    summary_dict[basename]["Unmapped_read"] = int(line.split()[-1].replace(",",""))
    
    # Write stored data in a csv file
    out_name = out_prefix+"_summary_report.csv" if out_prefix else "summary_report.csv"
    with open (out_name, "w") as fout:
        fout.write(" \t")
        for i in summary_dict.keys():
            fout.write("{}\t".format(i))
        
        for entry in ["Total_read", "Trimmed_read", "Mapped_read", "Unmapped_read"]:
            fout.write("\n{}\t".format(entry))   
            for i in summary_dict.values():
                fout.write("{} ({}%)\t".format(i[entry], round(float(i[entry])/i["Total_read"]*100, 2)))
        fout.write("\n")

Indicates the directory containing the output files of FastqSweeper


In [14]:
summarize_reports("./clean_250k_MAPQ0/", out_prefix="MapQ0")
summarize_reports("./clean_250k_MAPQ10/", out_prefix="MapQ10")
summarize_reports("./clean_250k_MAPQ20/", out_prefix="MapQ20")