Pipeline base level sequence sequence analysis

starting with bam files

Process

  1. remove duplicates
  2. realign around indels
  3. samtools - forcing calls at all positions

In [1]:
import sys
import os
import time
import subprocess
from bwa_mem_pe import *

In [2]:
def bam_group_sort(in_bam, bam_root, out_dir):
    ''' Sorting bam'''
    print "Sorting bam ..."
    
    # prep files
    log_file = open(out_dir + "/bwa_group_sort"+ time.strftime("-%Y-%m-%d-%H-%M-%S.log"),'w')
    stderr_file = open(out_dir + "/bwa_group_sort"+ time.strftime("-%Y-%m-%d-%H-%M-%S.stder"),'w')
    
    # run command
    bam_group_sort_command = ["samtools", "sort", "-n", "-O", "bam", "-o", bam_group_sort_file, "-T", out_dir, in_bam]
    subprocess.call(bam_group_sort_command, stdout=log_file,stderr=stderr_file) 
    log_file.close(); stderr_file.close()

In [3]:
def bam_fixmate(in_bam,bam_fix,out_dir):
    '''Fix mate pairs'''
    print "Fixing mate pairs ..."
    
    ## log files for standard out and error
    #out_file = open(bam_fix,'w')
    log_file = open(out_dir + "/bwa_fixmate"+ time.strftime("-%Y-%m-%d-%H-%M-%S.log"),'w')
    stderr_file = open(out_dir + "/bwa_fixmate"+ time.strftime("-%Y-%m-%d-%H-%M-%S.stder"),'w')
    
    # run command
    fixmate_command = ["samtools","fixmate",in_bam,bam_fix]
    subprocess.call(fixmate_command, stderr=stderr_file, stdout=log_file)  
    log_file.close(); stderr_file.close()

In [4]:
def bam_realign(in_bam, ref, intervals, bam_realign_file, out_dir):
    ''' Indel relignment'''
    print "Realignment Around Indels ..."
    
    # prep files
    log_file = open(out_dir + "/bwa_realign"+ time.strftime("-%Y-%m-%d-%H-%M-%S.log"),'w')
    stderr_file = open(out_dir + "/bwa_realign"+ time.strftime("-%Y-%m-%d-%H-%M-%S.stder"),'w')
    
    # run commands
    GATK_command = ["java","-jar","-Xmx2g","/notebooks/utilities/GenomeAnalysisTK.jar"]
    realigner_target_command = GATK_command + ["-T","RealignerTargetCreator", "-R",ref,"-I",in_bam, "-o", intervals]
    subprocess.call(realigner_target_command,stdout=log_file,stderr=stderr_file)
    
    realigner_command = GATK_command + ["-T","IndelRealigner", "-R", ref,"-I",in_bam,
                                "-targetIntervals", intervals, "-o", bam_realign_file]
    subprocess.call(realigner_command, stdout=log_file,stderr=stderr_file)
    log_file.close(); stderr_file.close()

In [5]:
def bam_markdup(in_bam, bam_markdup_file, metrics_file, out_dir):
    ''' Mark duplicates '''
    print "Marking Duplicates ..."
    
    # prep files
    log_file = open(out_dir + "/bwa_markdup"+ time.strftime("-%Y-%m-%d-%H-%M-%S.log"),'w')
    stderr_file = open(out_dir + "/bwa_markdup"+ time.strftime("-%Y-%m-%d-%H-%M-%S.stder"),'w')
    
    # run command
    markdup_command = ["java","-Xmx2g","-jar","/usr/local/bin/MarkDuplicates.jar","VALIDATION_STRINGENCY=LENIENT",
                        ("INPUT=%s" % (in_bam)),("METRICS_FILE=%s" % (metrics_file)),("OUTPUT=%s" % (bam_markdup_file))]
    subprocess.call(markdup_command, stdout=log_file,stderr=stderr_file)
    log_file.close(); stderr_file.close()

In [6]:
def bam_merge(bam_list, bam_merged_file, outdir):
    ''' Merge list of bams into a single bam file'''
    print "Merging bams"
    
    # prep files
    log_file = open(out_dir + "/merge_bams"+ time.strftime("-%Y-%m-%d-%H-%M-%S.log"),'w')
    stderr_file = open(out_dir + "/merge_bams"+ time.strftime("-%Y-%m-%d-%H-%M-%S.stder"),'w')
    
    # run command
    merge_bam_command = ["samtools","merge", "-b", bam_list, merged_bam_file]
    subprocess.call(merge_bam_command, stdout=log_file,stderr=stderr_file)
    log_file.close(); stderr_file.close()

In [7]:
def genome_calls_mpileup(bams,ref, vcf_file, out_dir):
    ''' Takes a list of bam files and refernece genome then
        performs base level sequence analysis
    '''
    print "Running mpileup ..."
    
    # prep files
    vcf_file = open(vcf_file,'w')
    stderr_file = open(out_dir + "/mpileup"+ time.strftime("-%Y-%m-%d-%H-%M-%S.stder"),'w')
    
    # run command
    mpileup_command = ["samtools","mpileup", "-uv", "-t", "DP", "-t", "DV",
                     "-t", "DPR", "-t", "SP", "-t", "DP4","-f", ref] + bams
    subprocess.call(mpileup_command,stdout=vcf_file,stderr=stderr_file)
    vcf_file.close(); stderr_file.close()

In [8]:
ref='/notebooks/utilities/resources/exampleFASTA.fasta'
bam='/notebooks/utilities/resources/exampleBAM.bam'

bam_split = bam.split("/")[-1]
ref_split = ref.split("/")[-1]
bam_root = ref_split.replace(".fasta","") + "_" + bam_split.replace(".bam","")
bam_group_sort_file = bam_root + "_group_sort.bam"
bam_sort_file = bam_root + "_sort.bam"
bam_fix_file = bam_root + "_fix.bam"
intervals = bam_root + ".intervals"
bam_realign_file = bam_root + "_realign.bam" 
bam_markdup_file = bam_root + "_mrkdup.bam"
metrics_file = bam_root + "_mrkdup.metrics"
vcf_file = bam_root + ".vcf"

out_dir='/notebooks/dev/genome_purity'
%mkdir /notebooks/dev/genome_purity

In [9]:
bam_group_sort(bam, bam_group_sort_file, out_dir)
bam_fixmate(bam_group_sort_file,bam_fix_file,out_dir)
bam_sort(bam_fix_file,bam_sort_file, out_dir)
bam_index(out_dir, bam_sort_file)
bam_realign(bam_sort_file, ref,intervals, bam_realign_file, out_dir)
bam_markdup(bam_realign_file, bam_markdup_file,metrics_file, out_dir)
bam_index(out_dir,bam_markdup_file)

## TODO
# add merging bams and realign around indels for merged bams
genome_calls_mpileup([bam_markdup_file],ref,vcf_file,out_dir)


Sorting bam ...
Fixing mate pairs ...
Realignment Around Indels ...
Marking Duplicates ...
Running mpileup ...

In [8]:
ls -lh


total 116K
-rw------- 1 1000 staff 4.6K Dec 12  2014 bwa_mem_pe.ipynb
-rw-r----- 1 1000 staff 2.7K Dec 12  2014 bwa_mem_pe.py
-rw-r----- 1 1000 staff 2.5K Dec 12  2014 bwa_mem_pe.pyc
-rw-rw-r-- 1 1000 staff  200 Dec 12  2014 bwa_mem_pipeline_params_test.txt
-rw------- 1 1000 staff 5.2K Jan 12  2015 consensus_base_functions.ipynb
-rw-r--r-- 1 1000 staff 3.7K Jan 12  2015 exampleFASTA_exampleBAM_fix.bam
-rw-r--r-- 1 1000 staff 3.6K Jan 12  2015 exampleFASTA_exampleBAM_group_sort.bam
-rw-r--r-- 1 1000 staff    0 Jan 12  2015 exampleFASTA_exampleBAM.intervals
-rw-r--r-- 1 1000 staff 4.5K Jan 12  2015 exampleFASTA_exampleBAM_mrkdup.bam
-rw-r--r-- 1 1000 staff  232 Jan 12  2015 exampleFASTA_exampleBAM_mrkdup.bam.bai
-rw-r--r-- 1 1000 staff 1.1K Jan 12  2015 exampleFASTA_exampleBAM_mrkdup.metrics
-rw-r--r-- 1 1000 staff  232 Jan 12  2015 exampleFASTA_exampleBAM_realign.bai
-rw-r--r-- 1 1000 staff 4.1K Jan 12  2015 exampleFASTA_exampleBAM_realign.bam
-rw-r--r-- 1 1000 staff 3.7K Jan 12  2015 exampleFASTA_exampleBAM_sort.bam
-rw-r--r-- 1 1000 staff  232 Jan 12  2015 exampleFASTA_exampleBAM_sort.bam.bai
drwxr-xr-x 1 1000 staff  510 Jan 12  2015 genome_purity/
-rw-r--r-- 1 1000 staff 4.4K Nov 22 19:59 notebook_2014_11_14.ipynb
-rwxr-xr-x 1 1000 staff 1.2K Nov 22 19:59 notebook_2014_11_16.ipynb*
-rw------- 1 1000 staff 3.5K Jan  9  2015 notebook_2014_12_12.ipynb
drwxr-xr-x 1 1000 staff  102 Nov 22 19:59 old_scripts/
drwxr-xr-x 1 1000 staff  102 Dec 12  2014 __pycache__/
-rw------- 1 1000 staff 6.3K Dec 12  2014 run_bwa_mem_pe.ipynb
-rw------- 1 1000 staff  969 Jan 12  2015 run_consensus_base_pipeline.ipynb
-rw-r--r-- 1 1000 staff  13K Jan 12  2015 sequence_purity_pipeline.ipynb

In [17]:
rm -r example* genome_purity

In [5]:
def prep_bam_for_variant_calling(bam, ref, known_vcf):
    ''' Function for generating variant calls from bam files, 
        adapted from  
        http://www.htslib.org/workflow/#mapping_to_variant
    '''
    import subprocess
    bam_split = bam.split("/")[-1]
    ref_split = ref.split("/")[-1]
    bam_root = ref_split.replace(".fasta","") + "_" + bam_split.replace(".bam","")
    
    ##%%## Need to work out logging code
    log_file = open(bam_root + ".log",'w')
    log_file.write("Fix mate pairs for: %s" % (bam))

In [6]:
def main():
    ref=sys.argv[1]
    known_vcf=sys.argv[2]
    vcf_filename=sys.argv[3]
    # processing multiple bams
    processed_bams = []
    for i in sys.argv[4:]:
        processed_bams.append(prep_bam_for_variant_calling(i, ref,known_vcf))
    genome_calls_mpileup(processed_bams,ref, vcf_filename)
# Need to check also incorporate command line parsing
# def __main__ if name == "":
#     main()

Old functions


In [2]:
def log_stderr_files(out_dir,command):
    ''' Create log and standard error files'''
    log_file = open(out_dir + command + time.strftime("-%Y-%m-%d-%H-%M-%S.log"),'w')
    err_file = open(out_dir + command + time.strftime("-%Y-%m-%d-%H-%M-%S.err"),'w')
    
    return [log_file,err_file]