In [1]:
import sys
import re
import subprocess
from bwa_mem_pe import *
In [2]:
def define_run_params(run_info,parameters):
# hash for storing run specific information
from collections import defaultdict
run_params = defaultdict(str)
run_info_list = run_info.split(":")
run_params['plat'] = run_info_list[1]
run_params['run_id'] = run_info_list[0]
run_params['vial'] = run_info_list[2]
run_params['rep'] = run_info_list[3]
fastq_root = parameters['root_dir'] + parameters['fastq_dir'] + \
run_info_list[1] + '/'+ 'fastq'+'/'
if run_info_list[1] == "MiSeq":
run_params['fastq1'] = fastq_root + run_info_list[0] + "_1.fastq"
run_params['fastq2'] = fastq_root + run_info_list[0] + "_2.fastq"
else:
run_params['fastq1'] = fastq_root + run_info_list[0] + ".fastq"
run_params['fastq2'] = None
run_params['out_dir'] = parameters['root_dir'] + parameters['analysis_out_dir']
run_params['sam'] = run_params['out_dir'] + "/tmp/" + run_params['run_id'] + ".sam"
run_params['bam'] = run_params['out_dir'] + "/tmp/" + run_params['run_id'] + ".bam"
run_params['sorted_bam'] = run_params['out_dir'] + "/"+ run_params['run_id'] + ".bam"
run_params['read_group'] = "@RG\tID:%s\tCN:%s\tLB:%s\tPL:%s\tSM:%s" %(run_params['run_id'],
parameters['center'],
run_params['vial']+"-"+run_params['rep'],
run_params['plat'],
run_params['run_id'])
return run_params
In [3]:
def run_bwa_mem_pipeline(run_params,bwa_params):
## creating file with run parameters
run_log_file = open(run_params['out_dir']+"/" + run_params['run_id'] +"-run_parameters.txt", 'w')
run_log_file.write("Parameter\tValue\n")
for i in run_params.keys():
run_log_file.write("%s\t%s\n" % (i, run_params[i]))
for i in bwa_params.keys():
run_log_file.write("%s\t%s\n" % (i, bwa_params[i]))
run_log_file.close()
## running bwa_mem
bam_map_bwa_mem(ref = bwa_params['root_dir'] + bwa_params['ref'],
fq1 = run_params['fastq1'],
fq2 =run_params['fastq2'],
out_sam = run_params['sam'],
out_dir = run_params['out_dir'],
read_group = run_params['read_group'])
sam_to_bam(in_sam = run_params['sam'],
out_bam = run_params['bam'],
out_dir = run_params['out_dir'])
bam_sort(in_bam = run_params['bam'],
out_sort = run_params['sorted_bam'],
out_dir = run_params['out_dir'])
bam_index(out_dir = run_params['out_dir'], bam = run_params['sorted_bam'])
In [4]:
def read_dat(filename):
#process input file with configuration information
from collections import defaultdict
parameters = defaultdict(str)
with open(filename,'r') as f:
for line in f:
param = line.strip().split("=")
parameters[param[0]] = param[1]
return parameters
In [5]:
def main(filename):
#read run parameters from input file and process using pathoscope
parameters = read_dat(filename)
# creating temp directory
subprocess.call(['mkdir','-p',parameters['root_dir']+ parameters['analysis_out_dir']+"/tmp/"])
# indexing reference
bwa_index_ref(parameters['root_dir'] + parameters['ref'],parameters['root_dir']+parameters['analysis_out_dir'])
for i in parameters['datasets'].split(","):
run_params = define_run_params(i,parameters)
run_bwa_mem_pipeline(run_params, parameters)
In [7]:
main("bwa_mem_pipeline_params_test.txt")
In [ ]:
if __name__ == '__main__':
main(sys.argv[1])