To run this notebook reproducibly, follow these steps:
In [ ]:
g_num_processors = 3
g_fastqs_dir = '/Users/Birmingham/Repositories/ccbb_tickets/20160210_mali_crispr/data/raw/20160504_D00611_0275_AHMM2JBCXX'
g_trimmed_fastqs_dir = '/Users/Birmingham/Repositories/ccbb_tickets/20160210_mali_crispr/data/interim/20160504_D00611_0275_AHMM2JBCXX'
g_full_5p_r1 = 'TATATATCTTGTGGAAAGGACGAAACACCG'
g_full_5p_r2 = 'CCTTATTTTAACTTGCTATTTCTAGCTCTAAAAC'
g_full_3p_r1 = 'GTTTCAGAGCTATGCTGGAAACTGCATAGCAAGTTGAAATAAGGCTAGTCCGTTATCAACTTGAAAAAGTGGCACCGAGTCGGTGCTTTTTTGTACTGAG'
g_full_3p_r2 = 'CAAACAAGGCTTTTCTCCAAGGGATATTTATAGTCTCAAAACACACAATTACTTTACAGTTAGGGTGAGTTTCCTTTTGTGCTGTTTTTTAAAATA'
g_code_location = '/Users/Birmingham/Repositories/ccbb_tickets/20160210_mali_crispr/src/python'
In [ ]:
import sys
sys.path.append(g_code_location)
In [ ]:
# %load -s describe_var_list /Users/Birmingham/Repositories/ccbb_tickets/20160210_mali_crispr/src/python/ccbbucsd/utilities/analysis_run_prefixes.py
def describe_var_list(input_var_name_list):
description_list = ["{0}: {1}\n".format(name, eval(name)) for name in input_var_name_list]
return "".join(description_list)
In [ ]:
from ccbbucsd.utilities.analysis_run_prefixes import check_or_set, get_run_prefix, get_timestamp
g_trimmed_fastqs_dir = check_or_set(g_trimmed_fastqs_dir, g_fastqs_dir)
print(describe_var_list(['g_trimmed_fastqs_dir']))
In [ ]:
from ccbbucsd.utilities.files_and_paths import verify_or_make_dir
verify_or_make_dir(g_trimmed_fastqs_dir)
In [ ]:
from ccbbucsd.utilities.notebook_logging import set_stdout_info_logger
set_stdout_info_logger()
In [ ]:
# %load /Users/Birmingham/Repositories/ccbb_tickets/20160210_mali_crispr/src/python/ccbbucsd/malicrispr/scaffold_trim.py
# standard libraries
import enum
# third-party libraries
import cutadapt.scripts.cutadapt
# ccbb libraries
from ccbbucsd.utilities.files_and_paths import get_file_name_pieces, make_file_path
__author__ = 'Amanda Birmingham'
__maintainer__ = "Amanda Birmingham"
__email__ = "abirmingham@ucsd.edu"
__status__ = "prototype"
class TrimType(enum.Enum):
FIVE = "5"
THREE = "3"
FIVE_THREE = "53"
def get_trimmed_suffix(trimtype):
return "_trimmed{0}.fastq".format(trimtype.value)
def trim_linked_scaffold(output_dir, fastq_fp, scaffold_seq_5p, scaffold_seq_3p, quiet=True):
args = ["-a", "{0}...{1}".format(scaffold_seq_5p,scaffold_seq_3p)]
return _run_cutadapt(output_dir, fastq_fp, TrimType.FIVE_THREE, args, quiet)
def trim_global_scaffold(output_dir, fastq_fp, scaffold_seq_5p=None, scaffold_seq_3p=None, quiet=True):
curr_fastq_fp = fastq_fp
if scaffold_seq_5p is not None:
curr_fastq_fp = _run_cutadapt_global(output_dir, curr_fastq_fp, scaffold_seq_5p, True, quiet)
if scaffold_seq_3p is not None:
curr_fastq_fp = _run_cutadapt_global(output_dir, curr_fastq_fp, scaffold_seq_3p, False, quiet)
return curr_fastq_fp
def _run_cutadapt_global(output_dir, input_fastq_fp, seq_to_trim, is_5p, quiet):
end_switch = "-g"
end_name = TrimType.FIVE
if not is_5p:
end_switch = "-a"
end_name = TrimType.THREE
args = [end_switch, seq_to_trim]
return _run_cutadapt(output_dir, input_fastq_fp, end_name, args, quiet)
def _run_cutadapt(output_dir, input_fastq_fp, trim_name, partial_args, quiet):
_, input_base, _ = get_file_name_pieces(input_fastq_fp)
output_fastq_fp = make_file_path(output_dir, input_base, get_trimmed_suffix(trim_name))
args = [x for x in partial_args]
if quiet:
args.append("--quiet")
args.extend(["-o", output_fastq_fp, input_fastq_fp])
cutadapt.scripts.cutadapt.main(args)
return output_fastq_fp
In [ ]:
def trim_fw_and_rv_reads(output_dir, full_5p_r1, full_3p_r1, full_5p_r2, full_3p_r2, fw_fastq_fp, rv_fastq_fp):
trim_linked_scaffold(output_dir, fw_fastq_fp, full_5p_r1, full_3p_r1)
trim_linked_scaffold(output_dir, rv_fastq_fp, full_5p_r2, full_3p_r2)
In [ ]:
g_seq_file_ext_name = ".fastq"
g_gzip_ext_name = ".gz"
In [ ]:
from ccbbucsd.utilities.files_and_paths import summarize_filenames_for_prefix_and_suffix
print(summarize_filenames_for_prefix_and_suffix(g_fastqs_dir, "",
"{0}{1}".format(g_seq_file_ext_name, g_gzip_ext_name),
all_subdirs=True))
In [ ]:
from ccbbucsd.utilities.files_and_paths import gunzip_wildpath, move_to_dir_and_flatten
def unzip_and_flatten_seq_files(top_fastqs_dir, ext_name, gzip_ext_name, keep_gzs):
# first, recursively unzip all fastq.gz files anywhere under the input dir
gunzip_wildpath(top_fastqs_dir, ext_name + gzip_ext_name, keep_gzs, True) # True = do recursive
# now move all fastqs to top-level directory so don't have to work recursively in future
move_to_dir_and_flatten(top_fastqs_dir, top_fastqs_dir, ext_name)
In [ ]:
# False = don't keep gzs as well as expanding, True = do keep them (True only works for gzip 1.6+)
unzip_and_flatten_seq_files(g_fastqs_dir, g_seq_file_ext_name, g_gzip_ext_name, False)
In [ ]:
print(summarize_filenames_for_prefix_and_suffix(g_fastqs_dir, "", g_seq_file_ext_name))
In [ ]:
from ccbbucsd.utilities.parallel_process_fastqs import parallel_process_paired_reads, concatenate_parallel_results
g_parallel_results = parallel_process_paired_reads(g_fastqs_dir, g_seq_file_ext_name, g_num_processors,
trim_fw_and_rv_reads, [g_trimmed_fastqs_dir, g_full_5p_r1,
g_full_3p_r1, g_full_5p_r2, g_full_3p_r2])
In [ ]:
print(concatenate_parallel_results(g_parallel_results))
In [ ]:
print(summarize_filenames_for_prefix_and_suffix(g_trimmed_fastqs_dir, "", get_trimmed_suffix(TrimType.FIVE_THREE)))