Amanda Birmingham, CCBB, UCSD (abirmingham@ucsd.edu)
To run this notebook reproducibly, follow these steps:
In [ ]:
g_num_processors = 3
g_filtered_fastqs_dir = '~/dual_crispr/test_data/test_set_3'
g_library_fp = '~/dual_crispr/library_definitions/test_library.txt'
g_len_of_seq_to_match = 19
g_num_allowed_mismatches = 1
g_fastq_counts_run_prefix = ''
g_fastq_counts_dir = '~/dual_crispr/test_outputs/test_set_3'
In [ ]:
import inspect
import ccbb_pyutils.analysis_run_prefixes as ns_runs
import ccbb_pyutils.files_and_paths as ns_files
import ccbb_pyutils.notebook_logging as ns_logs
def describe_var_list(input_var_name_list):
description_list = ["{0}: {1}\n".format(name, eval(name)) for name in input_var_name_list]
return "".join(description_list)
ns_logs.set_stdout_info_logger()
In [ ]:
g_filtered_fastqs_dir = ns_files.expand_path(g_filtered_fastqs_dir)
g_library_fp = ns_files.expand_path(g_library_fp)
g_fastq_counts_run_prefix = ns_runs.check_or_set(g_fastq_counts_run_prefix, ns_runs.generate_run_prefix())
g_fastq_counts_dir = ns_files.expand_path(ns_runs.check_or_set(g_fastq_counts_dir, g_filtered_fastqs_dir))
print(describe_var_list(['g_filtered_fastqs_dir', 'g_library_fp','g_fastq_counts_run_prefix', 'g_fastq_counts_dir']))
ns_files.verify_or_make_dir(g_fastq_counts_dir)
In [ ]:
import dual_crispr.count_filterer as ns_filter
print(inspect.getsource(ns_filter.get_filtered_file_suffix))
In [ ]:
import dual_crispr.construct_file_extracter as ns_extractor
print(inspect.getsource(ns_extractor))
In [ ]:
import dual_crispr.grna_position_matcher as ns_matcher
print(inspect.getsource(ns_matcher))
In [ ]:
import dual_crispr.construct_counter as ns_counter
print(inspect.getsource(ns_counter))
In [ ]:
def count_constructs_for_one_fastq_pair(curr_base, run_prefix, seq_len, num_allowed_mismatches, constructs_fp,
output_dir, fw_fastq_fp, rv_fastq_fp):
construct_names, grna_name_seq_pairs = ns_extractor.extract_construct_and_grna_info(constructs_fp)
trimmed_grna_name_seq_pairs = ns_extractor.trim_probes(grna_name_seq_pairs, seq_len)
# Note: currently same value (num_allowed_mismatches) is being used for number of mismatches allowed in forward
# read and number of mismatches allowed in reverse read, but this can be altered if desired
grna_matcher = ns_matcher.GrnaPositionMatcher(trimmed_grna_name_seq_pairs, seq_len, num_allowed_mismatches,
num_allowed_mismatches)
output_fp = ns_files.build_multipart_fp(output_dir, [curr_base, run_prefix, ns_counter.get_counts_file_suffix()])
ns_counter.generate_construct_counts(grna_matcher, construct_names, output_fp, fw_fastq_fp, rv_fastq_fp)
In [ ]:
import ccbb_pyutils.parallel_process_fastqs as ns_parallel
g_parallel_results = ns_parallel.parallel_process_paired_reads(g_filtered_fastqs_dir,
ns_filter.get_filtered_file_suffix(), g_num_processors, count_constructs_for_one_fastq_pair,
[g_fastq_counts_run_prefix, g_len_of_seq_to_match, g_num_allowed_mismatches, g_library_fp,
g_fastq_counts_dir], True)
In [ ]:
print(ns_parallel.concatenate_parallel_results(g_parallel_results))
In [ ]:
print(ns_files.check_file_presence(g_fastq_counts_dir, g_fastq_counts_run_prefix, ns_counter.get_counts_file_suffix(),
check_failure_msg="Construct counting failed to produce count file(s)."))