To run this notebook reproducibly, follow these steps:
In [ ]:
g_timestamp = ''
g_num_processors = 3
g_filtered_fastqs_dir = '/Users/Birmingham/Repositories/ccbb_tickets/20160210_mali_crispr/data/interim/20160504_D00611_0275_AHMM2JBCXX'
g_count_alg_name = '19mer_1mm_py'
g_constructs_fp = '/Users/Birmingham/Repositories/ccbb_tickets/20160210_mali_crispr/data/raw/general/Cancer_rep2_CRV4.txt'
g_col_indices_str = "1,3,5"
g_len_of_seq_to_match = 19
g_num_allowed_mismatches = 1
g_fastq_counts_dir = '/Users/Birmingham/Repositories/ccbb_tickets/20160210_mali_crispr/data/interim/20160504_D00611_0275_AHMM2JBCXX'
g_fastq_counts_run_prefix = '20160504_CRV4_plasmid_20160701134823'
g_code_location = '/Users/Birmingham/Repositories/ccbb_tickets/20160210_mali_crispr/src/python'
In [ ]:
import sys
sys.path.append(g_code_location)
In [ ]:
# %load -s describe_var_list /Users/Birmingham/Repositories/ccbb_tickets/20160210_mali_crispr/src/python/ccbbucsd/utilities/analysis_run_prefixes.py
def describe_var_list(input_var_name_list):
description_list = ["{0}: {1}\n".format(name, eval(name)) for name in input_var_name_list]
return "".join(description_list)
In [ ]:
from ccbbucsd.utilities.analysis_run_prefixes import check_or_set, get_run_prefix, get_timestamp
g_timestamp = check_or_set(g_timestamp, get_timestamp())
g_fastq_counts_dir = check_or_set(g_fastq_counts_dir, g_filtered_fastqs_dir)
g_fastq_counts_run_prefix = check_or_set(g_fastq_counts_run_prefix,
get_run_prefix(None, g_count_alg_name, g_timestamp))
print(describe_var_list(['g_timestamp','g_fastq_counts_dir', 'g_fastq_counts_run_prefix']))
In [ ]:
from ccbbucsd.utilities.files_and_paths import verify_or_make_dir
verify_or_make_dir(g_fastq_counts_dir)
In [ ]:
from ccbbucsd.utilities.notebook_logging import set_stdout_info_logger
set_stdout_info_logger()
In [ ]:
# %load -s get_filtered_file_suffix /Users/Birmingham/Repositories/ccbb_tickets/20160210_mali_crispr/src/python/ccbbucsd/malicrispr/count_filterer.py
def get_filtered_file_suffix():
return "_len_filtered.fastq"
In [ ]:
# %load /Users/Birmingham/Repositories/ccbb_tickets/20160210_mali_crispr/src/python/ccbbucsd/malicrispr/construct_file_extracter.py
# third-party libraries
import pandas
# ccbb libraries
from ccbbucsd.utilities.bio_seq_utilities import trim_seq
__author__ = "Amanda Birmingham"
__maintainer__ = "Amanda Birmingham"
__email__ = "abirmingham@ucsd.edu"
__status__ = "prototype"
_CONSTRUCT_ID = "CONSTRUCT_ID"
_GENE_A_SEQ = "GENE_A_SEQ"
_GENE_B_SEQ = "GENE_B_SEQ"
_CONSTRUCT_A_NAME = "CONSTRUCT_A_NAME"
_CONSTRUCT_B_NAME = "CONSTRUCT_B_NAME"
def get_construct_separator():
return "__"
def extract_construct_and_grna_info(constructs_fp, column_indices):
construct_table = _read_in_construct_table(constructs_fp, column_indices, rows_to_skip=1)
seq_name_sets = _extract_grnas_from_construct_table(construct_table)
grna_name_seq_pairs = _format_and_check_grnas_input(seq_name_sets)
construct_names = construct_table[_CONSTRUCT_ID].unique().tolist()
return construct_names, grna_name_seq_pairs
def _read_in_construct_table(constructs_fp, column_indices, rows_to_skip=1):
result = pandas.read_table(constructs_fp, skiprows=rows_to_skip, header=None)
result = _rename_columns(result, column_indices)
return result
def _rename_columns(construct_table, column_indices):
new_names = [_CONSTRUCT_ID, _GENE_A_SEQ, _GENE_B_SEQ]
existing_names = list(construct_table.columns.values)
existing_to_new_names = {}
for curr_index in range(0, len(column_indices)):
curr_col_index = column_indices[curr_index]
curr_existing_name = existing_names[curr_col_index]
existing_to_new_names[curr_existing_name] = new_names[curr_index]
return construct_table.rename(columns=existing_to_new_names)
def _extract_grnas_from_construct_table(construct_table):
grna_name_key = "grna_name"
grna_seq_key = "grna_seq"
# split the construct id in each row into two pieces--the two construct names--and put them into new columns
construct_table[_CONSTRUCT_A_NAME], construct_table[_CONSTRUCT_B_NAME] = \
zip(*construct_table[_CONSTRUCT_ID].str.split(get_construct_separator()).tolist())
# get the gene/sequence pairs for each of the two genes and concatenate them
gene_a_pairs = _extract_grna_name_and_seq_df(construct_table, _CONSTRUCT_A_NAME,
_GENE_A_SEQ, grna_name_key, grna_seq_key)
gene_b_pairs = _extract_grna_name_and_seq_df(construct_table, _CONSTRUCT_B_NAME,
_GENE_B_SEQ, grna_name_key, grna_seq_key)
gene_pairs = pandas.concat([gene_a_pairs, gene_b_pairs])
gene_pairs[grna_seq_key] = gene_pairs[grna_seq_key].str.upper() # upper-case all gRNA sequences
# extract only the unique pairs
grna_seq_name_groups = gene_pairs.groupby([grna_seq_key, grna_name_key]).groups
result = [x for x in grna_seq_name_groups]
return sorted(result) # NB sort so that output order is predictable
def _extract_grna_name_and_seq_df(construct_table, name_key, seq_key, grna_name_key, grna_seq_key):
name_and_seq_df = construct_table[[name_key, seq_key]]
name_and_seq_df.rename(columns={name_key: grna_name_key, seq_key: grna_seq_key}, inplace=True)
return name_and_seq_df
def trim_grnas(grnas_name_and_seq_list, retain_len):
result = []
for name_seq_tuple in grnas_name_and_seq_list:
grna_name = name_seq_tuple[0]
full_seq = name_seq_tuple[1]
trimmed_seq = trim_seq(full_seq, retain_len, False) # False = do not retain from 5p end but from 3p end
result.append((grna_name, trimmed_seq))
return result
def _read_in_grnas(grnas_fp):
list_from_file = _read_grnas_input(grnas_fp)
return _format_and_check_grnas_input(list_from_file)
def _read_grnas_input(grnas_fp, comment_prefix="#", delimiter="\t"):
result = []
with open(grnas_fp, 'r') as file_handle:
for line in file_handle:
if not line.startswith(comment_prefix):
pieces = line.strip().split(delimiter)
result.append(pieces)
return result
def _format_and_check_grnas_input(grnas_seq_and_name_list):
expected_num_pieces = 2
seqs_by_names = {}
names_by_seqs = {}
result = []
for curr_set in grnas_seq_and_name_list:
if len(curr_set) != expected_num_pieces:
raise ValueError(
"input '{0}' has {1} pieces instead of the expected {2}".format(
curr_set, len(curr_set), expected_num_pieces
))
curr_seq = curr_set[0]
curr_name = curr_set[1]
if curr_seq in names_by_seqs:
raise ValueError(
"sequence '{0}' associated with name '{1}' but was already associated with name '{2}'".format(
curr_seq, curr_name, names_by_seqs[curr_seq]
))
if curr_name in seqs_by_names:
raise ValueError(
"name '{0}' associated with sequence '{1}' but was already associated with sequence '{2}'".format(
curr_name, curr_seq, seqs_by_names[curr_name]
))
names_by_seqs[curr_seq] = curr_name
seqs_by_names[curr_name] = curr_seq
result.append((curr_name, curr_seq))
# next pair in
return result
In [ ]:
# %load /Users/Birmingham/Repositories/ccbb_tickets/20160210_mali_crispr/src/python/ccbbucsd/malicrispr/grna_position_matcher.py
# ccbb libraries
from ccbbucsd.utilities.bio_seq_utilities import rev_comp_canonical_dna_seq
__author__ = "Amanda Birmingham"
__maintainer__ = "Amanda Birmingham"
__email__ = "abirmingham@ucsd.edu"
__status__ = "development"
class GrnaPositionMatcher:
@staticmethod
def _generate_seqs_to_check(fw_whole_seq, rv_whole_seq):
rc_whole_rv_seq = rev_comp_canonical_dna_seq(rv_whole_seq)
return fw_whole_seq, rc_whole_rv_seq
def __init__(self, grna_names_and_seqs, expected_len, num_allowed_fw_mismatches, num_allowed_rv_mismatches):
self._grna_names_and_seqs = grna_names_and_seqs
self._num_allowed_fw_mismatches = num_allowed_fw_mismatches
self._num_allowed_rv_mismatches = num_allowed_rv_mismatches
self._seq_len = expected_len
@property
def num_allowed_fw_mismatches(self):
return self._num_allowed_fw_mismatches
@property
def num_allowed_rv_mismatches(self):
return self._num_allowed_rv_mismatches
def find_fw_and_rv_read_matches(self, fw_whole_seq, rv_whole_seq):
fw_construct_window, rc_rv_construct_window = self._generate_seqs_to_check(fw_whole_seq, rv_whole_seq)
return self._id_pair_matches(fw_construct_window, rc_rv_construct_window)
def _id_pair_matches(self, input1_seq, input2_seq):
input2_match_name = None
input1_match_name = self._id_sequence_match(self.num_allowed_fw_mismatches, input1_seq)
if input1_match_name is not None:
input2_match_name = self._id_sequence_match(self.num_allowed_rv_mismatches, input2_seq)
return input1_match_name, input2_match_name
def _id_sequence_match(self, num_allowed_mismatches, input_seq):
found_name = None
# check for perfect matches
for potential_name, potential_reference in self._grna_names_and_seqs:
if input_seq == potential_reference:
found_name = potential_name
break
# if no perfect matches found, check for mismatches
if found_name is None:
min_found_mismatches = num_allowed_mismatches + 1 # nothing checked yet so num mismatches is maximum
found_name = None
for potential_name, potential_reference in self._grna_names_and_seqs:
# this way is not elegant, but it IS fast :) .... >3x faster than XOR approach
num_mismatches = 0
for x in range(0, self._seq_len):
if input_seq[x] != potential_reference[x]: # NB assumption that both are the same fixed length!
num_mismatches += 1
if num_mismatches > num_allowed_mismatches:
break
if num_mismatches < min_found_mismatches:
min_found_mismatches = num_mismatches
if num_mismatches <= num_allowed_mismatches:
found_name = potential_name
return found_name
In [ ]:
# %load /Users/Birmingham/Repositories/ccbb_tickets/20160210_mali_crispr/src/python/ccbbucsd/malicrispr/construct_counter.py
"""This module counts almost-perfect matches of small sequences within forward and reverse fastq sequence pairs."""
# standard libraries
import csv
import datetime
import logging
# ccbb libraries
from ccbbucsd.utilities.basic_fastq import FastqHandler, paired_fastq_generator
# project-specific libraries
from ccbbucsd.malicrispr.construct_file_extracter import get_construct_separator
__author__ = "Amanda Birmingham"
__maintainer__ = "Amanda Birmingham"
__email__ = "abirmingham@ucsd.edu"
__status__ = "development"
def get_counts_file_suffix():
return "counts.txt"
def get_counter_from_names(names_to_count):
return {x: 0 for x in names_to_count}
def generate_construct_counts(grna_matcher, construct_names, output_fp, fw_fastq_fp, rv_fastq_fp):
counts_info_tuple = _match_and_count_constructs_from_files(grna_matcher, construct_names, fw_fastq_fp, rv_fastq_fp)
counts_by_construct = counts_info_tuple[0]
counts_by_type = counts_info_tuple[1]
_write_counts(counts_by_construct, counts_by_type, output_fp)
def _match_and_count_constructs_from_files(grna_matcher, construct_names, fw_fastq_fp, rv_fastq_fp):
construct_counts = get_counter_from_names(construct_names)
fw_fastq_handler = FastqHandler(fw_fastq_fp)
rv_fastq_handler = FastqHandler(rv_fastq_fp)
return _match_and_count_constructs(grna_matcher, construct_counts, fw_fastq_handler, rv_fastq_handler)
def _match_and_count_constructs(grna_matcher, construct_counts, fw_fastq_handler, rv_fastq_handler):
summary_counts = {"num_pairs": 0, "num_pairs_unrecognized": 0, "num_constructs_found": 0,
"num_constructs_unrecognized": 0, "num_constructs_recognized": 0}
paired_fastq_seqs = paired_fastq_generator(fw_fastq_handler, rv_fastq_handler)
for curr_pair_seqs in paired_fastq_seqs:
summary_counts["num_pairs"] += 1
_report_progress(summary_counts["num_pairs"])
grna_name_A, grna_name_B = grna_matcher.find_fw_and_rv_read_matches(*curr_pair_seqs)
if grna_name_A is not None and grna_name_B is not None:
construct_name = _generate_construct_name(grna_name_A, grna_name_B)
summary_counts["num_constructs_found"] += 1
if construct_name in construct_counts:
summary_counts["num_constructs_recognized"] += 1
construct_counts[construct_name] += 1
else:
summary_counts["num_constructs_unrecognized"] += 1
logging.debug("Unrecognized construct name: {0}".format(construct_name))
else:
summary_counts["num_pairs_unrecognized"] += 1
logging.debug("Unrecognized sequence: {0},{1}".format(*curr_pair_seqs))
return construct_counts, summary_counts
def _report_progress(num_fastq_pairs):
if num_fastq_pairs % 100000 == 0:
logging.info("On fastq pair number {0} at {1}".format(num_fastq_pairs, datetime.datetime.now()))
def _generate_construct_name(grna_name_A, grna_name_B):
return "{0}{1}{2}".format(grna_name_A, get_construct_separator(), grna_name_B)
def _write_counts(counts_by_construct, counts_by_type, output_fp):
construct_names = list(counts_by_construct.keys())
construct_names.sort()
with open(output_fp, 'w') as file_handle:
summary_pieces = []
for curr_key, curr_value in counts_by_type.items():
summary_pieces.append("{0}:{1}".format(curr_key, curr_value))
summary_comment = ",".join(summary_pieces)
summary_comment = "# " + summary_comment
header = ["construct_id", "counts"]
writer = csv.writer(file_handle, delimiter="\t")
writer.writerow([summary_comment])
writer.writerow(header)
for curr_name in construct_names:
row = [curr_name, counts_by_construct[curr_name]]
writer.writerow(row)
In [ ]:
from ccbbucsd.utilities.files_and_paths import build_multipart_fp
def count_constructs_for_one_fastq_pair(curr_base, run_prefix, seq_len, num_allowed_mismatches, constructs_fp,
col_indices, output_dir, fw_fastq_fp, rv_fastq_fp):
construct_names, grna_name_seq_pairs = extract_construct_and_grna_info(constructs_fp, col_indices)
trimmed_grna_name_seq_pairs = trim_grnas(grna_name_seq_pairs, seq_len)
# Note: currently same value (num_allowed_mismatches) is being used for number of mismatches allowed in forward
# read and number of mismatches allowed in reverse read, but this can be altered if desired
grna_matcher = GrnaPositionMatcher(trimmed_grna_name_seq_pairs, seq_len, num_allowed_mismatches,
num_allowed_mismatches)
output_fp = build_multipart_fp(output_dir, [curr_base, run_prefix, get_counts_file_suffix()])
generate_construct_counts(grna_matcher, construct_names, output_fp, fw_fastq_fp, rv_fastq_fp)
In [ ]:
from ccbbucsd.utilities.parallel_process_fastqs import parallel_process_paired_reads, concatenate_parallel_results
g_col_indices = [int(x.strip()) for x in g_col_indices_str.split(",")]
g_parallel_results = parallel_process_paired_reads(g_filtered_fastqs_dir, get_filtered_file_suffix(), g_num_processors,
count_constructs_for_one_fastq_pair, [g_fastq_counts_run_prefix, g_len_of_seq_to_match,
g_num_allowed_mismatches, g_constructs_fp,
g_col_indices, g_fastq_counts_dir], True)
In [ ]:
print(concatenate_parallel_results(g_parallel_results))
In [ ]: