Dual CRISPR Screen Analysis

Count Plots

Amanda Birmingham, CCBB, UCSD (abirmingham@ucsd.edu)

Instructions

To run this notebook reproducibly, follow these steps:

  1. Click Kernel > Restart & Clear Output
  2. When prompted, click the red Restart & clear all outputs button
  3. Fill in the values for your analysis for each of the variables in the Input Parameters section
  4. Click Cell > Run All

Input Parameters


In [ ]:
g_timestamp = ""
g_dataset_name = "20160510_A549"
g_count_alg_name = "19mer_1mm_py"
g_fastq_counts_dir = '/Users/Birmingham/Repositories/ccbb_tickets/20160210_mali_crispr/data/interim/20160510_D00611_0278_BHK55CBCXX_A549'
g_fastq_counts_run_prefix = "19mer_1mm_py_20160615223822"
g_collapsed_counts_dir = "/Users/Birmingham/Repositories/ccbb_tickets/20160210_mali_crispr/data/processed/20160510_A549"
g_collapsed_counts_run_prefix = "20160510_A549_19mer_1mm_py_20160616101309"
g_combined_counts_dir = ""
g_combined_counts_run_prefix = ""
g_plots_dir = ""
g_plots_run_prefix = ""
g_code_location = "/Users/Birmingham/Repositories/ccbb_tickets/20160210_mali_crispr/src/python"

Matplotlib Display


In [ ]:
%matplotlib inline

CCBB Library Imports


In [ ]:
import sys
sys.path.append(g_code_location)

Automated Set-Up


In [ ]:
# %load -s describe_var_list /Users/Birmingham/Repositories/ccbb_tickets/20160210_mali_crispr/src/python/ccbbucsd/utilities/analysis_run_prefixes.py
def describe_var_list(input_var_name_list):
    description_list =  ["{0}: {1}\n".format(name, eval(name)) for name in input_var_name_list]
    return "".join(description_list)

In [ ]:
from ccbbucsd.utilities.analysis_run_prefixes import check_or_set, get_run_prefix, get_timestamp
g_timestamp = check_or_set(g_timestamp, get_timestamp())
g_collapsed_counts_dir = check_or_set(g_collapsed_counts_dir, g_fastq_counts_dir)
g_collapsed_counts_run_prefix = check_or_set(g_collapsed_counts_run_prefix, g_fastq_counts_run_prefix)
g_combined_counts_dir = check_or_set(g_combined_counts_dir, g_collapsed_counts_dir)
g_combined_counts_run_prefix = check_or_set(g_combined_counts_run_prefix, g_collapsed_counts_run_prefix)
g_plots_dir = check_or_set(g_plots_dir, g_combined_counts_dir)
g_plots_run_prefix = check_or_set(g_plots_run_prefix,
                                  get_run_prefix(g_dataset_name, g_count_alg_name, g_timestamp))
print(describe_var_list(['g_timestamp','g_collapsed_counts_dir', 'g_collapsed_counts_run_prefix', 
                         'g_combined_counts_dir', 'g_combined_counts_run_prefix', 'g_plots_dir', 
                         'g_plots_run_prefix']))

In [ ]:
from ccbbucsd.utilities.files_and_paths import verify_or_make_dir
verify_or_make_dir(g_collapsed_counts_dir)
verify_or_make_dir(g_combined_counts_dir)
verify_or_make_dir(g_plots_dir)

Count File Suffixes


In [ ]:
# %load -s get_counts_file_suffix /Users/Birmingham/Repositories/ccbb_tickets/20160210_mali_crispr/src/python/ccbbucsd/malicrispr/construct_counter.py
def get_counts_file_suffix():
    return "counts.txt"

In [ ]:
# %load -s get_collapsed_counts_file_suffix,get_combined_counts_file_suffix /Users/Birmingham/Repositories/ccbb_tickets/20160210_mali_crispr/src/python/ccbbucsd/malicrispr/count_combination.py
def get_collapsed_counts_file_suffix():
    return "collapsed.txt"

def get_combined_counts_file_suffix():
    return "counts_combined.txt"

Count Plots Functions


In [ ]:
# %load /Users/Birmingham/Repositories/ccbb_tickets/20160210_mali_crispr/src/python/ccbbucsd/malicrispr/count_plots.py
# third-party libraries
import matplotlib.pyplot
import numpy
import pandas

# ccbb libraries
from ccbbucsd.utilities.analysis_run_prefixes import strip_run_prefix
from ccbbucsd.utilities.files_and_paths import build_multipart_fp, get_file_name_pieces, get_filepaths_by_prefix_and_suffix

# project-specific libraries
from ccbbucsd.malicrispr.count_files_and_dataframes import get_counts_df

__author__ = "Amanda Birmingham"
__maintainer__ = "Amanda Birmingham"
__email__ = "abirmingham@ucsd.edu"
__status__ = "prototype"

DEFAULT_PSEUDOCOUNT = 1


def get_boxplot_suffix():
    return "boxplots.png"


def make_log2_series(input_series, pseudocount_val):
    revised_series = input_series + pseudocount_val
    log2_series = revised_series.apply(numpy.log2)
    nan_log2_series = log2_series.replace([numpy.inf, -numpy.inf], numpy.nan)
    return nan_log2_series.dropna().reset_index(drop=True)
    # note that .reset_index(drop=True) is necessary as matplotlib boxplot function (perhaps among others)
    # throws an error if the input series doesn't include an item with index 0--which can be the case if
    # that first item was NaN and was dropped, and series wasn't reindexed.
    

def show_and_save_histogram(output_fp, title, count_data):
    matplotlib.pyplot.figure(figsize=(20,20))
    matplotlib.pyplot.hist(count_data)
    matplotlib.pyplot.title(title)
    matplotlib.pyplot.xlabel("log2(raw counts)")
    matplotlib.pyplot.ylabel("Frequency")
    matplotlib.pyplot.savefig(output_fp)
    matplotlib.pyplot.show()


def show_and_save_boxplot(output_fp, title, samples_names, samples_data, rotation_val=0):
    fig = matplotlib.pyplot.figure(1, figsize=(20,20))
    ax = fig.add_subplot(111)
    bp = ax.boxplot(samples_data)
    ax.set_xticklabels(samples_names, rotation=rotation_val)   
    ax.set_xlabel("samples")
    ax.set_ylabel("log2(raw counts)")

    matplotlib.pyplot.title(title)
    fig.savefig(output_fp, bbox_inches='tight')
    matplotlib.pyplot.show()


def plot_raw_counts(input_dir, input_run_prefix, counts_suffix, output_dir, output_run_prefix, boxplot_suffix):
    counts_fps_for_run = get_filepaths_by_prefix_and_suffix(input_dir, input_run_prefix, counts_suffix)
    
    for curr_counts_fp in counts_fps_for_run:
        _, curr_sample, _ = get_file_name_pieces(curr_counts_fp)
        stripped_sample = strip_run_prefix(curr_sample, input_run_prefix)
        count_header, curr_counts_df = get_counts_df(curr_counts_fp, input_run_prefix)
        curr_counts_df.rename(columns={count_header:stripped_sample}, inplace=True)
        count_header = stripped_sample
        log2_series = make_log2_series(curr_counts_df[count_header], DEFAULT_PSEUDOCOUNT)
        
        title = " ".join([input_run_prefix, count_header, "with pseudocount", str(DEFAULT_PSEUDOCOUNT)])
        output_fp_prefix = build_multipart_fp(output_dir, [count_header, input_run_prefix])
        
        boxplot_fp = output_fp_prefix + "_" + boxplot_suffix
        show_and_save_boxplot(boxplot_fp, title, [count_header], log2_series)
        
        hist_fp = output_fp_prefix + "_" + "hist.png"
        show_and_save_histogram(hist_fp, title, log2_series)
        
        
def plot_combined_raw_counts(input_dir, input_run_prefix, combined_suffix, output_dir, output_run_prefix, boxplot_suffix):
    output_fp = build_multipart_fp(output_dir, [output_run_prefix, boxplot_suffix])
    combined_counts_fp = build_multipart_fp(input_dir, [input_run_prefix, combined_suffix])
    combined_counts_df = pandas.read_table(combined_counts_fp)
    samples_names = combined_counts_df.columns.values[1:]  # TODO: remove hardcode
    samples_data = []
    for curr_name in samples_names:
        log2_series = make_log2_series(combined_counts_df[curr_name], DEFAULT_PSEUDOCOUNT)
        samples_data.append(log2_series.tolist())
        
    title = " ".join([input_run_prefix, "all samples", "with pseudocount", str(DEFAULT_PSEUDOCOUNT)])
    show_and_save_boxplot(output_fp, title, samples_names, samples_data, 90)

Individual fastq Plots


In [ ]:
from ccbbucsd.utilities.files_and_paths import summarize_filenames_for_prefix_and_suffix

In [ ]:
print(summarize_filenames_for_prefix_and_suffix(g_fastq_counts_dir, g_fastq_counts_run_prefix, get_counts_file_suffix()))

In [ ]:
# this call makes one boxplot per raw fastq  
plot_raw_counts(g_fastq_counts_dir, g_fastq_counts_run_prefix, get_counts_file_suffix(), g_plots_dir, 
                g_plots_run_prefix, get_boxplot_suffix())

Individual Sample Plots


In [ ]:
print(summarize_filenames_for_prefix_and_suffix(g_collapsed_counts_dir, g_collapsed_counts_run_prefix, 
                                        get_collapsed_counts_file_suffix()))

In [ ]:
plot_raw_counts(g_collapsed_counts_dir, g_collapsed_counts_run_prefix, get_collapsed_counts_file_suffix(), 
                g_plots_dir, g_plots_run_prefix, get_boxplot_suffix())

Combined Samples Plots


In [ ]:
print(summarize_filenames_for_prefix_and_suffix(g_combined_counts_dir, g_combined_counts_run_prefix, 
                                        get_combined_counts_file_suffix()))

In [ ]:
plot_combined_raw_counts(g_combined_counts_dir, g_combined_counts_run_prefix, get_combined_counts_file_suffix(), 
                         g_plots_dir, g_plots_run_prefix, get_boxplot_suffix())