In [1]:
import matplotlib
matplotlib.rc('text', usetex=True)

%matplotlib inline

In [2]:
import logging

import matplotlib_venn
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import scipy.stats

import misc.bio as bio
import misc.latex as latex
import misc.math_utils as math_utils

import riboutils.ribo_utils as ribo_utils

In [3]:
def get_uniprot_nt_lengths(uniprot_file, min_nt_length=0, types=None):
    """ This function parses a file exported from UniProt giving protein
        information. It then filters based on the 'Status' types, and
        finally filters based on the nucleotide lengths of the sequences.
        It returns a list of lengths.
        
        Args:
            uniprot_file (string) : the path to the file
            
            min_nt_length (int) : the minimum length to include in the result
            
            types (None, or list of strings) : the types to keep after
                filtering; or, if None is given, no filtering will be applied
                
            The available types seem to be:
                * 'unreviewed'
                * 'reviewed'
                * 'unreviewed and UniParc'
                * 'reviewed and UniParc'
                * 'partially reviewed and UniParc'
                * 'partially reviewed'
                
        Returns:
            np.array of ints : the lengths (in nucleotides) of the
                proteins remaining after filtering
    """
    uniprot = pd.read_csv(uniprot_file, sep='\t')

    if types is not None:
        m_types = uniprot['Status'].isin(types)
        uniprot = uniprot[m_types]
        
    uniprot_nt_lengths = uniprot['Length'] * 3

    m_uniprot_nt_lengths = uniprot_nt_lengths > min_nt_length
    uniprot_nt_lengths = uniprot_nt_lengths[m_uniprot_nt_lengths]
    return np.array(uniprot_nt_lengths)

In [4]:
def get_orf_lengths(orfs, orf_types):
    m_orf_type = orfs['orf_type'].isin(orf_types)
    lengths = np.array(orfs.loc[m_orf_type, 'orf_len'])
    return lengths

In [5]:
orfs_file = "/prj/grosshans-riboseq/RPF/orf-predictions/early-samples-unique.smooth.predicted-orfs.bed.gz"
orfs = bio.read_bed(orfs_file)
title = "ORF length distributions"
image_name = None # "/data/projects/departments/Christoph_Dieterich/riboseq/orf-coverage.eps"

In [6]:
# read in the ground truth protein lengths
#uniprot_file = "/data/projects/departments/Christoph_Dieterich/riboseq/uniprot-protein-lengths.tab.gz"
uniprot_file = "/genomes/caenorhabditis_elegans/c_elegans-uniref90-proteins.tab.gz"
truth_nt_lengths = get_uniprot_nt_lengths(uniprot_file)
truth_label = 'UniRef90'

In [7]:
orf_lengths = [ get_orf_lengths(orfs, ribo_utils.orf_type_labels_mapping[label]) 
                   for label in ribo_utils.orf_type_labels]

In [9]:
prediction_labels = [latex.get_latex_safe_string(l) for l in ribo_utils.orf_type_labels]
prediction_lengths_list = orf_lengths
#prediction_lengths_list = [bf_lengths, chisq_lengths]
#prediction_labels = ['BF', r'$\chi^2$']

# input: truth_nt_lengths (array-like)
#        prediction_lengths_list (list of array-likes)
#        truth_label (string)
#        prediction_labels (list of array-likes)
#
# if truth_nt_lengths is not defined, then the KL-divergence calculations
# will be skipped (and it will not be shown)

fontsize = 20
legend_fontsize = 20
title_fontsize = 20
linewidth = 4

# plot the empirical distribution of ORF lengths
hist_min = 200
hist_max = 5250
hist_step = 200
hist_range = (hist_min, hist_max)
hist_bins = np.arange(hist_min, hist_max, hist_step)

if truth_nt_lengths is not None:
    truth_hist, _ = np.histogram(truth_nt_lengths, bins=hist_bins, range=hist_range, density=True)
else:
    truth_hist = None
    
prediction_hists = []
for prediction_lengths in prediction_lengths_list:
    prediction_hist, _ = np.histogram(prediction_lengths, bins=hist_bins, range=hist_range, density=True)
    prediction_hists.append(prediction_hist)

# now, normalize the histograms
if truth_hist is not None:
    truth_hist = truth_hist / np.sum(truth_hist)
    truth_hist += 1e-3
    
for i, prediction_hist in enumerate(prediction_hists):
    prediction_hists[i] = prediction_hist / np.sum(prediction_hist)
    prediction_hists[i] += 1e-3

kls = []
if truth_hist is not None:
    for i, prediction_hist in enumerate(prediction_hists):
        kl = math_utils.calculate_symmetric_kl_divergence(truth_hist, prediction_hist, scipy.stats.entropy)
        kls.append(kl)
        
        # and update the label
        prediction_labels[i] = '{}, KL: ${:.2f}$'.format(prediction_labels[i], kl)
        
if truth_hist is not None:
    truth_hist = 100 * truth_hist
    
for i, prediction_hist in enumerate(prediction_hists):
    prediction_hists[i] *= 100

fig, ax = plt.subplots(figsize=(10,5))

cm = plt.cm.gist_earth

x = np.arange(len(hist_bins)-1)

truth_cm_offset = 0.1
if truth_hist is not None:
    color = cm(truth_cm_offset)
    ax.plot(x, truth_hist, label=truth_label, linewidth=linewidth, color=color)
    
color_range = 1 - 2*truth_cm_offset
for i, prediction_hist in enumerate(prediction_hists):
    color = i / len(prediction_hists) * color_range
    color += truth_cm_offset
    color = cm(color)
    ax.plot(x, prediction_hist, label=prediction_labels[i], linewidth=linewidth, color=color)

ax.set_xlabel('Length (bp)', fontsize=fontsize)
ax.set_ylabel('\% of predicted ORFs', fontsize=fontsize)
ax.set_title(title, fontsize=fontsize)

ax.set_xticks(x[::2])
ax.set_xticklabels(hist_bins[::2], fontsize=fontsize, rotation=90)

ax.set_ylim((0, 20))
ax.set_xlim((0, len(hist_bins)))

# hide the "0" tick label
yticks = ax.yaxis.get_major_ticks()
yticks[0].label1.set_visible(False)

# chop off everything from 3000 on
index_of_3000 = 14
ax.set_xlim((0, index_of_3000))
#ax.set_xlim((0, len(uniprot_hist)-1))

ax.legend(loc='center right', fontsize=legend_fontsize, bbox_to_anchor=(1.75,0.5))
ax.tick_params(axis='both', which='major', labelsize=fontsize)

if image_name is not None:
    fig.tight_layout()
    fig.savefig(image_name)



In [29]:
min_signal = 10
min_bf_mean = 5
max_bf_var = 2
min_length = 20
chisq_alpha = 0.01

bf = bio.read_bed(bf_file)

orf_types = None # ['noncoding', 'canonical', 'canonical_truncated', 'within',
#       'three_prime', 'suspect_overlap', 'three_prime_overlap',
#       'five_prime', 'canonical_extended', 'five_prime_overlap']

if orf_types is not None:
    m_orf_type = bf['orf_type'].isin(orf_types)
    bf_filtered = bf[m_orf_type]
else:
    bf_filtered = bf

longest_orfs, bf_orfs, chisq_orfs = rpbp.rpbp_utils.get_predicted_orfs(bf_filtered, 
                                                       min_signal=min_signal, 
                                                       min_bf_mean=min_bf_mean, 
                                                       max_bf_var=max_bf_var, 
                                                       min_length=min_length,
                                                       chisq_alpha=chisq_alpha)

bf_lengths = bf_orfs.apply(bio.get_bed_12_feature_length, axis=1)
chisq_lengths = chisq_orfs.apply(bio.get_bed_12_feature_length, axis=1)