In [1]:
%load_ext autoreload
%autoreload 2
%matplotlib inline

from argparse import Namespace
import misc.logging_utils as logging_utils

args = Namespace()
logger = logging_utils.get_ipython_logger()

In [22]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns; sns.set(style='white', color_codes=True)

import os

import riboutils.ribo_utils as ribo_utils
import riboutils.ribo_filenames as filenames

import bio_utils.bed_utils as bed_utils
import misc.parallel as parallel
import misc.utils as utils
import yaml

In [32]:
def get_orf_type_counts(name, is_single_sample, config, args):
    
    note_str = config.get('note', None)

    # and the smoothing parameters
    fraction = config.get('smoothing_fraction', None)
    reweighting_iterations = config.get('smoothing_reweighting_iterations', None)

    # keep multimappers?
    is_unique = not ('keep_riboseq_multimappers' in config)
    
    if is_single_sample:
        # otherwise, just treat things as normal
        # get the lengths and offsets which meet the required criteria from 
        # the config file
        
        try:
            lengths, offsets = ribo_utils.get_periodic_lengths_and_offsets(
                config, 
                name,
                is_unique=is_unique
            )
            
        except FileNotFoundError:
            msg = "Could not find metagene periodicity file. Skipping. name: {}".format(name,)
            logger.warning(msg)
            return None
        
    else:
        # we will not use the lengths and offsets in the filenames
        lengths = None
        offsets = None
        

    predicted_orfs = filenames.get_riboseq_predicted_orfs(
        config['riboseq_data'], 
        name, 
        length=lengths, 
        offset=offsets, 
        is_unique=is_unique, 
        note=note_str, 
        fraction=fraction, 
        reweighting_iterations=reweighting_iterations,
        is_filtered=True, 
        is_chisq=False
    )
    
    if not os.path.exists(predicted_orfs):
        msg = "Could not find predicted ORFs. name: {}. file: {}".format(name, predicted_orfs)
        logger.warning(msg)
        return None
    
    bed = bed_utils.read_bed(predicted_orfs)

    if args.use_groups:
        bed['orf_type_group'] = bed['orf_type'].map(
            ribo_utils.orf_type_labels_reverse_mapping)

        orf_type_counts = bed.groupby(['orf_type_group', 'strand']).size()
        orf_type_counts = orf_type_counts.reset_index(name="count")
        orf_type_counts['display_name'] = orf_type_counts['orf_type_group'].map(
            ribo_utils.orf_type_labels_display_name_map)
    else:
        orf_type_counts = bed.groupby(['orf_type', 'strand']).size()
        orf_type_counts = orf_type_counts.reset_index(name="count")
        orf_type_counts['display_name'] = orf_type_counts['orf_type'].map(
            ribo_utils.orf_type_display_name_map)
        
    orf_type_counts['sample'] = name
    return orf_type_counts

In [24]:
args.config = "/prj/riechert-riboseq/analysis/config/control-plus-huebner.yaml"
args.use_groups = True

config = yaml.load(open(args.config))

In [41]:
is_single_sample = True
single_sample_orf_types = parallel.apply_iter_simple(
    config['riboseq_samples'].keys(),
    get_orf_type_counts,
    is_single_sample,
    config,
    args
)
single_sample_orf_types = utils.remove_nones(single_sample_orf_types)

is_single_sample = False
merged_sample_orf_types = parallel.apply_iter_simple(
    ribo_utils.get_riboseq_replicates(config),
    get_orf_type_counts,
    is_single_sample,
    config,
    args
)
merged_sample_orf_types = utils.remove_nones(merged_sample_orf_types)

sample_orf_types = single_sample_orf_types + merged_sample_orf_types
sample_orf_types_df = pd.concat(sample_orf_types)


WARNING  : Could not find metagene periodicity file. Skipping. name: sedentary-2wks-wt.riboseq.cell-type-cm.rep-453
WARNING  : Could not find metagene periodicity file. Skipping. name: sedentary-2wks-wt.riboseq.cell-type-cm.rep-454
WARNING  : Could not find metagene periodicity file. Skipping. name: sedentary.riboseq.cell-type-cm.rep-317
WARNING  : Could not find metagene periodicity file. Skipping. name: sedentary.riboseq.cell-type-cm.rep-320
WARNING  : Could not find metagene periodicity file. Skipping. name: sedentary.riboseq.cell-type-endo.rep-198
WARNING  : Could not find metagene periodicity file. Skipping. name: sham-2wks-wt.riboseq.cell-type-cm.rep-542
WARNING  : Could not find metagene periodicity file. Skipping. name: sham-2wks-wt.riboseq.cell-type-cm.rep-745
WARNING  : Could not find metagene periodicity file. Skipping. name: sham-wt.riboseq.cell-type-cm.rep-403
WARNING  : Could not find metagene periodicity file. Skipping. name: sham-wt.riboseq.cell-type-cm.rep-407
WARNING  : Could not find metagene periodicity file. Skipping. name: bl6-wt-boston.riboseq.cell-type-lv.rep-1
WARNING  : Could not find metagene periodicity file. Skipping. name: bl6-wt-boston.riboseq.cell-type-lv.rep-2
WARNING  : Could not find metagene periodicity file. Skipping. name: bl6-wt-boston.riboseq.cell-type-lv.rep-3
WARNING  : Could not find metagene periodicity file. Skipping. name: bl6-wt-signapore.riboseq.cell-type-lv.rep-1
WARNING  : Could not find metagene periodicity file. Skipping. name: bl6-wt-signapore.riboseq.cell-type-lv.rep-2
WARNING  : Could not find metagene periodicity file. Skipping. name: bl6-wt-signapore.riboseq.cell-type-lv.rep-3
INFO     : Found 'riboseq_biological_replicates' key in config file
/home/bmmalone/.virtualenvs/rpbp/lib/python3.6/site-packages/joblib/parallel.py:131: DtypeWarning: Columns (0,16) have mixed types. Specify dtype option on import or set low_memory=False.
  return [func(*args, **kwargs) for func, args, kwargs in self.items]

In [45]:
def get_name(sample, d):
    return d[sample]

sample_name_map = ribo_utils.get_sample_name_map(config)
condition_name_map = ribo_utils.get_riboseq_condition_name_map(config)

sample_names = sample_orf_types_df['sample']
sample_names = parallel.apply_iter_simple(sample_names, get_name, sample_name_map)
sample_names = parallel.apply_iter_simple(sample_names, get_name, condition_name_map)

sample_orf_types_df['sample_name'] = sample_names
sample_orf_types_df.head()


Out[45]:
orf_type_group strand count display_name sample sample_name
0 canonical + 2828 Canonical sedentary-2wks-wt.riboseq.cell-type-cm Sedentary, WT, 2 weeks, CMs, RPF
1 canonical - 2818 Canonical sedentary-2wks-wt.riboseq.cell-type-cm Sedentary, WT, 2 weeks, CMs, RPF
2 canonical_variant + 887 Canonical variant sedentary-2wks-wt.riboseq.cell-type-cm Sedentary, WT, 2 weeks, CMs, RPF
3 canonical_variant - 868 Canonical variant sedentary-2wks-wt.riboseq.cell-type-cm Sedentary, WT, 2 weeks, CMs, RPF
4 five_prime + 140 uORF sedentary-2wks-wt.riboseq.cell-type-cm Sedentary, WT, 2 weeks, CMs, RPF

In [34]:
sample_orf_types_df.head()


Out[34]:
orf_type_group strand count display_name sample
0 canonical + 2828 Canonical sedentary-2wks-wt.riboseq.cell-type-cm
1 canonical - 2818 Canonical sedentary-2wks-wt.riboseq.cell-type-cm
2 canonical_variant + 887 Canonical variant sedentary-2wks-wt.riboseq.cell-type-cm
3 canonical_variant - 868 Canonical variant sedentary-2wks-wt.riboseq.cell-type-cm
4 five_prime + 140 uORF sedentary-2wks-wt.riboseq.cell-type-cm

In [43]:
ribo_utils.get_riboseq_condition_name_map(config)


Out[43]:
{'bl6-wt-boston.riboseq.cell-type-lv': 'BL6 - Boston, WT, LV, RPF',
 'bl6-wt-signapore.riboseq.cell-type-lv': 'BL6 - Singapore, WT, LV, RPF',
 'sedentary-2wks-wt.riboseq.cell-type-cm': 'Sedentary, WT, 2 weeks, CMs, RPF',
 'sedentary.riboseq.cell-type-cm': 'Sedentary, WT, CMs, RPF',
 'sham-2wks-wt.riboseq.cell-type-cm': 'Sham, WT, 2 weeks, CMs, RPF',
 'sham-wt.riboseq.cell-type-cm': 'Sham, WT, CMs, RPF'}

In [40]:
sample_name_map['sedentary-2wks-wt.riboseq.cell-type-cm']


Out[40]:
'sedentary-2wks-wt.riboseq.cell-type-cm'

In [ ]: