notebook.community

Edit and run



In [1]:

    
%load_ext autoreload
%autoreload 2
%matplotlib inline

from argparse import Namespace
import misc.logging_utils as logging_utils

args = Namespace()
logger = logging_utils.get_ipython_logger()



In [22]:

    
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns; sns.set(style='white', color_codes=True)

import os

import riboutils.ribo_utils as ribo_utils
import riboutils.ribo_filenames as filenames

import bio_utils.bed_utils as bed_utils
import misc.parallel as parallel
import misc.utils as utils
import yaml



In [32]:

    
def get_orf_type_counts(name, is_single_sample, config, args):
    
    note_str = config.get('note', None)

    # and the smoothing parameters
    fraction = config.get('smoothing_fraction', None)
    reweighting_iterations = config.get('smoothing_reweighting_iterations', None)

    # keep multimappers?
    is_unique = not ('keep_riboseq_multimappers' in config)
    
    if is_single_sample:
        # otherwise, just treat things as normal
        # get the lengths and offsets which meet the required criteria from 
        # the config file
        
        try:
            lengths, offsets = ribo_utils.get_periodic_lengths_and_offsets(
                config, 
                name,
                is_unique=is_unique
            )
            
        except FileNotFoundError:
            msg = "Could not find metagene periodicity file. Skipping. name: {}".format(name,)
            logger.warning(msg)
            return None
        
    else:
        # we will not use the lengths and offsets in the filenames
        lengths = None
        offsets = None
        

    predicted_orfs = filenames.get_riboseq_predicted_orfs(
        config['riboseq_data'], 
        name, 
        length=lengths, 
        offset=offsets, 
        is_unique=is_unique, 
        note=note_str, 
        fraction=fraction, 
        reweighting_iterations=reweighting_iterations,
        is_filtered=True, 
        is_chisq=False
    )
    
    if not os.path.exists(predicted_orfs):
        msg = "Could not find predicted ORFs. name: {}. file: {}".format(name, predicted_orfs)
        logger.warning(msg)
        return None
    
    bed = bed_utils.read_bed(predicted_orfs)

    if args.use_groups:
        bed['orf_type_group'] = bed['orf_type'].map(
            ribo_utils.orf_type_labels_reverse_mapping)

        orf_type_counts = bed.groupby(['orf_type_group', 'strand']).size()
        orf_type_counts = orf_type_counts.reset_index(name="count")
        orf_type_counts['display_name'] = orf_type_counts['orf_type_group'].map(
            ribo_utils.orf_type_labels_display_name_map)
    else:
        orf_type_counts = bed.groupby(['orf_type', 'strand']).size()
        orf_type_counts = orf_type_counts.reset_index(name="count")
        orf_type_counts['display_name'] = orf_type_counts['orf_type'].map(
            ribo_utils.orf_type_display_name_map)
        
    orf_type_counts['sample'] = name
    return orf_type_counts



In [24]:

    
args.config = "/prj/riechert-riboseq/analysis/config/control-plus-huebner.yaml"
args.use_groups = True

config = yaml.load(open(args.config))



In [41]:

    
is_single_sample = True
single_sample_orf_types = parallel.apply_iter_simple(
    config['riboseq_samples'].keys(),
    get_orf_type_counts,
    is_single_sample,
    config,
    args
)
single_sample_orf_types = utils.remove_nones(single_sample_orf_types)

is_single_sample = False
merged_sample_orf_types = parallel.apply_iter_simple(
    ribo_utils.get_riboseq_replicates(config),
    get_orf_type_counts,
    is_single_sample,
    config,
    args
)
merged_sample_orf_types = utils.remove_nones(merged_sample_orf_types)

sample_orf_types = single_sample_orf_types + merged_sample_orf_types
sample_orf_types_df = pd.concat(sample_orf_types)









    



WARNING  : Could not find metagene periodicity file. Skipping. name: sedentary-2wks-wt.riboseq.cell-type-cm.rep-453
WARNING  : Could not find metagene periodicity file. Skipping. name: sedentary-2wks-wt.riboseq.cell-type-cm.rep-454
WARNING  : Could not find metagene periodicity file. Skipping. name: sedentary.riboseq.cell-type-cm.rep-317
WARNING  : Could not find metagene periodicity file. Skipping. name: sedentary.riboseq.cell-type-cm.rep-320
WARNING  : Could not find metagene periodicity file. Skipping. name: sedentary.riboseq.cell-type-endo.rep-198
WARNING  : Could not find metagene periodicity file. Skipping. name: sham-2wks-wt.riboseq.cell-type-cm.rep-542
WARNING  : Could not find metagene periodicity file. Skipping. name: sham-2wks-wt.riboseq.cell-type-cm.rep-745
WARNING  : Could not find metagene periodicity file. Skipping. name: sham-wt.riboseq.cell-type-cm.rep-403
WARNING  : Could not find metagene periodicity file. Skipping. name: sham-wt.riboseq.cell-type-cm.rep-407
WARNING  : Could not find metagene periodicity file. Skipping. name: bl6-wt-boston.riboseq.cell-type-lv.rep-1
WARNING  : Could not find metagene periodicity file. Skipping. name: bl6-wt-boston.riboseq.cell-type-lv.rep-2
WARNING  : Could not find metagene periodicity file. Skipping. name: bl6-wt-boston.riboseq.cell-type-lv.rep-3
WARNING  : Could not find metagene periodicity file. Skipping. name: bl6-wt-signapore.riboseq.cell-type-lv.rep-1
WARNING  : Could not find metagene periodicity file. Skipping. name: bl6-wt-signapore.riboseq.cell-type-lv.rep-2
WARNING  : Could not find metagene periodicity file. Skipping. name: bl6-wt-signapore.riboseq.cell-type-lv.rep-3
INFO     : Found 'riboseq_biological_replicates' key in config file
/home/bmmalone/.virtualenvs/rpbp/lib/python3.6/site-packages/joblib/parallel.py:131: DtypeWarning: Columns (0,16) have mixed types. Specify dtype option on import or set low_memory=False.
  return [func(*args, **kwargs) for func, args, kwargs in self.items]



In [45]:

    
def get_name(sample, d):
    return d[sample]

sample_name_map = ribo_utils.get_sample_name_map(config)
condition_name_map = ribo_utils.get_riboseq_condition_name_map(config)

sample_names = sample_orf_types_df['sample']
sample_names = parallel.apply_iter_simple(sample_names, get_name, sample_name_map)
sample_names = parallel.apply_iter_simple(sample_names, get_name, condition_name_map)

sample_orf_types_df['sample_name'] = sample_names
sample_orf_types_df.head()









    Out[45]:






  
    
      
      orf_type_group
      strand
      count
      display_name
      sample
      sample_name
    
  
  
    
      0
      canonical
      +
      2828
      Canonical
      sedentary-2wks-wt.riboseq.cell-type-cm
      Sedentary, WT, 2 weeks, CMs, RPF
    
    
      1
      canonical
      -
      2818
      Canonical
      sedentary-2wks-wt.riboseq.cell-type-cm
      Sedentary, WT, 2 weeks, CMs, RPF
    
    
      2
      canonical_variant
      +
      887
      Canonical variant
      sedentary-2wks-wt.riboseq.cell-type-cm
      Sedentary, WT, 2 weeks, CMs, RPF
    
    
      3
      canonical_variant
      -
      868
      Canonical variant
      sedentary-2wks-wt.riboseq.cell-type-cm
      Sedentary, WT, 2 weeks, CMs, RPF
    
    
      4
      five_prime
      +
      140
      uORF
      sedentary-2wks-wt.riboseq.cell-type-cm
      Sedentary, WT, 2 weeks, CMs, RPF



In [34]:

    
sample_orf_types_df.head()









    Out[34]:






  
    
      
      orf_type_group
      strand
      count
      display_name
      sample
    
  
  
    
      0
      canonical
      +
      2828
      Canonical
      sedentary-2wks-wt.riboseq.cell-type-cm
    
    
      1
      canonical
      -
      2818
      Canonical
      sedentary-2wks-wt.riboseq.cell-type-cm
    
    
      2
      canonical_variant
      +
      887
      Canonical variant
      sedentary-2wks-wt.riboseq.cell-type-cm
    
    
      3
      canonical_variant
      -
      868
      Canonical variant
      sedentary-2wks-wt.riboseq.cell-type-cm
    
    
      4
      five_prime
      +
      140
      uORF
      sedentary-2wks-wt.riboseq.cell-type-cm



In [43]:

    
ribo_utils.get_riboseq_condition_name_map(config)









    Out[43]:





{'bl6-wt-boston.riboseq.cell-type-lv': 'BL6 - Boston, WT, LV, RPF',
 'bl6-wt-signapore.riboseq.cell-type-lv': 'BL6 - Singapore, WT, LV, RPF',
 'sedentary-2wks-wt.riboseq.cell-type-cm': 'Sedentary, WT, 2 weeks, CMs, RPF',
 'sedentary.riboseq.cell-type-cm': 'Sedentary, WT, CMs, RPF',
 'sham-2wks-wt.riboseq.cell-type-cm': 'Sham, WT, 2 weeks, CMs, RPF',
 'sham-wt.riboseq.cell-type-cm': 'Sham, WT, CMs, RPF'}



In [40]:

    
sample_name_map['sedentary-2wks-wt.riboseq.cell-type-cm']









    Out[40]:





'sedentary-2wks-wt.riboseq.cell-type-cm'



In [ ]:

	orf_type_group	strand	count	display_name	sample	sample_name
0	canonical	+	2828	Canonical	sedentary-2wks-wt.riboseq.cell-type-cm	Sedentary, WT, 2 weeks, CMs, RPF
1	canonical	-	2818	Canonical	sedentary-2wks-wt.riboseq.cell-type-cm	Sedentary, WT, 2 weeks, CMs, RPF
2	canonical_variant	+	887	Canonical variant	sedentary-2wks-wt.riboseq.cell-type-cm	Sedentary, WT, 2 weeks, CMs, RPF
3	canonical_variant	-	868	Canonical variant	sedentary-2wks-wt.riboseq.cell-type-cm	Sedentary, WT, 2 weeks, CMs, RPF
4	five_prime	+	140	uORF	sedentary-2wks-wt.riboseq.cell-type-cm	Sedentary, WT, 2 weeks, CMs, RPF