In [1]:
%load_ext autoreload
%autoreload 2
%matplotlib inline
from argparse import Namespace
import misc.logging_utils as logging_utils
args = Namespace()
logger = logging_utils.get_ipython_logger()
In [22]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns; sns.set(style='white', color_codes=True)
import os
import riboutils.ribo_utils as ribo_utils
import riboutils.ribo_filenames as filenames
import bio_utils.bed_utils as bed_utils
import misc.parallel as parallel
import misc.utils as utils
import yaml
In [32]:
def get_orf_type_counts(name, is_single_sample, config, args):
note_str = config.get('note', None)
# and the smoothing parameters
fraction = config.get('smoothing_fraction', None)
reweighting_iterations = config.get('smoothing_reweighting_iterations', None)
# keep multimappers?
is_unique = not ('keep_riboseq_multimappers' in config)
if is_single_sample:
# otherwise, just treat things as normal
# get the lengths and offsets which meet the required criteria from
# the config file
try:
lengths, offsets = ribo_utils.get_periodic_lengths_and_offsets(
config,
name,
is_unique=is_unique
)
except FileNotFoundError:
msg = "Could not find metagene periodicity file. Skipping. name: {}".format(name,)
logger.warning(msg)
return None
else:
# we will not use the lengths and offsets in the filenames
lengths = None
offsets = None
predicted_orfs = filenames.get_riboseq_predicted_orfs(
config['riboseq_data'],
name,
length=lengths,
offset=offsets,
is_unique=is_unique,
note=note_str,
fraction=fraction,
reweighting_iterations=reweighting_iterations,
is_filtered=True,
is_chisq=False
)
if not os.path.exists(predicted_orfs):
msg = "Could not find predicted ORFs. name: {}. file: {}".format(name, predicted_orfs)
logger.warning(msg)
return None
bed = bed_utils.read_bed(predicted_orfs)
if args.use_groups:
bed['orf_type_group'] = bed['orf_type'].map(
ribo_utils.orf_type_labels_reverse_mapping)
orf_type_counts = bed.groupby(['orf_type_group', 'strand']).size()
orf_type_counts = orf_type_counts.reset_index(name="count")
orf_type_counts['display_name'] = orf_type_counts['orf_type_group'].map(
ribo_utils.orf_type_labels_display_name_map)
else:
orf_type_counts = bed.groupby(['orf_type', 'strand']).size()
orf_type_counts = orf_type_counts.reset_index(name="count")
orf_type_counts['display_name'] = orf_type_counts['orf_type'].map(
ribo_utils.orf_type_display_name_map)
orf_type_counts['sample'] = name
return orf_type_counts
In [24]:
args.config = "/prj/riechert-riboseq/analysis/config/control-plus-huebner.yaml"
args.use_groups = True
config = yaml.load(open(args.config))
In [41]:
is_single_sample = True
single_sample_orf_types = parallel.apply_iter_simple(
config['riboseq_samples'].keys(),
get_orf_type_counts,
is_single_sample,
config,
args
)
single_sample_orf_types = utils.remove_nones(single_sample_orf_types)
is_single_sample = False
merged_sample_orf_types = parallel.apply_iter_simple(
ribo_utils.get_riboseq_replicates(config),
get_orf_type_counts,
is_single_sample,
config,
args
)
merged_sample_orf_types = utils.remove_nones(merged_sample_orf_types)
sample_orf_types = single_sample_orf_types + merged_sample_orf_types
sample_orf_types_df = pd.concat(sample_orf_types)
In [45]:
def get_name(sample, d):
return d[sample]
sample_name_map = ribo_utils.get_sample_name_map(config)
condition_name_map = ribo_utils.get_riboseq_condition_name_map(config)
sample_names = sample_orf_types_df['sample']
sample_names = parallel.apply_iter_simple(sample_names, get_name, sample_name_map)
sample_names = parallel.apply_iter_simple(sample_names, get_name, condition_name_map)
sample_orf_types_df['sample_name'] = sample_names
sample_orf_types_df.head()
Out[45]:
In [34]:
sample_orf_types_df.head()
Out[34]:
In [43]:
ribo_utils.get_riboseq_condition_name_map(config)
Out[43]:
In [40]:
sample_name_map['sedentary-2wks-wt.riboseq.cell-type-cm']
Out[40]:
In [ ]: