``````

In [1]:

%matplotlib inline

``````
``````

In [28]:

import matplotlib.pyplot as plt
import numpy as np
import scipy.io

import misc.bio as bio
import misc.bio_utils.bed_utils as bed_utils
import misc.parallel as parallel
import misc.utils as utils

import misc.math_utils as math_utils

import riboutils.ribo_utils as ribo_utils

``````
``````

In [57]:

def get_windows(profile):

profile = profile / np.max(profile)

orf_len = len(profile)
if orf_len < 42:
# we would return first window and exit
first_window = profile[:21]
return (first_window, None, None)

first_window, middle_window, last_window = np.split(profile, [21, orf_len-21])

# now, pull together and sum up all intermediate windows (of length 21)
# cheat a bit, and just split split the middle into 21-bp windows, drop the last window
indices = np.arange(21, len(middle_window), 21)
middle_windows = np.split(middle_window, indices)[:-1]

return first_window, middle_windows, last_window

def get_profile(orf, profiles):
orf_num = orf['orf_num']
orf_len = orf['orf_len']

if orf_len < 21:
return None

profile = utils.to_dense(profiles, orf_num, length=orf_len)

if sum(profile) < 5:
return None

return profile

def plot_rects(ind, means, variances, ax, ymax=0.5, fontsize=12, width=1):
cm = plt.cm.Blues

x_1 = means[0::3]
x_2 = means[1::3]
x_3 = means[2::3]

x_1_var = variances[0::3]
x_2_var = variances[1::3]
x_3_var = variances[2::3]

x_1_pos = np.arange(len(ind))[0::3]
x_2_pos = np.arange(len(ind))[1::3]
x_3_pos = np.arange(len(ind))[2::3]

x_1_rects = ax.bar(x_1_pos, x_1, width=width, color=cm(0.8), yerr=x_1_var)
x_2_rects = ax.bar(x_2_pos, x_2, width=width, color=cm(0.5), yerr=x_2_var)
x_3_rects = ax.bar(x_3_pos, x_3, width=width, color=cm(0.2), yerr=x_2_var)

ax.set_xticks(x_1_pos + width/2)
ax.set_xticklabels(x_1_pos, fontsize=fontsize)

ax.set_xlim((-width/2, len(ind)+width/2))
ax.set_ylim((0, ymax))

yticks = ax.yaxis.get_major_ticks()
yticks[0].label1.set_visible(False)

def plot_windows(windows, axes, ymax=0.5):

windows_np = np.array(windows)
first_windows = windows_np[:,0]

#print(first_windows)

last_windows = windows_np[:,2]
last_windows = np.array([lw for lw in last_windows if lw is not None])

middle_windows = windows_np[:,1]
middle_windows = [mw for mw in middle_windows if mw is not None]
middle_windows = utils.flatten_lists(middle_windows)
middle_windows = np.array(middle_windows)

ind = np.arange(21)  # the x locations for the groups
width = 0.5       # the width of the bars

cm = plt.cm.Blues

# the first window
first_means = np.mean(first_windows, axis=0)
first_var = np.var(first_windows, axis=0)
plot_rects(ind, first_means, first_var, axes[0], ymax=ymax)

#rects_first = axes[0].bar(ind, first_means, width, color=cm(0.8), yerr=first_var)

# the middle windows
middle_means = np.mean(middle_windows, axis=0)
middle_var = np.var(middle_windows, axis=0)
#rects_middle = axes[1].bar(ind, middle_means, width, color=cm(0.5), yerr=middle_var)
plot_rects(ind, middle_means, middle_var, axes[1], ymax=ymax)

# the last window
last_means = np.mean(last_windows, axis=0)
last_var = np.var(last_windows, axis=0)
#rects_last = axes[2].bar(ind, last_means, width, color=cm(0.2), yerr=last_var)
plot_rects(ind, last_means, last_var, axes[2], ymax=ymax)

``````
``````

In [5]:

#orfs_file = "/genomes/caenorhabditis_elegans/WBcel235.79.plus-de-novo/transcript-index/WBcel235.79.plus-de-novo.genomic-orfs.atg-only.bed.gz"
#profiles_file = "/prj/grosshans-riboseq/RPF/orf-profiles/9h-unique.length-17-20-21-28-29-33.offset-6-3-3-12-12-13.profiles.mtx"

#orfs_file = "/genomes/mus_musculus/GRCm38.79.plus-de-novo.genomic-orfs.atg-only.bed.gz"
#profiles_file = "/prj/shirin-riboseq/RPF/orf-profiles/mouse-325.swim.cm.de-novo-unique.length-25-29-30-32-33-34-35.offset-12-12-12-13-13-13-13.profiles.mtx"

#orfs_file = "/genomes/homo-sapiens/GRCh38.79.plus-de-novo.genomic-orfs.atg-only.bed.gz"
#profiles_file = "/prj/leo-riboseq/RPF/orf-profiles/tgfp-1-unique.length-17-18-19-20-21-22-23-24-26-28-29-30-31-32.offset-0-0-0-3-3-12-12-7-9-12-12-12-13-13.profiles.mtx"
orfs_file = "/genomes/homo-sapiens/GRCh38_79/transcript-index/GRCh38_79.genomic-orfs.aug-only.bed.gz"
orfs_file = "/prj/rpbp-paper/RPF/orf-predictions/hek293-unique.filtered.predicted-orfs.bed.gz"

profiles_file = "/prj/rpbp-paper/RPF/orf-profiles/hek293-unique.length-18-19-21-22-23-24-25-26-27-28-29-30-31.offset-12-12-12-12-12-12-12-12-12-12-12-12-13.profiles.mtx.gz"

``````
``````

/home/bmalone/local/lib/python3.5/site-packages/IPython/core/interactiveshell.py:2825: DtypeWarning: Columns (0) have mixed types. Specify dtype option on import or set low_memory=False.
if self.run_code(code, result):

``````
``````

In [61]:

orfs_file = "/prj/rpbp-paper/RPF/orf-predictions/hek293-unique.filtered.predicted-orfs.bed.gz"
sample_title = "HEK293"

``````
``````

In [68]:

image_file = "/prj/rpbp-paper/paper-figures/orf-type-metagene-profiles.pdf"

orf_types = ribo_utils.orf_types

nrows = len(orf_types)
fig, axes = plt.subplots(nrows=nrows, ncols=3, figsize=(10,5*nrows)) # , sharey=True, sharex=True

for i, orf_type in enumerate(orf_types):
m_orf_type = orfs['orf_type'] == orf_type
g = orfs[m_orf_type]

%xdel windows
%xdel g_profiles
g_profiles = parallel.apply_df_simple(g, get_profile, profiles, progress_bar=True)
g_profiles = [g_profile for g_profile in g_profiles if g_profile is not None]
windows = parallel.apply_iter_simple(g_profiles, get_windows, progress_bar=True)

title = '{} ({})'.format(orf_type, len(windows))
axes[i, 1].set_xlabel(title)

if len(windows) == 0:
print(title)
continue
plot_windows(windows, axes[i])

suptitle = fig.suptitle(sample_title, y=0.91)

if image_file is not None:
fig.savefig(image_file, bbox_inches='tight', bbox_extra_artists=(suptitle,))

``````
``````

100%|██████████| 11056/11056 [00:11<00:00, 964.35it/s]
100%|██████████| 11056/11056 [00:02<00:00, 3969.06it/s]
100%|██████████| 550/550 [00:00<00:00, 997.81it/s]
100%|██████████| 550/550 [00:00<00:00, 4268.26it/s]
100%|██████████| 547/547 [00:00<00:00, 1340.13it/s]
100%|██████████| 547/547 [00:00<00:00, 5022.23it/s]
100%|██████████| 2244/2244 [00:01<00:00, 1860.25it/s]
100%|██████████| 2244/2244 [00:00<00:00, 16885.57it/s]
100%|██████████| 383/383 [00:00<00:00, 1774.92it/s]
100%|██████████| 383/383 [00:00<00:00, 13786.63it/s]
100%|██████████| 2201/2201 [00:01<00:00, 1486.84it/s]
100%|██████████| 2201/2201 [00:00<00:00, 13296.21it/s]
0it [00:00, ?it/s]
novel (0)
100%|██████████| 63/63 [00:00<00:00, 1120.62it/s]
100%|██████████| 63/63 [00:00<00:00, 6075.91it/s]
100%|██████████| 71/71 [00:00<00:00, 1199.50it/s]
100%|██████████| 71/71 [00:00<00:00, 7264.02it/s]
100%|██████████| 41/41 [00:00<00:00, 1401.26it/s]
100%|██████████| 41/41 [00:00<00:00, 7369.15it/s]
100%|██████████| 42/42 [00:00<00:00, 1180.27it/s]
100%|██████████| 42/42 [00:00<00:00, 5671.26it/s]

``````
``````

In [25]:

orf_types = ['canonical', 'five_prime', 'noncoding', 'three_prime'] # 'within']

for orf_type in orf_types:
m_orf_type = orfs['orf_type'] == orf_type
m_reverse = orfs['strand'] == '-'
m_seqname = orfs['seqname'] == 'I'
g = orfs[m_orf_type & m_reverse] # & m_seqname]

%xdel windows
%xdel g_profiles
g_profiles = parallel.apply_df_simple(g, get_profile, profiles, progress_bar=True)
g_profiles = [g_profile for g_profile in g_profiles if g_profile is not None]
windows = parallel.apply_iter_simple(g_profiles, get_windows, progress_bar=True)

title = '{}, reverse ({})'.format(orf_type, len(windows))
plot_windows(windows, title)

%xdel windows
%xdel g_profiles
g = orfs[m_canonical & ~m_reverse] # & m_seqname]
g_profiles = parallel.apply_df_simple(g, get_profile, profiles, progress_bar=True)
g_profiles = [g_profile for g_profile in g_profiles if g_profile is not None]
windows = parallel.apply_iter_simple(g_profiles, get_windows, progress_bar=True)

title = '{}, forward ({})'.format(orf_type, len(windows))
plot_windows(windows, title)

``````
``````

100%|██████████| 11056/11056 [00:06<00:00, 1590.26it/s]
100%|██████████| 11056/11056 [00:01<00:00, 6822.68it/s]
100%|██████████| 194/194 [00:00<00:00, 1622.03it/s]
100%|██████████| 194/194 [00:00<00:00, 8287.03it/s]
100%|██████████| 2244/2244 [00:01<00:00, 1817.15it/s]
100%|██████████| 2244/2244 [00:00<00:00, 17695.12it/s]
100%|██████████| 194/194 [00:00<00:00, 1666.77it/s]
100%|██████████| 194/194 [00:00<00:00, 9888.26it/s]
100%|██████████| 2201/2201 [00:01<00:00, 1819.72it/s]
100%|██████████| 2201/2201 [00:00<00:00, 14489.43it/s]
100%|██████████| 194/194 [00:00<00:00, 1663.72it/s]
100%|██████████| 194/194 [00:00<00:00, 9634.66it/s]
100%|██████████| 383/383 [00:00<00:00, 1807.12it/s]
100%|██████████| 383/383 [00:00<00:00, 12563.79it/s]
100%|██████████| 194/194 [00:00<00:00, 1694.54it/s]
100%|██████████| 194/194 [00:00<00:00, 11857.99it/s]

``````
``````

In [5]:

orf_type = 'within'
m_canonical = orfs['orf_type'] == orf_type
m_reverse = orfs['strand'] == '-'
m_seqname = orfs['seqname'] == 'I'
g = orfs[m_canonical & m_reverse] # & m_seqname]

``````
``````

In [6]:

%xdel windows
%xdel g_profiles
g_profiles = parallel.apply_df_simple(g, get_profile, profiles, progress_bar=True)
g_profiles = [g_profile for g_profile in g_profiles if g_profile is not None]
windows = parallel.apply_iter_simple(g_profiles, get_windows, progress_bar=True)

title = '{}, reverse ({})'.format(orf_type, len(windows))
plot_windows(windows, title)

``````
``````

NameError: name 'windows' is not defined
NameError: name 'g_profiles' is not defined
100%|██████████| 216227/216227 [01:05<00:00, 3299.16it/s]
100%|██████████| 16533/16533 [00:00<00:00, 18341.04it/s]

``````
``````

In [42]:

%xdel windows
%xdel g_profiles
g = orfs[m_canonical & ~m_reverse] # & m_seqname]
g_profiles = parallel.apply_df_simple(g, get_profile, profiles, progress_bar=True)
g_profiles = [g_profile for g_profile in g_profiles if g_profile is not None]
windows = parallel.apply_iter_simple(g_profiles, get_windows, progress_bar=True)

title = '{}, forward ({})'.format(orf_type, len(windows))
plot_windows(windows, title)

``````
``````

100%|██████████| 17083/17083 [00:12<00:00, 1350.57it/s]
100%|██████████| 7989/7989 [00:01<00:00, 6199.56it/s]

``````
``````

In [ ]:

``````
``````

In [ ]:

``````
``````

In [ ]:

def run_all(g, profiles, num_cpus=2):
orf_type = g['orf_type'].iloc[0]

g_profiles = parallel.apply_df_simple(g, get_profile, profiles, progress_bar=True)
windows = parallel.apply_parallel_iter(g_profiles, num_cpus, get_windows, progress_bar=True)

plot_windows(windows, orf_type)

``````
``````

In [ ]:

num_cpus = 2
orf_type_groups.apply(run_all, profiles, num_cpus)

``````
``````

In [ ]:

run_all(g, profiles)

``````
``````

In [ ]:

orf_type_groups.apply()

``````
``````

In [ ]:

run_all('canonical', orf_type_groups, profiles)

``````
``````

In [ ]:

run_all('five_prime_overlap', orf_type_groups, profiles)

``````
``````

In [ ]:

run_all('five_prime', orf_type_groups, profiles)

``````
``````

In [ ]:

run_all('three_prime_overlap', orf_type_groups, profiles)

``````
``````

In [ ]:

``````