In [1]:
# initialise and load WB annotations
%run ~/relmapping/annot/notebooks/__init__.ipynb
fp_ = 'WS260_ce10/WS260_ce10.transcripts.annot.gtf.gz'
df_annot = yp.df_gfftags_unpack(yp.read_wbgtf(fp_, parse_attr=False), name='attribute')
In [2]:
fp_ = 'annot/Fig2D2_genomic_regions/Fig2D2_genomic_regions.tsv'
df_regions = pd.read_csv(fp_, sep='\t')
In [3]:
df_regions.head()
Out[3]:
In [4]:
d_ = collections.OrderedDict([
('outronic', yp.RED),
('exonic', yp.ORANGE),
('gene_end', yp.PURPLE),
('intronic', yp.YELLOW),
('intergenic', yp.SKYBLUE),
('mixed', '0.5'),
])
df_regions['length'] = df_regions['end'] - df_regions['start']
df_length = df_regions.groupby('type').agg({'length': np.sum}).loc[d_.keys()]
df_length
Out[4]:
In [33]:
#df_plt = pd.DataFrame({'length': df_length['length'].tolist()},
# index = ["%s\n(%sbp)" % (type_, yp.f_uk(length_)) for type_, length_ in zip(df_length.index, df_length.length)],
#)
#df_plt
#df_plt = pd.DataFrame({'length': df_length['length'].tolist()},
# index = ["%s (%.1f%%)" % (type_, 100.0*length_ / df_length['length'].sum()) for type_, length_ in zip(df_length.index, df_length.length)],
#)
df_plt = pd.DataFrame({'length': df_length['length'].tolist()},
index = ["%.1f%%" % (100.0*length_ / df_length['length'].sum()) for type_, length_ in zip(df_length.index, df_length.length)],
)
df_plt
Out[33]:
In [34]:
df_regl = regl_Apr27()
df_mode = pd.DataFrame()
df_mode['chrom'] = df_regl['chrom'].copy()
df_mode['start'] = summit_pos(df_regl)
df_mode['end'] = summit_pos(df_regl) + 1
df_mode['type'] = BedTool.from_dataframe(df_mode).map(
b=BedTool.from_dataframe(df_regions[['chrom', 'start', 'end', 'type']]).fn,
c=4, o='distinct',
).to_dataframe()['name']
In [44]:
plt.figure(figsize=(12,6)).subplots_adjust(wspace=0.6)
plt.subplot(1,2,1)
df_plt['length'].plot(
label='',
kind='pie',
colors=d_.values(),
counterclock=False,
#autopct='%.1f%%',
startangle=90, # avoid 'mixed' label overlapping with the title
fontsize=12,
);
#plt.title('Genome (%sbp)' % (yp.f_uk(df_plt['length'].sum()),), fontsize=18)
plt.title('Genome', fontsize=18)# (%sbp)' % (yp.f_uk(df_plt['length'].sum()),), fontsize=18)
plt.axis('equal');
df_mode_agg = df_mode['type'].value_counts().loc[d_.keys()]
#df_mode_agg.index = ["%s\n(%s sites)" % (type_, yp.f_uk(count_)) for type_, count_ in zip(df_mode_agg.index, df_mode_agg.values)]
#df_mode_agg.index = ["%s (%.1f%%)" % (type_, 100.0*frac_) for type_, frac_ in zip(df_mode_agg.index, df_mode_agg.values / df_mode_agg.values.sum())]
df_mode_agg.index = ["%.1f%%" % (100.0*frac_,) for type_, frac_ in zip(df_mode_agg.index, df_mode_agg.values / df_mode_agg.values.sum())]
plt.subplot(1,2,2)
pie_ = df_mode_agg.plot(
label='',
kind='pie',
colors=d_.values(),
counterclock=False,
#autopct='%.1f%%',
startangle=90, # avoid 'mixed' label overlapping with the title
fontsize=12,
);
#plt.title('All accessible sites (%s)' % (yp.f_uk(sum(df_mode_agg.values)),), fontsize=18)
plt.title('Accessible sites', fontsize=18)# (%s)' % (yp.f_uk(sum(df_mode_agg.values)),), fontsize=18)
plt.axis('equal');
plt.savefig('annot_Apr27/Fig2S2A_genomic_regions.pdf', bbox_inches='tight', transparent=True)
In [29]:
plt.figure(figsize=(0.2,0.2))
plt.gca().axis('off')
plt.gca().legend(pie_.patches, list(d_.keys()), loc='upper left')
plt.savefig('annot_Apr27/Fig2S2_legend.pdf', bbox_inches='tight', transparent=True)
In [43]:
def plot_(m_, label_):
df_mode_agg = df_mode.loc[m_]['type'].value_counts().loc[d_.keys()].fillna(0)
#df_mode_agg.index = ["%s\n(%s sites)" % (type_, yp.f_uk(count_)) for type_, count_ in zip(df_mode_agg.index, df_mode_agg.values)]
#df_mode_agg.index = ["%s (%.1f%%)" % (type_, 100.0*frac_) for type_, frac_ in zip(df_mode_agg.index, df_mode_agg.values / df_mode_agg.values.sum())]
df_mode_agg.index = ["%.1f%%" % (100.0*frac_) for type_, frac_ in zip(df_mode_agg.index, df_mode_agg.values / df_mode_agg.values.sum())]
#plt.subplot(1,2,2)
df_mode_agg.plot(
#title='sites annotated as\n%s (%s)' % (label_, yp.f_uk(sum(df_mode_agg.values)),),
label='',
kind='pie',
colors=d_.values(),
counterclock=False,
#autopct='%.1f%%',
startangle=0, # avoid 'mixed' label overlapping with the title
fontsize=12,
);
#plt.title('%s\n(%s sites)' % (label_, yp.f_uk(sum(df_mode_agg.values)),), fontsize=18)
plt.title('%s' % (label_,), fontsize=18)
plt.axis('equal');
plt.figure(figsize=(16,12)).subplots_adjust(wspace=0.5)
plt.subplot(2,3,1)
plot_(df_regl['annot'] == 'coding_promoter', 'coding_promoter')
plt.subplot(2,3,2)
plot_(df_regl['annot'] == 'pseudogene_promoter', 'pseudogene_promoter')
plt.subplot(2,3,3)
plot_(df_regl['annot'] == 'unknown_promoter', 'unknown_promoter')
plt.subplot(2,3,4)
plot_(df_regl['annot'] == 'putative_enhancer', 'putative_enhancer')
plt.subplot(2,3,5)
plot_(df_regl['annot'] == 'non-coding_RNA', 'non-coding_RNA')
plt.subplot(2,3,6)
plot_(df_regl['annot'] == 'other_element', 'other_element')
plt.savefig('annot_Apr27/Fig2S2B_genomic_regions_by_annot.pdf', bbox_inches='tight', transparent=True)
In [ ]: