In [1]:
%run ~/relmapping/annot/notebooks/__init__.ipynb
def vp(fp): return os.path.join('annot/Fig2S3_tss/', fp) # "verbose path"
In [2]:
fp_ = 'wget/genome.cshlp.org/content/suppl/2013/04/25/gr.151571.112.DC1/Supplemental_Table_S2.csv'
df_emb_ = pd.read_csv(fp_)
print('%d records raw' % (len(df_emb_)),)
print(df_emb_['TSS type'].value_counts())
print()
df_emb_ = df_emb_.loc[df_emb_['start position'] == df_emb_['start position']]
print('%d records with start position set' % (len(df_emb_)),)
print(df_emb_['TSS type'].value_counts())
http://genome.cshlp.org/content/23/8/1348.long
Among 13,336 genes having Gaussian TSS peaks in the region of length 3000 bp upstream of their translation initiation sites, 7351 genes (55.1%) had only Gaussian outron-TSSs and were therefore defined as trans-spliced genes, and 4906 genes (36.8%) having only Gaussian exon-TSSs were categorized as non-trans-spliced genes (Fig. 1C).
Are 7310 / 4904 close enough to 7351 / 4906?
For genes that could be defined unambiguously as trans-spliced or non-trans-spliced, we assigned gene-unique representative outron/exon-TSSs with the maximum number of aligned reads. This allowed us to pursue further analysis in parallel with (1) all Gaussian peaks as TSSs (“The Gaussian Peak Collection”), and (2) representative gene-unique TSSs (one for each gene).
In [3]:
df_emb = pd.DataFrame()
df_emb['chrom'] = df_emb_['chr']
df_emb['start'] = list(map(int, df_emb_['start position']))
df_emb['end'] = df_emb['start'] + 1
df_emb['name'] = df_emb_['common gene name']
df_emb['score'] = df_emb_['number of aligned reads in the best-fit Gaussian distribution']
df_emb['strand'] = df_emb_['strand']
df_emb['stage'] = 'embryo'
df_emb.head()
Out[3]:
In [4]:
fp_ = 'wget/genome.cshlp.org/content/suppl/2013/04/25/gr.151571.112.DC1/Supplemental_Table_S3.csv'
df_ad_ = pd.read_csv(fp_)
print('%d records raw' % (len(df_ad_)),)
print(df_ad_['TSS type'].value_counts())
print()
df_ad_ = df_ad_.loc[df_ad_['start position'] == df_ad_['start position']].reset_index(drop=True)
print('%d records with start position set' % (len(df_ad_)),)
print(df_ad_['TSS type'].value_counts())
In [5]:
df_ad = pd.DataFrame()
df_ad['chrom'] = df_ad_['chr']
df_ad['start'] = list(map(int, df_ad_['start position']))
df_ad['end'] = df_ad['start'] + 1
df_ad['name'] = df_ad_['common gene name']
df_ad['score'] = df_ad_['number of aligned reads in the best-fit Gaussian distribution']
df_ad['strand'] = df_ad_['strand']
df_ad['stage'] = 'adult'
df_ad.head()
Out[5]:
In [6]:
df_ = pd.concat([df_emb, df_ad], axis=0)\
.sort_values(['chrom', 'start', 'end', 'strand']).reset_index(drop=True)
print('%d emb representative TSSs' % (len(df_emb),))
print('%d adult representative TSSs' % (len(df_ad),))
df_saito_emb_ad = BedTool.from_dataframe(df_).merge(s=True, c='4,5,7', o='distinct,sum,distinct').to_dataframe()
print('%d pooled Embryo+adult records' % (len(df_saito_emb_ad),))
df_saito_emb_ad.columns = ('chrom', 'start', 'end', 'strand', 'name', 'score', 'stage')
df_saito_emb_ad['stage'].value_counts()
Out[6]:
In [7]:
gv = yp.GenomicVenn2(
BedTool.from_dataframe(df_emb),
BedTool.from_dataframe(df_ad),
label_a='(Saito et al., 2013)\nEmb TSSs',
label_b='(Saito et al., 2013)\nadult TSSs',
)
plt.figure(figsize=(8,8))
plt.subplot(1,1,1)
gv.plot()
Out[7]:
In [8]:
fp_ = vp('Saito2013_tss.bed')
write_gffbed(fp_,
chrom = df_saito_emb_ad['chrom'],
start = df_saito_emb_ad['start'],
end = df_saito_emb_ad['end'],
name = df_saito_emb_ad['name'],
strand = df_saito_emb_ad['strand'],
attr = df_saito_emb_ad[['name', 'stage']],
)
In [9]:
# (Saito et al., 2013)
df_regl_ = regl_Apr27(flank_len=150)[['chrom', 'start', 'end', 'annot']]
gv = yp.GenomicVenn2(
BedTool.from_dataframe(df_regl_),
BedTool.from_dataframe(df_saito_emb_ad[yp.NAMES_BED3]),
label_a='Accessible sites',
label_b='(Saito et al., 2013)\nembryo + adult TSSs',
)
plt.figure(figsize=(12,6)).subplots_adjust(wspace=0.5)
plt.subplot(1,2,1)
gv.plot()
plt.subplot(1,2,2)
annot_count_ = gv.df_a_with_b['name'].value_counts()[config['annot']]
annot_count_.index = [
'coding_promoter',
'pseudogene_promoter',
'unknown_promoter',
'putative_enhancer',
'non-coding_RNA',
'\nother_element'
]
#plt.title('Annotation of %d accessible sites that overlap a TSS from (Saito et al., 2013)' % (len(gv.df_a_with_b),))
plt.pie(
annot_count_.values,
labels = ['%s (%d)' % (l, c) for l, c in annot_count_.iteritems()],
colors=[yp.RED, yp.ORANGE, yp.YELLOW, yp.GREEN, '0.4', yp.BLUE],
counterclock=False,
startangle=70,
autopct='%.1f%%',
);
plt.gca().set_aspect('equal')
#plt.savefig('annot/Fig2S5_tss/Saito2013_annot.pdf', bbox_inches='tight', transparent=True)
In [10]:
# (Saito et al., 2013)
df_regl_ = regl_Apr27(flank_len=150)[['chrom', 'start', 'end', 'annot']]
gv = yp.GenomicVenn2(
BedTool.from_dataframe(df_regl_),
BedTool.from_dataframe(df_saito_emb_ad[yp.NAMES_BED3]),
label_a='Accessible sites',
label_b='(Saito et al.,\n2013) embryo\n+ adult TSSs',
)
plt.figure(figsize=(8,4)).subplots_adjust(wspace=0.2)
plt.subplot(1,2,1)
v = gv.plot(style='compact')
v.get_patch_by_id('10').set_color(yp.RED)
v.get_patch_by_id('01').set_color(yp.GREEN)
v.get_patch_by_id('11').set_color(yp.YELLOW)
plt.subplot(1,2,2)
d_reduced_ = collections.OrderedDict([
('coding_promoter', 'coding_promoter, pseudogene_promoter'),
('pseudogene_promoter', 'coding_promoter, pseudogene_promoter'),
('unknown_promoter', 'unknown_promoter'),
('putative_enhancer', 'putative_enhancer'),
('non-coding_RNA', 'other_element, non-coding_RNA'),
('other_element', 'other_element, non-coding_RNA'),
])
d_colour_ = collections.OrderedDict([
('coding_promoter, pseudogene_promoter', yp.RED),
('unknown_promoter', yp.YELLOW),
('putative_enhancer', yp.GREEN),
('other_element, non-coding_RNA', yp.BLUE),
])
gv.df_a_with_b['name_reduced'] = [*map(lambda a: d_reduced_[a], gv.df_a_with_b['name'])]
annot_count_ = gv.df_a_with_b['name_reduced'].value_counts()[d_colour_.keys()]
#plt.title('Annotation of %d accessible sites that overlap a TSS from (Chen et al., 2013)' % (len(gv.df_a_with_b),))
(patches, texts) = plt.pie(
annot_count_.values,
labels = yp.pct_(annot_count_.values),
colors=d_colour_.values(),
counterclock=False,
startangle=45,
);
plt.gca().set_aspect('equal')
#plt.savefig(vp('Saito2013_annot.pdf'), bbox_inches='tight', transparent=True)
plt.savefig('annot_Apr27/Fig2S3C_Saito2013_annot.pdf', bbox_inches='tight', transparent=True)