notebook.community

Edit and run



In [1]:

    
%run ~/relmapping/annot/notebooks/__init__.ipynb
def vp(fp): return os.path.join('annot/Fig2S3_tss/', fp) # "verbose path"









    



/mnt/home3/jj374/anaconda36/lib/python3.6/site-packages/statsmodels/compat/pandas.py:56: FutureWarning: The pandas.core.datetools module is deprecated and will be removed in a future version. Please use the pandas.tseries module instead.
  from pandas.core import datetools






    



os.getcwd(): /mnt/beegfs/scratch_copy/ahringer/jj374/lab/relmapping



In [2]:

    
fp_ = 'wget/genome.cshlp.org/content/suppl/2013/04/25/gr.151571.112.DC1/Supplemental_Table_S2.csv'
df_emb_ = pd.read_csv(fp_)
print('%d records raw' % (len(df_emb_)),)
print(df_emb_['TSS type'].value_counts())
print()
df_emb_ = df_emb_.loc[df_emb_['start position'] == df_emb_['start position']]
print('%d records with start position set' % (len(df_emb_)),)
print(df_emb_['TSS type'].value_counts())









    



12520 records raw
outron    7310
exon      4904
Name: TSS type, dtype: int64

12214 records with start position set
outron    7310
exon      4904
Name: TSS type, dtype: int64

http://genome.cshlp.org/content/23/8/1348.long

Among 13,336 genes having Gaussian TSS peaks in the region of length 3000 bp upstream of their translation initiation sites, 7351 genes (55.1%) had only Gaussian outron-TSSs and were therefore defined as trans-spliced genes, and 4906 genes (36.8%) having only Gaussian exon-TSSs were categorized as non-trans-spliced genes (Fig. 1C).

Are 7310 / 4904 close enough to 7351 / 4906?

For genes that could be defined unambiguously as trans-spliced or non-trans-spliced, we assigned gene-unique representative outron/exon-TSSs with the maximum number of aligned reads. This allowed us to pursue further analysis in parallel with (1) all Gaussian peaks as TSSs (“The Gaussian Peak Collection”), and (2) representative gene-unique TSSs (one for each gene).



In [3]:

    
df_emb = pd.DataFrame()
df_emb['chrom'] = df_emb_['chr']
df_emb['start'] = list(map(int, df_emb_['start position']))
df_emb['end'] = df_emb['start'] + 1
df_emb['name'] = df_emb_['common gene name']
df_emb['score'] = df_emb_['number of aligned reads in the best-fit Gaussian distribution']
df_emb['strand'] = df_emb_['strand']
df_emb['stage'] = 'embryo'
df_emb.head()



In [4]:

    
fp_ = 'wget/genome.cshlp.org/content/suppl/2013/04/25/gr.151571.112.DC1/Supplemental_Table_S3.csv'
df_ad_ = pd.read_csv(fp_)
print('%d records raw' % (len(df_ad_)),)
print(df_ad_['TSS type'].value_counts())
print()
df_ad_ = df_ad_.loc[df_ad_['start position'] == df_ad_['start position']].reset_index(drop=True)
print('%d records with start position set' % (len(df_ad_)),)
print(df_ad_['TSS type'].value_counts())









    



12520 records raw
exon      2837
outron    2728
Name: TSS type, dtype: int64

5565 records with start position set
exon      2837
outron    2728
Name: TSS type, dtype: int64



In [5]:

    
df_ad = pd.DataFrame()
df_ad['chrom'] = df_ad_['chr']
df_ad['start'] = list(map(int, df_ad_['start position']))
df_ad['end'] = df_ad['start'] + 1
df_ad['name'] = df_ad_['common gene name']
df_ad['score'] = df_ad_['number of aligned reads in the best-fit Gaussian distribution']
df_ad['strand'] = df_ad_['strand']
df_ad['stage'] = 'adult'
df_ad.head()



In [6]:

    
df_ = pd.concat([df_emb, df_ad], axis=0)\
    .sort_values(['chrom', 'start', 'end', 'strand']).reset_index(drop=True)
print('%d emb representative TSSs' % (len(df_emb),))
print('%d adult representative TSSs' % (len(df_ad),))

df_saito_emb_ad = BedTool.from_dataframe(df_).merge(s=True, c='4,5,7', o='distinct,sum,distinct').to_dataframe()
print('%d pooled Embryo+adult records' % (len(df_saito_emb_ad),))

df_saito_emb_ad.columns = ('chrom', 'start', 'end', 'strand', 'name', 'score', 'stage')
df_saito_emb_ad['stage'].value_counts()









    



12214 emb representative TSSs
5565 adult representative TSSs
16011 pooled Embryo+adult records






    Out[6]:





embryo          10446
adult            3797
adult,embryo     1768
Name: stage, dtype: int64



In [7]:

    
gv = yp.GenomicVenn2(
    BedTool.from_dataframe(df_emb),
    BedTool.from_dataframe(df_ad),
    label_a='(Saito et al., 2013)\nEmb TSSs',
    label_b='(Saito et al., 2013)\nadult TSSs',
)
plt.figure(figsize=(8,8))
plt.subplot(1,1,1)
gv.plot()









    Out[7]:





<matplotlib_venn._common.VennDiagram at 0x7f8a45d3a1d0>



In [8]:

    
fp_ = vp('Saito2013_tss.bed')
write_gffbed(fp_,
    chrom = df_saito_emb_ad['chrom'],
    start = df_saito_emb_ad['start'],
    end = df_saito_emb_ad['end'],
    name = df_saito_emb_ad['name'],
    strand = df_saito_emb_ad['strand'],
    attr = df_saito_emb_ad[['name', 'stage']],
)



In [9]:

    
# (Saito et al., 2013)
df_regl_ = regl_Apr27(flank_len=150)[['chrom', 'start', 'end', 'annot']]

gv = yp.GenomicVenn2(
    BedTool.from_dataframe(df_regl_),
    BedTool.from_dataframe(df_saito_emb_ad[yp.NAMES_BED3]),
    label_a='Accessible sites',
    label_b='(Saito et al., 2013)\nembryo + adult TSSs',
)

plt.figure(figsize=(12,6)).subplots_adjust(wspace=0.5)
plt.subplot(1,2,1)
gv.plot()

plt.subplot(1,2,2)
annot_count_ = gv.df_a_with_b['name'].value_counts()[config['annot']]
annot_count_.index = [
    'coding_promoter',
    'pseudogene_promoter',
    'unknown_promoter',
    'putative_enhancer',
    'non-coding_RNA',
    '\nother_element'
]
#plt.title('Annotation of %d accessible sites that overlap a TSS from (Saito et al., 2013)' % (len(gv.df_a_with_b),))
plt.pie(
    annot_count_.values,
    labels = ['%s (%d)' % (l, c) for l, c in annot_count_.iteritems()],
    colors=[yp.RED, yp.ORANGE, yp.YELLOW, yp.GREEN, '0.4', yp.BLUE],
    counterclock=False,
    startangle=70,
    autopct='%.1f%%',
);
plt.gca().set_aspect('equal')
#plt.savefig('annot/Fig2S5_tss/Saito2013_annot.pdf', bbox_inches='tight', transparent=True)









    



/mnt/home3/jj374/anaconda36/lib/python3.6/site-packages/ipykernel_launcher.py:64: RuntimeWarning: Mean of empty slice






    



13054 of 42245 sites with CV values via promoter annotation
26764 of 42245 sites with CV values via "associated gene"



In [10]:

    
# (Saito et al., 2013)
df_regl_ = regl_Apr27(flank_len=150)[['chrom', 'start', 'end', 'annot']]

gv = yp.GenomicVenn2(
    BedTool.from_dataframe(df_regl_),
    BedTool.from_dataframe(df_saito_emb_ad[yp.NAMES_BED3]),
    label_a='Accessible sites',
    label_b='(Saito et al.,\n2013) embryo\n+ adult TSSs',
)

plt.figure(figsize=(8,4)).subplots_adjust(wspace=0.2)
plt.subplot(1,2,1)
v = gv.plot(style='compact')
v.get_patch_by_id('10').set_color(yp.RED)
v.get_patch_by_id('01').set_color(yp.GREEN)
v.get_patch_by_id('11').set_color(yp.YELLOW)

plt.subplot(1,2,2)
d_reduced_ = collections.OrderedDict([
    ('coding_promoter', 'coding_promoter, pseudogene_promoter'),
    ('pseudogene_promoter', 'coding_promoter, pseudogene_promoter'),
    ('unknown_promoter', 'unknown_promoter'),
    ('putative_enhancer', 'putative_enhancer'),
    ('non-coding_RNA', 'other_element, non-coding_RNA'),
    ('other_element', 'other_element, non-coding_RNA'),
])

d_colour_ = collections.OrderedDict([
    ('coding_promoter, pseudogene_promoter', yp.RED),
    ('unknown_promoter', yp.YELLOW),
    ('putative_enhancer', yp.GREEN),
    ('other_element, non-coding_RNA', yp.BLUE),
])

gv.df_a_with_b['name_reduced'] = [*map(lambda a: d_reduced_[a], gv.df_a_with_b['name'])]
annot_count_ = gv.df_a_with_b['name_reduced'].value_counts()[d_colour_.keys()]

#plt.title('Annotation of %d accessible sites that overlap a TSS from (Chen et al., 2013)' % (len(gv.df_a_with_b),))
(patches, texts) = plt.pie(
    annot_count_.values,
    labels = yp.pct_(annot_count_.values),
    colors=d_colour_.values(),
    counterclock=False,
    startangle=45,
);
plt.gca().set_aspect('equal')
#plt.savefig(vp('Saito2013_annot.pdf'), bbox_inches='tight', transparent=True)
plt.savefig('annot_Apr27/Fig2S3C_Saito2013_annot.pdf', bbox_inches='tight', transparent=True)









    



/mnt/home3/jj374/anaconda36/lib/python3.6/site-packages/ipykernel_launcher.py:64: RuntimeWarning: Mean of empty slice






    



13054 of 42245 sites with CV values via promoter annotation
26764 of 42245 sites with CV values via "associated gene"

	chrom	start	end	name	score	strand	stage
0	chrI	10640	10641	Y74C9A.3	29.0	-	embryo
1	chrI	11285	11286	nlp-40	16.0	+	embryo
2	chrI	27287	27288	Y74C9A.4	30.0	-	embryo
3	chrI	43530	43531	Y74C9A.1	17.0	+	embryo
4	chrI	47202	47203	Y48G1C.12	50.0	+	embryo

	chrom	start	end	name	score	strand	stage
0	chrI	11289	11290	nlp-40	11.0	+	adult
1	chrI	91223	91224	Y48G1C.9	59.0	+	adult
2	chrI	93010	93011	Y48G1C.1	470.0	+	adult
3	chrI	109000	109001	Y48G1C.8	123.0	-	adult
4	chrI	110310	110311	rab-11.1	63.0	-	adult