In [1]:
%run ~/relmapping/annot/notebooks/__init__.ipynb
def vp(fp): return os.path.join('annot/Fig2S3_tss/', fp) # "verbose path"
http://www.sciencedirect.com/science/article/pii/S0092867412014080
Table S1. trans-Splice Sites, Transcription Start Sites, and csRNA Loci for Protein-Coding Genes and Transcription Start Sites for pri-miRNAs, Related to Figure 2. Analysis of C. elegans CapSeq and CIP-TAP, containing lists of trans-splice sites, transcription start sites, and sense and antisense csRNAs derived from protein coding genes. Also included is the list of the transcription start sites for pri-miRNAs.
For C. elegans analysis, reads were mapped to the genome (WormBase release WS215)
In [2]:
#!cd ~/relmapping/wget; wget -m --no-parent https://ars.els-cdn.com/content/image/1-s2.0-S0092867412014080-mmc1.xlsx
fp_ = 'wget/ars.els-cdn.com/content/image/1-s2.0-S0092867412014080-mmc1_B._TS_sites_for_protein_genes.csv'
df_ = pd.read_csv(fp_, skiprows=11)
df_['assigned_to_an_annotation'] = df_['transcript'].map(lambda x: x == x)
print('%d records, ~20,000 not assigned to an annotation:' % (len(df_)))
print(df_['assigned_to_an_annotation'].value_counts())
df_.head()
Out[2]:
Using a cutoff of one CapSeq read per 10 million total reads, and a requirement for a YR motif, our CapSeq data predicted approximately 64,000 candidate TS sites genome wide (Table S1B).
In [3]:
print(df_['transcript type'].value_counts())
m_ = df_['transcript type'] == "coding"
df_ = df_.loc[m_].reset_index(drop=True)
print('%d records with annotated as "coding"' % (len(df_.query('transcript == transcript')),))
In [4]:
# Raw (Gu et al., 2012) TSS sites (=many assigned to multiple transcripts)
df_gu = pd.DataFrame()
df_gu['chrom'] = 'chr' + df_['chromosome']
df_gu['start'] = df_['start']
df_gu['end'] = df_['start'] + 1
df_gu['name'] = df_['transcript']
df_gu['score'] = df_['reads']
df_gu['strand'] = df_['strand']
df_gu = df_gu.sort_values(['chrom', 'start', 'end', 'start']).reset_index(drop=True)
fp_ = vp('Gu2012_tss.bed')
write_gffbed(fp_,
chrom = df_gu['chrom'],
start = df_gu['start'],
end = df_gu['end'],
name = df_gu['name'],
strand = df_gu['strand'],
score = df_gu['score'],
)
!wc -l {fp_}
In [5]:
# Collapse TSS annotations by chrom/start/end/strand (raw TSS assignments are to all "compatible" transcripts)
fp_ = vp('Gu2012_tss_unique.bed')
df_gu.groupby(['chrom', 'start', 'end', 'strand']).agg({
'name': lambda l: os.path.commonprefix(list(l)).rstrip('.'),#lambda l: ','.join(sorted(set(l))),
'score': np.sum,
})\
.reset_index().sort_values(['chrom', 'start', 'end', 'strand'])[['chrom', 'start', 'end', 'name', 'score', 'strand']]\
.to_csv(fp_, sep='\t', index=False, header=False)
!wc -l {fp_}
In [6]:
# Cluster TSS annotations using single-linkage, strand-specific, using a distance cutoff of 50
df_gu_cluster50_ = BedTool.from_dataframe(df_gu).cluster(d=50, s=True).to_dataframe()
df_gu_cluster50_.columns = ('chrom', 'start', 'end', 'transcript_id', 'score', 'strand', 'cluster_id')
fp_ = vp('Gu2012_tss_clustered.bed')
df_gu_cluster50 = df_gu_cluster50_.groupby('cluster_id').agg({
'chrom': lambda s: list(set(s))[0],
'start': np.min,
'end': np.max,
'transcript_id': lambda l: os.path.commonprefix(list(l)).rstrip('.'),#lambda l: ','.join(sorted(set(l))),
'score': np.sum,
'strand': lambda s: list(set(s))[0],
})\
.sort_values(['chrom', 'start', 'end', 'strand']).reset_index(drop=True)
df_gu_cluster50.to_csv(fp_, sep='\t', index=False, header=False)
!wc -l {fp_}
In [7]:
# Overlaps to TSS clusters
df_regl_ = regl_Apr27(flank_len=150)[['chrom', 'start', 'end', 'annot']]
gv = yp.GenomicVenn2(
BedTool.from_dataframe(df_regl_),
BedTool.from_dataframe(df_gu_cluster50[yp.NAMES_BED3]),
label_a='Accessible sites',
label_b='(Gu et al., 2012)\nTSS clusters',
)
plt.figure(figsize=(12,6)).subplots_adjust(wspace=0.5)
plt.subplot(1,2,1)
gv.plot()
plt.subplot(1,2,2)
annot_count_ = gv.df_a_with_b['name'].value_counts()[config['annot']]
annot_count_.index = [
'coding_promoter',
'pseudogene_promoter',
'unknown_promoter',
'putative_enhancer',
'non-coding_RNA',
'\n\nother_element'
]
#plt.title('Annotation of %d accessible sites that overlap a TSS from (Gu et al., 2012)' % (len(gv.df_a_with_b),))
plt.pie(
annot_count_.values,
labels = ['%s (%d)' % (l, c) for l, c in annot_count_.iteritems()],
colors=[yp.RED, yp.ORANGE, yp.YELLOW, yp.GREEN, '0.4', yp.BLUE],
counterclock=False,
startangle=70,
autopct='%.1f%%',
);
plt.gca().set_aspect('equal')
#plt.savefig('annot/Fig2S5_tss/Gu2012_annot.pdf', bbox_inches='tight', transparent=True)
annot_count_
Out[7]:
In [8]:
df_regl_ = regl_Apr27(flank_len=150)[['chrom', 'start', 'end', 'annot']]
gv = yp.GenomicVenn2(
BedTool.from_dataframe(df_regl_),
BedTool.from_dataframe(df_gu_cluster50[yp.NAMES_BED3]),
label_a='Accessible sites',
label_b='(Gu et al., 2012)\nTSS clusters',
)
plt.figure(figsize=(8,4)).subplots_adjust(wspace=0.2)
plt.subplot(1,2,1)
v = gv.plot(style='compact')
v.get_patch_by_id('10').set_color(yp.RED)
v.get_patch_by_id('01').set_color(yp.GREEN)
v.get_patch_by_id('11').set_color(yp.YELLOW)
plt.subplot(1,2,2)
d_reduced_ = collections.OrderedDict([
('coding_promoter', 'coding_promoter, pseudogene_promoter'),
('pseudogene_promoter', 'coding_promoter, pseudogene_promoter'),
('unknown_promoter', 'unknown_promoter'),
('putative_enhancer', 'putative_enhancer'),
('non-coding_RNA', 'other_element, non-coding_RNA'),
('other_element', 'other_element, non-coding_RNA'),
])
d_colour_ = collections.OrderedDict([
('coding_promoter, pseudogene_promoter', yp.RED),
('unknown_promoter', yp.YELLOW),
('putative_enhancer', yp.GREEN),
('other_element, non-coding_RNA', yp.BLUE),
])
gv.df_a_with_b['name_reduced'] = [*map(lambda a: d_reduced_[a], gv.df_a_with_b['name'])]
annot_count_ = gv.df_a_with_b['name_reduced'].value_counts()[d_colour_.keys()]
#plt.title('Annotation of %d accessible sites that overlap a TSS from (Chen et al., 2013)' % (len(gv.df_a_with_b),))
(patches, texts) = plt.pie(
annot_count_.values,
labels = yp.pct_(annot_count_.values),
colors=d_colour_.values(),
counterclock=False,
startangle=45,
);
plt.gca().set_aspect('equal')
#plt.savefig(vp('Gu2012_annot.pdf'), bbox_inches='tight', transparent=True)
plt.savefig('annot_Apr27/Fig2S3D_Gu2012_annot.pdf', bbox_inches='tight', transparent=True)
In [9]:
#fp_ = 'annot/Fig2S4_TSS/Gu2012_not_atac.bed'
#gv.df_b_only.to_csv(fp_, header=False, sep='\t', index=False)
#!wc -l {fp_}