In [1]:
%run ~/relmapping/annot_cb/notebooks/annot__init__.ipynb
In [2]:
def mp(fp): return os.path.join('annot_cb', 'metrics_exon', fp)
fp_atac = 'annot_cb/accessible_sites_cb.tsv'
df_atac = pd.read_csv(fp_atac, sep='\t')
l_atac_peak_pos = df_atac[['start', 'end']].mean(axis=1).map(int)
df_atac.head()
Out[2]:
In [3]:
# df_exon1 / df_exon2: first/other exons, coordinates fixed to represent 5' ends
#NAMES_EXON = ('chrom', 'start', 'end', 'transcript_id', 'score', 'strand', 'gene_id', 'gene_biotype', 'display_id')
l_adj_ = [
'chrom', 'start', 'end', 'score', 'strand', 'thickStart', 'thickEnd', 'itemRgb', 'Name',
'exon_start', 'exon_end', 'utr_start', 'utr_end', 'aoe_end', 'aoe_start', 'gene_biotype',
'exon_id', 'gene_id', 'init_cutoff_pos', #'locus_id',
]
df_pass1_exon1_fwd = read_gffbed('annot_cb/canonical_geneset/WS260_CB4.transcripts.annot_pass1_exon1_fwd_utradj.bed')[l_adj_]
df_pass1_exon1_rev = read_gffbed('annot_cb/canonical_geneset/WS260_CB4.transcripts.annot_pass1_exon1_rev_utradj.bed')[l_adj_]
df_pass1_exon2_fwd = read_gffbed('annot_cb/canonical_geneset/WS260_CB4.transcripts.annot_pass1_exon2_fwd.bed')
df_pass1_exon2_rev = read_gffbed('annot_cb/canonical_geneset/WS260_CB4.transcripts.annot_pass1_exon2_rev.bed')
df_pass2_exon1_fwd = read_gffbed('annot_cb/canonical_geneset/WS260_CB4.transcripts.annot_pass2_exon1_fwd.bed')
df_pass2_exon1_rev = read_gffbed('annot_cb/canonical_geneset/WS260_CB4.transcripts.annot_pass2_exon1_rev.bed')
# Set 1st exon matching area to TSS + 250bp
df_pass1_exon1_fwd['end'] = df_pass1_exon1_fwd['start'] + 1 + 250
df_pass1_exon1_rev['start'] = df_pass1_exon1_rev['end'] - 1 - 250
print(len(df_pass1_exon1_fwd), len(df_pass1_exon1_rev),
len(df_pass1_exon2_fwd), len(df_pass1_exon2_rev),
len(df_pass2_exon1_fwd), len(df_pass2_exon1_rev),
)
In [4]:
# Match hypersensitive sites to closest first/other exon
def df_closest(df_a, df_b, b_prefix, **kwargs):
df_a_sort = df_a
df_b_sort = df_b.sort_values(list(df_b.columns[:3]))
fn_ = BedTool.from_dataframe(df_a).closest(BedTool.from_dataframe(df_b_sort).fn, **kwargs).fn
names_ = list(itertools.chain(df_a.columns.values,
['%s_%s' % (b_prefix, col) for col in df_b.columns.values],
['%s_distance' % (b_prefix)]
))
df_ = pd.read_csv(fn_, names=names_, sep='\t')
return df_[names_[len(df_a.columns):]]
# Closest exon1 -- protein_coding & pseudogene
df_ = pd.concat([df_atac['chrom'], l_atac_peak_pos, l_atac_peak_pos + 1], axis=1).copy()
df_pass1_exon1_fwd_out_ = df_closest(df_, df_pass1_exon1_fwd, 'pass1_exon1', D='ref', t='first', iu=True)
df_pass1_exon1_rev_out_ = df_closest(df_, df_pass1_exon1_rev, 'pass1_exon1', D='ref', t='last', id=True)
# Closest exon2 can be both upstream or downstream
df_pass1_exon2_fwd_out_ = df_closest(df_, df_pass1_exon2_fwd, 'pass1_exon2', D='ref', t='first')
df_pass1_exon2_rev_out_ = df_closest(df_, df_pass1_exon2_rev, 'pass1_exon2', D='ref', t='last')
# Closest exon2 can be both upstream or downstream (downstream only)
df_pass1_exon2_downstream_fwd_out_ = df_closest(df_, df_pass1_exon2_fwd, 'pass1_exon2_downstream', D='ref', t='first', iu=True)
df_pass1_exon2_downstream_rev_out_ = df_closest(df_, df_pass1_exon2_rev, 'pass1_exon2_downstream', D='ref', t='last', id=True)
# Closes pass2/exon1 -- tRNA, snoRNA, rRNA, miRNA, etc
df_pass2_exon1_fwd_out_ = df_closest(df_, df_pass2_exon1_fwd, 'pass2_exon1', D='ref', t='first', iu=True)
df_pass2_exon1_rev_out_ = df_closest(df_, df_pass2_exon1_rev, 'pass2_exon1', D='ref', t='last', id=True)
df_exon_fwd = pd.concat([df_pass1_exon1_fwd_out_, df_pass1_exon2_fwd_out_, df_pass1_exon2_downstream_fwd_out_, df_pass2_exon1_fwd_out_], axis=1)
df_exon_rev = pd.concat([df_pass1_exon1_rev_out_, df_pass1_exon2_rev_out_, df_pass1_exon2_downstream_rev_out_, df_pass2_exon1_rev_out_], axis=1)
df_exon_fwd.to_csv(mp('closest_exon_fwd.tsv'), header=True, index=False, sep='\t')
df_exon_rev.to_csv(mp('closest_exon_rev.tsv'), header=True, index=False, sep='\t')
#l_ = ['pass1_exon1_gene_id', 'pass1_exon1_locus_id', 'pass1_exon1_distance']
l_ = ['pass1_exon1_gene_id', 'pass1_exon1_distance']
write_gffbed(mp('closest_pass1_exon1_fwd.bed'),
chrom = df_atac['chrom'],
start = df_atac['start'],
end = df_atac['end'],
name = df_exon_fwd['pass1_exon1_Name'],
strand = '+',
attr = df_exon_fwd[l_],
)
write_gffbed(mp('closest_pass1_exon1_rev.bed'),
chrom = df_atac['chrom'],
start = df_atac['start'],
end = df_atac['end'],
name = df_exon_rev['pass1_exon1_Name'],
strand = '-',
attr = df_exon_rev[l_],
)
#l_ = ['pass1_exon2_gene_id', 'pass1_exon2_locus_id', 'pass1_exon2_distance']
l_ = ['pass1_exon2_gene_id', 'pass1_exon2_distance']
write_gffbed(mp('closest_pass1_exon2_fwd.bed'),
chrom = df_atac['chrom'],
start = df_atac['start'],
end = df_atac['end'],
name = df_exon_fwd['pass1_exon2_Name'],
strand = '+',
attr = df_exon_fwd[l_],
)
write_gffbed(mp('closest_pass1_exon2_rev.bed'),
chrom = df_atac['chrom'],
start = df_atac['start'],
end = df_atac['end'],
name = df_exon_rev['pass1_exon2_Name'],
strand = '-',
attr = df_exon_rev[l_],
)
#l_ = ['pass1_exon2_downstream_gene_id', 'pass1_exon2_downstream_locus_id', 'pass1_exon2_downstream_distance']
l_ = ['pass1_exon2_downstream_gene_id', 'pass1_exon2_downstream_distance']
write_gffbed(mp('closest_pass1_exon2_downstream_fwd.bed'),
chrom = df_atac['chrom'],
start = df_atac['start'],
end = df_atac['end'],
name = df_exon_fwd['pass1_exon2_downstream_Name'],
strand = '+',
attr = df_exon_fwd[l_],
)
write_gffbed(mp('closest_pass1_exon2_downstream_rev.bed'),
chrom = df_atac['chrom'],
start = df_atac['start'],
end = df_atac['end'],
name = df_exon_rev['pass1_exon2_downstream_Name'],
strand = '-',
attr = df_exon_rev[l_],
)
#l_ = ['pass2_exon1_gene_id', 'pass2_exon1_locus_id', 'pass2_exon1_distance']
l_ = ['pass2_exon1_gene_id', 'pass2_exon1_distance']
write_gffbed(mp('closest_pass2_exon1_fwd.bed'),
chrom = df_atac['chrom'],
start = df_atac['start'],
end = df_atac['end'],
name = df_exon_fwd['pass2_exon1_Name'],
strand = '+',
attr = df_exon_fwd[l_],
)
write_gffbed(mp('closest_pass2_exon1_rev.bed'),
chrom = df_atac['chrom'],
start = df_atac['start'],
end = df_atac['end'],
name = df_exon_rev['pass2_exon1_Name'],
strand = '-',
attr = df_exon_rev[l_],
)
In [ ]: