In [1]:
# Set annotation to annot_ce10
%run ~/relmapping/annot/notebooks/annot__init__.ipynb
#annot_ = 'annot_ce10_eLife_full'
annot_ = 'annot_ce10_tissues'
def mp(fp, annot_=annot_): return os.path.join(annot_, 'metrics_exon', fp)


/mnt/home3/jj374/anaconda36/lib/python3.6/site-packages/statsmodels/compat/pandas.py:56: FutureWarning: The pandas.core.datetools module is deprecated and will be removed in a future version. Please use the pandas.tseries module instead.
  from pandas.core import datetools
os.getcwd(): /mnt/beegfs/scratch_copy/ahringer/jj374/lab/relmapping

In [2]:
# Load long cap jump & exon data
df_atac = pd.read_csv(os.path.join(annot_, 'accessible_sites.tsv'), sep='\t')
l_atac_peak_pos = df_atac[['start', 'end']].mean(axis=1).map(int)
#df_sites = df_sites.head(100)
print(len(df_atac), 'ATAC-seq sites')


42874 ATAC-seq sites

In [3]:
# df_exon1 / df_exon2: first/other exons, coordinates fixed to represent 5' ends
#NAMES_EXON = ('chrom', 'start', 'end', 'transcript_id', 'score', 'strand', 'gene_id', 'gene_biotype', 'display_id')
l_adj_ = [
    'chrom', 'start', 'end', 'score', 'strand', 'thickStart', 'thickEnd', 'itemRgb', 'Name', 
    'exon_start', 'exon_end', 'utr_start', 'utr_end', 'aoe_end', 'aoe_start', 'gene_biotype', 
    'exon_id', 'gene_id', 'locus_id', 'init_cutoff_pos', 
]

#df_pass1_exon1_fwd = read_gffbed('WS260_ce10/WS260_ce10.transcripts.annot_pass1_exon1_fwd.bed')
#df_pass1_exon1_rev = read_gffbed('WS260_ce10/WS260_ce10.transcripts.annot_pass1_exon1_rev.bed')
df_pass1_exon1_fwd = read_gffbed('WS260_ce10/WS260_ce10.transcripts.annot_pass1_exon1_fwd_utradj.bed')[l_adj_]
df_pass1_exon1_rev = read_gffbed('WS260_ce10/WS260_ce10.transcripts.annot_pass1_exon1_rev_utradj.bed')[l_adj_]
df_pass1_exon2_fwd = read_gffbed('WS260_ce10/WS260_ce10.transcripts.annot_pass1_exon2_fwd.bed')
df_pass1_exon2_rev = read_gffbed('WS260_ce10/WS260_ce10.transcripts.annot_pass1_exon2_rev.bed')
df_pass2_exon1_fwd = read_gffbed('WS260_ce10/WS260_ce10.transcripts.annot_pass2_exon1_fwd.bed')
df_pass2_exon1_rev = read_gffbed('WS260_ce10/WS260_ce10.transcripts.annot_pass2_exon1_rev.bed')

# Set 1st exon matching area to TSS + 250bp
df_pass1_exon1_fwd['end'] = df_pass1_exon1_fwd['start'] + 1 + 250
df_pass1_exon1_rev['start'] = df_pass1_exon1_rev['end'] - 1 - 250

print(len(df_pass1_exon1_fwd), len(df_pass1_exon1_rev),
      len(df_pass1_exon2_fwd), len(df_pass1_exon2_rev),
      len(df_pass2_exon1_fwd), len(df_pass2_exon1_rev),
)


17922 17074 105494 99143 808 753

In [4]:
# Match hypersensitive sites to closest first/other exon
def df_closest(df_a, df_b, b_prefix, **kwargs):
    df_a_sort = df_a
    df_b_sort = df_b.sort_values(list(df_b.columns[:3]))
    fn_ = BedTool.from_dataframe(df_a).closest(BedTool.from_dataframe(df_b_sort).fn, **kwargs).fn
    names_ = list(itertools.chain(df_a.columns.values,
        ['%s_%s' % (b_prefix, col) for col in df_b.columns.values],
        ['%s_distance' % (b_prefix)]
    ))
    df_ = pd.read_csv(fn_, names=names_, sep='\t')
    return df_[names_[len(df_a.columns):]]

# Closest exon1 -- protein_coding & pseudogene
df_ = pd.concat([df_atac['chrom'], l_atac_peak_pos, l_atac_peak_pos + 1], axis=1).copy()
df_pass1_exon1_fwd_out_ = df_closest(df_, df_pass1_exon1_fwd, 'pass1_exon1', D='ref', t='first', iu=True)
df_pass1_exon1_rev_out_ = df_closest(df_, df_pass1_exon1_rev, 'pass1_exon1', D='ref', t='last', id=True)

# Closest exon2 can be both upstream or downstream
df_pass1_exon2_fwd_out_ = df_closest(df_, df_pass1_exon2_fwd, 'pass1_exon2', D='ref', t='first')
df_pass1_exon2_rev_out_ = df_closest(df_, df_pass1_exon2_rev, 'pass1_exon2', D='ref', t='last')

# Closest exon2 can be both upstream or downstream (downstream only)
df_pass1_exon2_downstream_fwd_out_ = df_closest(df_, df_pass1_exon2_fwd, 'pass1_exon2_downstream', D='ref', t='first', iu=True)
df_pass1_exon2_downstream_rev_out_ = df_closest(df_, df_pass1_exon2_rev, 'pass1_exon2_downstream', D='ref', t='last', id=True)

# Closes pass2/exon1 -- tRNA, snoRNA, rRNA, miRNA, etc
df_pass2_exon1_fwd_out_ = df_closest(df_, df_pass2_exon1_fwd, 'pass2_exon1', D='ref', t='first', iu=True)
df_pass2_exon1_rev_out_ = df_closest(df_, df_pass2_exon1_rev, 'pass2_exon1', D='ref', t='last', id=True)

df_exon_fwd = pd.concat([df_pass1_exon1_fwd_out_, df_pass1_exon2_fwd_out_, df_pass1_exon2_downstream_fwd_out_, df_pass2_exon1_fwd_out_], axis=1)
df_exon_rev = pd.concat([df_pass1_exon1_rev_out_, df_pass1_exon2_rev_out_, df_pass1_exon2_downstream_rev_out_, df_pass2_exon1_rev_out_], axis=1)

df_exon_fwd.to_csv(mp('closest_exon_fwd.tsv'), header=True, index=False, sep='\t')
df_exon_rev.to_csv(mp('closest_exon_rev.tsv'), header=True, index=False, sep='\t')

l_ = ['pass1_exon1_gene_id', 'pass1_exon1_locus_id', 'pass1_exon1_distance']
write_gffbed(mp('closest_pass1_exon1_fwd.bed'),
    chrom = df_atac['chrom'],
    start = df_atac['start'],
    end = df_atac['end'],
    name = df_exon_fwd['pass1_exon1_Name'],
    strand = '+',
    attr = df_exon_fwd[l_], 
)
write_gffbed(mp('closest_pass1_exon1_rev.bed'),
    chrom = df_atac['chrom'],
    start = df_atac['start'],
    end = df_atac['end'],
    name = df_exon_rev['pass1_exon1_Name'],
    strand = '-',
    attr = df_exon_rev[l_], 
)

l_ = ['pass1_exon2_gene_id', 'pass1_exon2_locus_id', 'pass1_exon2_distance']
write_gffbed(mp('closest_pass1_exon2_fwd.bed'),
    chrom = df_atac['chrom'],
    start = df_atac['start'],
    end = df_atac['end'],
    name = df_exon_fwd['pass1_exon2_Name'],
    strand = '+',
    attr = df_exon_fwd[l_], 
)
write_gffbed(mp('closest_pass1_exon2_rev.bed'),
    chrom = df_atac['chrom'],
    start = df_atac['start'],
    end = df_atac['end'],
    name = df_exon_rev['pass1_exon2_Name'],
    strand = '-',
    attr = df_exon_rev[l_], 
)

l_ = ['pass1_exon2_downstream_gene_id', 'pass1_exon2_downstream_locus_id', 'pass1_exon2_downstream_distance']
write_gffbed(mp('closest_pass1_exon2_downstream_fwd.bed'),
    chrom = df_atac['chrom'],
    start = df_atac['start'],
    end = df_atac['end'],
    name = df_exon_fwd['pass1_exon2_downstream_Name'],
    strand = '+',
    attr = df_exon_fwd[l_], 
)
write_gffbed(mp('closest_pass1_exon2_downstream_rev.bed'),
    chrom = df_atac['chrom'],
    start = df_atac['start'],
    end = df_atac['end'],
    name = df_exon_rev['pass1_exon2_downstream_Name'],
    strand = '-',
    attr = df_exon_rev[l_], 
)

l_ = ['pass2_exon1_gene_id', 'pass2_exon1_locus_id', 'pass2_exon1_distance']
write_gffbed(mp('closest_pass2_exon1_fwd.bed'),
    chrom = df_atac['chrom'],
    start = df_atac['start'],
    end = df_atac['end'],
    name = df_exon_fwd['pass2_exon1_Name'],
    strand = '+',
    attr = df_exon_fwd[l_], 
)
write_gffbed(mp('closest_pass2_exon1_rev.bed'),
    chrom = df_atac['chrom'],
    start = df_atac['start'],
    end = df_atac['end'],
    name = df_exon_rev['pass2_exon1_Name'],
    strand = '-',
    attr = df_exon_rev[l_], 
)