In [1]:
%run ~/relmapping/annot_cb/notebooks/annot__init__.ipynb


/mnt/home3/jj374/anaconda36/lib/python3.6/site-packages/statsmodels/compat/pandas.py:56: FutureWarning: The pandas.core.datetools module is deprecated and will be removed in a future version. Please use the pandas.tseries module instead.
  from pandas.core import datetools
os.getcwd(): /mnt/beegfs/scratch_copy/ahringer/jj374/lab/relmapping

In [2]:
def mp(fp): return os.path.join('annot_cb', 'metrics_exon', fp)

fp_atac = 'annot_cb/accessible_sites_cb.tsv'
df_atac = pd.read_csv(fp_atac, sep='\t')
l_atac_peak_pos = df_atac[['start', 'end']].mean(axis=1).map(int)
df_atac.head()


Out[2]:
chrom concave_start concave_end mode wt_ya_rep1_score wt_ya_rep2_score min_rank start end wt_ya_globalIDR max_globalIDR
0 I 7954 8180 8048 218.772699 431.677619 6379.0 7973 8124 3.671405 3.671405
1 I 16609 16809 16695 1089.472352 752.060789 1192.0 16620 16771 4.686630 4.686630
2 I 17076 17279 17167 634.236105 392.992569 2525.0 17092 17243 3.956980 3.956980
3 I 21469 21679 21560 2845.210881 3857.913439 65.0 21485 21636 5.000000 5.000000
4 I 39778 40055 39915 1768.688064 7326.561438 95.0 39840 39991 5.000000 5.000000

In [3]:
# df_exon1 / df_exon2: first/other exons, coordinates fixed to represent 5' ends
#NAMES_EXON = ('chrom', 'start', 'end', 'transcript_id', 'score', 'strand', 'gene_id', 'gene_biotype', 'display_id')
l_adj_ = [
    'chrom', 'start', 'end', 'score', 'strand', 'thickStart', 'thickEnd', 'itemRgb', 'Name', 
    'exon_start', 'exon_end', 'utr_start', 'utr_end', 'aoe_end', 'aoe_start', 'gene_biotype', 
    'exon_id', 'gene_id', 'init_cutoff_pos', #'locus_id', 
]

df_pass1_exon1_fwd = read_gffbed('annot_cb/canonical_geneset/WS260_CB4.transcripts.annot_pass1_exon1_fwd_utradj.bed')[l_adj_]
df_pass1_exon1_rev = read_gffbed('annot_cb/canonical_geneset/WS260_CB4.transcripts.annot_pass1_exon1_rev_utradj.bed')[l_adj_]
df_pass1_exon2_fwd = read_gffbed('annot_cb/canonical_geneset/WS260_CB4.transcripts.annot_pass1_exon2_fwd.bed')
df_pass1_exon2_rev = read_gffbed('annot_cb/canonical_geneset/WS260_CB4.transcripts.annot_pass1_exon2_rev.bed')
df_pass2_exon1_fwd = read_gffbed('annot_cb/canonical_geneset/WS260_CB4.transcripts.annot_pass2_exon1_fwd.bed')
df_pass2_exon1_rev = read_gffbed('annot_cb/canonical_geneset/WS260_CB4.transcripts.annot_pass2_exon1_rev.bed')

# Set 1st exon matching area to TSS + 250bp
df_pass1_exon1_fwd['end'] = df_pass1_exon1_fwd['start'] + 1 + 250
df_pass1_exon1_rev['start'] = df_pass1_exon1_rev['end'] - 1 - 250

print(len(df_pass1_exon1_fwd), len(df_pass1_exon1_rev),
      len(df_pass1_exon2_fwd), len(df_pass1_exon2_rev),
      len(df_pass2_exon1_fwd), len(df_pass2_exon1_rev),
)


12860 12771 69067 67641 477 497

In [4]:
# Match hypersensitive sites to closest first/other exon
def df_closest(df_a, df_b, b_prefix, **kwargs):
    df_a_sort = df_a
    df_b_sort = df_b.sort_values(list(df_b.columns[:3]))
    fn_ = BedTool.from_dataframe(df_a).closest(BedTool.from_dataframe(df_b_sort).fn, **kwargs).fn
    names_ = list(itertools.chain(df_a.columns.values,
        ['%s_%s' % (b_prefix, col) for col in df_b.columns.values],
        ['%s_distance' % (b_prefix)]
    ))
    df_ = pd.read_csv(fn_, names=names_, sep='\t')
    return df_[names_[len(df_a.columns):]]

# Closest exon1 -- protein_coding & pseudogene
df_ = pd.concat([df_atac['chrom'], l_atac_peak_pos, l_atac_peak_pos + 1], axis=1).copy()
df_pass1_exon1_fwd_out_ = df_closest(df_, df_pass1_exon1_fwd, 'pass1_exon1', D='ref', t='first', iu=True)
df_pass1_exon1_rev_out_ = df_closest(df_, df_pass1_exon1_rev, 'pass1_exon1', D='ref', t='last', id=True)

# Closest exon2 can be both upstream or downstream
df_pass1_exon2_fwd_out_ = df_closest(df_, df_pass1_exon2_fwd, 'pass1_exon2', D='ref', t='first')
df_pass1_exon2_rev_out_ = df_closest(df_, df_pass1_exon2_rev, 'pass1_exon2', D='ref', t='last')

# Closest exon2 can be both upstream or downstream (downstream only)
df_pass1_exon2_downstream_fwd_out_ = df_closest(df_, df_pass1_exon2_fwd, 'pass1_exon2_downstream', D='ref', t='first', iu=True)
df_pass1_exon2_downstream_rev_out_ = df_closest(df_, df_pass1_exon2_rev, 'pass1_exon2_downstream', D='ref', t='last', id=True)

# Closes pass2/exon1 -- tRNA, snoRNA, rRNA, miRNA, etc
df_pass2_exon1_fwd_out_ = df_closest(df_, df_pass2_exon1_fwd, 'pass2_exon1', D='ref', t='first', iu=True)
df_pass2_exon1_rev_out_ = df_closest(df_, df_pass2_exon1_rev, 'pass2_exon1', D='ref', t='last', id=True)

df_exon_fwd = pd.concat([df_pass1_exon1_fwd_out_, df_pass1_exon2_fwd_out_, df_pass1_exon2_downstream_fwd_out_, df_pass2_exon1_fwd_out_], axis=1)
df_exon_rev = pd.concat([df_pass1_exon1_rev_out_, df_pass1_exon2_rev_out_, df_pass1_exon2_downstream_rev_out_, df_pass2_exon1_rev_out_], axis=1)

df_exon_fwd.to_csv(mp('closest_exon_fwd.tsv'), header=True, index=False, sep='\t')
df_exon_rev.to_csv(mp('closest_exon_rev.tsv'), header=True, index=False, sep='\t')

#l_ = ['pass1_exon1_gene_id', 'pass1_exon1_locus_id', 'pass1_exon1_distance']
l_ = ['pass1_exon1_gene_id', 'pass1_exon1_distance']
write_gffbed(mp('closest_pass1_exon1_fwd.bed'),
    chrom = df_atac['chrom'],
    start = df_atac['start'],
    end = df_atac['end'],
    name = df_exon_fwd['pass1_exon1_Name'],
    strand = '+',
    attr = df_exon_fwd[l_], 
)
write_gffbed(mp('closest_pass1_exon1_rev.bed'),
    chrom = df_atac['chrom'],
    start = df_atac['start'],
    end = df_atac['end'],
    name = df_exon_rev['pass1_exon1_Name'],
    strand = '-',
    attr = df_exon_rev[l_], 
)

#l_ = ['pass1_exon2_gene_id', 'pass1_exon2_locus_id', 'pass1_exon2_distance']
l_ = ['pass1_exon2_gene_id', 'pass1_exon2_distance']
write_gffbed(mp('closest_pass1_exon2_fwd.bed'),
    chrom = df_atac['chrom'],
    start = df_atac['start'],
    end = df_atac['end'],
    name = df_exon_fwd['pass1_exon2_Name'],
    strand = '+',
    attr = df_exon_fwd[l_], 
)
write_gffbed(mp('closest_pass1_exon2_rev.bed'),
    chrom = df_atac['chrom'],
    start = df_atac['start'],
    end = df_atac['end'],
    name = df_exon_rev['pass1_exon2_Name'],
    strand = '-',
    attr = df_exon_rev[l_], 
)

#l_ = ['pass1_exon2_downstream_gene_id', 'pass1_exon2_downstream_locus_id', 'pass1_exon2_downstream_distance']
l_ = ['pass1_exon2_downstream_gene_id', 'pass1_exon2_downstream_distance']
write_gffbed(mp('closest_pass1_exon2_downstream_fwd.bed'),
    chrom = df_atac['chrom'],
    start = df_atac['start'],
    end = df_atac['end'],
    name = df_exon_fwd['pass1_exon2_downstream_Name'],
    strand = '+',
    attr = df_exon_fwd[l_], 
)
write_gffbed(mp('closest_pass1_exon2_downstream_rev.bed'),
    chrom = df_atac['chrom'],
    start = df_atac['start'],
    end = df_atac['end'],
    name = df_exon_rev['pass1_exon2_downstream_Name'],
    strand = '-',
    attr = df_exon_rev[l_], 
)

#l_ = ['pass2_exon1_gene_id', 'pass2_exon1_locus_id', 'pass2_exon1_distance']
l_ = ['pass2_exon1_gene_id', 'pass2_exon1_distance']
write_gffbed(mp('closest_pass2_exon1_fwd.bed'),
    chrom = df_atac['chrom'],
    start = df_atac['start'],
    end = df_atac['end'],
    name = df_exon_fwd['pass2_exon1_Name'],
    strand = '+',
    attr = df_exon_fwd[l_], 
)
write_gffbed(mp('closest_pass2_exon1_rev.bed'),
    chrom = df_atac['chrom'],
    start = df_atac['start'],
    end = df_atac['end'],
    name = df_exon_rev['pass2_exon1_Name'],
    strand = '-',
    attr = df_exon_rev[l_], 
)

In [ ]: