notebook.community

Edit and run



In [1]:

    
# Init, specify annotation target, load pre-computed lcap, scap, ... tables
%run ~/relmapping/annot/notebooks/annot__init__.ipynb

#annot_ = 'annot_ce10'
#annot_ = 'annot_mapq0_ce10'
#annot_ = 'annot_ce10_eLife_full'
annot_ = 'annot_ce10_tissues'
print('annotation target: %s' % (annot_,))

def mp(fp, annot_=annot_): return os.path.join(annot_, 'metrics_type', fp)

if annot_ == 'annot_ce10':
    l_cond = config['stages']
    df_atac = pd.read_csv(os.path.join(annot_, 'accessible_sites.tsv'), sep='\t')
    l_atac_peak_pos = df_atac[['start', 'end']].mean(axis=1).map(int)
    df_lcap_fwd = pd.read_csv(os.path.join(annot_, 'metrics_lcap', 'lcap_all_fwd.tsv'), sep='\t', low_memory=False)
    df_lcap_rev = pd.read_csv(os.path.join(annot_, 'metrics_lcap', 'lcap_all_rev.tsv'), sep='\t', low_memory=False)
    df_exon_fwd = pd.read_csv(os.path.join(annot_, 'metrics_exon', 'closest_exon_fwd.tsv'), sep='\t', low_memory=False)
    df_exon_rev = pd.read_csv(os.path.join(annot_, 'metrics_exon', 'closest_exon_rev.tsv'), sep='\t', low_memory=False)
    df_maxgap_fwd = pd.read_csv(os.path.join(annot_, 'metrics_maxgap', 'maxgap_fwd.tsv'), sep='\t')
    df_maxgap_rev = pd.read_csv(os.path.join(annot_, 'metrics_maxgap', 'maxgap_rev.tsv'), sep='\t')
    df_scap_fwd = pd.read_csv(os.path.join(annot_, 'metrics_scap', 'scap_fwd.tsv'), sep='\t')
    df_scap_rev = pd.read_csv(os.path.join(annot_, 'metrics_scap', 'scap_rev.tsv'), sep='\t')

elif annot_ == 'annot_ce10_eLife_full':
    l_cond = config['stages']
    df_atac = pd.read_csv(os.path.join(annot_, 'accessible_sites.tsv'), sep='\t')
    l_atac_peak_pos = df_atac[['start', 'end']].mean(axis=1).map(int)
    df_lcap_fwd = pd.read_csv(os.path.join(annot_, 'metrics_lcap', 'lcap_all_fwd.tsv'), sep='\t', low_memory=False)
    df_lcap_rev = pd.read_csv(os.path.join(annot_, 'metrics_lcap', 'lcap_all_rev.tsv'), sep='\t', low_memory=False)
    df_exon_fwd = pd.read_csv(os.path.join(annot_, 'metrics_exon', 'closest_exon_fwd.tsv'), sep='\t', low_memory=False)
    df_exon_rev = pd.read_csv(os.path.join(annot_, 'metrics_exon', 'closest_exon_rev.tsv'), sep='\t', low_memory=False)
    df_maxgap_fwd = pd.read_csv(os.path.join(annot_, 'metrics_maxgap', 'maxgap_fwd.tsv'), sep='\t')
    df_maxgap_rev = pd.read_csv(os.path.join(annot_, 'metrics_maxgap', 'maxgap_rev.tsv'), sep='\t')
    df_scap_fwd = pd.read_csv(os.path.join(annot_, 'metrics_scap', 'scap_fwd.tsv'), sep='\t')
    df_scap_rev = pd.read_csv(os.path.join(annot_, 'metrics_scap', 'scap_rev.tsv'), sep='\t')

elif annot_ == 'annot_ce10_tissues':
    l_cond = config['stages'] + config['lcap823_tissues']
    df_atac = pd.read_csv(os.path.join(annot_, 'accessible_sites.tsv'), sep='\t')
    l_atac_peak_pos = df_atac[['start', 'end']].mean(axis=1).map(int)
    df_lcap_fwd = pd.read_csv(os.path.join(annot_, 'metrics_lcap', 'lcap_all_fwd.tsv'), sep='\t', low_memory=False)
    df_lcap_rev = pd.read_csv(os.path.join(annot_, 'metrics_lcap', 'lcap_all_rev.tsv'), sep='\t', low_memory=False)
    df_exon_fwd = pd.read_csv(os.path.join(annot_, 'metrics_exon', 'closest_exon_fwd.tsv'), sep='\t', low_memory=False)
    df_exon_rev = pd.read_csv(os.path.join(annot_, 'metrics_exon', 'closest_exon_rev.tsv'), sep='\t', low_memory=False)
    df_maxgap_fwd = pd.read_csv(os.path.join(annot_, 'metrics_maxgap', 'maxgap_fwd.tsv'), sep='\t')
    df_maxgap_rev = pd.read_csv(os.path.join(annot_, 'metrics_maxgap', 'maxgap_rev.tsv'), sep='\t')
    df_scap_fwd = pd.read_csv(os.path.join(annot_, 'metrics_scap', 'scap_fwd.tsv'), sep='\t')
    df_scap_rev = pd.read_csv(os.path.join(annot_, 'metrics_scap', 'scap_rev.tsv'), sep='\t')

elif annot_ == 'annot_mapq0_ce10':
    l_cond = config['stages']
    df_atac = pd.read_csv(os.path.join('annot_ce10', 'accessible_sites.tsv'), sep='\t')
    l_atac_peak_pos = df_atac[['start', 'end']].mean(axis=1).map(int)
    df_lcap_fwd = pd.read_csv(os.path.join(annot_, 'metrics_lcap', 'lcap_all_fwd.tsv'), sep='\t', low_memory=False)
    df_lcap_rev = pd.read_csv(os.path.join(annot_, 'metrics_lcap', 'lcap_all_rev.tsv'), sep='\t', low_memory=False)
    df_exon_fwd = pd.read_csv(os.path.join('annot_ce10', 'metrics_exon', 'closest_exon_fwd.tsv'), sep='\t', low_memory=False)
    df_exon_rev = pd.read_csv(os.path.join('annot_ce10', 'metrics_exon', 'closest_exon_rev.tsv'), sep='\t', low_memory=False)
    df_maxgap_fwd = pd.read_csv(os.path.join(annot_, 'metrics_maxgap', 'maxgap_fwd.tsv'), sep='\t')
    df_maxgap_rev = pd.read_csv(os.path.join(annot_, 'metrics_maxgap', 'maxgap_rev.tsv'), sep='\t')
    df_scap_fwd = pd.read_csv(os.path.join(annot_, 'metrics_scap', 'scap_fwd.tsv'), sep='\t')
    df_scap_rev = pd.read_csv(os.path.join(annot_, 'metrics_scap', 'scap_rev.tsv'), sep='\t')

else:
    assert False

df_prom_fwd = df_atac[['chrom', 'start', 'end']].copy()
df_prom_rev = df_atac[['chrom', 'start', 'end']].copy()
df_prom_fwd['strand'] = '+'
df_prom_rev['strand'] = '-'









    



/mnt/home3/jj374/anaconda36/lib/python3.6/site-packages/statsmodels/compat/pandas.py:56: FutureWarning: The pandas.core.datetools module is deprecated and will be removed in a future version. Please use the pandas.tseries module instead.
  from pandas.core import datetools






    



os.getcwd(): /mnt/beegfs/scratch_copy/ahringer/jj374/lab/relmapping
annotation target: annot_ce10_tissues



In [2]:

    
# df_pass1_gene_%strand: WS260_ce10 gene coordinates, excluding 250bp downstream of TSS
df_ = pd.concat([
    yp.read_wbgtf('WS260_ce10/WS260_ce10.genes.pseudogene.gtf.gz', parse_attr=True, coords_adj=True),
    yp.read_wbgtf('WS260_ce10/WS260_ce10.genes.protein_coding.gtf.gz', parse_attr=True, coords_adj=True),
], axis=0)

df_pass1_gene_fwd = df_.query('strand == "+"').reset_index(drop=True)[['chrom', 'start', 'end', 'gene_id']]
df_pass1_gene_rev = df_.query('strand == "-"').reset_index(drop=True)[['chrom', 'start', 'end', 'gene_id']]

print(len(df_pass1_gene_fwd), len(df_pass1_gene_rev), 'fwd/rev genes')

# do not mask 250bp downstream of TSS
df_pass1_gene_fwd['start'] = df_pass1_gene_fwd['start'] + 250
df_pass1_gene_rev['end'] = df_pass1_gene_rev['end'] - 250

df_pass1_gene_fwd = df_pass1_gene_fwd.query('start < end')
df_pass1_gene_rev = df_pass1_gene_rev.query('start < end')
print(len(df_pass1_gene_fwd), len(df_pass1_gene_rev), 'fwd/rev genes longer than 250bp kept')









    



11073 10928 fwd/rev genes
10829 10668 fwd/rev genes longer than 250bp kept



In [3]:

    
# df_pass1_gpos_%strand[gene_chrom/start/end]: closest gene; if overlaps multiple genes, choose one with the more distal TSS
def df_pos():
    l_atac_peak_pos = df_atac[['start', 'end']].mean(axis=1).map(int)
    df_ = pd.concat([df_atac['chrom'], l_atac_peak_pos, l_atac_peak_pos + 1], axis=1).copy()
    df_.columns = ['chrom', 'start', 'end']
    return df_

def df_closest_(df_a, df_b, b_prefix, **kwargs):
    df_a_sort = df_a
    df_b_sort = df_b.sort_values(list(df_b.columns[:3]))
    fn_ = BedTool.from_dataframe(df_a).closest(BedTool.from_dataframe(df_b_sort).fn, **kwargs).fn
    names_ = list(itertools.chain(df_a.columns.values,
        ['%s_%s' % (b_prefix, col) for col in df_b.columns.values],
    ))
    df_ = pd.read_csv(fn_, sep='\t', names=names_)
    return df_#[names_[len(df_a.columns):]]

df_pass1_gpos_fwd_ = df_closest_(df_pos(), df_pass1_gene_fwd, 'pass1_gpos', t='all')
df_pass1_gpos_rev_ = df_closest_(df_pos(), df_pass1_gene_rev, 'pass1_gpos', t='all')
print(len(df_pass1_gpos_fwd_), len(df_pass1_gpos_rev_), 'fwd/rev gene body hits')









    



42895 42938 fwd/rev gene body hits



In [4]:

    
# df_pass1_gpos_%strand[is_nearly_intergenic]: site is outside of gene bodies, or up to 250bp from the TSS
df_pass1_gpos_fwd_['is_nearly_intergenic'] = ~((df_pass1_gpos_fwd_['pass1_gpos_start'] <= df_pass1_gpos_fwd_['start'])\
                                           & (df_pass1_gpos_fwd_['end'] <= df_pass1_gpos_fwd_['pass1_gpos_end']))
df_pass1_gpos_rev_['is_nearly_intergenic'] = ~((df_pass1_gpos_rev_['pass1_gpos_start'] <= df_pass1_gpos_rev_['start'])\
                                           & (df_pass1_gpos_rev_['end'] <= df_pass1_gpos_rev_['pass1_gpos_end']))

l_ = 'is_nearly_intergenic'
d_ = {'is_nearly_intergenic': np.any}
df_exon_fwd[l_] = df_pass1_gpos_fwd_.groupby(['chrom', 'start', 'end']).agg(d_).reset_index()[l_]
df_exon_rev[l_] = df_pass1_gpos_rev_.groupby(['chrom', 'start', 'end']).agg(d_).reset_index()[l_]
print(len(df_exon_fwd), len(df_exon_rev))
print(len(df_exon_fwd.query('is_nearly_intergenic')))
print(len(df_exon_rev.query('is_nearly_intergenic')))

# Visualise maxgap as .bed-files
def itemRgb_(is_nearly_intergenic_):
    if is_nearly_intergenic_:
        return yp.ORANGE
    else:
        return yp.BLUE

write_gffbed(mp('is_nearly_intergenic_fwd.bed'),
    chrom = df_atac['chrom'],
    start = df_atac['start'],
    end = df_atac['end'],
    #name = df_maxgap_fwd.min(axis=1),
    strand = '+',
    #attr = df_pass1_gpos_fwd_,
    itemRgb = map(itemRgb_, df_exon_fwd['is_nearly_intergenic']),
)

write_gffbed(mp('is_nearly_intergenic_rev.bed'),
    chrom = df_atac['chrom'],
    start = df_atac['start'],
    end = df_atac['end'],
    #name = df_maxgap_rev.min(axis=1),
    strand = '-',
    #attr = df_maxgap_rev,
    itemRgb = map(itemRgb_, df_exon_rev['is_nearly_intergenic']),
)









    



42874 42874
34002
34272



In [5]:

    
# df_prom_%strand[annot_detailed_%stage]: stage-specific detailed annotation, excluding lowconf
d_detailed_annot = collections.OrderedDict([
    ('coding_promoter_jump_scap', 'coding_promoter'),
    ('coding_promoter_incr_scap', 'coding_promoter'),
    ('coding_promoter_jump_incr', 'coding_promoter'),
    ('coding_promoter_lowconf', 'coding_promoter'),
    ('pseudogene_promoter_jump_scap', 'pseudogene_promoter'),
    ('pseudogene_promoter_incr_scap', 'pseudogene_promoter'),
    ('pseudogene_promoter_jump_incr', 'pseudogene_promoter'),
    ('pseudogene_promoter_lowconf', 'pseudogene_promoter'),
    ('non-coding_RNA', 'non-coding_RNA'),
    ('unknown_promoter_jump_scap', 'unknown_promoter'),
    #('unknown_promoter_jump_incr', 'unknown_promoter'),
    ('transcription_initiation', 'transcription_initiation'),
    ('no_transcription', 'no_transcription'),
])

d_annot_strand_legend = collections.OrderedDict([
    ('coding_promoter', yp.RED),
    ('pseudogene_promoter', yp.ORANGE),
    ('non-coding_RNA', yp.BLACK),
    ('unknown_promoter', yp.YELLOW),
    ('transcription_initiation', yp.GREEN),
    ('no_transcription', yp.BLUE),
])

d_annot_legend = collections.OrderedDict([
    ('coding_promoter', yp.RED),
    ('pseudogene_promoter', yp.ORANGE),
    ('non-coding_RNA', yp.BLACK),
    ('unknown_promoter', yp.YELLOW),
    ('putative_enhancer', yp.GREEN),
    ('other_element', yp.BLUE),
])

def annot_detailed_(chrom, start, end, jump, incr, maxgap, exon1_dist, exon2_dist, pass2_exon1_dist, 
                    scap_pass, scap_diff, gene_biotype, is_exon1_first, is_nearly_intergenic):
    """
    #is_gene_biotype = (gene_biotype == 'protein_coding') | (gene_biotype == 'pseudogene')
    #v = False
    #if chrom == "chrI" and start == 183421 and end == 183568: v = True
    #if (chrom == "chrX") and (start == 1125552) and (end == 1125703):
    #    v = True
    #    print(jump, maxgap <= 0, scap_diff >= 0, has_scap)
    #if start == 11829601-1: print(incr, jump, has_scap, n_txn_evidence)
    """

    near_exon1 = abs(exon1_dist) < 250
    near_exon2 = abs(exon2_dist) < 250
    near_pass2_exon1 = abs(pass2_exon1_dist) < 250
    #nearly_upstream_of_tss = exon1_dist > -250

    # We annotated accessible sites as coding_promoter or pseudogene_promoter if they fulfilled the following
    # four criteria (p1 to p4).
    # 1) The accessible site had transcription initiation, and passed at least one of the elongation tests 
    #    (jump or incr), or passed both elongation tests (jump and incr).
    p1 = (int(scap_pass) + int(jump) + int(incr)) >= 2
    # 2) Transcription initiation mode at the accessible site was either upstream of the closest first exon, 
    #    or, in the presence of a UTR, up to 250bp downstream within the UTR.
    #p2 = nearly_upstream_of_tss and (scap_diff >= 0)
    p2 = (scap_diff >= 0)
    # 3) The region from peak accessibility to the closest first exon did not contain the 5' end of a non-first exon.
    p3 = is_exon1_first
    # 4) Distal sites (peak accessibility >250bp from the closest first exon) were additionally required to 
    # (a) have continuous long cap coverage from 250bp downstream of peak accessibility to the closest first exon, and 
    # (b) be further than 250bp away from any non-first exon.
    p4 = near_exon1 or ((maxgap <= 0) and not near_exon2)
    if p1 and p2 and p3 and p4:
        #if v: print('coding_promoter')
        if scap_pass and jump:
            return 'coding_promoter_jump_scap' if (gene_biotype == 'protein_coding') else 'pseudogene_promoter_jump_scap'
        elif scap_pass and incr:
            return 'coding_promoter_incr_scap' if (gene_biotype == 'protein_coding') else 'pseudogene_promoter_incr_scap'
        elif jump and incr:
            return 'coding_promoter_jump_incr' if (gene_biotype == 'protein_coding') else 'pseudogene_promoter_jump_incr'
        else:
            assert False

    # Sites within 250bp of the 5' end of an annotated tRNA, snoRNA, miRNA, snRNA or rRNA were annotated as 
    # non-coding_RNA.
    elif near_pass2_exon1:
        #if v: print('non-coding_RNA')
        return 'non-coding_RNA'

    # Next, intergenic sites more than 250bp away from annotated exons that had initiating transcription, 
    # and passed the jump test were annotated as unknown_promoter.
    elif is_nearly_intergenic and not(near_exon1 or near_exon2) and scap_pass and jump:
        return 'unknown_promoter_jump_scap'

    # All remaining sites were annotated as transcription_initiation or no_transcription based on 
    # whether they had transcription initiation.
    elif scap_pass:
        #if v: print('transcription_initiation')
        return 'transcription_initiation'
    else:
        #if v: print('no_transcription')
        return 'no_transcription'

for stage in itertools.islice(l_cond, None):
    print(stage, 'fwd')
    df_prom_fwd['annot_detailed_%s' % (stage,)] = [*map(annot_detailed_,
        # chrom, start, end
        df_atac['chrom'], df_atac['start'], df_atac['end'],
        # jump, incr
        df_lcap_fwd['lcap_%s_fwd_passed_jump' % (stage,)], df_lcap_fwd['lcap_%s_fwd_passed_incr' % (stage,)],
        # maxgap
        df_maxgap_fwd['maxgap_%s_fwd' % (stage,)],
        # exon1_dist
        df_exon_fwd['pass1_exon1_start'] - l_atac_peak_pos,
        # exon2_dist
        df_exon_fwd['pass1_exon2_start'] - l_atac_peak_pos,
        # pass2_exon2_dist
        df_exon_fwd['pass2_exon1_start'] - l_atac_peak_pos,
        # scap_pass
        df_scap_fwd['scap_pass'],
        # scap_diff
        pd.to_numeric(df_exon_fwd['pass1_exon1_init_cutoff_pos'], errors='coerce') - df_scap_fwd['scap_mode'],
        # gene_biotype
        df_exon_fwd['pass1_exon1_gene_biotype'],
        # is_exon1_first
        df_exon_fwd['pass1_exon1_distance'] <= df_exon_fwd['pass1_exon2_downstream_distance'],
        # is_nearly_intergenic
        df_exon_fwd['is_nearly_intergenic'],
    )]
    print(stage, 'rev')
    df_prom_rev['annot_detailed_%s' % (stage,)] = [*map(annot_detailed_,
        # chrom, start, end
        df_atac['chrom'], df_atac['start'], df_atac['end'],
        # jump, incr
        df_lcap_rev['lcap_%s_rev_passed_jump' % (stage,)], df_lcap_rev['lcap_%s_rev_passed_incr' % (stage,)],
        # maxgap
        df_maxgap_rev['maxgap_%s_rev' % (stage,)],
        # exon1_dist
        l_atac_peak_pos - (df_exon_rev['pass1_exon1_end'] - 1),
        # exon2_dist
        l_atac_peak_pos - (df_exon_rev['pass1_exon2_end'] - 1),
        # pass2_exon2_dist
        l_atac_peak_pos - (df_exon_rev['pass2_exon1_end'] - 1),
        # scap_pass
        df_scap_rev['scap_pass'],
        # scap_diff
        df_scap_rev['scap_mode'] - pd.to_numeric(df_exon_rev['pass1_exon1_init_cutoff_pos'], errors='coerce'),
        # gene_biotype
        df_exon_rev['pass1_exon1_gene_biotype'],
        # is_exon1_first
        df_exon_rev['pass1_exon2_downstream_distance'] <= df_exon_rev['pass1_exon1_distance'],
        # is_nearly_intergenic
        df_exon_rev['is_nearly_intergenic'],
    )]









    



wt_emb fwd
wt_emb rev
wt_l1 fwd
wt_l1 rev
wt_l2 fwd
wt_l2 rev
wt_l3 fwd
wt_l3 rev
wt_l4 fwd
wt_l4 rev
wt_ya fwd
wt_ya rev
glp1_d1 fwd
glp1_d1 rev
glp1_d2 fwd
glp1_d2 rev
glp1_d6 fwd
glp1_d6 rev
glp1_d9 fwd
glp1_d9 rev
glp1_d13 fwd
glp1_d13 rev
neurons fwd
neurons rev
gonad fwd
gonad rev
muscle fwd
muscle rev
intest fwd
intest rev



In [6]:

    
# df_prom_%strand[annot_detailed_all]: stage-aggregated detailed annotation, excluding lowconf
def annot_detailed_summary_(l_annot):
    l_ranking = list(d_detailed_annot.keys())
    l_annot_i = [l_ranking.index(annot) for annot in l_annot]
    return l_ranking[min(l_annot_i)]

df_prom_fwd['annot_detailed_summary'] = list(map(lambda ir: annot_detailed_summary_(ir[1].tolist()),
    df_prom_fwd[['annot_detailed_%(stage)s' % locals() for stage in l_cond]].iterrows()))

df_prom_rev['annot_detailed_summary'] = list(map(lambda ir: annot_detailed_summary_(ir[1].tolist()),
    df_prom_rev[['annot_detailed_%(stage)s' % locals() for stage in l_cond]].iterrows()))



In [7]:

    
# Forward strand annotation summary table (excl. lowconf)
df_fwd_ = pd.concat([df_prom_fwd['annot_detailed_%(stage)s' % locals()].value_counts() for stage in l_cond + ['summary']], axis=1).loc[d_detailed_annot.keys()]
df_fwd_.columns = l_cond + ['summary']
#df_fwd_.to_latex('%(dm_step)s_fig/prom_summary_fwd.latex' % locals())
df_fwd_.fillna(0, downcast='infer')









    Out[7]:







  
    
      
      wt_emb
      wt_l1
      wt_l2
      wt_l3
      wt_l4
      wt_ya
      glp1_d1
      glp1_d2
      glp1_d6
      glp1_d9
      glp1_d13
      neurons
      gonad
      muscle
      intest
      summary
    
  
  
    
      coding_promoter_jump_scap
      3241
      3627
      4271
      3513
      3610
      3173
      2474
      2117
      2470
      2421
      2453
      3125
      3068
      2620
      3191
      6454
    
    
      coding_promoter_incr_scap
      389
      282
      282
      440
      340
      454
      408
      320
      483
      309
      522
      405
      305
      240
      365
      810
    
    
      coding_promoter_jump_incr
      53
      73
      100
      74
      102
      104
      74
      97
      71
      94
      78
      102
      132
      104
      98
      461
    
    
      coding_promoter_lowconf
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
    
    
      pseudogene_promoter_jump_scap
      50
      44
      45
      33
      41
      49
      24
      13
      25
      19
      21
      25
      36
      19
      35
      98
    
    
      pseudogene_promoter_incr_scap
      11
      8
      9
      13
      9
      4
      9
      4
      9
      10
      11
      13
      10
      7
      7
      27
    
    
      pseudogene_promoter_jump_incr
      8
      8
      11
      7
      11
      8
      6
      1
      5
      4
      2
      7
      7
      3
      8
      26
    
    
      pseudogene_promoter_lowconf
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
    
    
      non-coding_RNA
      501
      495
      490
      496
      499
      496
      502
      504
      499
      497
      495
      496
      502
      503
      502
      470
    
    
      unknown_promoter_jump_scap
      469
      604
      834
      406
      328
      263
      188
      151
      215
      285
      363
      241
      223
      229
      282
      1722
    
    
      transcription_initiation
      24688
      24288
      23414
      24447
      24519
      24906
      25741
      26238
      25645
      25805
      25482
      25042
      25202
      25728
      24966
      19757
    
    
      no_transcription
      13464
      13445
      13418
      13445
      13415
      13417
      13448
      13429
      13452
      13430
      13447
      13418
      13389
      13421
      13420
      13049



In [8]:

    
# Reverse strand annotation summary table (excl. lowconf)
df_rev_ = pd.concat([df_prom_rev['annot_detailed_%(stage)s' % locals()].value_counts() for stage in l_cond + ['summary']], axis=1).loc[d_detailed_annot.keys()]
df_rev_.columns = l_cond + ['summary']
#df_rev_.to_latex('%(dm_step)s_fig/prom_summary_rev.latex' % locals())
df_rev_.fillna(0, downcast='infer')









    Out[8]:







  
    
      
      wt_emb
      wt_l1
      wt_l2
      wt_l3
      wt_l4
      wt_ya
      glp1_d1
      glp1_d2
      glp1_d6
      glp1_d9
      glp1_d13
      neurons
      gonad
      muscle
      intest
      summary
    
  
  
    
      coding_promoter_jump_scap
      3301
      3624
      4256
      3718
      3590
      3213
      2477
      2155
      2466
      2373
      2559
      3071
      3012
      2748
      3178
      6416
    
    
      coding_promoter_incr_scap
      342
      286
      295
      365
      362
      424
      382
      349
      474
      366
      460
      403
      340
      220
      393
      770
    
    
      coding_promoter_jump_incr
      72
      76
      91
      96
      114
      115
      85
      89
      80
      94
      96
      103
      139
      116
      104
      496
    
    
      coding_promoter_lowconf
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
    
    
      pseudogene_promoter_jump_scap
      55
      54
      62
      52
      48
      46
      22
      11
      27
      23
      26
      36
      52
      18
      48
      113
    
    
      pseudogene_promoter_incr_scap
      13
      11
      6
      11
      10
      15
      6
      9
      10
      6
      7
      11
      11
      4
      7
      19
    
    
      pseudogene_promoter_jump_incr
      7
      8
      11
      8
      11
      11
      5
      3
      5
      6
      5
      8
      9
      5
      11
      33
    
    
      pseudogene_promoter_lowconf
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
    
    
      non-coding_RNA
      503
      503
      499
      503
      505
      500
      510
      512
      510
      504
      507
      508
      505
      507
      503
      481
    
    
      unknown_promoter_jump_scap
      444
      618
      868
      483
      355
      257
      180
      171
      209
      268
      378
      215
      229
      254
      298
      1794
    
    
      transcription_initiation
      24588
      24151
      23259
      24114
      24377
      24789
      25672
      26042
      25553
      25705
      25311
      25004
      25099
      25498
      24820
      19647
    
    
      no_transcription
      13549
      13543
      13527
      13524
      13502
      13504
      13535
      13533
      13540
      13529
      13525
      13515
      13478
      13504
      13512
      13105



In [9]:

    
# df_lowconf_%strand[annot_lowconf_raw_%stage] <- candidate "raw" low-confidence annotation (with aggregating)
def lowconf_raw_stage(chrom, start, end, jump, incr, log2FoldChange, maxgap, exon1_dist, exon2_dist,
                      pass2_exon1_dist, scap_pass, scap_diff, gene_biotype, is_exon1_first, is_nearly_intergenic):
    near_exon1 = abs(exon1_dist) < 250
    near_exon2 = abs(exon2_dist) < 250
    near_pass2_exon1 = abs(pass2_exon1_dist) < 250
    nearly_upstream_of_tss = exon1_dist > -250
    has_txn_evidence = (log2FoldChange > 1) #incr or (log2FoldChange > 1 and (has_scap))

    v = False
    #if chrom == "chrI" and start == 183421 and end == 183568: v = True
    if (near_exon1 or not(near_exon2)) and nearly_upstream_of_tss and is_exon1_first and\
        is_nearly_intergenic and (not (maxgap > 0)) and has_txn_evidence and (scap_diff >= 0): #-50
        #if pass1_exon1_locus_id == "F59F4.3":
        #    print(chrom, start, end)
        return 'coding_promoter_lowconf' if (gene_biotype == 'protein_coding') else 'pseudogene_promoter_lowconf'
    else:
        return 'no_lowconf_annotation'

df_lowconf_fwd = df_atac[['chrom', 'start', 'end']].copy()
df_lowconf_rev = df_atac[['chrom', 'start', 'end']].copy()
df_lowconf_fwd['strand'] = '+'
df_lowconf_rev['strand'] = '-'

for stage in itertools.islice(l_cond, None):
    print(stage, 'fwd')
    df_lowconf_fwd['annot_lowconf_raw_%s' % (stage,)] = [*map(lowconf_raw_stage,
        #chrom, start, end
        df_atac['chrom'], df_atac['start'], df_atac['end'],
        # jump, incr
        df_lcap_fwd['lcap_%s_fwd_passed_jump' % (stage,)], df_lcap_fwd['lcap_%s_fwd_passed_incr' % (stage,)],
        # log2FoldChange
        df_lcap_fwd['lcap_%s_fwd_log2FoldChange' % (stage,)],
        # maxgap
        df_maxgap_fwd['maxgap_%s_fwd' % (stage,)],
        # exon1_dist
        df_exon_fwd['pass1_exon1_start'] - l_atac_peak_pos,
        # exon2_dist
        df_exon_fwd['pass1_exon2_start'] - l_atac_peak_pos,
        # pass2_exon1_dist
        df_exon_fwd['pass2_exon1_start'] - l_atac_peak_pos,
        # scap_pass
        df_scap_fwd['scap_pass'],
        # scap_diff
        pd.to_numeric(df_exon_fwd['pass1_exon1_init_cutoff_pos'], errors='coerce') - df_scap_fwd['scap_mode'],
        # gene_biotype
        df_exon_fwd['pass1_exon1_gene_biotype'],
        # is_exon1_first
        df_exon_fwd['pass1_exon1_distance'] <= df_exon_fwd['pass1_exon2_downstream_distance'],
        # is_nearly_intergenic
        df_exon_fwd['is_nearly_intergenic'],
    )]

    print(stage, 'rev')
    df_lowconf_rev['annot_lowconf_raw_%s' % (stage,)] = [*map(lowconf_raw_stage,
        # chrom, start, end
        df_atac['chrom'], df_atac['start'], df_atac['end'],
        # jump, incr
        df_lcap_rev['lcap_%s_rev_passed_jump' % (stage,)], df_lcap_rev['lcap_%s_rev_passed_incr' % (stage,)],
        # log2FoldChange
        df_lcap_rev['lcap_%s_rev_log2FoldChange' % (stage,)],
        # maxgap
        df_maxgap_rev['maxgap_%s_rev' % (stage,)],
        # exon1_dist
        l_atac_peak_pos - (df_exon_rev['pass1_exon1_end'] - 1),
        # exon2_dist
        l_atac_peak_pos - (df_exon_rev['pass1_exon2_end'] - 1),
        # pass2_exon1_dist
        l_atac_peak_pos - (df_exon_rev['pass2_exon1_end'] - 1),
        # scap_pass
        df_scap_fwd['scap_pass'],
        # scap_diff
        df_scap_rev['scap_mode'] - pd.to_numeric(df_exon_rev['pass1_exon1_init_cutoff_pos'], errors='coerce'),
        # gene_biotype
        df_exon_rev['pass1_exon1_gene_biotype'],
        # is_exon1_first
        df_exon_rev['pass1_exon2_downstream_distance'] <= df_exon_rev['pass1_exon1_distance'],
        #is_nearly_intergenic
        df_exon_rev['is_nearly_intergenic'],
    )]









    



wt_emb fwd
wt_emb rev
wt_l1 fwd
wt_l1 rev
wt_l2 fwd
wt_l2 rev
wt_l3 fwd
wt_l3 rev
wt_l4 fwd
wt_l4 rev
wt_ya fwd
wt_ya rev
glp1_d1 fwd
glp1_d1 rev
glp1_d2 fwd
glp1_d2 rev
glp1_d6 fwd
glp1_d6 rev
glp1_d9 fwd
glp1_d9 rev
glp1_d13 fwd
glp1_d13 rev
neurons fwd
neurons rev
gonad fwd
gonad rev
muscle fwd
muscle rev
intest fwd
intest rev



In [10]:

    
# df_lowconf_%strand[annot_lowconf_raw_summary] <- candidate "raw" low-confidence annotation across stages
d_annot_lowconf_raw_legend = collections.OrderedDict([
    ('coding_promoter_lowconf', yp.RED),
    ('pseudogene_promoter_lowconf', yp.ORANGE),
    ('no_lowconf_annotation', yp.BLACK),
])

def annot_lowconf_raw_summary_(l_annot):
    l_ranking = list(d_annot_lowconf_raw_legend.keys())
    l_annot_i = [l_ranking.index(annot) for annot in l_annot]
    return l_ranking[min(l_annot_i)]

df_lowconf_fwd['annot_lowconf_raw_summary'] = list(map(lambda ir: annot_lowconf_raw_summary_(ir[1].tolist()),
    df_lowconf_fwd[['annot_lowconf_raw_%(stage)s' % locals() for stage in l_cond]].iterrows()))
df_lowconf_rev['annot_lowconf_raw_summary'] = list(map(lambda ir: annot_lowconf_raw_summary_(ir[1].tolist()),
    df_lowconf_rev[['annot_lowconf_raw_%(stage)s' % locals() for stage in l_cond]].iterrows()))



In [11]:

    
# df_lowconf_%strand[lcap_log2FoldChange_max] <- max log2FoldChange over all candidates (lowconf_raw) by gene_id

# copy fields over in order to group by
df_lowconf_fwd['pass1_exon1_gene_id'] = df_exon_fwd['pass1_exon1_gene_id'] 
df_lowconf_rev['pass1_exon1_gene_id'] = df_exon_rev['pass1_exon1_gene_id']

q_ = "(annot_lowconf_raw_summary == 'coding_promoter_lowconf') | (annot_lowconf_raw_summary == 'pseudogene_promoter_lowconf')"
df_lowconf_fwd['lcap_log2FoldChange'] = df_lcap_fwd[['lcap_%s_fwd_log2FoldChange' % (s_,) for s_ in l_cond]].max(axis=1)
df_lowconf_rev['lcap_log2FoldChange'] = df_lcap_rev[['lcap_%s_rev_log2FoldChange' % (s_,) for s_ in l_cond]].max(axis=1)

df_lowconf_fwd['lcap_log2FoldChange_max'] = df_lowconf_fwd.query(q_)\
    .groupby('pass1_exon1_gene_id')['lcap_log2FoldChange'].transform(np.nanmax)
df_lowconf_rev['lcap_log2FoldChange_max'] = df_lowconf_rev.query(q_)\
    .groupby('pass1_exon1_gene_id')['lcap_log2FoldChange'].transform(np.nanmax)



In [12]:

    
# df_lowconf_%strand[annot_lowconf_summary] <- low-confidence annotations candidates (=only for "promoterless genes")
df_lowconf_fwd['annot_detailed_summary'] = df_prom_fwd['annot_detailed_summary']
df_lowconf_rev['annot_detailed_summary'] = df_prom_rev['annot_detailed_summary']

# list of gene_id without jump/incr promoters, but at least one candidate lowconf promoter
l1_ = ['coding_promoter_jump_scap', 'coding_promoter_incr_scap', 'coding_promoter_jump_incr',
    'pseudogene_promoter_jump_scap', 'pseudogene_promoter_incr_scap', 'pseudogene_promoter_jump_incr',
]
l2_ = ['coding_promoter_lowconf', 'pseudogene_promoter_lowconf']
l_prom_lowconf_fwd_gene_id = df_lowconf_fwd.groupby('pass1_exon1_gene_id').agg({
        'annot_detailed_summary': lambda a: len([a_i for a_i in a if a_i in l1_]),
        'annot_lowconf_raw_summary': lambda a: len([a_i for a_i in a if a_i in l2_]),
}).query('(annot_detailed_summary == 0) & (annot_lowconf_raw_summary > 0)').index.tolist()

l_prom_lowconf_rev_gene_id = df_lowconf_rev.groupby('pass1_exon1_gene_id').agg({
        'annot_detailed_summary': lambda a: len([a_i for a_i in a if a_i in l1_]),
        'annot_lowconf_raw_summary': lambda a: len([a_i for a_i in a if a_i in l2_]),
}).query('(annot_detailed_summary == 0) & (annot_lowconf_raw_summary > 0)').index.tolist()

l3_ = ["coding_promoter_lowconf", "pseudogene_promoter_lowconf"]
def annot_lowconf_fwd_summary(ir):
    (i, r) = (ir[0], ir[1])
    if (r.annot_lowconf_raw_summary in l3_) and (r.pass1_exon1_gene_id in l_prom_lowconf_fwd_gene_id) and (r.lcap_log2FoldChange_max == r.lcap_log2FoldChange):
        return r.annot_lowconf_raw_summary
    else:
        return '.'

def annot_lowconf_rev_summary(ir):
    (i, r) = (ir[0], ir[1])
    if (r.annot_lowconf_raw_summary in l3_) and (r.pass1_exon1_gene_id in l_prom_lowconf_rev_gene_id) and (r.lcap_log2FoldChange_max == r.lcap_log2FoldChange):
        return r.annot_lowconf_raw_summary
    else:
        return '.'

df_lowconf_fwd['annot_lowconf_summary'] = list(map(annot_lowconf_fwd_summary, df_lowconf_fwd.iterrows()))
df_lowconf_rev['annot_lowconf_summary'] = list(map(annot_lowconf_rev_summary, df_lowconf_rev.iterrows()))
print('Fwd strand:')
print(df_lowconf_fwd['annot_lowconf_summary'].value_counts())
print('Rev strand:')
print(df_lowconf_rev['annot_lowconf_summary'].value_counts())









    



Fwd strand:
.                              42213
coding_promoter_lowconf          614
pseudogene_promoter_lowconf       47
Name: annot_lowconf_summary, dtype: int64
Rev strand:
.                              42260
coding_promoter_lowconf          559
pseudogene_promoter_lowconf       55
Name: annot_lowconf_summary, dtype: int64



In [13]:

    
# df_lowconf_%strand[annot_lowconf_summary] => .bed-files of the low-confidence annotation
d_ = collections.OrderedDict([
    ('coding_promoter_lowconf', yp.RED),
    ('pseudogene_promoter_lowconf', yp.ORANGE),
    ('.', yp.BLACK),
])

write_gffbed(mp('regulatory_annotation_lowconf_fwd.bed'),
        chrom = df_atac['chrom'],
        start = df_atac['start'],
        end = df_atac['end'],
        name = df_exon_fwd['pass1_exon1_locus_id'],
        strand = '+',
        itemRgb = list(map(lambda lowconf: d_[lowconf], df_lowconf_fwd['annot_lowconf_summary'])),
        attr = df_lowconf_fwd['annot_lowconf_summary'],
)
write_gffbed(mp('regulatory_annotation_lowconf_rev.bed'),
        chrom = df_atac['chrom'],
        start = df_atac['start'],
        end = df_atac['end'],
        name = df_exon_rev['pass1_exon1_locus_id'],
        strand = '-',
        itemRgb = list(map(lambda lowconf: d_[lowconf], df_lowconf_rev['annot_lowconf_summary'])),
        attr = df_lowconf_rev['annot_lowconf_summary'],
)



In [14]:

    
# df_prom_%strand[annot_detailed_all] <- update with low-confidence annotations
m_fwd_ = df_lowconf_fwd['annot_lowconf_summary'] != "."
m_rev_ = df_lowconf_rev['annot_lowconf_summary'] != "."
df_prom_fwd.loc[m_fwd_, 'annot_detailed_summary'] = df_lowconf_fwd.loc[m_fwd_, 'annot_lowconf_summary']
df_prom_rev.loc[m_rev_, 'annot_detailed_summary'] = df_lowconf_rev.loc[m_rev_, 'annot_lowconf_summary']
print('%d fwd strand lowconf (pseudo)gene promoters added' % (sum(m_fwd_),))
print('%d rev strand lowconf (pseudo)gene promoters added' % (sum(m_rev_),))









    



661 fwd strand lowconf (pseudo)gene promoters added
614 rev strand lowconf (pseudo)gene promoters added



In [15]:

    
# Forward strand annotation summary table (incl. lowconf across all stages but not the origin stage)
df_fwd_ = pd.concat([df_prom_fwd['annot_detailed_%(stage)s' % locals()].value_counts() for stage in l_cond + ['summary']], axis=1).loc[d_detailed_annot.keys()]
df_fwd_.columns = l_cond + ['summary']
#df_fwd_.to_latex('%(dm_step)s_fig/prom_summary_fwd.latex' % locals())
df_fwd_.fillna(0, downcast='infer')









    Out[15]:







  
    
      
      wt_emb
      wt_l1
      wt_l2
      wt_l3
      wt_l4
      wt_ya
      glp1_d1
      glp1_d2
      glp1_d6
      glp1_d9
      glp1_d13
      neurons
      gonad
      muscle
      intest
      summary
    
  
  
    
      coding_promoter_jump_scap
      3241
      3627
      4271
      3513
      3610
      3173
      2474
      2117
      2470
      2421
      2453
      3125
      3068
      2620
      3191
      6454
    
    
      coding_promoter_incr_scap
      389
      282
      282
      440
      340
      454
      408
      320
      483
      309
      522
      405
      305
      240
      365
      810
    
    
      coding_promoter_jump_incr
      53
      73
      100
      74
      102
      104
      74
      97
      71
      94
      78
      102
      132
      104
      98
      461
    
    
      coding_promoter_lowconf
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      614
    
    
      pseudogene_promoter_jump_scap
      50
      44
      45
      33
      41
      49
      24
      13
      25
      19
      21
      25
      36
      19
      35
      98
    
    
      pseudogene_promoter_incr_scap
      11
      8
      9
      13
      9
      4
      9
      4
      9
      10
      11
      13
      10
      7
      7
      27
    
    
      pseudogene_promoter_jump_incr
      8
      8
      11
      7
      11
      8
      6
      1
      5
      4
      2
      7
      7
      3
      8
      26
    
    
      pseudogene_promoter_lowconf
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      47
    
    
      non-coding_RNA
      501
      495
      490
      496
      499
      496
      502
      504
      499
      497
      495
      496
      502
      503
      502
      462
    
    
      unknown_promoter_jump_scap
      469
      604
      834
      406
      328
      263
      188
      151
      215
      285
      363
      241
      223
      229
      282
      1704
    
    
      transcription_initiation
      24688
      24288
      23414
      24447
      24519
      24906
      25741
      26238
      25645
      25805
      25482
      25042
      25202
      25728
      24966
      19449
    
    
      no_transcription
      13464
      13445
      13418
      13445
      13415
      13417
      13448
      13429
      13452
      13430
      13447
      13418
      13389
      13421
      13420
      12722



In [16]:

    
# Reverse strand annotation summary table (incl. lowconf across all stages but not the origin stage)
df_rev_ = pd.concat([df_prom_rev['annot_detailed_%(stage)s' % locals()].value_counts() for stage in l_cond + ['summary']], axis=1).loc[d_detailed_annot.keys()]
df_rev_.columns = l_cond + ['summary']
#df_rev_.to_latex('%(dm_step)s_fig/prom_summary_rev.latex' % locals())
df_rev_.fillna(0, downcast='infer')









    Out[16]:







  
    
      
      wt_emb
      wt_l1
      wt_l2
      wt_l3
      wt_l4
      wt_ya
      glp1_d1
      glp1_d2
      glp1_d6
      glp1_d9
      glp1_d13
      neurons
      gonad
      muscle
      intest
      summary
    
  
  
    
      coding_promoter_jump_scap
      3301
      3624
      4256
      3718
      3590
      3213
      2477
      2155
      2466
      2373
      2559
      3071
      3012
      2748
      3178
      6416
    
    
      coding_promoter_incr_scap
      342
      286
      295
      365
      362
      424
      382
      349
      474
      366
      460
      403
      340
      220
      393
      770
    
    
      coding_promoter_jump_incr
      72
      76
      91
      96
      114
      115
      85
      89
      80
      94
      96
      103
      139
      116
      104
      496
    
    
      coding_promoter_lowconf
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      559
    
    
      pseudogene_promoter_jump_scap
      55
      54
      62
      52
      48
      46
      22
      11
      27
      23
      26
      36
      52
      18
      48
      113
    
    
      pseudogene_promoter_incr_scap
      13
      11
      6
      11
      10
      15
      6
      9
      10
      6
      7
      11
      11
      4
      7
      19
    
    
      pseudogene_promoter_jump_incr
      7
      8
      11
      8
      11
      11
      5
      3
      5
      6
      5
      8
      9
      5
      11
      33
    
    
      pseudogene_promoter_lowconf
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      55
    
    
      non-coding_RNA
      503
      503
      499
      503
      505
      500
      510
      512
      510
      504
      507
      508
      505
      507
      503
      475
    
    
      unknown_promoter_jump_scap
      444
      618
      868
      483
      355
      257
      180
      171
      209
      268
      378
      215
      229
      254
      298
      1781
    
    
      transcription_initiation
      24588
      24151
      23259
      24114
      24377
      24789
      25672
      26042
      25553
      25705
      25311
      25004
      25099
      25498
      24820
      19378
    
    
      no_transcription
      13549
      13543
      13527
      13524
      13502
      13504
      13535
      13533
      13540
      13529
      13525
      13515
      13478
      13504
      13512
      12779



In [17]:

    
# annot_detailed => collapse by method
for s_ in l_cond + ['summary']:
    df_prom_fwd['annot_%s' % (s_,)] = list(map(lambda annot_detailed: d_detailed_annot[annot_detailed], 
                                               df_prom_fwd['annot_detailed_%s' % (s_,)]))
    df_prom_rev['annot_%s' % (s_,)] = list(map(lambda annot_detailed: d_detailed_annot[annot_detailed], 
                                               df_prom_rev['annot_detailed_%s' % (s_,)]))



In [18]:

    
# Forward strand annotation (collapsed across methods)
df_fwd_ = pd.concat([df_prom_fwd['annot_%(stage)s' % locals()].value_counts() for stage in l_cond + ['summary']], axis=1).loc[d_annot_strand_legend.keys()]
df_fwd_.columns = l_cond + ['summary']
#df_fwd_.to_latex('%(dm_step)s_fig/prom_summary_rev.latex' % locals())
#plt.subplot(111, frame_on=False)
#plt.gca().xaxis.set_visible(False)
#plt.gca().yaxis.set_visible(False)
#pd.plotting.table(data=df_fwd_, ax=plt.gca(), loc='center left')
#plt.savefig('annot/FigA_mapping/annot_crosstab_fwd.pdf', bbox_inches='tight')
df_fwd_









    Out[18]:







  
    
      
      wt_emb
      wt_l1
      wt_l2
      wt_l3
      wt_l4
      wt_ya
      glp1_d1
      glp1_d2
      glp1_d6
      glp1_d9
      glp1_d13
      neurons
      gonad
      muscle
      intest
      summary
    
  
  
    
      coding_promoter
      3683
      3982
      4653
      4027
      4052
      3731
      2956
      2534
      3024
      2824
      3053
      3632
      3505
      2964
      3654
      8339
    
    
      pseudogene_promoter
      69
      60
      65
      53
      61
      61
      39
      18
      39
      33
      34
      45
      53
      29
      50
      198
    
    
      non-coding_RNA
      501
      495
      490
      496
      499
      496
      502
      504
      499
      497
      495
      496
      502
      503
      502
      462
    
    
      unknown_promoter
      469
      604
      834
      406
      328
      263
      188
      151
      215
      285
      363
      241
      223
      229
      282
      1704
    
    
      transcription_initiation
      24688
      24288
      23414
      24447
      24519
      24906
      25741
      26238
      25645
      25805
      25482
      25042
      25202
      25728
      24966
      19449
    
    
      no_transcription
      13464
      13445
      13418
      13445
      13415
      13417
      13448
      13429
      13452
      13430
      13447
      13418
      13389
      13421
      13420
      12722



In [19]:

    
# Reverse strand annotation summary (collapsed across methods)
df_rev_ = pd.concat([df_prom_rev['annot_%(stage)s' % locals()].value_counts() for stage in l_cond + ['summary']], axis=1).loc[d_annot_strand_legend.keys()]
df_rev_.columns = l_cond + ['summary']
#df_rev_.to_latex('%(dm_step)s_fig/prom_summary_rev.latex' % locals())
#plt.subplot(111, frame_on=False)
#plt.gca().xaxis.set_visible(False)
#plt.gca().yaxis.set_visible(False)
#pd.plotting.table(data=df_rev_, ax=plt.gca(), loc='center left')
#plt.savefig('annot/FigA_mapping/annot_crosstab_rev.pdf', bbox_inches='tight')
df_rev_









    Out[19]:







  
    
      
      wt_emb
      wt_l1
      wt_l2
      wt_l3
      wt_l4
      wt_ya
      glp1_d1
      glp1_d2
      glp1_d6
      glp1_d9
      glp1_d13
      neurons
      gonad
      muscle
      intest
      summary
    
  
  
    
      coding_promoter
      3715
      3986
      4642
      4179
      4066
      3752
      2944
      2593
      3020
      2833
      3115
      3577
      3491
      3084
      3675
      8241
    
    
      pseudogene_promoter
      75
      73
      79
      71
      69
      72
      33
      23
      42
      35
      38
      55
      72
      27
      66
      220
    
    
      non-coding_RNA
      503
      503
      499
      503
      505
      500
      510
      512
      510
      504
      507
      508
      505
      507
      503
      475
    
    
      unknown_promoter
      444
      618
      868
      483
      355
      257
      180
      171
      209
      268
      378
      215
      229
      254
      298
      1781
    
    
      transcription_initiation
      24588
      24151
      23259
      24114
      24377
      24789
      25672
      26042
      25553
      25705
      25311
      25004
      25099
      25498
      24820
      19378
    
    
      no_transcription
      13549
      13543
      13527
      13524
      13502
      13504
      13535
      13533
      13540
      13529
      13525
      13515
      13478
      13504
      13512
      12779



In [20]:

    
# Cross-tabulation of strand-specific summary annotation
pd.crosstab(pd.Categorical(df_prom_fwd['annot_summary']), pd.Categorical(df_prom_rev['annot_summary'])).loc[list(d_annot_strand_legend.keys()), list(d_annot_strand_legend.keys())]









    Out[20]:







  
    
      col_0
      coding_promoter
      pseudogene_promoter
      non-coding_RNA
      unknown_promoter
      transcription_initiation
      no_transcription
    
    
      row_0
      
      
      
      
      
      
    
  
  
    
      coding_promoter
      2163
      50
      29
      718
      3829
      1550
    
    
      pseudogene_promoter
      47
      12
      1
      12
      72
      54
    
    
      non-coding_RNA
      28
      0
      49
      6
      96
      283
    
    
      unknown_promoter
      686
      11
      5
      46
      794
      162
    
    
      transcription_initiation
      3788
      73
      92
      815
      10604
      4077
    
    
      no_transcription
      1529
      74
      299
      184
      3983
      6653



In [21]:

    
# Promoter short cap statistics for text
m_prom = (df_prom_fwd['annot_summary'] == 'coding_promoter') | (df_prom_rev['annot_summary'] == 'coding_promoter')
m_both_scap = (df_scap_fwd['scap_mode_count'] >= 2) & (df_scap_rev['scap_mode_count'] >= 2)
f_ = sum(m_prom & m_both_scap) / sum(m_prom)
print('%d bidirect, %d with scap, %d both, %.2f' % (sum(m_prom), sum(m_both_scap), sum(m_prom & m_both_scap), 100*f_))









    



14417 bidirect, 23064 with scap, 10550 both, 73.18



In [22]:

    
# S2_outron-extended_genes.bed <- genes extended by outron regions as defined by (aggregated) promoter annotations
def df_outron_extended_genes_(df_prom_fwd_, df_prom_rev_):

    # df_prom_bounds <- prom min/max boundaries, based on promoter annotation
    df_prom_bounds_fwd = df_prom_fwd_.groupby(['pass1_exon1_gene_id', 'pass1_exon1_locus_id']).agg({
        'chrom': lambda s: list(set(s))[0],
        'start': np.min,
        'end': np.max,
    }).reset_index()[['chrom', 'start', 'end', 'pass1_exon1_gene_id', 'pass1_exon1_locus_id']]
    df_prom_bounds_fwd.columns = ['chrom', 'start', 'end', 'gene_id', 'locus_id']

    df_prom_bounds_rev = df_prom_rev_.groupby(['pass1_exon1_gene_id', 'pass1_exon1_locus_id']).agg({
        'chrom': lambda s: list(set(s))[0],
        'start': np.min,
        'end': np.max,
    }).reset_index()[['chrom', 'start', 'end', 'pass1_exon1_gene_id', 'pass1_exon1_locus_id']]
    df_prom_bounds_rev.columns = ['chrom', 'start', 'end', 'gene_id', 'locus_id']

    df_prom_bounds = pd.concat([df_prom_bounds_fwd, df_prom_bounds_rev], axis=0, ignore_index=False)\
        .sort_values(['chrom', 'start', 'end']).reset_index(drop=True)
    print('%d fwd %d rev %d pooled' % (len(df_prom_bounds_fwd), len(df_prom_bounds_rev), len(df_prom_bounds)))

    # Write a .bed-file of all the regions that changed
    df_prom_bounds.to_csv(mp('_df_prom_bounds.bed'), index=False, header=False, sep='\t')
    !wc -l {mp('_df_prom_bounds.bed')}

    # df_gene_bounds <- gene min/max boundaries, based on annotation
    df_transcripts_ = yp.read_wbgtf('WS260_ce10/WS260_ce10.transcripts.annot.gtf.gz', parse_attr=False)
    df_transcripts = yp.df_gfftags_unpack(df_transcripts_, name='attribute')
    
    # Add display_id
    #fp_geneIDs = 'wget/ftp.wormbase.org/pub/wormbase/releases/WS260/species/c_elegans/PRJNA13758/annotation/c_elegans.PRJNA13758.WS260.geneIDs.txt.gz'
    #df_geneIDs = pd.read_csv(fp_geneIDs, sep=',', names=('na', 'gene_id', 'locus', 'sequence_id', 'status'))[['gene_id', 'locus', 'sequence_id', 'status']]
    #
    #def display_id_(locus, sequence_id, gene_id):
    #    if locus == locus:
    #        return locus
    #    elif sequence_id == sequence_id:
    #        return sequence_id
    #    else:
    #        return gene_id
    #
    #df_geneIDs['locus_id'] = list(map(display_id_, df_geneIDs['locus'], df_geneIDs['sequence_id'], df_geneIDs['gene_id']))
    #df_transcripts = pd.merge(df_transcripts, df_geneIDs[['gene_id', 'locus_id']], left_on='gene_id', right_on='gene_id')

    df_gene_bounds = df_transcripts.groupby(['gene_id', 'locus_id']).agg({
        'chrom': lambda s: list(set(s))[0],
        'start': np.min,
        'end': np.max,
    }).reset_index()[['chrom', 'start', 'end', 'gene_id', 'locus_id']].sort_values(['chrom', 'start', 'end']).reset_index(drop=True)

    df_gene_bounds.to_csv(mp('_df_gene_bounds.bed'), index=False, header=False, sep='\t')
    !wc -l {mp('_df_gene_bounds.bed')}
    
    df_ = df_prom_bounds.merge(df_gene_bounds, how='outer', on=['gene_id', 'locus_id'], suffixes=('_prom', '_gene'))
    df_['start'] = list(map(int, map(np.nanmin, zip(df_['start_prom'], df_['start_gene']))))
    df_['end'] = list(map(int, map(np.nanmax, zip(df_['end_prom'], df_['end_gene']))))
    df_ = df_[['chrom_gene', 'start', 'end', 'gene_id', 'locus_id']].sort_values(['chrom_gene', 'start', 'end']).reset_index(drop=True)
    df_.columns = [['chrom', 'start', 'end', 'gene_id', 'locus_id']]
    return df_

q_ = '(annot_summary == "coding_promoter") | (annot_summary == "pseudogene_promoter") | (annot_summary == "non-coding_RNA")'
df_outron_extended_genes = df_outron_extended_genes_(
    pd.concat([df_prom_fwd, df_exon_fwd['pass1_exon1_gene_id'], df_exon_fwd['pass1_exon1_locus_id']], axis=1).query(q_)[['chrom', 'start', 'end', 'annot_summary', 'pass1_exon1_gene_id', 'pass1_exon1_locus_id']],
    pd.concat([df_prom_rev, df_exon_rev['pass1_exon1_gene_id'], df_exon_rev['pass1_exon1_locus_id']], axis=1).query(q_)[['chrom', 'start', 'end', 'annot_summary', 'pass1_exon1_gene_id', 'pass1_exon1_locus_id']],
)
df_outron_extended_genes.to_csv(mp('outron-extended_genes.bed'), index=False, header=False, sep='\t')
!wc -l {mp('outron-extended_genes.bed')}









    



6029 fwd 5973 rev 12002 pooled
12002 annot_ce10_tissues/metrics_type/_df_prom_bounds.bed
23155 annot_ce10_tissues/metrics_type/_df_gene_bounds.bed
23156 annot_ce10_tissues/metrics_type/outron-extended_genes.bed



In [23]:

    
# df_associated_gene <- associated gene assignments
df_associated_gene = pd.read_csv(BedTool.from_dataframe(df_atac[['chrom', 'start', 'end']]).map(
    BedTool.from_dataframe(df_outron_extended_genes).fn, c='4,5', o='distinct,distinct',).fn, sep='\t',
    names=yp.NAMES_BED9[:5])
df_associated_gene.columns = ['chrom', 'start', 'end', 'associated_gene_id', 'associated_locus_id']









    



***** WARNING: File /tmp/pybedtools.5k7abx1a.tmp has inconsistent naming convention for record:
	78	3977	.	.

***** WARNING: File /tmp/pybedtools.5k7abx1a.tmp has inconsistent naming convention for record:
	78	3977	.	.



In [24]:

    
# Table S2 -- unstranded summary annotation
def name_(annot_fwd, annot_rev, locus_id_fwd, locus_id_rev, enhancer_locus_id):
    if (annot_fwd in config['annot_with_gene_id']) and not(annot_rev in config['annot_with_gene_id']):
        return locus_id_fwd
    elif not(annot_fwd in config['annot_with_gene_id']) and (annot_rev in config['annot_with_gene_id']):
        return locus_id_rev
    elif (annot_fwd in config['annot_with_gene_id']) and (annot_rev in config['annot_with_gene_id']):
        return '%s / %s' % (locus_id_rev, locus_id_fwd)
    elif enhancer_locus_id != '.': # show "likeliest target gene for an enhancer" in brackets for all remaining (=not coding_promoter or non_coding_promoter)
        return '(%s)' % (enhancer_locus_id,)
    else:
        return ''

def annot_merge_(annot_fwd, annot_rev):
    if annot_fwd == 'coding_promoter' or annot_rev == 'coding_promoter':
        return 'coding_promoter'
    elif annot_fwd == 'pseudogene_promoter' or annot_rev == 'pseudogene_promoter':
        return 'pseudogene_promoter'
    elif annot_fwd == 'non-coding_RNA' or annot_rev == 'non-coding_RNA':
        return 'non-coding_RNA'
    elif annot_fwd == 'unknown_promoter' or annot_rev == 'unknown_promoter':
        return 'unknown_promoter'
    elif annot_fwd == 'transcription_initiation' or annot_rev == 'transcription_initiation':
        return 'putative_enhancer'
    elif annot_fwd == 'no_transcription' or annot_rev == 'no_transcription':
        return 'other_element'
    assert False

def strand_merge_(annot_fwd, annot_rev):
    promoter_fwd = (annot_fwd in config['annot_with_gene_id'])
    promoter_rev = (annot_rev in config['annot_with_gene_id'])
    lcap_fwd = annot_fwd == 'unknown_promoter'
    lcap_rev = annot_rev == 'unknown_promoter'
    if promoter_fwd and promoter_rev:
        return '.'
    elif promoter_fwd and (not promoter_rev):
        return '+'
    elif (not promoter_fwd) and promoter_rev:
        return '-'
    # in the absence of promoter annotations with an "annotated" target, use unknown_promoter annotations for strand
    elif lcap_fwd and lcap_rev:
        return '.'
    elif lcap_fwd and (not lcap_rev):
        return '+'
    elif (not lcap_fwd) and lcap_rev:
        return '-'
    else:
        return '.'

df_regl = df_atac[['chrom', 'start', 'end']].copy()
df_regl['tss_fwd'] = df_scap_fwd['scap_mode']
df_regl['tss_rev'] = df_scap_rev['scap_mode']
df_regl['scap_pass_fwd'] = df_scap_fwd['scap_pass']
df_regl['scap_pass_rev'] = df_scap_rev['scap_pass']
df_regl['scap_mode_fwd'] = df_scap_fwd['scap_mode']
df_regl['scap_mode_rev'] = df_scap_rev['scap_mode']
df_regl['scap_mode_count_fwd'] = df_scap_fwd['scap_mode_count']
df_regl['scap_mode_count_rev'] = df_scap_rev['scap_mode_count']
df_regl['scap_count_fwd'] = df_scap_fwd['scap_count']
df_regl['scap_count_rev'] = df_scap_rev['scap_count']
df_regl['annot_fwd'] = df_prom_fwd['annot_summary']
df_regl['annot_rev'] = df_prom_rev['annot_summary']
df_regl['annot_detailed_fwd'] = df_prom_fwd['annot_detailed_summary']
df_regl['annot_detailed_rev'] = df_prom_rev['annot_detailed_summary']
df_regl['annot'] = list(map(annot_merge_, df_prom_fwd['annot_summary'], df_prom_rev['annot_summary']))
df_regl['strand'] = list(map(strand_merge_, df_prom_fwd['annot_summary'], df_prom_rev['annot_summary']))

def agg_pass1_pass2(annot, pass1_val, pass2_val):
    if annot in ['coding_promoter', 'pseudogene_promoter']:
        return pass1_val
    elif annot == 'non-coding_RNA':
        return pass2_val
    else:
        return ''

df_regl['promoter_gene_id_fwd'] = list(map(agg_pass1_pass2, df_regl['annot_fwd'],
                                    df_exon_fwd['pass1_exon1_gene_id'], df_exon_fwd['pass2_exon1_gene_id']))
df_regl['promoter_gene_id_rev'] = list(map(agg_pass1_pass2, df_regl['annot_rev'],
                                    df_exon_rev['pass1_exon1_gene_id'], df_exon_rev['pass2_exon1_gene_id']))

df_regl['promoter_locus_id_fwd'] = list(map(agg_pass1_pass2, df_regl['annot_fwd'],
                                    df_exon_fwd['pass1_exon1_locus_id'], df_exon_fwd['pass2_exon1_locus_id']))
df_regl['promoter_locus_id_rev'] = list(map(agg_pass1_pass2, df_regl['annot_rev'],
                                    df_exon_rev['pass1_exon1_locus_id'], df_exon_rev['pass2_exon1_locus_id']))

df_regl['promoter_gene_biotype_fwd'] = list(map(agg_pass1_pass2, df_regl['annot_fwd'],
                                    df_exon_fwd['pass1_exon1_gene_biotype'], df_exon_fwd['pass2_exon1_gene_biotype']))
df_regl['promoter_gene_biotype_rev'] = list(map(agg_pass1_pass2, df_regl['annot_rev'],
                                    df_exon_rev['pass1_exon1_gene_biotype'], df_exon_rev['pass2_exon1_gene_biotype']))

df_regl['associated_gene_id'] = df_associated_gene['associated_gene_id']
df_regl['associated_locus_id'] = df_associated_gene['associated_locus_id']

df_regl['label'] = list(map(name_,
    df_regl['annot_fwd'], df_regl['annot_rev'],                     
    df_regl['promoter_locus_id_fwd'], df_regl['promoter_locus_id_rev'],
    df_regl['associated_locus_id']
))

if annot_ != 'annot_ce10_tissues':
    for stage in config['stages_wt'] + config['stages_glp1']:
        df_regl['atac_%s_height' % (stage,)] = df_atac['atac_%s_height' % (stage,)]



In [25]:

    
# chrom_,start_,end_ce11: site coordinates in ce11, obtained by lifting over the position with peak accessibility
def liftover_(chroms, starts):#, ends, strands): # Liftover wrapper
    """
    # http://www.ncbi.nlm.nih.gov/pmc/articles/PMC4512556/
    # note that while annotations changed, the N2 genome sequence remained identical from WS215 through WS234
    Test: 
    list(liftover_WS230_WS220(
    chroms=['chrI', 'chrX', 'chrIII', 'chrV', 'chrIV', 'chrII'],
    starts=[1,1,4,1,1, 1],
    ends=[12507921,13713648,471811,15465174,11718826, 11718826],
    strands=['+', '-', '+', '-', '+']))
    """
    fp_inp = mp('_liftover_fp_inp.tsv')
    fp_out = mp('_liftover_fp_out.tsv')
    with open(fp_inp, 'w') as fh_inp:
        #for (chrom, start, end, strand) in zip(chroms, starts, ends, strands):        
        for (chrom, start) in zip(chroms, starts):
            print('%s\t%s' % (chrom, start), file=fh_inp)

    liftover_bin = '/mnt/home1/ahringer/jj374/lab/raw_data/datastore/Kenneth_Evans/ConvertWormAssembly-1-0-0/ConvertWormAssembly.1.0.0.pl'
    liftover_dir = '/mnt/home1/ahringer/jj374/lab/raw_data/datastore/Kenneth_Evans/ConvertWormAssembly-1-0-0/CHROMOSOME-DIFFERENCES-to-WS244'
    !{liftover_bin} \
        --input={fp_inp} \
        --output={fp_out} \
        --release1=WS220 \
        --release2=WS235 \
        --chrfield=1 \
        --posfield=2 \
        --postype=1 \
        --refdir={liftover_dir}
    
    df_out = pd.read_csv(fp_out, sep='\t', names=['chrom', 'start'])
    for i, r in df_out.iterrows():
        yield((r['chrom'], r['start']))
        
    !rm {fp_inp}
    !rm {fp_out}

df_ce11_pos = pd.DataFrame.from_records(liftover_(df_regl['chrom'], l_atac_peak_pos), columns=['chrom', 'pos'])
df_regl['chrom_ce11'] = df_ce11_pos['chrom']
df_regl['start_ce11'] = df_ce11_pos['pos'] - 75
df_regl['end_ce11'] = df_ce11_pos['pos'] + 75 + 1









    



Input file  = annot_ce10_tissues/metrics_type/_liftover_fp_inp.tsv 
Output file = annot_ce10_tissues/metrics_type/_liftover_fp_out.tsv 
Lines processed from input file              = 42874
Comment / blank lines written to output file = 0
Data lines written to output file            = 42874
Number of records changed                    = 41446



In [26]:

    
df_prom_fwd.to_csv(mp('prom_fwd.tsv'), sep='\t', index=False)
df_prom_rev.to_csv(mp('prom_rev.tsv'), sep='\t', index=False)
df_regl.to_csv(mp('regl.tsv'), sep='\t', index=False)



In [27]:

    
# regulatory_annotation_%strand.bed: stranded .bed-files showing the annotation in individual stages
write_gffbed(os.path.join(annot_, 'regulatory_annotation_fwd.bed'),
        chrom = df_regl['chrom'],
        start = df_regl['start'],
        end = df_regl['end'],
        name = df_regl['promoter_locus_id_fwd'],
        strand = '+',
        itemRgb = list(map(lambda annot: d_annot_strand_legend[annot], df_regl['annot_fwd'])),
        attr = df_prom_fwd[['annot_%s' % (stage,) for stage in l_cond]],
)

write_gffbed(os.path.join(annot_, 'regulatory_annotation_rev.bed'),
        chrom = df_regl['chrom'],
        start = df_regl['start'],
        end = df_regl['end'],
        name = df_regl['promoter_locus_id_rev'],
        strand = '-',
        itemRgb = list(map(lambda annot: d_annot_strand_legend[annot], df_regl['annot_rev'])),
        attr = df_prom_rev[['annot_%s' % (stage,) for stage in l_cond]],
)



In [28]:

    
# regulatory_annotation_ce10.bed: final annotation as a single, unstranded .bed-file (ce10)
def write_ce10_(fp_, df_regl_):
    write_gffbed(fp_,
        chrom = df_regl_['chrom'],
        start = df_regl_['start'],
        end = df_regl_['end'],
        name = df_regl_['label'],
        strand = df_regl_['strand'],
        itemRgb = list(map(lambda annot: d_annot_legend[annot], df_regl_['annot'])),
        attr = df_regl_[['annot', 'annot_fwd', 'annot_rev', 'annot_detailed_fwd', 'annot_detailed_rev']],
    )

write_ce10_(os.path.join(annot_, 'regulatory_annotation_ce10.bed'), df_regl)



In [29]:

    
# metrics_type/regulatory_annotation.coding_promoter_only.bed.bed-files of specific types of elements
for annot in config['annot']:
    fp_ = mp('regulatory_annotation.%s.bed' % (annot,))
    write_ce10_(fp_, df_regl.query('annot == "%s"' % (annot,)).reset_index(drop=True))
    !wc -l {fp_}









    



14418 annot_ce10_tissues/metrics_type/regulatory_annotation.coding_promoter.bed
310 annot_ce10_tissues/metrics_type/regulatory_annotation.pseudogene_promoter.bed
2002 annot_ce10_tissues/metrics_type/regulatory_annotation.unknown_promoter.bed
18665 annot_ce10_tissues/metrics_type/regulatory_annotation.putative_enhancer.bed
831 annot_ce10_tissues/metrics_type/regulatory_annotation.non-coding_RNA.bed
6654 annot_ce10_tissues/metrics_type/regulatory_annotation.other_element.bed



In [30]:

    
# regulatory_annotation_ce11.bed: final annotation as a single, unstranded .bed-file (ce11)
def write_ce11_(fp_, df_regl_):
    write_gffbed(fp_,
        chrom = df_regl_['chrom_ce11'],
        start = df_regl_['start_ce11'],
        end = df_regl_['end_ce11'],
        name = df_regl_['label'],
        strand = df_regl_['strand'],
        itemRgb = list(map(lambda annot: d_annot_legend[annot], df_regl_['annot'])),
        attr = df_regl_[['annot', 'annot_fwd', 'annot_rev', 'annot_detailed_fwd', 'annot_detailed_rev']],
    )

write_ce11_(os.path.join(annot_, 'regulatory_annotation_ce11.bed'), df_regl)



In [31]:

    
# Table of short cap bidirectionality statistics by type of element
def directionality_(pass_fwd, pass_rev):
    if pass_fwd and pass_rev:
        return 'bidirectional'
    elif pass_fwd:
        return 'forward'
    elif pass_rev:
        return 'reverse'
    else:
        return 'none/weak'

df_has_scap = pd.DataFrame()
df_has_scap['annot'] = df_regl['annot']
df_has_scap['directionality'] = [*map(directionality_, df_scap_fwd['scap_pass'], df_scap_rev['scap_pass'])]
df_has_scap['directionality'].value_counts()

df_ = pd.crosstab(pd.Categorical(df_has_scap['annot']), pd.Categorical(df_has_scap['directionality']), margins=True)\
.loc[config['annot'] + ['All'], ['bidirectional', 'forward', 'reverse', 'none/weak']]

fig = plt.figure(figsize=(4,2))
plt.subplot(111, frame_on=False)
plt.gca().xaxis.set_visible(False)
plt.gca().yaxis.set_visible(False)
pd.plotting.table(data=df_, ax=plt.gca(), loc='center left')
#plt.savefig('annot/FigA_mapping/annot_crosstab_scap.pdf', bbox_inches='tight')









    Out[31]:





<matplotlib.table.Table at 0x7fd7718a1d68>



In [32]:

    
# Pie chart of different types of promoters
n_bid = len(df_regl.query('(annot == "coding_promoter") & (strand == ".")'))
n_fwd = len(df_regl.query('(annot == "coding_promoter") & (strand == "+")'))
n_rev = len(df_regl.query('(annot == "coding_promoter") & (strand == "-")'))
print('%d bidirectional promoters' % (n_bid))
print('%d forward-only promoters' % (n_fwd,))
print('%d reverse-only promoters' % (n_rev,))
plt.figure(figsize=(16,6)).subplots_adjust(wspace=0.4)
plt.subplot(1,2,1, aspect=1)
plt.title('%d of %d sites (%.1f%%) annotated\nas protein-coding promoters' % \
    (n_fwd + n_rev + n_bid, len(df_regl), 100 * (n_fwd + n_rev + n_bid) / float(len(df_regl)),))
plt.pie([n_fwd, n_rev, n_bid], labels=[
        '%(n_fwd)d forward\npromoters' % locals(),
        '%(n_rev)d reverse\npromoters' % locals(), 
        '%(n_bid)d bidirectional\npromoters' % locals(),
])[-1][-1]









    



2317 bidirectional promoters
6097 forward-only promoters
6003 reverse-only promoters






    Out[32]:





Text(0.962748,-0.532087,'2317 bidirectional\npromoters')



In [33]:

    
# Most (82.6%) are unidirectional promoters, producing a protein-coding transcript in one direction
n_prom = sum((df_regl['annot'] == 'coding_promoter'))
n_prom_bs = sum((df_regl['annot'] == 'coding_promoter') & df_scap_fwd['scap_pass'] & df_scap_rev['scap_pass'])
print(n_prom_bs/n_prom)

m_fwd = (df_regl['annot_fwd'] == 'coding_promoter') & (df_regl['annot_rev'] != 'coding_promoter')
m_rev = (df_regl['annot_fwd'] != 'coding_promoter') & (df_regl['annot_rev'] == 'coding_promoter')
n_prom_bs_ud = sum((m_fwd | m_rev) & df_scap_fwd['scap_pass'] & df_scap_rev['scap_pass'])
print(n_prom_bs_ud/n_prom_bs)









    



0.731774987862
0.821800947867



In [35]:

    
# if annot_ == 'annot_ce10_tissues':
m_fwd = (df_prom_fwd[['annot_%s' % (cond,) for cond in config['stages']]] == 'coding_promoter').any(axis=1)
t_fwd = (df_prom_fwd[['annot_%s' % (cond,) for cond in config['lcap823_tissues']]] == 'coding_promoter').any(axis=1)

m_rev = (df_prom_rev[['annot_%s' % (cond,) for cond in config['stages']]] == 'coding_promoter').any(axis=1)
t_rev = (df_prom_rev[['annot_%s' % (cond,) for cond in config['lcap823_tissues']]] == 'coding_promoter').any(axis=1)

print('%d additional fwd promoters' % (len(df_prom_fwd[~m_fwd & t_fwd])))
print('%d additional rev promoters' % (len(df_prom_rev[~m_rev & t_rev])))
#df_prom_fwd[~m & t].sample(20)









    



497 additional fwd promoters
507 additional rev promoters



In [ ]:

	wt_emb	wt_l1	wt_l2	wt_l3	wt_l4	wt_ya	glp1_d1	glp1_d2	glp1_d6	glp1_d9	glp1_d13	neurons	gonad	muscle	intest	summary
coding_promoter_jump_scap	3241	3627	4271	3513	3610	3173	2474	2117	2470	2421	2453	3125	3068	2620	3191	6454
coding_promoter_incr_scap	389	282	282	440	340	454	408	320	483	309	522	405	305	240	365	810
coding_promoter_jump_incr	53	73	100	74	102	104	74	97	71	94	78	102	132	104	98	461
coding_promoter_lowconf	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0
pseudogene_promoter_jump_scap	50	44	45	33	41	49	24	13	25	19	21	25	36	19	35	98
pseudogene_promoter_incr_scap	11	8	9	13	9	4	9	4	9	10	11	13	10	7	7	27
pseudogene_promoter_jump_incr	8	8	11	7	11	8	6	1	5	4	2	7	7	3	8	26
pseudogene_promoter_lowconf	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0
non-coding_RNA	501	495	490	496	499	496	502	504	499	497	495	496	502	503	502	470
unknown_promoter_jump_scap	469	604	834	406	328	263	188	151	215	285	363	241	223	229	282	1722
transcription_initiation	24688	24288	23414	24447	24519	24906	25741	26238	25645	25805	25482	25042	25202	25728	24966	19757
no_transcription	13464	13445	13418	13445	13415	13417	13448	13429	13452	13430	13447	13418	13389	13421	13420	13049

	wt_emb	wt_l1	wt_l2	wt_l3	wt_l4	wt_ya	glp1_d1	glp1_d2	glp1_d6	glp1_d9	glp1_d13	neurons	gonad	muscle	intest	summary
coding_promoter_jump_scap	3301	3624	4256	3718	3590	3213	2477	2155	2466	2373	2559	3071	3012	2748	3178	6416
coding_promoter_incr_scap	342	286	295	365	362	424	382	349	474	366	460	403	340	220	393	770
coding_promoter_jump_incr	72	76	91	96	114	115	85	89	80	94	96	103	139	116	104	496
coding_promoter_lowconf	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0
pseudogene_promoter_jump_scap	55	54	62	52	48	46	22	11	27	23	26	36	52	18	48	113
pseudogene_promoter_incr_scap	13	11	6	11	10	15	6	9	10	6	7	11	11	4	7	19
pseudogene_promoter_jump_incr	7	8	11	8	11	11	5	3	5	6	5	8	9	5	11	33
pseudogene_promoter_lowconf	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0
non-coding_RNA	503	503	499	503	505	500	510	512	510	504	507	508	505	507	503	481
unknown_promoter_jump_scap	444	618	868	483	355	257	180	171	209	268	378	215	229	254	298	1794
transcription_initiation	24588	24151	23259	24114	24377	24789	25672	26042	25553	25705	25311	25004	25099	25498	24820	19647
no_transcription	13549	13543	13527	13524	13502	13504	13535	13533	13540	13529	13525	13515	13478	13504	13512	13105

	wt_emb	wt_l1	wt_l2	wt_l3	wt_l4	wt_ya	glp1_d1	glp1_d2	glp1_d6	glp1_d9	glp1_d13	neurons	gonad	muscle	intest	summary
coding_promoter	3683	3982	4653	4027	4052	3731	2956	2534	3024	2824	3053	3632	3505	2964	3654	8339
pseudogene_promoter	69	60	65	53	61	61	39	18	39	33	34	45	53	29	50	198
non-coding_RNA	501	495	490	496	499	496	502	504	499	497	495	496	502	503	502	462
unknown_promoter	469	604	834	406	328	263	188	151	215	285	363	241	223	229	282	1704
transcription_initiation	24688	24288	23414	24447	24519	24906	25741	26238	25645	25805	25482	25042	25202	25728	24966	19449
no_transcription	13464	13445	13418	13445	13415	13417	13448	13429	13452	13430	13447	13418	13389	13421	13420	12722

	wt_emb	wt_l1	wt_l2	wt_l3	wt_l4	wt_ya	glp1_d1	glp1_d2	glp1_d6	glp1_d9	glp1_d13	neurons	gonad	muscle	intest	summary
coding_promoter	3715	3986	4642	4179	4066	3752	2944	2593	3020	2833	3115	3577	3491	3084	3675	8241
pseudogene_promoter	75	73	79	71	69	72	33	23	42	35	38	55	72	27	66	220
non-coding_RNA	503	503	499	503	505	500	510	512	510	504	507	508	505	507	503	475
unknown_promoter	444	618	868	483	355	257	180	171	209	268	378	215	229	254	298	1781
transcription_initiation	24588	24151	23259	24114	24377	24789	25672	26042	25553	25705	25311	25004	25099	25498	24820	19378
no_transcription	13549	13543	13527	13524	13502	13504	13535	13533	13540	13529	13525	13515	13478	13504	13512	12779

col_0	coding_promoter	pseudogene_promoter	non-coding_RNA	unknown_promoter	transcription_initiation	no_transcription
row_0
coding_promoter	2163	50	29	718	3829	1550
pseudogene_promoter	47	12	1	12	72	54
non-coding_RNA	28	0	49	6	96	283
unknown_promoter	686	11	5	46	794	162
transcription_initiation	3788	73	92	815	10604	4077
no_transcription	1529	74	299	184	3983	6653