In [40]:
# Initialise
%run ~/relmapping/annot/notebooks/__init__.ipynb
In [39]:
# Load new annotation & test known corner cases
#fp_regl = 'annot/S2_regulatory_annotation/S2_regulatory_annotation.tsv'
fp_regl = 'annot/Fig2D2_regulatory_annotation_Apr27/Fig2D2_regulatory_annotation_Apr27.tsv'
df_regl = pd.read_csv(fp_regl, sep='\t')
print('%d regions loaded' % (len(df_regl),))
def check_(igvstr, **kwargs):
(chrom, start, end) = yp.parse_igvstr(igvstr)
pos = int((start + end) / 2)
df_ = df_regl.query('(chrom == "%s") & (start <= %d) & (%d < end)' % (chrom, pos, pos))
assert len(df_) == 1
for (k, v) in kwargs.items():
print('\t'.join([igvstr, '%25s' % (k,), '%25s' % (v,), '%25s' % (df_[k].tolist()[0],)]))
assert df_[k].tolist()[0] == v
#> rfc-4 (black element - promoter inside masked??)
#> zen-4 (black element - should be promoter?)sonRNA, I guess could mask promoter
#Rare but compelling cases of bidirectional promoters where one direction was protein-coding, and the other was an annotated non-coding RNA. Addressed by relaxing the non-coding RNA annotation. They're now treated more similarly to protein-coding annotations, so a site can be annotated as protein_coding on one strand, and snoRNA on the other strand. (Previously, any site that overlapped a non-coding RNA was excluded from any other annotation...)
#> attf-2 (black element maybe promoter?)
#Now annotated as a promoter for F09G2.2 (protein-coding gene, forward strabd), and F09G2.14 (snoRNA, reverse strand). Could also be a promoter for attf-2, but can't think of much one could do about this give current fragmented long cap data.
check_('chrIV:6117863-6118043',
annot_fwd='coding_promoter', promoter_locus_id_fwd='zen-4',
annot_rev='non-coding_RNA', promoter_locus_id_rev='M03D4.78')
## "Dual" coding/non-coding annotations: coding_promoter >> non-coding_RNA
check_('chrIII:6971981-6972101',
annot_fwd='coding_promoter', promoter_locus_id_fwd='rfc-4',
annot_rev='coding_promoter', promoter_locus_id_rev='eef-1A.1') # alternative: mir-5549
check_('chrV:7196589-7196718',
annot_fwd='coding_promoter', promoter_locus_id_fwd='F09G2.2',
annot_rev='coding_promoter', promoter_locus_id_rev='attf-2') # alternative: F09G2.14
check_('chrII:13596635-13596738', annot_rev='coding_promoter', promoter_locus_id_rev='fbxc-23')
#> fbxc-15
#Fixed by tweaking the long cap jump test.
check_('chrII:979338-979455', annot_fwd='coding_promoter', promoter_locus_id_fwd='fbxc-15')
#> sri-5 (promoter annotated to rsr-1 only)
#> C25F9.6 (promoter annotate to C25F9.6 only) - looks like there is jump in txn?
#> F08F3.8 (promoter annotated to acl-6 only) - looks like there is longcap signal
#> rsd-2 (maybe one of the yellow elements promoter?) Looks like there is txn from furthest right yellow element
#Weak outron signal; now captured by requiring 1 long cap tag in each replicate, and at least 3 tags total (previously at least 2 tags in both replicates).
check_('chrI:11182502-11182624', annot_rev='coding_promoter', promoter_locus_id_rev='rsr-1')
check_('chrV:19405507-19405707',
annot_fwd='coding_promoter', promoter_locus_id_fwd='C25F9.10',
annot_rev='coding_promoter', promoter_locus_id_rev='C25F9.6')
check_('chrV:5433170-5433353', annot_fwd='coding_promoter', promoter_locus_id_fwd='acl-6')
check_('chrIV:13541820-13541934', annot_rev='coding_promoter', promoter_locus_id_rev='rsd-2')
# Sites downstream within a long exon that should not be called promoters
check_('chrII:7150014-7150236', annot_fwd='no_transcription', annot_rev='transcription_initiation')
check_('chrI:183422-183568', annot_fwd='transcription_initiation', associated_locus_id='atm-1')
# "Long distance" ncRNA annotations (should not happen)
check_('chrII:6891890-6892040', annot_fwd='coding_promoter', annot_rev='coding_promoter')
# addressed via closest_exon_flank ~< 250 (ncRNA downstream of site; exon1 from upstream gets mis-assigned)
check_('chrI:111191-111378', annot_fwd='non-coding_RNA', promoter_locus_id_fwd='F53G12.12')
# Non-coding RNAs within outrons should not 'block' upstream promoter annotation
check_('chrI:13324560-13324714', annot_fwd='coding_promoter', promoter_locus_id_fwd='gcy-35') # Reverse strand shows weak-gapped-but-compelling long cap?
check_('chrI:13372998-13373215', annot_rev='coding_promoter', promoter_locus_id_rev='sptf-3')
check_('chrIII:6971981-6972101', annot_fwd='coding_promoter', promoter_locus_id_fwd='rfc-4')
check_('chrIII:6971981-6972101', annot_rev='coding_promoter', promoter_locus_id_rev='eef-1A.1')
check_('chrI:13372998-13373215', annot_rev='coding_promoter', promoter_locus_id_rev='sptf-3')
# Long cap jump -- weak regions
check_('chrI:14316482-14316632', annot_fwd='pseudogene_promoter', promoter_locus_id_fwd='F49B2.4') # Also, reverse strand shows weak-gapped-but-compelling long cap (however no short cap!)?
# Short cap 100..125bp from peak accessibility...
#check_('chrI:6341909-6342059', annot_rev='unknown_promoter')
check_('chrII:14468955-14469118', annot_fwd='unknown_promoter')
# Short cap >125bp from peak accessibility...
#check_('chrII:14468955-14469118', annot_rev='unknown_promoter')
# Promoters initiating transcription within the UTR, downstream of the annotated TSS
check_('chrI:6011491-6011614', annot_rev='coding_promoter', promoter_locus_id_rev='rnf-1')
check_('chrI:7643236-7643386', annot_rev='coding_promoter', promoter_locus_id_rev='ncbp-2')
check_('chrII:2320231-2320381', annot_fwd='coding_promoter', promoter_locus_id_fwd='ZK1240.3')
check_('chrII:6775038-6775173', annot_rev='coding_promoter', promoter_locus_id_rev='agr-1')
check_('chrII:2296175-2296310', annot_rev='coding_promoter', promoter_locus_id_rev='fbxc-25')
# Various weak-ish/corner cases where a clear unique promoter candidate exists
check_('chrI:5509301-5509451', annot_rev='coding_promoter', promoter_locus_id_rev='zipt-11') # upstream long cap overlaps upstream gene 3' end
check_('chrX:14706659-14706809', annot_rev='transcription_initiation') # should not be an unknown_promoter, as is proximal to first exon
check_('chrIII:1163615-1163765', annot_fwd='no_transcription', annot_rev='no_transcription') # Should not be a promoter of Y119D3B.13
check_('chrX:14524974-14525124', annot_rev='coding_promoter', promoter_locus_id_rev='nspc-17')
check_('chrV:5433187-5433337', annot_rev='coding_promoter', promoter_locus_id_rev='F08F3.8') # distal; site has no long cap (but has continuous txn)
check_('chrI:13376619-13376731', annot_fwd='coding_promoter', promoter_locus_id_fwd='C01A2.9') # should be detected in d13_fwd -- padj 0.32, missed
# No short cap
check_('chrI:11182502-11182624', annot_fwd='coding_promoter', promoter_locus_id_fwd='sri-5') # no short cap
check_('chrII:13596635-13596738', annot_fwd='coding_promoter', promoter_locus_id_fwd='csc-1') # "distal"; no scap
# txn init within coding region
#check_('chrII:2772185-2772335', annot_rev='coding_promoter', promoter_locus_id_rev='sdz-10')
#check_('chrII:1818315-1818462', annot_rev='coding_promoter', promoter_locus_id_rev='fbxb-10') # short cap within 1st exon
#check_('chrII:14563893-14564084', annot_rev='pseudogene_promoter', promoter_locus_id_rev='C01G12.12') # pseudogene promoters without short cap!
# afd-1 locus -- two sites with long cap gaps, should not be called promoters (but not necessarily unknown_promoter either)
check_('chrI:2167906-2168056', annot_fwd='transcription_initiation')
check_('chrI:2179963-2180113', annot_fwd='transcription_initiation')
# Y37E3.11 locus -- too far from the annotated TSS to be associated with that UTR
check_('chrI:2084937-2085087', annot_fwd='non-coding_RNA', promoter_locus_id_fwd='Y37E3.20')
# Weak promoter with several non-first alternative isoforms overlapping the first exon
check_('chrI:2084303-2084453', annot_fwd='coding_promoter', promoter_locus_id_fwd='Y37E3.11')
check_('chrI:5485932-5486082', annot_rev='coding_promoter', promoter_locus_id_rev='rpl-19')
check_('chrI:5485932-5486082', annot_fwd='unknown_promoter')
# Only apply the low-confidence promoter rule to intergenic sites
check_('chrI:2190535-2190685', annot_fwd='transcription_initiation') # afd-1 (in intron)
check_('chrI:4202291-4202441', annot_fwd='transcription_initiation') # C18E3.4
In [ ]:
# Differences to previous github version
#fp_head_bed = 'annot/S2_regulatory_annotation/S2_regulatory_annotation.bed_HEAD.bed'
#fp_head_tsv = 'annot/S2_regulatory_annotation/S2_regulatory_annotation.tsv_HEAD.tsv'
#!git show HEAD:{fp_regl} > {fp_head_tsv}
#!git show HEAD:annot/S2_regulatory_annotation/S2_regulatory_annotation.bed > {fp_head_bed}
# Diff against a specific earlier version
#fp_head_bed = 'annot/S2_regulatory_annotation/S2_regulatory_annotation.bed_1030.bed'
#fp_head_tsv = 'annot/S2_regulatory_annotation/S2_regulatory_annotation.tsv_1030.tsv'
#!git show 2040a6:annot/S2_regulatory_annotation/S2_regulatory_annotation.bed > {fp_head_bed}
#!git show 2040a6:annot/S2_regulatory_annotation/S2_regulatory_annotation.tsv > {fp_head_tsv}
fp_head_bed = 'annot/S2_regulatory_annotation/S2_regulatory_annotation_6Dec17.bed'
fp_head_tsv = 'annot/S2_regulatory_annotation/S2_regulatory_annotation_6Dec17.tsv'
df_head = pd.read_csv(fp_head_tsv, sep='\t')
m_diff_ = (df_head['annot_fwd'] != df_regl['annot_fwd']) | (df_head['annot_rev'] != df_regl['annot_rev'])# \
#| ((df_head['promoter_gene_id_fwd'] != df_regl['promoter_gene_id_fwd'])) \
#| ((df_head['promoter_gene_id_rev'] != df_regl['promoter_gene_id_rev'])) \
#| ((df_head['associated_gene_id'] != df_regl['associated_gene_id']))
print('%d regions annotated differently compared to last commit' % (sum(m_diff_)))
In [ ]:
# Randomly sample altered regions
df_diff = df_regl[['chrom', 'start', 'end']].copy()
df_diff['label_head'] = df_head['label']
df_diff['label_regl'] = df_regl['label']
df_diff['annot_rev_head'] = df_head['annot_rev']
df_diff['annot_rev_regl'] = df_regl['annot_rev']
df_diff['annot_fwd_head'] = df_head['annot_fwd']
df_diff['annot_fwd_regl'] = df_regl['annot_fwd']
yp.df_sample(df_diff.loc[m_diff_], 50)
In [ ]:
# Write a .bed-file of all the regions that changed
fp_diff = 'annot/S2_regulatory_annotation/S2_regulatory_annotation_diff.bed'
df_diff[m_diff_][['chrom', 'start', 'end']].to_csv(fp_diff, index=False, header=False, sep='\t')
!wc -l {fp_diff}
Possible additional improvements...
In [ ]:
# exon2 overlap rule is too conservative & masks even when txn clearly does not originate from exon
# => test for jump/incr at exon boundary & discard from annotation only if it tests positive
#check_annot_summary('chrIV:13588263-13588441', 'Y45F10B.12') trans-spliced promoter in L4?
In [ ]:
# extra long outrons???
# snt-2 has a distal promoter spanning 40kb and multiple genes (chrIII:787,433-829,218)
# => cut-off at e.g. 10kb?
#check_annot_summary('chrIII:828052-828187', '??')
#chrIII:4369629-4369711
In [ ]:
#check_annot_summary('chrII:11371902-11372117', '(C33B4.2)') # short cap 100bp downstream of annotated coding region
#check_annot_summary('chrI:5485952-5486062', 'rpl-19') # promoter assigned to un-translated tiny exon; scap not align
#check_annot_summary('chrII:11347905-11348076', 'B0491.6 / B0491.1') # long UTR; short cap precedes TSS
#check_annot_sumary('chrII:1818315-1818462', 'fbxb-10') # true promoter; scap ~60bp chrII:1821387-1821512
#check_annot_summary('chrIII:5876290-5876433', 'aldo-2')
In [ ]:
#> lir-2 (green element may be promoter?) Looks like a promoter, I think
#check_annot_summary('chrII:7667887-7668072', 'lir-2') # Tricky region, as short cap does not align with 5'
In [ ]:
fp_reglA = 'annot/S2_regulatory_annotation/S2_regulatory_annotation_6Dec17.bed'
fp_reglB = 'annot/S2_regulatory_annotation/S2_regulatory_annotation_20Apr18.bed'
df_reglA = read_gffbed(fp_reglA)
df_reglB = read_gffbed(fp_reglB)
In [ ]:
fp_ = 'annot/S2_regulatory_annotation/S2_regulatory_annotation_6Dec17_to_20Apr18_lost_promoters_fwd.bed'
df_reglA[(df_reglA['annot_fwd'] == 'coding_promoter') & (df_reglB['annot_fwd'] != 'coding_promoter')]\
[['chrom', 'start', 'end']].to_csv(fp_, index=False, header=False, sep='\t')
!wc -l {fp_}
fp_ = 'annot/S2_regulatory_annotation/S2_regulatory_annotation_6Dec17_to_20Apr18_lost_promoters_rev.bed'
df_reglA[(df_reglA['annot_rev'] == 'coding_promoter') & (df_reglB['annot_rev'] != 'coding_promoter')]\
[['chrom', 'start', 'end']].to_csv(fp_, index=False, header=False, sep='\t')
!wc -l {fp_}
In [ ]:
fp_reglB_lowconf_fwd = 'annot/S2_regulatory_annotation/metrics/regulatory_annotation_lowconf_fwd.bed'
df_reglB_lowconf_fwd = read_gffbed(fp_reglB_lowconf_fwd)
fp_reglB_lowconf_rev = 'annot/S2_regulatory_annotation/metrics/regulatory_annotation_lowconf_rev.bed'
df_reglB_lowconf_rev = read_gffbed(fp_reglB_lowconf_rev)
In [ ]:
df_a_ = df_reglA.query('(annot_fwd == "coding_promoter") | (annot_fwd == "pseudogene_promoter")')[['chrom', 'start', 'end']]
df_b_ = df_reglB_lowconf_fwd.query('annot_lowconf == "True"')[['chrom', 'start', 'end']]
gv_f = yp.GenomicVenn2(
BedTool.from_dataframe(df_a_),
BedTool.from_dataframe(df_b_),
label_a='Dec fwd promoters',
label_b='Apr fwd low-confidence promoters',
)
gv_f.plot()
In [ ]:
gv_f.df_b_only
In [ ]:
df_a_ = df_reglA.query('(annot_rev == "coding_promoter") | (annot_rev == "pseudogene_promoter")')[['chrom', 'start', 'end']]
df_b_ = df_reglB_lowconf_rev.query('annot_lowconf == "True"')[['chrom', 'start', 'end']]
gv_r = yp.GenomicVenn2(
BedTool.from_dataframe(df_a_),
BedTool.from_dataframe(df_b_),
label_a='Dec rev promoters',
label_b='Apr rev low-confidence promoters',
)
gv_r.plot()
In [ ]:
fp_reglA = 'annot/S2_regulatory_annotation/S2_regulatory_annotation_6Dec17.bed'
fp_reglB = 'annot/S2_regulatory_annotation/S2_regulatory_annotation_20Apr18.bed'
fp_reglC = 'annot/S2_regulatory_annotation/S2_regulatory_annotation_22Apr18.bed'
fp_reglD = 'annot/S2_regulatory_annotation/S2_regulatory_annotation_23Apr18.bed'
fp_reglE = 'annot/S2_regulatory_annotation/S2_regulatory_annotation_23A_Apr18.bed'
df_reglA = read_gffbed(fp_reglA)
df_reglB = read_gffbed(fp_reglB)
df_reglC = read_gffbed(fp_reglC)
df_reglD = read_gffbed(fp_reglD)
df_reglE = read_gffbed(fp_reglE)
In [ ]:
def overlap_(df_A, df_B):
return sum((df_A['annot_fwd'] == "coding_promoter") & (df_B['annot_fwd'] == "coding_promoter")) + \
sum((df_A['annot_rev'] == "coding_promoter") & (df_B['annot_rev'] == "coding_promoter"))
print('6Dec', overlap_(df_reglA, df_reglA), overlap_(df_reglA, df_reglB), overlap_(df_reglA, df_reglC), overlap_(df_reglA, df_reglD), overlap_(df_reglA, df_reglE), len(df_reglA.query('annot=="coding_promoter"')))
print('20Apr18', overlap_(df_reglB, df_reglA), overlap_(df_reglB, df_reglB), overlap_(df_reglB, df_reglC), overlap_(df_reglB, df_reglD), overlap_(df_reglB, df_reglE), len(df_reglB.query('annot=="coding_promoter"')))
print('22Apr18', overlap_(df_reglC, df_reglA), overlap_(df_reglC, df_reglB), overlap_(df_reglC, df_reglC), overlap_(df_reglC, df_reglD), overlap_(df_reglC, df_reglE), len(df_reglC.query('annot=="coding_promoter"')))
print('23Apr18', overlap_(df_reglD, df_reglA), overlap_(df_reglD, df_reglB), overlap_(df_reglD, df_reglC), overlap_(df_reglD, df_reglD), overlap_(df_reglD, df_reglE), len(df_reglD.query('annot=="coding_promoter"')))
print('23A_Apr18', overlap_(df_reglE, df_reglA), overlap_(df_reglE, df_reglB), overlap_(df_reglE, df_reglC), overlap_(df_reglE, df_reglD), overlap_(df_reglE, df_reglE), len(df_reglE.query('annot=="coding_promoter"')))
In [ ]:
df_reglD[(df_reglD['annot_fwd'] == 'coding_promoter') & \
(df_reglE['annot_fwd'] != 'coding_promoter')][['chrom', 'start', 'end']].sample(10)
In [ ]:
df_reglD[(df_reglD['annot_rev'] == 'coding_promoter') & \
(df_reglE['annot_rev'] != 'coding_promoter')][['chrom', 'start', 'end']].sample(10)
In [28]:
def diff_coding_promoter_lost(verA, verB):
fp_reglA = 'annot/S2_regulatory_annotation/S2_regulatory_annotation_%s.bed' % (verA,)
fp_reglB = 'annot/S2_regulatory_annotation/S2_regulatory_annotation_%s.bed' % (verB,)
df_reglA = read_gffbed(fp_reglA)
df_reglB = read_gffbed(fp_reglB)
m_AtoB_fwd = (df_reglA['annot_fwd'] == 'coding_promoter') & (df_reglB['annot_fwd'] != 'coding_promoter')
m_AtoB_rev = (df_reglA['annot_rev'] == 'coding_promoter') & (df_reglB['annot_rev'] != 'coding_promoter')
#print(sum(m_AtoB_fwd), sum(m_AtoB_rev))
fp_fwd = 'annot/S2_regulatory_annotation/S2_regulatory_annotation_%s_to_%s_coding_promoter_lost_fwd.bed' % (verA, verB)
fp_rev = 'annot/S2_regulatory_annotation/S2_regulatory_annotation_%s_to_%s_coding_promoter_lost_rev.bed' % (verA, verB)
df_reglA[m_AtoB_fwd][['chrom', 'start', 'end']].to_csv(fp_fwd, index=False, header=False, sep='\t')
df_reglB[m_AtoB_rev][['chrom', 'start', 'end']].to_csv(fp_rev, index=False, header=False, sep='\t')
!wc -l {fp_fwd}
!wc -l {fp_rev}
def diff_coding_promoter_gained(verA, verB):
fp_reglA = 'annot/S2_regulatory_annotation/S2_regulatory_annotation_%s.bed' % (verA,)
fp_reglB = 'annot/S2_regulatory_annotation/S2_regulatory_annotation_%s.bed' % (verB,)
df_reglA = read_gffbed(fp_reglA)
df_reglB = read_gffbed(fp_reglB)
m_AtoB_fwd = (df_reglA['annot_fwd'] != 'coding_promoter') & (df_reglB['annot_fwd'] == 'coding_promoter')
m_AtoB_rev = (df_reglA['annot_rev'] != 'coding_promoter') & (df_reglB['annot_rev'] == 'coding_promoter')
print(sum(m_AtoB_fwd), sum(m_AtoB_rev))
fp_fwd = 'annot/S2_regulatory_annotation/S2_regulatory_annotation_%s_to_%s_coding_promoter_gained_fwd.bed' % (verA, verB)
fp_rev = 'annot/S2_regulatory_annotation/S2_regulatory_annotation_%s_to_%s_coding_promoter_gained_rev.bed' % (verA, verB)
df_reglA[m_AtoB_fwd][['chrom', 'start', 'end']].to_csv(fp_fwd, index=False, header=False, sep='\t')
df_reglB[m_AtoB_rev][['chrom', 'start', 'end']].to_csv(fp_rev, index=False, header=False, sep='\t')
!wc -l {fp_fwd}
!wc -l {fp_rev}
#diff_coding_promoter_lost('23Apr18', '23A_Apr18')
#diff_coding_promoter_gained('23Apr18', '23A_Apr18')
#diff_coding_promoter_lost('Apr24a', 'Apr24b')
#diff_coding_promoter_gained('Apr24a', 'Apr24b')
#diff_coding_promoter_lost('Apr24b', 'Apr24c')
#diff_coding_promoter_gained('Apr24b', 'Apr24c')
#diff_coding_promoter_lost('Apr24c', 'TMP')
#diff_coding_promoter_gained('Apr24c', 'TMP')
diff_coding_promoter_lost('Apr24c', 'Apr25a')
diff_coding_promoter_gained('Apr24c', 'Apr25a')
diff_coding_promoter_lost('Apr25a', 'Apr25b')
diff_coding_promoter_gained('Apr25a', 'Apr25b')
In [ ]:
fp_ = 'annot/S2_regulatory_annotation/S2_regulatory_annotation_6Dec17_to_22Apr18_lost_promoters_fwd.bed'
df_reglA[(df_reglA['annot_fwd'] == 'coding_promoter') & (df_reglC['annot_fwd'] != 'coding_promoter')]\
[['chrom', 'start', 'end']].to_csv(fp_, index=False, header=False, sep='\t')
!wc -l {fp_}
fp_ = 'annot/S2_regulatory_annotation/S2_regulatory_annotation_6Dec17_to_22Apr18_lost_promoters_rev.bed'
df_reglA[(df_reglA['annot_rev'] == 'coding_promoter') & (df_reglC['annot_rev'] != 'coding_promoter')]\
[['chrom', 'start', 'end']].to_csv(fp_, index=False, header=False, sep='\t')
!wc -l {fp_}
fp_ = 'annot/S2_regulatory_annotation/S2_regulatory_annotation_6Dec17_to_22Apr18_gained_promoters_fwd.bed'
df_reglA[(df_reglA['annot_fwd'] != 'coding_promoter') & (df_reglC['annot_fwd'] == 'coding_promoter')]\
[['chrom', 'start', 'end']].to_csv(fp_, index=False, header=False, sep='\t')
!wc -l {fp_}
fp_ = 'annot/S2_regulatory_annotation/S2_regulatory_annotation_6Dec17_to_22Apr18_gained_promoters_rev.bed'
df_reglA[(df_reglA['annot_rev'] != 'coding_promoter') & (df_reglC['annot_rev'] == 'coding_promoter')]\
[['chrom', 'start', 'end']].to_csv(fp_, index=False, header=False, sep='\t')
!wc -l {fp_}
In [ ]:
fp_ = 'annot/S2_regulatory_annotation/S2_regulatory_annotation_6Dec17_to_23Apr18_lost_promoters_fwd.bed'
df_reglA[(df_reglA['annot_fwd'] == 'coding_promoter') & (df_reglC['annot_fwd'] != 'coding_promoter')]\
[['chrom', 'start', 'end']].to_csv(fp_, index=False, header=False, sep='\t')
!wc -l {fp_}
fp_ = 'annot/S2_regulatory_annotation/S2_regulatory_annotation_6Dec17_to_23Apr18_lost_promoters_rev.bed'
df_reglA[(df_reglA['annot_rev'] == 'coding_promoter') & (df_reglC['annot_rev'] != 'coding_promoter')]\
[['chrom', 'start', 'end']].to_csv(fp_, index=False, header=False, sep='\t')
!wc -l {fp_}
fp_ = 'annot/S2_regulatory_annotation/S2_regulatory_annotation_6Dec17_to_23Apr18_gained_promoters_fwd.bed'
df_reglA[(df_reglA['annot_fwd'] != 'coding_promoter') & (df_reglC['annot_fwd'] == 'coding_promoter')]\
[['chrom', 'start', 'end']].to_csv(fp_, index=False, header=False, sep='\t')
!wc -l {fp_}
fp_ = 'annot/S2_regulatory_annotation/S2_regulatory_annotation_6Dec17_to_23Apr18_gained_promoters_rev.bed'
df_reglA[(df_reglA['annot_rev'] != 'coding_promoter') & (df_reglC['annot_rev'] == 'coding_promoter')]\
[['chrom', 'start', 'end']].to_csv(fp_, index=False, header=False, sep='\t')
!wc -l {fp_}
In [7]:
fp_ = 'annot/S2_regulatory_annotation/S2_regulatory_annotation_TMP.bed'
read_gffbed(fp_).query('annot_detailed_fwd == "unknown_promoter_jump_incr"').sample(10)
Out[7]:
In [43]:
df_Apr27 = pd.read_csv('annot_Apr27/Fig2D2_regulatory_annotation_Apr27.tsv', sep='\t')
df_May11 = pd.read_csv('annot_May11/S2_regulatory_annotation_May11.tsv', sep='\t')
In [47]:
df_Apr27[df_Apr27['annot_rev'] != df_May11['annot_rev']]
Out[47]:
In [ ]: