In [1]:
%run ~/relmapping/annot/notebooks/__init__.ipynb
In [5]:
df_regl = regl_Apr27()
print('%d regions loaded' % (len(df_regl),))
df_regl[yp.NAMES_BED3].to_csv('annot_Apr27/Fig1D1_accessible_sites.bed', sep='\t', index=False, header=False)
In [20]:
# annot_Apr27/Fig2D2_regulatory_annotation_Apr27.tsv <= add ce11 coordinates by lifting over the peak position
fp_regl = 'annot/Fig2D2_regulatory_annotation_Apr27/Fig2D2_regulatory_annotation_Apr27.tsv'
df_regl = pd.read_csv(fp_regl, sep='\t')
print('%d regions loaded' % (len(df_regl),))
df_regl.to_csv('annot_Apr27/Fig2D2_regulatory_annotation_Apr27.tsv', sep='\t', index=False)
In [21]:
fp_ = 'annot/Fig2D2_regulatory_annotation_Apr27/Fig2D2_regulatory_annotation_Apr27.bed'
df_ = read_gffbed(fp_)
write_gffbed(fp_,
chrom = df_['chrom'],
start = df_['start'],
end = df_['end'],
name = df_['Name'],
strand = df_regl_['strand'],
itemRgb = list(map(lambda annot: d_annot_legend[annot], df_regl_['annot'])),
attr = df_regl_[['annot', 'annot_fwd', 'annot_rev', 'annot_detailed_fwd', 'annot_detailed_rev']],
)
Out[21]:
In [19]:
# annot_Apr27/Fig2D2_regulatory_annotation_Apr27_ce10.bed
# annot_Apr27/Fig2D2_regulatory_annotation_Apr27_ce11.bed
write_regl_bed9('annot/S2_regulatory_annotation/S2_regulatory_annotation.bed', df_regl)
In [4]:
Out[4]:
In [10]:
q_ = 'annot_fwd == "coding_promoter" | annot_fwd == "pseudogene_promoter" | annot_fwd == "non-coding_RNA" | annot_fwd == "unknown_promoter"'
q_incr_ = ' | '.join(['(lcap_incr_%s_fwd)' % (stage,) for stage in config['lcap_geo_by_stage']])
q_njump_ = ' & '.join(['(~lcap_jump_%s_fwd)' % (stage,) for stage in config['lcap_geo_by_stage']])
q_incronly_ = '(' + q_incr_ + ') & (' + q_njump_ + ')'
df_ = df_regl.query(q_).reset_index(drop=True).query(q_incronly_).reset_index(drop=True)
write_gffbed(
fp = 'annot/S2_regulatory_annotation/S2_regulatory_annotation_promoter_incr_only_fwd.bed',
chrom = df_['chrom'],
start = df_['start'],
end = df_['end'],
attr = df_[['lcap_incr_%s_fwd' % (stage,) for stage in config['lcap_geo_by_stage']]],
strand = '+',
itemRgb = list(map(lambda f: yp.RED if f else yp.BLUE,
df_[['lcap_incr_%s_fwd' % (stage,) for stage in config['lcap_geo_by_stage']]].any(axis=1),
)),
)
q_ = 'annot_rev == "coding_promoter" | annot_rev == "pseudogene_promoter" | annot_rev == "non-coding_RNA" | annot_rev == "unknown_promoter"'
q_incr_ = ' | '.join(['(lcap_incr_%s_rev)' % (stage,) for stage in config['lcap_geo_by_stage']])
q_njump_ = ' & '.join(['(~lcap_jump_%s_rev)' % (stage,) for stage in config['lcap_geo_by_stage']])
q_incronly_ = '(' + q_incr_ + ') & (' + q_njump_ + ')'
df_ = df_regl.query(q_).reset_index(drop=True).query(q_incronly_).reset_index(drop=True)
write_gffbed(
fp = 'annot/S2_regulatory_annotation/S2_regulatory_annotation_promoter_incr_only_rev.bed',
chrom = df_['chrom'],
start = df_['start'],
end = df_['end'],
attr = df_[['lcap_incr_%s_rev' % (stage,) for stage in config['lcap_geo_by_stage']]],
strand = '-',
itemRgb = list(map(lambda f: yp.RED if f else yp.BLUE,
df_[['lcap_incr_%s_rev' % (stage,) for stage in config['lcap_geo_by_stage']]].any(axis=1),
)),
)
!wc -l annot/S2_regulatory_annotation/S2_regulatory_annotation_promoter_incr_only_fwd.bed
!wc -l annot/S2_regulatory_annotation/S2_regulatory_annotation_promoter_incr_only_rev.bed
In [ ]:
q_ = 'annot_fwd == "coding_promoter" | annot_fwd == "pseudogene_promoter" | annot_fwd == "non-coding_RNA" | annot_fwd == "unknown_promoter"'
df_ = df_regl.query(q_).reset_index(drop=True)
write_gffbed(
fp = 'annot/S2_regulatory_annotation/S2_regulatory_annotation_promoter_jump_fwd.bed',
chrom = df_['chrom'],
start = df_['start'],
end = df_['end'],
attr = df_[['lcap_jump_%s_fwd' % (stage,) for stage in config['lcap_geo_by_stage']]],
strand = '+',
itemRgb = list(map(lambda f: yp.RED if f else yp.BLUE,
df_[['lcap_jump_%s_fwd' % (stage,) for stage in config['lcap_geo_by_stage']]].any(axis=1),
)),
)
write_gffbed(
fp = 'annot/S2_regulatory_annotation/S2_regulatory_annotation_promoter_incr_fwd.bed',
chrom = df_['chrom'],
start = df_['start'],
end = df_['end'],
attr = df_[['lcap_incr_%s_fwd' % (stage,) for stage in config['lcap_geo_by_stage']]],
strand = '+',
itemRgb = list(map(lambda f: yp.RED if f else yp.BLUE,
df_[['lcap_incr_%s_fwd' % (stage,) for stage in config['lcap_geo_by_stage']]].any(axis=1),
)),
)
q_ = 'annot_rev == "coding_promoter" | annot_rev == "pseudogene_promoter" | annot_rev == "non-coding_RNA" | annot_rev == "unknown_promoter"'
df_ = df_regl.query(q_).reset_index(drop=True)
write_gffbed(
fp = 'annot/S2_regulatory_annotation/S2_regulatory_annotation_promoter_jump_rev.bed',
chrom = df_['chrom'],
start = df_['start'],
end = df_['end'],
attr = df_[['lcap_jump_%s_rev' % (stage,) for stage in config['lcap_geo_by_stage']]],
strand = '-',
itemRgb = list(map(lambda f: yp.RED if f else yp.BLUE,
df_[['lcap_jump_%s_rev' % (stage,) for stage in config['lcap_geo_by_stage']]].any(axis=1),
)),
)
write_gffbed(
fp = 'annot/S2_regulatory_annotation/S2_regulatory_annotation_promoter_incr_rev.bed',
chrom = df_['chrom'],
start = df_['start'],
end = df_['end'],
attr = df_[['lcap_incr_%s_rev' % (stage,) for stage in config['lcap_geo_by_stage']]],
strand = '-',
itemRgb = list(map(lambda f: yp.RED if f else yp.BLUE,
df_[['lcap_incr_%s_rev' % (stage,) for stage in config['lcap_geo_by_stage']]].any(axis=1),
)),
)
In [ ]:
# annot/S2_regulatory_annotation/S2_regulatory_annotation_promoter_jump_fwd.bed
# annot/S2_regulatory_annotation/S2_regulatory_annotation_promoter_jump_rev.bed
# annot/S2_regulatory_annotation/S2_regulatory_annotation_promoter_incr_fwd.bed
# annot/S2_regulatory_annotation/S2_regulatory_annotation_promoter_incr_rev.bed
# summary results
def itemRgb_(is_jump, is_incr):
if is_jump == True:
return RED
elif is_incr:
return YELLOW
else:
return BLUE
strand = 'fwd'
write_gffbed(
fp = 'annot/S2_regulatory_annotation/_lcap_jump/lcap_tests_wt_%(strand)s.bed' % locals(),
chrom = df_['chrom'],
start = df_['start'],
end = df_['end'],
attr = df_[
['lcap_%s_%s_padj' % (stage, strand) for stage in config['lcap_geo_by_stage'][:6]] +\
['lcap_%s_%s_log2FoldChange' % (stage, strand) for stage in config['lcap_geo_by_stage']][:6]
],
strand = '+',
itemRgb = map(itemRgb_,
df_jump[['lcap_jump_%s_%s' % (stage, strand) for stage in config['lcap_geo_by_stage'][:6]]].any(axis=1),
df_incr[['lcap_incr_%s_%s' % (stage, strand) for stage in config['lcap_geo_by_stage'][:6]]].any(axis=1)
),
)
In [22]:
df_prom_fwd = df_regl.query('(annot_fwd == "coding_promoter")')
df_prom_rev = df_regl.query('(annot_rev == "coding_promoter")')
df_prom_fwd = df_prom_fwd[['chrom', 'start', 'end', 'tss_fwd', 'promoter_gene_id_fwd', 'promoter_locus_id_fwd']].copy()
df_prom_rev = df_prom_rev[['chrom', 'start', 'end', 'tss_rev', 'promoter_gene_id_rev', 'promoter_locus_id_rev']].copy()
df_prom_fwd['strand'] = '+'
df_prom_rev['strand'] = '-'
df_prom_fwd.columns = ['chrom', 'start', 'end', 'tss', 'gene_id', 'locus_id', 'strand']
df_prom_rev.columns = ['chrom', 'start', 'end', 'tss', 'gene_id', 'locus_id', 'strand']
print(len(df_prom_fwd), 'fwd promoters')
print(len(df_prom_rev), 'rev promoters')
df_prom = pd.concat([df_prom_fwd, df_prom_rev], axis=0).reset_index(drop=True)
print(len(df_prom), 'total (coding) promtoters')
df_gene = pd.read_csv('WS260_ce10/WS260_ce10.genes_by_CV.tsv', sep='\t')
df_prom['operon_rank'] = df_prom.merge(df_gene, left_on='gene_id', right_on='gene_id')['operon_rank']
df_prom['operon_id'] = df_prom.merge(df_gene, left_on='gene_id', right_on='gene_id')['operon_id']
df_prom
Out[22]:
In [23]:
df_hybrid = df_prom.query('operon_rank > 1').sort_values(['chrom', 'start', 'end', 'strand']).reset_index(drop=True)
fp_ = 'annot/S2_regulatory_annotation/S2d_promoters_within_operons.tsv'
df_hybrid.to_csv(fp_, header=True, index=False, sep='\t')
!wc -l {fp_}
fp_ = 'annot/S2_regulatory_annotation/S2d_promoters_within_operons.bed'
write_gffbed(fp_,
chrom = df_hybrid['chrom'],
start = df_hybrid['start'],
end = df_hybrid['end'],
name = df_hybrid['locus_id'],
strand = df_hybrid['strand'],
attr = df_hybrid[['tss', 'gene_id', 'locus_id', 'operon_rank', 'operon_id']],
)
!wc -l {fp_}
In [ ]:
df_ = df_prom.sort_values(['chrom', 'start', 'end', 'strand']).reset_index(drop=True)
fp_ = 'annot/S2_regulatory_annotation/S2b_promoter_annotation.tsv'
df_.to_csv(fp_, header=True, index=False, sep='\t')
In [ ]:
fp_ = 'annot/S2_regulatory_annotation/S2b_promoter_annotation.bed'
write_gffbed(fp_,
chrom = df_['chrom'],
start = df_['start'],
end = df_['end'],
name = df_['locus_id'],
strand = df_['strand'],
attr = df_[['tss', 'gene_id', 'locus_id']],
)
In [11]:
# Annotate current annotation by date
l_fp_inp = [
#'annot/S2_regulatory_annotation/S2_regulatory_annotation.bed',
#'annot/S2_regulatory_annotation/S2_regulatory_annotation.tsv',
#'annot/S2_regulatory_annotation/S2b_promoter_annotation.bed',
#'annot/S2_regulatory_annotation/S2b_promoter_annotation.tsv',
#'annot/S2_regulatory_annotation/S2c_outron-extended_genes.bed',
#'annot/S2_regulatory_annotation/S2_regulatory_annotation_promoter_incr_fwd.bed',
#'annot/S2_regulatory_annotation/S2_regulatory_annotation_promoter_jump_fwd.bed',
#'annot/S2_regulatory_annotation/S2_regulatory_annotation_promoter_incr_rev.bed',
#'annot/S2_regulatory_annotation/S2_regulatory_annotation_promoter_jump_rev.bed',
'annot/S2_regulatory_annotation/S2_regulatory_annotation_promoter_incr_only_fwd.bed',
'annot/S2_regulatory_annotation/S2_regulatory_annotation_promoter_incr_only_rev.bed',
]
date_ = "6Dec17"
for fp_inp in l_fp_inp:
#print(fp_inp)
#!git rm --cached {fp_inp}
fp_out = '%s_%s.%s' % (fp_inp[:-4], date_, fp_inp[-3:])
!cp {fp_inp} {fp_out}
!git add {fp_out}
In [ ]: