In [1]:
# Set annotation to annot_ce10
%run ~/relmapping/annot/notebooks/annot__init__.ipynb
annot_ = 'annot_ce10_eLife_full'
df_atac = pd.read_csv(os.path.join(annot_, 'accessible_sites.tsv'), sep='\t')
l_atac_peak_pos = df_atac[['start', 'end']].mean(axis=1).map(int)
df_lcap_fwd = pd.read_csv(os.path.join(annot_, 'metrics_lcap', 'lcap_all_fwd.tsv'), sep='\t', low_memory=False)
df_lcap_rev = pd.read_csv(os.path.join(annot_, 'metrics_lcap', 'lcap_all_rev.tsv'), sep='\t', low_memory=False)
df_exon_fwd = pd.read_csv(os.path.join(annot_, 'metrics_exon', 'closest_exon_fwd.tsv'), sep='\t', low_memory=False)
df_exon_rev = pd.read_csv(os.path.join(annot_, 'metrics_exon', 'closest_exon_rev.tsv'), sep='\t', low_memory=False)
df_maxgap_fwd = pd.read_csv(os.path.join(annot_, 'metrics_maxgap', 'maxgap_fwd.tsv'), sep='\t')
df_maxgap_rev = pd.read_csv(os.path.join(annot_, 'metrics_maxgap', 'maxgap_rev.tsv'), sep='\t')
df_scap_fwd = pd.read_csv(os.path.join(annot_, 'metrics_scap', 'scap_fwd.tsv'), sep='\t')
df_scap_rev = pd.read_csv(os.path.join(annot_, 'metrics_scap', 'scap_rev.tsv'), sep='\t')
df_prom_fwd = pd.read_csv(os.path.join(annot_, 'metrics_type', 'prom_fwd.tsv'), sep='\t')
df_prom_rev = pd.read_csv(os.path.join(annot_, 'metrics_type', 'prom_rev.tsv'), sep='\t')
df_regl = pd.read_csv(os.path.join(annot_, 'metrics_type', 'regl.tsv'), sep='\t')
def mp(fp, annot_=annot_): return os.path.join(annot_, 'Source Data', fp)
In [2]:
# Fig2D1_regulatory_annotation.tsv: stricter subset of (most informative) annotation metrics
fp_ = os.path.join(annot_, 'Source Data/Fig 2 - source data 1. Regulatory annotation.txt')
l_cols_ = [
'chrom', 'start', 'end',
'annot', 'annot_fwd', 'annot_rev',
'promoter_gene_id_fwd', 'promoter_locus_id_fwd', 'promoter_gene_biotype_fwd',
'promoter_gene_id_rev', 'promoter_locus_id_rev', 'promoter_gene_biotype_rev',
'associated_gene_id', 'associated_locus_id',
'tss_fwd', 'tss_rev', 'scap_pass_fwd', 'scap_pass_rev',
]
l_cols_fwd_ = \
['lcap_%s_fwd_passed_jump' % (stage,) for stage in config['stages']] +\
['lcap_%s_fwd_passed_incr' % (stage,) for stage in config['stages']]
l_cols_rev_ = \
['lcap_%s_rev_passed_jump' % (stage,) for stage in config['stages']] +\
['lcap_%s_rev_passed_incr' % (stage,) for stage in config['stages']]
df_ = pd.concat([df_regl[l_cols_].copy(), df_lcap_fwd[l_cols_fwd_].copy(), df_lcap_rev[l_cols_rev_].copy()], axis=1)\
.rename(columns={'scap_pass_fwd': 'scap_fwd_passed', 'scap_pass_rev': 'scap_rev_passed'}, copy=True, inplace=False,)
p_fwd = df_['annot_fwd'].isin(['coding_promoter', 'pseudogene_promoter', 'non-coding_RNA'])
p_rev = df_['annot_rev'].isin(['coding_promoter', 'pseudogene_promoter', 'non-coding_RNA'])
df_.loc[~p_fwd, 'promoter_gene_id_fwd'] = '.'
df_.loc[~p_fwd, 'promoter_locus_id_fwd'] = '.'
df_.loc[~p_fwd, 'promoter_gene_biotype_fwd'] = '.'
df_.loc[~p_rev, 'promoter_gene_id_rev'] = '.'
df_.loc[~p_rev, 'promoter_locus_id_rev'] = '.'
df_.loc[~p_rev, 'promoter_gene_biotype_rev'] = '.'
assoc = df_['annot'].isin(['unknown_promoter', 'putative_enhancer', 'other_element'])
df_.loc[~assoc, 'associated_gene_id'] = '.'
df_.loc[~assoc, 'associated_locus_id'] = '.'
df_.to_csv(fp_, header=True, index=False, sep='\t')
In [3]:
df_clust = pd.read_csv(os.path.join(annot_, 'metrics_atac_dynamics/20180530.supp-table.txt'), sep='\t')
df_clust.head()
Out[3]:
In [4]:
print(sum(df_regl['chrom'] == df_clust['chr']))
print(sum(df_regl['start'] == df_clust['start']))
print(sum(df_regl['end'] == df_clust['stop']))
In [5]:
df_clust['clustersATAC.dev'].value_counts()
Out[5]:
In [6]:
df_clust['clustersATAC.age'].value_counts()
Out[6]:
In [7]:
pd.crosstab(df_clust['annot'], df_clust['clustersATAC.dev'])
Out[7]:
In [8]:
pd.crosstab(df_clust['annot'], df_clust['clustersATAC.age'])
Out[8]:
In [9]:
df_clust_compact = df_clust\
[['chr', 'start', 'stop', 'ATACdc.dev', 'ATACdc.age', 'clustersATAC.dev', 'clustersATAC.age']]\
.rename(columns={'chr': 'chrom', 'stop': 'end',
'ATACdc.dev': 'devel_is_dynamic',
'ATACdc.age': 'ageing_is_dynamic',
'clustersATAC.dev': 'devel_cluster_label',
'clustersATAC.age': 'ageing_cluster_label'}).reset_index(drop=True)
print(len(df_clust_compact))
df_clust_compact['devel_cluster_label'].fillna('.', inplace=True)
df_clust_compact['ageing_cluster_label'].fillna('.', inplace=True)
df_clust_compact.head(20)
Out[9]:
In [10]:
df_clust_compact.to_csv(mp('Fig 4 - source data 1. Promoter accessibility.txt'), index=False, header=True, sep='\t')
In [37]:
# Fig2D1_regulatory_annotation.tsv: stricter subset of (most informative) annotation metrics
fp_ = os.path.join(annot_, 'reg_elements_eLife_full_review_expanded.tsv')
df_regl['annot_detailed_fwd'] = df_prom_fwd['annot_detailed_summary']
df_regl['annot_detailed_rev'] = df_prom_rev['annot_detailed_summary']
l_cols_ = [
'chrom', 'start', 'end', 'chrom_ce11', 'start_ce11', 'end_ce11',
'annot', 'annot_fwd', 'annot_rev', 'annot_detailed_fwd', 'annot_detailed_rev',
'promoter_gene_id_fwd', 'promoter_locus_id_fwd', 'promoter_gene_biotype_fwd',
'promoter_gene_id_rev', 'promoter_locus_id_rev', 'promoter_gene_biotype_rev',
'associated_gene_id', 'associated_locus_id',
'tss_fwd', 'tss_rev', 'scap_pass_fwd', 'scap_pass_rev',
]
l_cols_lcap_fwd_ = \
['lcap_%s_fwd_passed_jump' % (stage,) for stage in config['stages']] +\
['lcap_%s_fwd_passed_incr' % (stage,) for stage in config['stages']]
l_cols_lcap_rev_ = \
['lcap_%s_rev_passed_jump' % (stage,) for stage in config['stages']] +\
['lcap_%s_rev_passed_incr' % (stage,) for stage in config['stages']]
l_cols_clust_ = ['devel_is_dynamic', 'ageing_is_dynamic', 'devel_cluster_label', 'ageing_cluster_label']
df_ = pd.concat([
df_regl[l_cols_].rename(columns={'scap_pass_fwd': 'scap_fwd_passed', 'scap_pass_rev': 'scap_rev_passed'}),
df_atac[['atac_%s_height' % stage_ for stage_ in config['stages']] + ['atac_source']],
df_lcap_fwd[l_cols_lcap_fwd_],
df_lcap_rev[l_cols_lcap_rev_],
df_maxgap_fwd,
df_maxgap_rev,
df_clust_compact[l_cols_clust_]], axis=1)
p_fwd = df_['annot_fwd'].isin(['coding_promoter', 'pseudogene_promoter', 'non-coding_RNA'])
p_rev = df_['annot_rev'].isin(['coding_promoter', 'pseudogene_promoter', 'non-coding_RNA'])
df_.loc[~p_fwd, 'promoter_gene_id_fwd'] = '.'
df_.loc[~p_fwd, 'promoter_locus_id_fwd'] = '.'
df_.loc[~p_fwd, 'promoter_gene_biotype_fwd'] = '.'
df_.loc[~p_rev, 'promoter_gene_id_rev'] = '.'
df_.loc[~p_rev, 'promoter_locus_id_rev'] = '.'
df_.loc[~p_rev, 'promoter_gene_biotype_rev'] = '.'
assoc = df_['annot'].isin(['unknown_promoter', 'putative_enhancer', 'other_element'])
df_.loc[~assoc, 'associated_gene_id'] = '.'
df_.loc[~assoc, 'associated_locus_id'] = '.'
df_.to_csv(fp_, header=True, index=False, sep='\t')